python爬虫(实战)——爬取python菜鸟教程100道例题、猫眼电影top100、今日头条图集。

1.python菜鸟教程100道例题。
代码(1):

#!/usr/bin/python
# coding:utf-8
import json
import re
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            response.encoding = 'utf-8'
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    datalist = []
    results = re.findall('<p><strong>(.*?)</strong>(.*?)</p>', html, re.S)
    for i in results:
        for j in i:
            datalist.append(j)
    pattern = re.compile('<div class="hl-main">(.*?)</div>', re.S)
    results = re.findall(pattern, html)
    for result in results:
        soup = BeautifulSoup(result,'lxml')
        datalist.append(soup.get_text())
    soup = BeautifulSoup(html,'lxml')
    for pre in soup.select('pre')[0:1]:
        datalist.append(pre.get_text())
    return datalist


def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        for i in content:
            f.write(i)
            f.write('\n')

def main(offest):
    url = 'http://www.runoob.com/python/python-exercise-example' + str(offest) + '.html'
    html = get_one_page(url)
    data = parse_one_page(html)
    write_to_file(data)

if __name__ == '__main__':
    for i in range(1,101):
        main(offest=i)
        print("正在下载第" + str(i) + "道题。。。。。。")

代码(2):

import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from lxml import etree
def geu_page(url):
    try:
        res = requests.get(url,timeout=4)
        res.encoding = 'utf-8'
        if res.status_code == 200:
            html = res.text
            return html.encode("utf-8")
    except Exception as e:
        for i in range(3):
            print(url,e)
            res = requests.get(url,timeout=4)
            res.encoding = 'utf-8'
            if res.status_code == 200:
                html = res.text
                return html.encode('utf-8')
def get_index(url):
    html = geu_page(url)
    html = BeautifulSoup(html,'lxml')
    datas = html.find_all('ul')
    data = datas[2]
    data = BeautifulSoup(str(data),'lxml')
    for  urls in data.find_all('a'):
        yield 'http://www.runoob.com' + urls.get('href')

def get_data(url):
    html = geu_page(url)
    doc = pq(html)
    datas = etree.HTML(html)
    title = doc('#content h1').text()
    print('正在下载'+":"+title)
    data = doc('#content p')
    name = pq(data[1]).text()
    num = pq(data[2]).text()
    n = data[3].text
    data = datas.xpath('//div[@class="hl-main"]/span/text()')
    code = ''.join(data)
    with open(r'pythpn习题100例.txt','a+',encoding='utf-8') as f:
        f.write(title+'\n')
        f.write(name+'\n')
        f.write(num+'\n')
        f.write(n+'\n')
        f.write(code)
        f.write('\r\n')
def main():
    url = r'http://www.runoob.com/python/python-100-examples.html'
    for i in get_index(url):
        get_data(i)

if __name__ == '__main__':
    main()

2.猫眼电影top100。

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import json
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException
from multiprocessing import Pool
import re
def get_one_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except Exception as e:
        print(e)
        return None
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index':item[0],
            'image':item[1],
            'title':item[2],
            'actor':item[3].strip()[3:],
            'time':item[4].strip()[5:],
            'score':item[5]+item[6]
        }
def write_to_file(content):
    with open('result.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False) + '\n')
        f.close()
def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
   # for i in range(10):
    #    main(i*10)
    pool = Pool()
    pool.map(main,[i*10 for i in range(10)])  #生成器生成参数

3.今日头条图集抓取。
代码:

#!/usr/bin/python
#coding:utf-8
import os
import json
import re
import pymongo
import requests
from config import *
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from hashlib import md5
from  multiprocessing import Pool

client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]
def get_page_index(offest,keyword):
    data = {
        'offset': offest,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 1,
        'from': 'search_tab',
        'pd': 'synthesis'
    }

    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                   'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
                   'cache-control': 'max-age=0',
                   'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
                   'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
                   'upgrade-insecure-requests': '1',
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                   }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print("请求索引页出现错误")
        return None
def parse_page_index(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')
def get_page_detail(url):
    try:
        headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                   'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
                   'cache-control': 'max-age=0',
                   'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
                   'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
                   'upgrade-insecure-requests': '1',
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                   }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print("请求详情页出现错误",url)
        return None
def parse_page_datail(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()
    print(title)
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)
    result = re.search(images_pattern,html)
    if result:
        data = json.loads(result.group(1).replace('\\', ''))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images:
                download_image(image)
            return {
                'title': title,
                'url':url,
                'images': images
            }
def download_image(url):
    print('正在下载',url)
    try:
        headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                   'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
                   'cache-control': 'max-age=0',
                   'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
                   'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
                   'upgrade-insecure-requests': '1',
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                   }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            save_image(response.content)
        return None
    except RequestException:
        print("请求图片出错",url)
        return None

def save_image(content):
    file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)
            f.close()
def save_to_mongo(result):
    """存储文件到数据库"""
    if db[MONGO_DB].insert(result):
        print('存储成功', result)
        return True
    return False

def main(offset):
    html = get_page_index(offset,KEY)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_datail(html,url)
            if result:
                save_to_mongo(result)

if __name__ == '__main__':
    groups = [x * 20 for x in range(Group_start,Group_end + 1)]
    pool = Pool()
    pool.map(main,groups)

配置文件:

MONGO_URL = '127.0.0.1'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

Group_start = 1
Group_end = 20

KEY = '詹姆斯'

猜你喜欢

转载自blog.csdn.net/qq_40909772/article/details/85038044