Python实现简单的爬虫代码以及步骤

# 导入第三方库
from urllib.parse import urlencode
import os
import requests
# 查找或者操作文件的话就用这个os模块
from hashlib import md5
# 获取目标函数
# 定义一个函数，目的：加载单个ajax请求的结果
# 其中位移变化的参数就是offset，所以我们就将他定义成我们函数的参数
def get_page(offset):

    # 2.1定义参数的属性
    # 这个属性是个键值对，所以我们直接定义一个字典

    params={
        'offset': offset,
        'format': 'json',
        'keyword':'车模',
        'autoload':'true',
        'count':'20',
        'cur_tab':'1'
    }

    #2.2 拼接url https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%BD%A6%E6%A8%A1&autoload=true&count=20&cur_tab=1&from=search_tab
    url = 'https://www.toutiao.com/search_content/?'+urlencode(params)
    # 2.3 请求这个连接
    response = requests.get(url)
    # 2.4 如果返回的状态码为200，那么我才返回这个结果
    # 拓展：200是成功，400是错误请求（服务器找不到请求的语法）404未找到，405方法禁用，500服务器内部错误
    if response.status_code == 200:
        return response.json()
    # 3 解析目标网页
    # 3定义一个函数，目的是实现一个解析方法
def getImages(json):
    # 3.1获取数据
    data = json.get('data')
    if data:
        # 3.2便利数据，得到图片的列表和title
        for item in data:

            image_list = item.get('image_list')
            title = item.get('title')
            if image_list:
                for image in image_list:
                    # 3.3  构造一个生成器，目的：将图片的连接和图片的标题一并返回
                    # 怎么生成生成器？用yield关键字，(return返回之后程终止，yield返回后程序不终止)
                    yield{
                        'image':image.get('url'),
                        'title':title
                    }
# 下载目标网页的数据
# 4.1 定义一个函数，目的：实现一个保存图片的方法,item就是前面的get_image
# item 里面的title 创建一个文件夹，然后请求图片的连接
def saveImage(item):

    if not os.path.exists(item.get('title')):#判断是否存在
        os.mkdir(item.get('title'))#创建文件夹
#         4.1 获取连接
    local_image_url = item.get('image')
    #请求图片，并拼接连接
    response = requests.get('http:'+local_image_url)
    if response.status_code == 200:
        file_path = '{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
        if not os.path.exists(file_path):
            with open(file_path,'wb') as f:
                f.write(response.content)
def main(offset):
    json = get_page(offset)
    for item in getImages(json):
        print(item)
        saveImage(item)
if __name__== '__main__':
    main(5)
Python实现简单的爬虫代码以及步骤

猜你喜欢