爬取zol索尼相机排行榜

import requests
import re
import json
from bs4 import BeautifulSoup

def get_one_page(url):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    headers = {'User-Agent':user_agent}
    response = requests.get(url,headers)
    return response.text

获取网页内容

def get_information(html_text):
    pattern = re.compile('shtml">(.{1,16})</a></div>.*?"rank__price">(.{1,8})</div>.*?<span>(.*?)</span>', re.S)
    items = re.findall(pattern,html_text)
    for item in items:
        yield {
            'index':item[0],
            'price':item[1],
            'score':item[2]
        }

正则匹配
yield整合起数据结构
finaall返回匹配到的列表,里面为元组

def recording(information):
  with open('豆瓣Top250.txt','a',encoding='utf-8') as f:
      f.write(json.dumps(information,ensure_ascii=False)+'\n')

将爬到的信息写入文件

def main():
    for i in range(0,1):
        response = get_one_page('https://top.zol.com.cn/compositor/15/manu_167.html')
        html_text = get_information(response)
        for m in html_text:
            recording(m)
        print('正在爬取第'+str(i)+'页')
    print('爬取完毕!')

main()

猜你喜欢

转载自blog.csdn.net/weixin_39025679/article/details/106175025