爬取排行榜top
有机器验证 运行爬虫前只要人力进一遍网站就行了
边学边敲的
import json
import requests
from lxml import etree
def getOnePage(n):
url = f'https://maoyan.com/board/4?offset={n*10}'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
# header = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0'}
r = requests.get(url, headers=header)
return r.text
def parse(text):
# 初始化 标准化
html = etree.HTML(text)
names = html.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/a/@title')
times = html.xpath('//p[@class="releasetime"]/text()')
# print(names)
# zip是拉链函数
item = {}
for name, time in zip(names, times):
item['name'] = name
item['time'] = time
yield item
def save2File(data):
with open('D://Project//Pycharm project//spider//猫眼//movie.json', 'a', encoding='utf-8') as f:
data = json.dumps(data, ensure_ascii=False) + ',\n'
f.write(data)
def run():
for n in range(0, 10):
page = getOnePage(n)
items = parse(page)
for item in items:
save2File(item)
if __name__ == '__main__':
run()