1.多线程爬虫
import threading # 线程模块
#from threading import Thread
#import time
#import requests
def funa():
print(‘一直小爬虫’)
time.sleep(1)
print(‘就是玩儿’)
def funb():
print(‘大海星辰’)
time.sleep(2)
print(‘倒车请注意’)
funa()
funb()
# 创建子线程
if name == ‘main’:
# 创建子线程对象,target:执行的任务名,不要加()
t1 = Thread(target=funa())
t2 = Thread(target=funb())
# 开启子线程
t1.start()
t2.start()
url = ‘https://movie.douban.com/top250’
get 请求
import requests
from lxml import etree
import json
class Douban:
# 初始化
def init(self):
self.url = ‘https://movie.douban.com/top250’
self.headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36’
}
# 发送请求
def get_res(self):
self.res = requests.get(self.url, headers=self.headers)
# print(res.text)
# 提取数据 ---电影名 title score评分, 链接href
def get_data(self):
html = etree.HTML(self.res.text)
# 标签<>里面的是属性:@, 标签<><>中间的是文本值:text()
title = html.xpath('//div[@class="hd"]/a/span[1]/text()')
score = html.xpath('//div[@class="star"]/span[2]/text()')
href = html.xpath('//div[@class="hd"]/a/@href')
# print(href)
# 用拉链函数放一起
self.li_data = list(zip(title, score, href)) # self.li_data是列表
# print(result)
# 保存数据
def save_data(self):
for data in self.li_data:
# print(data)
dic = {}
dic['title'] = data[0]
dic['score'] = data[1]
dic['href'] = data[2]
# print(dic)
# 写入文件
with open('top250.json', 'a', encoding='utf-8')as f:
doc_data = json.dumps(dic, ensure_ascii=False)
f.write(doc_data + ',\n')
# 翻页操作
def run(self):
self.get_res()
self.get_data()
self.save_data()
实例化对象
db = Douban()
db.run()