版权声明:个人原创,所属@jerry 本人 https://blog.csdn.net/qq_42938842/article/details/83623654
代码如下
import json
from threading import Thread
from threading import Lock
from queue import Queue
import requests
from bs4 import BeautifulSoup
import time
# 设置两个全局变量,为了控制数据列队状态
g_crawl = True
g_parse = True
class CrawThread(Thread):
def __init__(self, name, page_quue, data_queue):
super().__init__()
self.name = name
self.page_quue = page_quue
self.url = 'https://search.51job.com/list/170200,000000,0000,00,9,99,c,2,{}.html'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
self.data_queue = data_queue
def run(self):
print('---线程%s---正在启动' % self.name)
while g_crawl:
try:
page = self.page_quue.get(False)
url = self.url.format(page)
r = requests.get(url=url, headers=self.headers)
self.data_queue.put(r.content)
time.sleep(1)
except Exception as e:
pass
print('---线程%s---结束---' % self.name)
class ParseThread(Thread):
def __init__(self, name, data_queue, fp, lock, page_queue):
super().__init__()
self.name = name
self.data_queue = data_queue
self.fp = fp
self.lock = lock
self.page_queue = page_queue
def run(self):
print('---%s---正在启动' % self.name)
while g_parse:
try:
data = self.data_queue.get(False)
# 创建一个解析函数
self.parse(data)
time.sleep(1)
except Exception as e:
pass
print('---解析线程%s---结束---' % self.name)
def parse(self, data):
soup = BeautifulSoup(data, 'lxml')
rets = soup.select('#resultList > .el')[1:]
# 得到的是一个列表对象,所以进行遍历处理
for ret in rets:
# 得到的是一个列表,取出,并且取值去换行,去空格
# 职位
title = ret.select('.t1 a')[0].string.replace('\n', '').strip()
# 公司
job_name = ret.select('.t2 a')[0].string.replace('\n', '').strip()
#
job_where = ret.select('.t3')[0].string
# 薪资
salary = ret.select('.t4')[0].string
# 发布时间
pulish_time = ret.select('.t5')[0].string
item = {
'职位': title,
'公司': job_name,
'地点': job_where,
'薪资': salary,
'发布时间': pulish_time,
}
# 字典转换为json字符串
string = json.dumps(item, ensure_ascii=False)
# 每写一个加一个换行
if self.lock.acquire():
self.fp.write(string + '\n')
self.lock.release()
def create_queue():
# 页码
page_queue = Queue(10)
# 响应
data_queue = Queue(10)
for page in range(1, 11):
page_queue.put(page)
return page_queue, data_queue
def main():
# 创建一把锁
lock = Lock()
# 保存文件
fp = open('jobs.txt', 'w', encoding='utf8')
# 创建一个页码队列函数
page_queue, data_queue = create_queue()
# 用两个列表保存所有的线程
crawl_thred_list = []
parse_thred_list = []
craw_name_list = ['采集线程1', '采集线程2', '采集线程3']
parse_name_list = ['解析线程1', '解析线程2', '解析线程3']
for name in craw_name_list:
t_crawl = CrawThread(name, page_queue, data_queue)
# 启动
t_crawl.start()
crawl_thred_list.append(t_crawl)
for name in parse_name_list:
t_parse = ParseThread(name, data_queue, fp, lock, page_queue)
# 启动
t_parse.start()
parse_thred_list.append(t_parse)
global g_crawl, g_parse
while 1:
if page_queue.empty():
g_crawl = False
break
for crawl in crawl_thred_list:
crawl.join()
while 1:
if data_queue.empty():
g_parse = False
break
for parse in parse_thred_list:
parse.join()
print('主线程--子线程全都结束')
if __name__ == '__main__':
main()