from urllib import request from bs4 import BeautifulSoup import ssl import json ssl._create_default_https_context=ssl._create_unverified_context def bs4_parse(list_obj): item_list=[] for tr in list_obj: item={}#存放数据 td_list=tr.select('td') # 职位名称和href # <XXXX> # <a href="http://langlang2017.com">python工程师</a> # </XXXX> # XXXX.get_text() title = td_list[0].get_text() # href td_square = td_list[0] a_list = td_square.select('a') a_dic = a_list[0].attrs href = a_dic["href"] href = "https://hr.tencent.com/" + href catalog = td_list[-4].get_text() # 类别 num = td_list[-3].get_text() # 人数 workPlace = td_list[-2].get_text() # 工作地点 publishTime = td_list[-1].get_text() # 发布时间 print(title, catalog, num, workPlace, publishTime, href) # 存入字典 item["title"] = title item["catalog"] = catalog item["num"] = num item["workPlace"] = workPlace item["publishTime"] = publishTime item["href"] = href item_list.append(item) # print(item) data=json.dumps(item_list,ensure_ascii=False) #序列号 with open('tencent.json','a',encoding='utf-8') as f: f.write(data) def tencent(page): base_url='https://hr.tencent.com/position.php?keywords=python&start=%s#a'%(page-1)*10 response = request.urlopen(base_url) html = response.read() soup = BeautifulSoup(html, 'lxml') # print(soup) # 奇数行 tr_list1 = soup.select('tr[class="even"]') # 偶数行 tr_list2 = soup.select('tr[class="odd"]') bs4_parse(tr_list1 + tr_list2) if __name__=='__main__': # 抓取前10面的内容 for page in range(1,11): tencent(page)
Python3~爬取某公司招聘信息
猜你喜欢
转载自blog.csdn.net/zbrj12345/article/details/80282587
周排行