版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/yhj198927/article/details/88828263
import time import pymongo import requests from bs4 import BeautifulSoup #简历数据库连接 client=pymongo.MongoClient('localhost',27017) mydb=client['mydb'] lagou=mydb['lagou'] #获取request header信息 headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'user_trace_token=20171231132715-42f37e4d-edeb-11e7-b924-525400f775ce; LGUID=20171231132715-42f38117-edeb-11e7-b924-525400f775ce; _ga=GA1.2.1622077289.1514698034; WEBTJ-ID=20180520195456-1637d6710e631a-0a4246394d3832-444a022e-1049088-1637d6710e841b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526817297; _gid=GA1.2.1271768012.1526817297; LGSID=20180520195459-9efc9158-5c24-11e8-bd30-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xacba81c0000cbaf5%26issp%3D1%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3D78000241_11_hao_pg%26rsv_enter%3D1%26rsv_sug3%3D6%26rsv_sug1%3D4%26rsv_sug7%3D100%26rsv_sug2%3D0%26prefixsug%3D%2525E6%25258B%252589%2525E9%252592%2525A9%26rsp%3D0%26inputT%3D2653%26rsv_sug4%3D2653; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; X_HTTP_TOKEN=b2b9763bf78611580dd3144a4c933505; JSESSIONID=ABAAABAAAGGABCBA28349B0372DF5C3C5A9CD7175E19A73; _putrc=E48AA8047CD3EAE0; login=true; unick=%E6%9D%A8%E7%BA%A2%E6%9D%B0; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=125; gate_login_token=168bd7a1cf64debe233e600dc26d3a3ee031a28d8019a5ca; index_location_city=%E5%8C%97%E4%BA%AC; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526817365; LGRID=20180520195604-c608491d-5c24-11e8-87c2-5254005c3644', 'Host': 'www.lagou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } def get_page(url): web_data=requests.get(url,headers=headers) soup=BeautifulSoup(web_data.text,'html.parser',from_encoding='utf-8') #soup.select()方法中对应的参数为:检查--》copy--》copy selector companynames=soup.select('.list_item_top > div.company > div.company_name > a') industrys=soup.select('div.list_item_top > div.company > div.industry') positions=soup.select('div.list_item_top > div.position > div.p_top > a > h3') addresses=soup.select('.list_item_top > div.position > div.p_top > a > span > em') moneys=soup.select('div.list_item_top > div.position > div.p_bot > div > span') advantages=soup.select('div.list_item_bot > div.li_b_r') #上面获取的数据为一个个的列表,将他们转换为Json字符串形式的数据,插入数据库 for companyname,industry,position,address,money,advantage in zip(companynames,industrys,positions,addresses,moneys,advantages): data={ 'companyname': companyname.get_text().strip(), 'industry': industry.get_text().strip(), 'position': position.get_text().strip(), 'address':address.get_text().strip(), 'money': money.get_text().split(), 'advantage': advantage.get_text().split() } print(data) lagou_id=lagou.insert(data) time.sleep(1) print(lagou_id) #output_html(data) print('----------------------------') if __name__ == '__main__': urls=['https://www.lagou.com/zhaopin/Python/1/?filterOption=3'.format(str(i)) for i in range(1,4)] for url in urls: get_page(url)