爬取拉勾网数据,并存入Mongodb数据库

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/yhj198927/article/details/88828263
import time
import pymongo
import requests
from bs4 import BeautifulSoup

#简历数据库连接
client=pymongo.MongoClient('localhost',27017)
mydb=client['mydb']
lagou=mydb['lagou']

#获取request header信息
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'user_trace_token=20171231132715-42f37e4d-edeb-11e7-b924-525400f775ce; LGUID=20171231132715-42f38117-edeb-11e7-b924-525400f775ce; _ga=GA1.2.1622077289.1514698034; WEBTJ-ID=20180520195456-1637d6710e631a-0a4246394d3832-444a022e-1049088-1637d6710e841b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526817297; _gid=GA1.2.1271768012.1526817297; LGSID=20180520195459-9efc9158-5c24-11e8-bd30-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xacba81c0000cbaf5%26issp%3D1%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3D78000241_11_hao_pg%26rsv_enter%3D1%26rsv_sug3%3D6%26rsv_sug1%3D4%26rsv_sug7%3D100%26rsv_sug2%3D0%26prefixsug%3D%2525E6%25258B%252589%2525E9%252592%2525A9%26rsp%3D0%26inputT%3D2653%26rsv_sug4%3D2653; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; X_HTTP_TOKEN=b2b9763bf78611580dd3144a4c933505; JSESSIONID=ABAAABAAAGGABCBA28349B0372DF5C3C5A9CD7175E19A73; _putrc=E48AA8047CD3EAE0; login=true; unick=%E6%9D%A8%E7%BA%A2%E6%9D%B0; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=125; gate_login_token=168bd7a1cf64debe233e600dc26d3a3ee031a28d8019a5ca; index_location_city=%E5%8C%97%E4%BA%AC; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526817365; LGRID=20180520195604-c608491d-5c24-11e8-87c2-5254005c3644',
'Host': 'www.lagou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}


def get_page(url):
    web_data=requests.get(url,headers=headers)
    soup=BeautifulSoup(web_data.text,'html.parser',from_encoding='utf-8')
    #soup.select()方法中对应的参数为:检查--》copy--》copy selector
    companynames=soup.select('.list_item_top > div.company > div.company_name > a')
    industrys=soup.select('div.list_item_top > div.company > div.industry')
    positions=soup.select('div.list_item_top > div.position > div.p_top > a > h3')
    addresses=soup.select('.list_item_top > div.position > div.p_top > a > span > em')
    moneys=soup.select('div.list_item_top > div.position > div.p_bot > div > span')
    advantages=soup.select('div.list_item_bot > div.li_b_r')

    #上面获取的数据为一个个的列表,将他们转换为Json字符串形式的数据,插入数据库
    for companyname,industry,position,address,money,advantage in zip(companynames,industrys,positions,addresses,moneys,advantages):
        data={
            'companyname': companyname.get_text().strip(),
            'industry': industry.get_text().strip(),
            'position': position.get_text().strip(),
            'address':address.get_text().strip(),
            'money': money.get_text().split(),
            'advantage': advantage.get_text().split()
        }
        print(data)
        lagou_id=lagou.insert(data)
        time.sleep(1)
        print(lagou_id)
        #output_html(data)
        print('----------------------------')

if __name__ == '__main__':
    urls=['https://www.lagou.com/zhaopin/Python/1/?filterOption=3'.format(str(i)) for i in range(1,4)]
    for url in urls:
        get_page(url)

猜你喜欢

转载自blog.csdn.net/yhj198927/article/details/88828263