爬取拉勾网信息


#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#
import requests
import json
from random import randint, choice
import pymongo
from time import sleep
from multiprocessing import Process, JoinableQueue as Queue

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# 采集公司信息
def company(q):
    # br = get_chrome()
    br = login_lagou(20)
    br.set_window_rect(602, 0, 600, 800)
    db = get_mongodb()
    # 采集到的数据放到company表中
    company = db.company1

    while True:
        if q.empty():
            break
        try:
            company_id = q.get()
            url = 'https://www.lagou.com/gongsi/' + str(company_id) + '.html'
            br.get(url)
            company_info = {}
            company_info['name'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/h1/a').text
            company_info['job_num'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[1]/strong').text
            company_info['efficiency'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[2]/strong').text
            company_info['time_consuming'] = br.find_element_by_xpath(
                '/html/body/div[3]/div/div/div[2]/ul/li[3]/strong').text
            company_info['last_login'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[5]/strong').text
            company_info['introduction'] = br.find_element_by_xpath(
                '/html/body/div[6]/div[1]/div/div[2]/div[2]/div[2]/span[1]').text
            company_info['inancing'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[2]/span').text
            company_info['scale'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[3]/span').text
            company.insert(company_info)

            # 采集公司的评价
            # company_comment(company_id, db, br)
            print('%d公司信息已采集入库' % company_id)
            q.task_done()
        except Exception as e:
            print('遇到异常', e)
            q.put(company_id)
            sleep(10)
        sleep(1)
    br.close()
    q.join()

# 采集公司的评论信息
# def company_comment(id, db, br):
#     """
#     :param id: 公司id
#     :param db: 数据库
#     :param br: 浏览器驱动
#     :return:
#     """
#     url = 'https://www.lagou.com/gongsi/interviewExperiences.html?companyId='+str(id)
#     br.get(url)



# 采集工作的具体信息
def work(q):
    # br = get_chrome()
    br = login_lagou(20)
    br.set_window_rect(101, 0, 600, 600)
    db = get_mongodb()
    job_table = db.job1
    while True:
        if q.empty():
            break
        try:
            id = q.get()
            url = 'https://www.lagou.com/jobs/' + str(id) + '.html'
            br.get(url)
            job = {'id': id}
            content = br.find_element_by_xpath('/html/body/div[5]/div[1]/dl[1]/dd[2]').text
            job['content'] = content

            job_table.insert(job)
            print('%d招聘启事具体内容已入库' % id)
            q.task_done()
        except Exception as e:
            print('遇到异常', e)
            q.put(id)
            sleep(10)
        sleep(1)
    br.close()
    q.join()


# 获得一个无界面浏览器驱动
def get_chrome():
    options = Options()
    # options.add_argument('--headless')
    # options.add_argument('--disable-gpu')
    br = webdriver.Chrome(chrome_options=options)
    return br

# 获取一个mongodb连接对象
def get_mongodb():
    # 连接mongodb
    cli = pymongo.MongoClient(host='192.168.12.244', port=27017)
    db = cli.xxx
    db.authenticate('ss', '123456')
    return db

# 获取cookie信息
def get_cookie(br):
    # br = get_chrome()
    br.get('https://www.lagou.com/')
    tmp_cookies = br.get_cookies()
    # 动态获取到cookies
    return {i['name']: i['value'] for i in tmp_cookies}

def login_lagou(sec):
    br = get_chrome()
    br.get('https://www.lagou.com/frontLogin.do')
    mobi = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[1]/input')
    pwd = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[2]/input')
    mobi.send_keys('15324818121')
    pwd.send_keys('123456')
    sleep(sec)
    # 打开连个选项卡备用
    # br.execute_script('window.open("https://www.lagou.com/")')
    # br.execute_script('window.open("https://www.lagou.com/")')
    # br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[5]/input').click()

    return br

if __name__ == '__main__':
    # 存放公司信息的队列
    companies = Queue()
    # 存放岗位信息的队列
    jobs = Queue()

    br = login_lagou(20)
    br.set_window_rect(0, 0, 200, 600)

    # 启动一个进程采集公司的信息
    c = Process(target=company, args=(companies,))
    c.start()
    sleep(20)
    # 启动一个进程采集岗位信息
    jo = Process(target=work, args=(jobs,))
    jo.start()

    # 准备请求头信息
    header = {
        'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%85%A8%E5%9B%BD',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    }

    UAs = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:65.0) Gecko/20100101 Firefox/65.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7'
    ]

    cookies = get_cookie(br)



    # 获取一个mongodb连接对象
    db = get_mongodb()

    # 采集职位信息
    i = 1
    header['User-Agent'] = choice(UAs)

    while True:
        if i > 30:
            break
        data = {'first': 'false', 'pn': i, 'kd': 'Python'}
        re = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false',
                           data=data, headers=header, cookies=cookies)
        content = json.loads(re.text)

        try:
            if content['success']:
                # 将职位信息放到mongodb的work表中
                tab = db.work1
                resultData = content['content']['positionResult']['result']
                tab.insert(resultData)

                for j in resultData:
                    # 将该公司的id放到 公司队j列中
                    companies.put(j['companyId'])
                    # 把工作id放到队列中
                    jobs.put(j['positionId'])
        except Exception as e:
            # 采集遇到异常的话就抛出异常病退出循环
            print('遇到异常', e, content)
            # 获取最新的cookie
            cookies = get_cookie(br)
            header['User-Agent'] = choice(UAs)
            print('更换身份，正在重试')
            # 因为后面会进行加一操作，而我们这次并没有成功猜到，那么需要重新采集
            i -= 1

        sleep_time = randint(1, 3)
        print('列表第%d页已完成， 打算睡%d秒' % (i, sleep_time), )
        sleep(sleep_time)
        i += 1
    # companies.put(None)
    # jobs.put(None)
    br.close()

    c.join()
    jo.join()
猜你喜欢