爬取腾讯招聘信息存入mongodb数据库

SpiderTencent.py

import requests
from lxml import etree
import time
import pymongo
import random


class SpiderTencent(object):
    def __init__(self):
        """
        初始化url
        网页页码下标地址
        爬虫控制开关
        """
        self.url = "http://hr.tencent.com/position.php?&start="
        self.index = 0
        self.switch = True
        self.tencent_data = []  # 创建一个列表用来存储tencent招聘信息

    def con_mongodb(self):
        """
        创建mongodb对象
        连接mongodb
        """
        client = pymongo.MongoClient(host="localhost", port=27017)
        db = client.py3
        collection = db.tencent
        for data in self.tencent_data:
            collection.insert(data)
        print("已将数据全部存入到mongodb中!")

    def get_html(self, url):
        """
            加载html页面,并解析为xml文档
        """
        headers_list = [
            {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
            {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"},
        ]
        headers = random.choice(headers_list)
        try:
            response = requests.get(url=url, headers=headers, timeout=20)
        except:
            print("have a error")
        finally:
            response = requests.get(url=url, headers=headers, timeout=20)
        html = response.text
        content = etree.HTML(html)
        return content

    def load_page(self, url):
        """
        利用xpaht获取信息,存入mongodb中
        """
        content = self.get_html(url)
        job_title = content.xpath('(//tr[@class="even"] | //tr[@class="odd"])//a/text()')    # 职位名称
        job_category = content.xpath('//tr[@class="even"]//td[2]//text() | //tr[@class="odd"]//td[2]//text()')   # 职位类别
        number = content.xpath('//tr[@class="even"]//td[3]//text() | //tr[@class="odd"]//td[3]//text()')  # 人数
        location = content.xpath('//tr[@class="even"]//td[4]//text() | //tr[@class="odd"]//td[4]//text()')  # 地点
        info_list = zip(job_title, job_category, number, location)  # 整合信息
        for info in info_list:
            info = {"job_title": info[0], "job_category": info[1], "number": info[2], "location": info[3]}  # 拼接成字典
            self.tencent_data.append(info)
        print(info_list)
        print("正在获取数据" + "-" * 10)

    def start_switch(self):
        """
        开启控制开关
        """
        while self.switch:
            tencent_url = self.url + str(self.index)  # 拼接url地址
            self.load_page(tencent_url)
            time.sleep(5)
            if self.index < 2500:   # 判断是否到了最后一页
                self.index += 10
            else:
                self.switch = False
                self.con_mongodb()  # 将数据存到mongodb中
                print("程序结束")


if __name__ == '__main__':
    tencent = SpiderTencent()
    tencent.start_switch()

猜你喜欢

转载自blog.csdn.net/qq_42029527/article/details/83181348