tencnet社招信息抓取

从mysql_link导入数据库链接

from lxml import etree
from urllib import parse
import requests
import json
from mysql_link import mysql_connect

def get_detail(detail_url,mysql):

    user_agent = 'Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
    headers = {
        'User-Agent': user_agent
    }
    response = requests.get(detail_url, headers=headers)
    html_ele = etree.HTML(response.text)
    tr = html_ele.xpath('//table[@class="tablelist textl"]/tr')
    # for tr in tr_list[:-2]:
    title = html_ele.xpath('//table[@class="tablelist textl"]/tr[1]/td')#标题
    print(title[0].text)
    place = html_ele.xpath('//table[@class="tablelist textl"]/tr[2]/td[1]/text()')#工作地点
    print(place[0])
    position = html_ele.xpath('//table[@class="tablelist textl"]/tr[2]/td[2]/text()')#职位类别
    # print(position[0])
    person = html_ele.xpath('//table[@class="tablelist textl"]/tr[2]/td[3]/text()')#招聘人数
    # print(person[0])
    duty = html_ele.xpath('//table[@class="tablelist textl"]/tr[3]/td/ul/li/text()')#工作职责
    str_duty = ''
    for du in duty:
        str_duty += du
    # print((str_duty))
    requirement = html_ele.xpath('//table[@class="tablelist textl"]/tr[4]/td/ul/li/text()')#工作要求
    str_requirement = ''
    for re in requirement:
        str_requirement+=re
    # print(str_requirement)

    sql = 'insert into tencent (title,place,posi,person,duty,requirement) values("{}","{}","{}","{}","{}","{}")'.format(title[0].text,place[0],position[0],person[0],str_duty,str_requirement)
    # data = (title,place,position,person,str_duty,str_requirement)
    print(sql)
    mysql.mysql_do(sql)


# 获取页面信息
def getPage():
    my = mysql_connect()
    url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E8%AF%8D&start={}#a'
    user_agent = 'Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'

    headers = {
        'User-Agent':user_agent
    }
    for i in range(0,5):
        full_url = url.format(i*20)
        print(full_url)
        response = requests.get(full_url,headers=headers)

        html_ele = etree.HTML(response.text)
        tr_list = html_ele.xpath('//table[@class="tablelist"]/tr')
        tr_list = tr_list[1:]
        for i in tr_list[:-1]:
            # 具体招聘职位的信息
            a_list = i.xpath('./td[1]/a/@href')
            # detail_url = parse.urljoin(url,a_list)
            # print(detail_url)
            detail_url = 'https://hr.tencent.com/'+a_list[0]
            print(detail_url)

            get_detail(detail_url,my)



if __name__ == '__main__':
    getPage()

猜你喜欢

转载自blog.csdn.net/qq_41847171/article/details/81865205