Spider学习笔记(三):抓取51招聘岗位信息

import requests
import lxml
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
url = 'https://jobs.51job.com/guangzhou/p2/'

def getJob(url):
    response = requests.get(url,headers=headers).content.decode('gbk')
    mytree = lxml.etree.HTML(response)

    jobList = mytree.xpath('/html/body/div[3]/div[3]/div[1]/div[2]/div')
    for job in jobList:
        #抓取公司名字:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/span[1]/a

        jobName = job.xpath('.//p[@class="info"]/span/a/@title')[0]
        jobName = job.xpath('./p[1]/span[1]/a/@title')[0]
        jobName = job.xpath('./p[1]/span[1]/a/text()')[0].strip()

        # 抓取公司网址:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/span[1]/a
        jobUrl = job.xpath('./p[1]/span[1]/a/@href')[0]

        # 抓取公司地址:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/a
        jobAddress = job.xpath('./p[1]/a/@title')[0]

        #所在城市:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/span[2]
        jobCity = job.xpath('./p[1]/span[2]/text()')[0]

        #工作需求:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[2]
        jobOrder = job.xpath('./p[2]/text()')[0].strip()+','+job.xpath('p[2]/text()')[1].strip()+','+job.xpath('p[2]/text()')[2].strip()+','+job.xpath('p[2]/text()')[3].strip()

        #公司简介:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[3]
        jobintro = job.xpath('./p[3]/@title')[0]
        print(jobName,jobUrl,jobAddress,jobCity,jobOrder,jobintro)
              # jobUrl,jobAddress)

getJob(url)

猜你喜欢

转载自blog.csdn.net/xiaohuoche175/article/details/81749126