前程无忧爬虫–仅供学习使用

前程无忧职位链接：https://search.51job.com/list/090200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
先右键检查分析网页，这里我们已经找到了详情页的链接
在这里插入图片描述可以看到详情页的链接就在a标签里面，我们可以使用xpath语法来进行提取。urls = html.xpath("//div[@class='dw_table']//div[@class='el']/p/span/a/@href") 请求这个网页，我们就可以进入详情页了。

这里就是详情页了，可以看到左边的信息都可以在右边的源代码中看到，接下来用xpath提取就可以了。
网页分析完毕，接下来的时间交给代码了

提取网页的详情链接函数：

def get_urls():
    for i in range(1,46):#限制页数。
        print("正在获取第{}页的数据".format(i))
        url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?'.format(i)
        response = requests.get(url,headers=headers)
        html = etree.HTML(response.text)
        urls = html.xpath("//div[@class='dw_table']//div[@class='el']/p/span/a/@href")
        # print(urls)
        parse_urls(urls)

解析详情页面，提取数据：

def parse_urls(urls):
    for ul in urls:
        try:
            print(ul)
            response = requests.get(ul,headers=headers)
            response.encoding='gbk'
            html = etree.HTML(response.text)
            # print(response.text)
            position_name = html.xpath("//div[@class='cn']/h1/text()")[0]#职位名称
            company_name = html.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/text()")[0]#公司名称
            address = html.xpath("//div[@class='cn']/p[2]/text()")[0]#地址
            salary = html.xpath("//div[@class='cn']/strong/text()")[0]#工资
            induction_requirements = html.xpath("//div[@class='cn']/p[2]/text()")[1]#入职要求
            education = html.xpath("//div[@class='cn']/p[2]/text()")[2]#学历
            number = html.xpath("//div[@class='cn']/p[2]/text()")[3]#招聘人数
            release_time = html.xpath("//div[@class='cn']/p[2]/text()")[4]#发布时间
            print(position_name,company_name,address,salary,induction_requirements,education,number,release_time)
            datas = [position_name,company_name,address,salary,induction_requirements,education,number,release_time]
            # writer.writerow(datas)
        except Exception as e:
            print('错误:{},数据不齐，丢弃'.format(e))

完整代码如下：

#Time:2020/03/29
#author:渔戈
import requests
from lxml import etree
import csv
#将数据写入csv文件
fp = open('前程无忧.csv','a',encoding='utf-8',newline='')
writer = csv.writer(fp)#初始化csv文件
header =['position_name','company_name','address','salary','induction_requirements','education','number','release_time']
writer.writerow(header)#写入表头
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
}


def get_urls():
    for i in range(1,46):#限制页数，最多有45页。
        print("正在获取第{}页的数据".format(i))
        url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{}.html?'.format(i)
        response = requests.get(url,headers=headers)
        html = etree.HTML(response.text)
        urls = html.xpath("//div[@class='dw_table']//div[@class='el']/p/span/a/@href")
        # print(urls)
        parse_urls(urls)

def parse_urls(urls):
    for ul in urls:
        try:
            print(ul)
            response = requests.get(ul,headers=headers)
            response.encoding='gbk'
            html = etree.HTML(response.text)
            # print(response.text)
            position_name = html.xpath("//div[@class='cn']/h1/text()")[0]#职位名称
            company_name = html.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/text()")[0]#公司名称
            address = html.xpath("//div[@class='cn']/p[2]/text()")[0]#地址
            salary = html.xpath("//div[@class='cn']/strong/text()")[0]#工资
            induction_requirements = html.xpath("//div[@class='cn']/p[2]/text()")[1]#入职要求
            education = html.xpath("//div[@class='cn']/p[2]/text()")[2]#学历
            number = html.xpath("//div[@class='cn']/p[2]/text()")[3]#招聘人数
            release_time = html.xpath("//div[@class='cn']/p[2]/text()")[4]#发布时间
            print(position_name,company_name,address,salary,induction_requirements,education,number,release_time)
            datas = [position_name,company_name,address,salary,induction_requirements,education,number,release_time]
            writer.writerow(datas)
        except Exception as e:
            print('错误:{},数据不齐，丢弃'.format(e))

if __name__ == '__main__':
    get_urls()
    fp.close()

前程无忧爬虫，仅供学习使用

前程无忧爬虫–仅供学习使用

猜你喜欢