Python笔记-获取拉钩网南京关于python岗位数据

FIddler抓包如下:

程序打印如下:

源码如下:

import re
import requests

class HandleLaGou(object):
    def __init__(self):
        self.laGou_session = requests.session()
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        self.city_list = ""

    #获取全国城市列表
    def handle_city(self):
        city_search = re.compile(r'zhaopin/">(.*?)</a>')
        city_url = "https://www.lagou.com/jobs/allCity.html"
        city_result = self.handle_request(method = "GET", url = city_url)
        self.city_list = city_search.findall(city_result)
        self.laGou_session.cookies.clear()

    def handle_city_job(self, city):
        first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
        first_response = self.handle_request(method = "GET", url = first_request_url)
        total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
        try:
            total_page = total_page_search.search(first_response).group(1)
        except:
            return
        else:
            for i in range(1, int(total_page) + 1):
                data = {
                    "pn": i,
                    "kd": "python"
                }
                page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
                referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
                self.header['Referer'] = referer_url.encode()
                response = self.handle_request(method = "POST", url = page_url, data = data)
                print(response)

    def handle_request(self, method, url, data=  None, info = None):
        if method == "GET":
            response = self.laGou_session.get(url = url, headers = self.header, proxies={"http": "http://127.0.0.1:8888", "https":"http:127.0.0.1:8888"},verify=r"D:/Fiddler/FiddlerRoot.pem")
        elif method == "POST":
            response = self.laGou_session.post(url = url, headers = self.header, data=data, proxies={"http": "http://127.0.0.1:8888", "https":"http:127.0.0.1:8888"},verify=r"D:/Fiddler/FiddlerRoot.pem")
        response.encoding = 'utf-8'
        return response.text


if __name__ == '__main__':
    laGou = HandleLaGou()
    laGou.handle_city()

    for city in laGou.city_list:
        laGou.handle_city_job(city)
        break

    pass

这里有个小技巧

以前用C++去搞爬虫,简直累死,现在用python真是香,很多都帮忙处理了!

通过使用这个session,当在爬数据时,可能他会先触发一个页面,设置了cookie后,才能进入爬取。

发布了1312 篇原创文章 · 获赞 2429 · 访问量 185万+

猜你喜欢

转载自blog.csdn.net/qq78442761/article/details/104816407