爬取拉钩 破解拉钩反爬机制

拉钩的反爬很厉害

我想 看这篇文章的时候 发现 只能爬取到5 6条数据就不能继续的爬取内容了 你是不是被坑了 哈哈 我之前也被坑了 一次 所以我认真的研究了一下拉钩的反爬机制 发现还真有点恶心 所以 我先晒图
在这里插入图片描述
我已经妥妥的爬到很多60多页 一点反爬都没有

我们再看数据库 我数据库 已经有几千条数据了
在这里插入图片描述

没毛病 老铁!!

我们现在来研究一下拉钩的请求头先

"
     "Accept":"application/json",
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Host":"m.lagou.com",
    "Cookie":"_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(timeStamp=timeStamp,time=time1),
    "Referer":"https://m.lagou.com/search.html",
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
    "X-Requested-With":"XMLHttpRequest",

看着没毛病 但是 实际上 拉钩在cookie上做了一个反爬 可能因为太长 还没看到 但是 你可以看到 后面 实际上是有两个时间戳 是需要跟着时间走的 这是个关键!! 他们的反爬 不止有UA 还有cookie 验证
现在 我将所有的源码奉上

导入的有一个ip池(get_ip) ip池可以不写 用自己的也不会被封掉还有我是第一次写这些文章 给个 支持呗 谢谢 等我找到工作后 会继续写上爬抖音小视频、火山小视频、YouTube视频 这些东西已经写好来玩了

觉得有帮助就点个呗 谢谢!

import requests
from  lxml import etree
import pymongo
import time
import datetime
now = datetime.datetime.now()
timeStamp = int(now.timestamp()*1000)
geshi = "%Y%m%d%H%M%S"
time1 = datetime.datetime.strftime(now,geshi)


from proxies import get_ip
import pymongo
import json
headers = {
    "Accept":"application/json",
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Host":"m.lagou.com",
    "Cookie":"_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(timeStamp=timeStamp,time=time1),
    "Referer":"https://m.lagou.com/search.html",
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
    "X-Requested-With":"XMLHttpRequest",
}

city = "广州"
positionName = "python"
# pageNo = "1"
pageSize = "15"


def get_detail_url(pageNo,proxies):
    base_url = "https://m.lagou.com/search.json?city={city}&positionName={positionName}&pageNo={pageNo}&" \
               "pageSize={pageSize}".format(city=city,positionName=positionName,pageNo=pageNo,pageSize=pageSize)
    res = requests.get(base_url,headers=headers,proxies=proxies)
    content = res.content.decode()
    dict1 = json.loads(content)
    # print(dict1)
    list1 = dict1['content']['data']['page']['result']
    for i in list1:
        yield "https://m.lagou.com/jobs/{}.html".format(i['positionId'])

 # 职位名称  薪资  工作地点  工作年限  学历要求 企业名字   职位描述
def parse_detail(url,proxies):
    # print(url)
    res = requests.get(url,headers=headers,proxies=proxies)
    content = res.content.decode()
    xml = etree.HTML(content)
    postitle = xml.xpath('//div[@class="postitle"]//h2[@class="title"]//text()') #职位名称
    salary = xml.xpath('//span[@class="item salary"]//span[@class="text"]//text()') # 薪资
    workaddress = xml.xpath('//span[@class="item workaddress"]//span[@class="text"]//text()') #工作地点
    workyear = xml.xpath("//span[@class='item workyear']//span//text()")  # 工作年限
    education=xml.xpath('''//span[@class='item education']//span[@class="text"]//text()''') #学历要求
    companyName=xml.xpath('//div[@class="dleft"]//h2//text()') # 公司名字
    detail = xml.xpath("//div[@class='content']/p//text()") #职位描述
    dict1 = {}
    # print(postitle)
    if len(postitle) != 0 :
        dict1['postitle']=postitle[0]
        dict1['salary']=salary[0]
        dict1['workaddress']=workaddress[0]
        dict1['workyear']=workyear[0]
        dict1['education']=education[0].strip()
        dict1['companyName']=companyName[0].strip()
        dict1['detail']=''.join(detail)  #将列表转化为一串字符串
        return dict1
    else:
        print('请求不到数据,重新请求 睡3秒')
        time.sleep(3)
        ip = get_ip()
        proxies = next(ip)
        parse_detail(url,proxies)


def save_mongodb(dict1):
    print(dict1)
    client = pymongo.MongoClient('localhost', 27017)
    lagou_db = client.lagou
    lagou_db = lagou_db.lagou
    lagou_db.insert(dict1)
    print("保存数据库成功")

def run():

    ip = get_ip()
    proxies1 = next(ip)
    proxies2 = next(ip)
    for i in range(100):
        print("开始爬取第{}页".format(i+1))
        for j,url in enumerate(get_detail_url(i,proxies1)):
            dict1 = parse_detail(url,proxies2)
            save_mongodb(dict1)
            print("爬取第{}页第{}条信息成功".format(i+1,j+1))
    print("已经爬取完")




if __name__ == '__main__':
    run()

猜你喜欢

转载自blog.csdn.net/weixin_42712704/article/details/85017214