Anti-anti-reptile measures of scrapy crawlers

1. Disable cookies

Some websites will identify and analyze users through their cookie information, so it is necessary to prevent the target website from identifying our session information.

In Scrapy, cookies are turned on by default (# COOKIES_ENABLED=False )

Set to: COOKIES_ENABLED = False (cookie enable: no), for those who need cookies, you can add cookies to headers in the request header

class LagouspiderSpider(scrapy.Spider):
    name = "lagouspider"
    allowed_domains = ["www.lagou.com"]

    url = 'https://www.lagou.com/jobs/positionAjax.json?'#city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
    page = 1
    allpage =0

    cookie = 'JSESSIONID=ABAAAABAAAFCAAEG34858C57541C1F9DF75AED18C3065736; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524281748; 04797acf-4515-11e8-90b5- LGSID=20180421130026-e7e614d7-4520-PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3Fcity%3D%25E6%25B7%25B1%25E5%259C 26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2F4302345.html; LGRID=20180421130208-24b73966-4521-11e8-90f2-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524286956'
    headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
               'Referer': 'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=',
               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
               'cookie': cookie }

    def start_requests(self):
       yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
            'first': 'true','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)


2. Set the download delay

In Scrapy, the default is to turn off the request download delay (#DOWNLOAD_DELAY = 3)

Remove the #, or add time.sleep(random.randint(5, 10)) to the spider's request interval

    def parse(self, response):
        #print(response.text)
        item = LagouItem()
        data = json.loads(response.text)

        totalCount = data['content']['positionResult']['totalCount']#How many pieces of information in total
        resultSize = data['content']['positionResult']['resultSize']#How many pieces of information per page

        result = data['content']['positionResult']['result']#Get a list with 15 pieces of information
        for each in result:
            for field in item.fields:
                if field in each.keys():
                    item[field] = each.get(field)
            yield item

        time.sleep (random.randint (5, 10))

        if int(resultSize):
            self.allpage = int(totalCount) // int(resultSize) + 1
            if self.page < self.allpage:
                self.page += 1
                yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
            'first': 'false','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)

3. Set USER-AGENT and proxy ip

In settings:

DOWNLOADER_MIDDLEWARES = {
    'doubanMongo.middlewares.RandomUserAgent': 300,
    'doubanMongo.middlewares.RandomProxy':400
}

USER_AGENTS = [
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
    'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
    'Opera / 8.0 (Macintosh; PPC Mac OS X; U; en)',
    'Mozilla / 5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
    'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
    'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
]

PROXIES=[{'ip_port':'117.48.214.249:16817','user_passwd':'632345244:4tf9pcpw'}
        #{'ip_port':'117.48.214.249:16817','user_passwd':''},
        #{'ip_port':'117.48.214.249:16817','user_passwd':''},
        #{'ip_port':'117.48.214.249:16817','user_passwd':''}
       ]

middlewares中:

from scrapy.conf import settings
import base64
import random


class RandomProxy(object):

    def process_request(self, request, spider):
        proxy = random.choice(settings["PROXIES"])
        if proxy['user_passwd'] is None:
            request.meta['proxy'] = 'http://'+ proxy['ip_port']
        else:
            # Convert account password to base64 encoding
            b_pw =  bytes(proxy['user_passwd'], encoding = "utf-8")#string转为bytes
            base64_userpasswd = base64.encodestring(b_pw)#The required parameter is a bytes object
            # Corresponding to the signaling format of the proxy server
            s_base64_userpasswd = str(base64_userpasswd, encoding="utf-8") #bytes转为string
            request.headers['Proxy-Authorization'] = 'Basic ' + s_base64_userpasswd
            request.meta['proxy'] = "http://" + proxy['ip_port']


class RandomUserAgent(object):

    def process_request(self, request, spider):
        useragent = random.choice(settings["USER_AGENTS"])
        #print(useragent)
        request.headers.setdefault('User-Agent',useragent)



Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324591233&siteId=291194637