1. Disable cookies
Some websites will identify and analyze users through their cookie information, so it is necessary to prevent the target website from identifying our session information.
In Scrapy, cookies are turned on by default (# COOKIES_ENABLED=False )
Set to: COOKIES_ENABLED = False (cookie enable: no), for those who need cookies, you can add cookies to headers in the request header
class LagouspiderSpider(scrapy.Spider): name = "lagouspider" allowed_domains = ["www.lagou.com"] url = 'https://www.lagou.com/jobs/positionAjax.json?'#city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false' page = 1 allpage =0 cookie = 'JSESSIONID=ABAAAABAAAFCAAEG34858C57541C1F9DF75AED18C3065736; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524281748; 04797acf-4515-11e8-90b5- LGSID=20180421130026-e7e614d7-4520-PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3Fcity%3D%25E6%25B7%25B1%25E5%259C 26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2F4302345.html; LGRID=20180421130208-24b73966-4521-11e8-90f2-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524286956' headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'cookie': cookie } def start_requests(self): yield scrapy.FormRequest(self.url, headers=self.headers, formdata={ 'first': 'true','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
2. Set the download delay
In Scrapy, the default is to turn off the request download delay (#DOWNLOAD_DELAY = 3)
Remove the #, or add time.sleep(random.randint(5, 10)) to the spider's request interval
def parse(self, response): #print(response.text) item = LagouItem() data = json.loads(response.text) totalCount = data['content']['positionResult']['totalCount']#How many pieces of information in total resultSize = data['content']['positionResult']['resultSize']#How many pieces of information per page result = data['content']['positionResult']['result']#Get a list with 15 pieces of information for each in result: for field in item.fields: if field in each.keys(): item[field] = each.get(field) yield item time.sleep (random.randint (5, 10)) if int(resultSize): self.allpage = int(totalCount) // int(resultSize) + 1 if self.page < self.allpage: self.page += 1 yield scrapy.FormRequest(self.url, headers=self.headers, formdata={ 'first': 'false','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
3. Set USER-AGENT and proxy ip
In settings:
DOWNLOADER_MIDDLEWARES = { 'doubanMongo.middlewares.RandomUserAgent': 300, 'doubanMongo.middlewares.RandomProxy':400 } USER_AGENTS = [ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Opera / 8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla / 5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' ] PROXIES=[{'ip_port':'117.48.214.249:16817','user_passwd':'632345244:4tf9pcpw'} #{'ip_port':'117.48.214.249:16817','user_passwd':''}, #{'ip_port':'117.48.214.249:16817','user_passwd':''}, #{'ip_port':'117.48.214.249:16817','user_passwd':''} ]
middlewares中:
from scrapy.conf import settings import base64 import random class RandomProxy(object): def process_request(self, request, spider): proxy = random.choice(settings["PROXIES"]) if proxy['user_passwd'] is None: request.meta['proxy'] = 'http://'+ proxy['ip_port'] else: # Convert account password to base64 encoding b_pw = bytes(proxy['user_passwd'], encoding = "utf-8")#string转为bytes base64_userpasswd = base64.encodestring(b_pw)#The required parameter is a bytes object # Corresponding to the signaling format of the proxy server s_base64_userpasswd = str(base64_userpasswd, encoding="utf-8") #bytes转为string request.headers['Proxy-Authorization'] = 'Basic ' + s_base64_userpasswd request.meta['proxy'] = "http://" + proxy['ip_port'] class RandomUserAgent(object): def process_request(self, request, spider): useragent = random.choice(settings["USER_AGENTS"]) #print(useragent) request.headers.setdefault('User-Agent',useragent)