1. Problem
The number of visits per unit time of the crawler reaches the peak limit set by the server, and the server will block our local IP
2. The role of the agent
- Break through the restriction of own IP access
- Hide your real IP
3. Agency related websites
- Fast agent
- Xici Agent
- www.goubanjia.com
Four. Types of proxy ip
- http: applied to the URL corresponding to the http protocol
- https: applied to the URL corresponding to the https protocol
5. Anonymity of proxy ip
- Transparent: The server knows that the request uses a proxy, and also knows the real IP corresponding to the request
- Anonymous: know the proxy is used, but don't know the real ip
- Gao An: I don’t know the proxy is used, let alone the real ip
Six.requests proxy method
1. Single ip proxy mode
import requests
proxy = {
'HTTPS': '162.105.30.101:8080'
}
url = '爬取链接地址'
response = requests.get(url,proxies=proxy)
2. Multi-ip proxy mode
import requests
#导入random,对ip池随机筛选
import random
proxy = [
{
'http': 'http://61.135.217.7:80',
'https': 'http://61.135.217.7:80',
},
{
'http': 'http://118.114.77.47:8080',
'https': 'http://118.114.77.47:8080',
},
{
'http': 'http://112.114.31.177:808',
'https': 'http://112.114.31.177:808',
},
{
'http': 'http://183.159.92.117:18118',
'https': 'http://183.159.92.117:18118',
},
{
'http': 'http://110.73.10.186:8123',
'https': 'http://110.73.10.186:8123',
},
]
url = '爬取链接地址'
response = requests.get(url,proxies=random.choice(proxy))
3. Simple Zhilian recruitment crawler package
import requests
from bs4 import BeautifulSoup
import re
import ssl
import time
import random
ssl._create_default_https_context = ssl._create_unverified_context
user_agent = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
'''
代理若出错,替换代理池,但代理池需要更新
'''
# proxy = [
# {
# 'http': 'http://61.135.217.7:80',
# 'https': 'http://61.135.217.7:80',
# },
# {
# 'http': 'http://118.114.77.47:8080',
# 'https': 'http://118.114.77.47:8080',
# },
# {
# 'http': 'http://112.114.31.177:808',
# 'https': 'http://112.114.31.177:808',
# },
# {
# 'http': 'http://183.159.92.117:18118',
# 'https': 'http://183.159.92.117:18118',
# },
# {
# 'http': 'http://110.73.10.186:8123',
# 'https': 'http://110.73.10.186:8123',
# },
# ]
def get_job_txt(city,kw,txt_name):
for i in range(100):
time.sleep(2)
url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={2}&kw={0}&sm=0&p={1}'.format(kw,i,city)
response = requests.get(url,headers = {
'User-Agent': random.choice(user_agent)}).content.decode()
soup =BeautifulSoup(response,'lxml')
tables = soup.select('.newlist')[1:]
if tables:
for table in tables:
job = table.select('.zwmc')[0].text
company = table.select('.gsmc')[0].text
money = table.select('.zwyx')[0].text
place = table.select('.gzdd')[0].text
href = table.select('.zwmc')[0].find('a')['href']
print(job+'\t'+company+'\t'+money+'\t'+place+'\t'+href+'\n')
with open('{0}.txt'.format(txt_name),'a+',encoding='utf-8',errors='ignore') as f:
f.write(job+'\t'+company+'\t'+money+'\t'+place+'\t'+href+'\n')
else:
print('总页'+ str(i))
break
if __name__ == '__main__':
city = input('输入城市')
kw = input('输入岗位')
txt_name = input('输入储存文件名')
get_job_txt(city=city,kw=kw,txt_name=txt_name)