05.python requests IP proxy

1. Problem

The number of visits per unit time of the crawler reaches the peak limit set by the server, and the server will block our local IP

2. The role of the agent

  • Break through the restriction of own IP access
  • Hide your real IP

3. Agency related websites

  • Fast agent
  • Xici Agent
  • www.goubanjia.com

Four. Types of proxy ip

  • http: applied to the URL corresponding to the http protocol
  • https: applied to the URL corresponding to the https protocol

5. Anonymity of proxy ip

  • Transparent: The server knows that the request uses a proxy, and also knows the real IP corresponding to the request
  • Anonymous: know the proxy is used, but don't know the real ip
  • Gao An: I don’t know the proxy is used, let alone the real ip

Six.requests proxy method

1. Single ip proxy mode

import requests
proxy = {
    'HTTPS': '162.105.30.101:8080'
}
url = '爬取链接地址'
response = requests.get(url,proxies=proxy)

2. Multi-ip proxy mode

import requests
#导入random,对ip池随机筛选
import random
proxy = [
    {
    
    
        'http': 'http://61.135.217.7:80',
        'https': 'http://61.135.217.7:80',
    },
{
    
    
        'http': 'http://118.114.77.47:8080',
        'https': 'http://118.114.77.47:8080',
    },
{
    
    
        'http': 'http://112.114.31.177:808',
        'https': 'http://112.114.31.177:808',
    },
{
    
    
        'http': 'http://183.159.92.117:18118',
        'https': 'http://183.159.92.117:18118',
    },
{
    
    
        'http': 'http://110.73.10.186:8123',
        'https': 'http://110.73.10.186:8123',
    },
]
url = '爬取链接地址'
response = requests.get(url,proxies=random.choice(proxy))

3. Simple Zhilian recruitment crawler package

import requests
from bs4 import BeautifulSoup
import re
import ssl
import time
import random

ssl._create_default_https_context = ssl._create_unverified_context

user_agent = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
'''
代理若出错,替换代理池,但代理池需要更新
'''
# proxy = [
#   {
    
    
#       'http': 'http://61.135.217.7:80',
#       'https': 'http://61.135.217.7:80',
#   },
# {
    
    
#       'http': 'http://118.114.77.47:8080',
#       'https': 'http://118.114.77.47:8080',
#   },
# {
    
    
#       'http': 'http://112.114.31.177:808',
#       'https': 'http://112.114.31.177:808',
#   },
# {
    
    
#       'http': 'http://183.159.92.117:18118',
#       'https': 'http://183.159.92.117:18118',
#   },
# {
    
    
#       'http': 'http://110.73.10.186:8123',
#       'https': 'http://110.73.10.186:8123',
#   },
# ]

def get_job_txt(city,kw,txt_name):
    for i in range(100):
        time.sleep(2)
        url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={2}&kw={0}&sm=0&p={1}'.format(kw,i,city)

        response = requests.get(url,headers = {
    
    'User-Agent': random.choice(user_agent)}).content.decode()

        soup =BeautifulSoup(response,'lxml')
        tables = soup.select('.newlist')[1:]
        if tables:
            for table in tables:
                job = table.select('.zwmc')[0].text
                company = table.select('.gsmc')[0].text
                money = table.select('.zwyx')[0].text
                place = table.select('.gzdd')[0].text
                href = table.select('.zwmc')[0].find('a')['href']
                print(job+'\t'+company+'\t'+money+'\t'+place+'\t'+href+'\n')
                with open('{0}.txt'.format(txt_name),'a+',encoding='utf-8',errors='ignore') as f:
                    f.write(job+'\t'+company+'\t'+money+'\t'+place+'\t'+href+'\n')
        else:
            print('总页'+ str(i))
            break


if __name__ == '__main__':
    city = input('输入城市')
    kw = input('输入岗位')
    txt_name = input('输入储存文件名')
    get_job_txt(city=city,kw=kw,txt_name=txt_name)

Guess you like

Origin blog.csdn.net/qq_40837794/article/details/109666341