Infi-chu:
http://www.cnblogs.com/Infi-chu/
1. Set up the proxy
1.urllib
#HTTPProxyType from urllib.error import URLError from urllib.requests import ProxyHandler,build_opener proxy='127.0.0.1:9743' # proxy='username:[email protected]:9743' put the username and password at the beginning proxy_handler=ProxyHandler({ 'http':'http://'+proxy, 'https':'https://'+proxy }) opener=build_opener(proxy_handler) try: res = opener.open('http://httpbin.org/get') print(res.read().decode('uft-8')) except URLError as e: print(e.reason) #SOCK5Proxy Type import socks # pip3 install PySocks import socket from urllib import request from urllib.error import URLError socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742) socket.socket=socks.socksocket try: res = request.urlopen('http://httpbin.org/get') print(res.read().decode('utf-8')) except URLError as e: print(e.reason)
2. requests
are simpler than urllib
# HTTP proxy type improt requests proxy='127.0.0.1:9743' proxies = { 'http':'http://'+proxy, 'https':'https://'+proxy, } try: res = requests.get('http://httpbin.org/get',proxies=proxies) print(res.text) except requests.exceptions.ConnectionError as e: print('Error',e.args) # SOCK5 proxy type (1) import requests # pip3 install 'requests[socks]' proxy='127.0.0.1:9742' proxies={ 'http':'socks5://'+proxy, 'https':'socks5://'+proxy, } try: res = requests.get('http://httpbin.org/get',proxies=proxies) print(res.text) except requests.exceptions.ConnectionError as e: print('Error',e.args) # SOCK5 proxy type (2) import requests,socks,socket socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742) socket.socket=socks.socksocket try: res = requests.get('http://httpbin.org/get',proxies=proxies) print(res.text) except requests.exceptions.ConnectionError as e: print('Error',e.args)
3. Selenium
set browser proxy
from selenium import webdriver proxy='127.0.0.1:9743' chrome_options=webdriver.ChromeOptions() # Use this method to pass parameters chrome_options.add_argument('--proxy-server=http://'+proxy) browser=webdriver.Chrome(chrome_options=chrome_options) browser.get('http://httpbin.org/get')
Set up authentication proxy
from selenium import webdriver from selenium.webdriver.chrome.options import Options import zipfile ip='127.0.0.1' port=9743 username='test' password='test' manifest_json=""" { "version":"1.0.0", "manifest_version":2, "name":"Chrome Proxy", "permissions":[ "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking" ], "background":{"scripts":["background.js"]} } """ background_js=""" var config={ mode:"fixed_servers", rules:{ singleProxy:{ scheme:"http", host:"%(ip)s", port:"%(port)s" } } } chrome.proxy.settings.set({value:config,scope:"regular"},function(){}); function callbackFn(details){ return{ authCredentials:{ username:"%(username)s", password:"%(password)s" } } } chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls:["<all_urls>"]}, ['blocking'] ) """%{'ip':ip,'port':port,'username':username,'port':port} plugin_file='proxy_auth_plugin.zip' with zipfile.ZipFile(plugin_file,'w') as zp: zp.writestr("manifest_json",manifest_json) zp.writestr("background.js",background_js) chrome_options=Options() chrome_options.add_argument('--start-maximized') chrome_options.add_extension(plugin_file) browser=webdriver.Chrome(chrome_options=chrome_options) browser.get('http://httpbin.org/get')
2. The agent pool maintains
a single agent and cannot complete our agent tasks, so a larger number of agents are needed to serve us.
We will screen the agents and serve us efficiently.
1. Preparations
need to use redis database, aiohttp, requests, redis-py, pyquery, flask library
2. The target of the proxy pool: storage module, acquisition module, detection module, interface module
3. Implementation of each module:
https://github.com/Infi-chu/proxypool
3. Use a proxy to crawl WeChat articles
https://github.com/Infi-chu/weixinspider