python reptile redis-ip proxy ip pool to build hundreds of thousands of data - can be used

Import the BeautifulSoup BS4 from 
Import Requests, OS, SYS, Time, Random, Redis
from lxml Import etree
Conn = redis.Redis (Host = '127.0.0.1', Port = 6379)
DEF get_ip (page_url, headers, Cookies, sui_ji_time):
"" "
crawling ip and ip format used in combination
: param page_url:
: param headers:
: param Cookies:
: param sui_ji_time:
: return:
" ""
Print ( '{} - {} - {} - { } >> {} '. format ( ' sleep time program ', sui_ji_time,' are crawling on ', page_url,' data '))
Response = requests.get (page_url, headers = headers, Cookies Cookies =). text
json_lxml = etree.HTML (Response)
Table json_lxml.xpath = ( '// * [@ ID = "List"] / Table / tbody / TR')
for i in table:
html_ip = i.xpath('.//td[1]/text()')[0]
html_ip_port = i.xpath('.//td[2]/text()')[0]
html_ip_lei = i.xpath('.//td[4]/text()')[0]
daili_ip = '{}{}:{}'.format('http://', html_ip, html_ip_port)
if html_ip_lei == 'HTTP':
ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei)
else:
ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei)

def ceshi_ip(headers,cookies,sui_ji_time,daili_ip,html_ip_lei):
"""
测试爬取下来的ip是否可用
:param headers:
:param cookies:
:param sui_ji_time:
:param daili_ip:
:param html_ip_lei:
:return:
"""
Print (daili_ip, '@@@@@@@@@@@@')
# List1 = []
the try:
requests.get ( 'http://wenshu.court.gov.cn/', Proxies STR = {( html_ip_lei): daili_ip})
the except:
. Print ( '>> {} {}' the format (daili_ip, 'unavailable'))
the else:
. Print ( '>> {} {}' the format (daili_ip, 'available') )
"" "
stored redis database
", ""
the try:
conn.sadd (. 'Proxy', '{} + {}' the format (html_ip_lei, daili_ip))
Print (. '} {' the format ( 'storage redis success') )
the except:
Print (. '} {' the format ( 'storage failures redis'))
. ROOT_DIR =' {} 'the format (' D:\\ web_xiangmu \\ biquge_tushu \\ Agent ')
# list1.append (STR {(html_ip_lei): STR (daili_ip)})
not os.path.exists IF (ROOT_DIR):
os.mkdir (ROOT_DIR)
Print (. '} {' format ( 'Creating Success'))
Print (. '} {' format ( 'file exists'))
"" "
file storage to prevent loss
"" "
the try:
with Open (ROOT_DIR + '\\' + 'daili.text'," A + ") AS Mon:
mon.write ( '{}} + {\ n'.format (html_ip_lei, daili_ip))
. Print ( '{} {} >>>' the format (daili_ip, 'write success'))
the except:
. Print ( '{}' the format ( 'write failure'))




DEF main ():
" ""
crawling all ip ip proxy websites
and combine pagination
: return:
"""
url = 'https://www.kuaidaili.com/free/inha/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Referer': 'https://www.kuaidaili.com/free/inha/',
}
cookies = {
'Cookie': 'channelid=0; sid=1575640807483263; _ga=GA1.2.757045199.1575642271; _gid=GA1.2.1903168241.1575642271; _gat=1; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1575642272,1575686420; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1575686420',
}
response = requests.get(url,headers=headers,cookies=cookies).text
json_lxml = etree.HTML(response)
ip_page = json_lxml.xpath('//ul/li[9]/a/text()')[0]
ip_page_href = json_lxml.xpath('//ul/li[9]/a/@href')[0]
sui_ji_time = random.choice(list_time_sleep)
for page in range(1,int(ip_page)+1):
page_url = '{}/{}/{}/{}'.format('https://www.kuaidaili.com',''.join(ip_page_href).split('/')[1],''.join(ip_page_href).split('/')[2],page)
time.sleep(sui_ji_time)
get_ip(page_url,headers,cookies,sui_ji_time)

if __name__ == '__main__':
list_time_sleep = [5,10,15]
zhu_sui_ji_time = random.choice(list_time_sleep)
print('{}<<{}>>{}'.format('主程序随机睡眠时间',zhu_sui_ji_time,'秒'))
time.sleep(zhu_sui_ji_time)
main()


"""# connect to the server RedisImport RequestsImport Redis# Import Module
Using proxy operations #




= redis.Redis Conn (Host = '127.0.0.1', Port = 6379)
# random data extracted a proxy
IP = conn.redis.srandmember ( 'Proxy')
ip_add = '' .join (IP) .split ( '+ ')
zhen_ip IP = [. 1]
Print (zhen_ip)
URL =' HTTPS: //www.baidu.com '
Proxies = {
"HTTP": "HTTP: //" + zhen_ip.decode ( "UTF-. 8")
}

# use IP proxy access Baidu, the test is valid proxy address
the try:
the Data = requests.get (url url =, = Proxies Proxies, timeout = 5)
the except:
# proxy address is invalid
delete invalid IP proxy
verification IP proxy is invalid, if invalid proxy address, you can use the following command to remove the agent, so the agent can ensure that our pool of addresses are valid

conn.redis.srem ( 'proxy', 'invalid proxy IP address')

"" "

Guess you like

Origin www.cnblogs.com/duanlinxiao/p/12001618.html