python3 reptiles (5) - Constructing Random User-Agent pool, free random ip pool construction, common exception handling

Process uses python for data collection, many sites are set up anti-reptile mechanism, the most common is the same as the User-Agent, ip or Cookie is not continuous data collection, so we need to build a lot of User-Agent, ip or Cookie to avoid being suspended. While many common exceptions occur when the batch data collection.

1, there are many ways to construct, where we will a large number of User-Agent and ip csv file exists, the data acquisition time for us to use.

2, ip proxy request failed how to solve?

We use free proxy ip back often faced with the problem of date, in which case the request will error: requests.exceptions.ProxyError: .....

There are a lot of ip ip pool, some can not be normal, this error when we again request the ip to other requests can be friends.

It also often request a timeout, url parsing error, or FAQ page does not exist and so on.

Case has a corresponding solution, of course, this code is not perfect, after understanding, to further optimize the code to the actual development process.

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
from lxml import etree
from requests.exceptions import ConnectTimeout,ProxyError
import random
import csv
import time

def User_Agent_and_proxies():
    global Headers, proxies  #设置为全局变量
    Headers = {} #构造随机User-Agent池
    User_Agent = open(r'User-Agent.csv','r+',encoding='UTF-8') #打开User-Agent.csv文件
    User_Agent=csv.reader(User_Agent) #读取文件
    User_Agent=random.choice(list(User_Agent)) #随机抽样
    #print(User_Agent)
    Headers["User-Agent"] = User_Agent[0]  #字典形式添加到Headers中
    #Headers["Cookie"] = r'select_city=440100; lianjia_uuid=4e6702fa-9afb-46d4-8fc6-7347b13e3c84; UM_distinctid=16fbee2b473343-074dcfa2576633-3764460c-100200-16fbee2b4742b2; _jzqckmp=1; sajssdk_2015_cross_new_user=1; _jzqy=1.1579455526.1579504146.1.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6%E7%BD%91.-; _gat=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga=GA1.2.1214289470.1579455528; _gid=GA1.2.220392387.1579455528; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1579455543,1579504156; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1579509996; _smt_uid=5e249426.3a588f5d; _jzqa=1.4109641768103788000.1579455526.1579504146.1579508397.3; _jzqc=1; _jzqb=1.15.10.1579508397.1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fbee2b651359-0f82d9e0f4de28-3764460c-1049088-16fbee2b652558%22%2C%22%24device_id%22%3A%2216fbee2b651359-0f82d9e0f4de28-3764460c-1049088-16fbee2b652558%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22sousuo%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; lianjia_ssid=ed2d58d1-9ed6-4ede-8922-7aba54cbaad3'
    #print(Headers)

    proxies= { }  #构造免费随机ip池
    pro = open(r'proxies.csv','r+',encoding='UTF-8')#打开proxies.csv文件
    pro=csv.reader(pro) #读取文件
    pro=random.choice(list(pro)) #随机抽取一行
    #print(pro)
    proxies[pro[0]]=pro[0]+r"://"+pro[1] #典形式添加到proxies中
    #print(proxies)

def response(): #将请求封装成一个函数
    r=requests.get(url,headers= Headers,proxies=proxies,timeout=30) #请求
    r.encoding ="utf-8" #编码转译
    #print(r.text)
    html = r.text # requests的解析为text的html
    #print(html)
    html = etree.HTML(html)  # 利用 etree.HTML 初始化
    title = html.xpath('//*[@id="column"]/div[1]/div/div[2]/div[1]/div/div[1]/h3/text()') #定位
    data = html.xpath('//*[@id="column"]/div[1]/div/div[2]/div[2]/span//text()') #定位
    print(title[0],'关注数:%s' % data[2],'文章数:%s' % data[5],'访问量:%s' % data[8] ) #输出

# url = "https://blog.csdn.net/weixin_41685388/category_9426224.html"  #请求的url
# url = "https://blog.csdn.net/weixin_41685388/category_9598997.html"
for i_url in [9426224,9598997,"adsfdddd"]:
    url =r'https://blog.csdn.net/weixin_41685388/category_'+str(i_url) +r'.html' #拼接获取url
    #构造请求的ulr
    time.sleep(3)  #延时3秒再执行
    i = 0
    while True:  #写个循环,处理异常
        try:  #无异常执行代码块
            User_Agent_and_proxies()
            response()
        except (ProxyError) as e:  #代理ip异常从新执行
            print(proxies)  #记录异常的ip,方便后续处理
            continue
        except (ConnectTimeout) as e1: #请求超时异常
            i+=1
            if i<2:  #请求超时从新执行限定再从新执行1次
                continue
            else:
                print("请求超时未执行的url:", url)  # 记录最终因请求超时未执行的url
                break
        except (IndexError) as e2:  # 请求中的其他错误,如404、解析错误等
            print("错误url:",url)
            break
        break
'''
结果:
python 关注数:10 文章数:24 访问量:65730
Informatica 关注数:15 文章数:13 访问量:6400
错误url https://blog.csdn.net/weixin_41685388/category_adsfdddd.html
'''

annex:

Published 107 original articles · won praise 104 · views 10000 +

Guess you like

Origin blog.csdn.net/weixin_41685388/article/details/104064174