Get IP Proxy(代理设置,代理池的搭建,用代理获取微信公众号文章)

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:[email protected]
@file:ipFreeProxyEn.py
@time: 10/1/2019 12:13 PM
"""

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:[email protected]
@file:ipFreeProxyDotCZ.py
@time: 9/13/2019 3:10 AM
"""

#Under the Anaconda Prompt
#pip install fake-useragent
from fake_useragent import UserAgent
from selenium import webdriver
from lxml import etree
import pandas as pd
import re
import numpy as np
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import json
import pprint  # for print dict structure
import copy
import telnetlib
import requests
from selenium.webdriver.support.select import Select
from time import sleep
import urllib.request

domainList=["http://free-proxy.cz"]
domain = domainList[0]
base="/en/proxylist/main/1"
#print(domain+base)

#####responding to anti-crawler
userAgent=UserAgent()
#print(userAgent.random)
#Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17
#The actual userAgent
# (right click your mouse then select Inspection
# and then you can find it from Headers by clicking any elementName Under the Name in Network )
header = {'UserAgent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
          'Connection':'close'#countryNameRaw
         }
#print(header['UserAgent'])
#Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36

options = webdriver.ChromeOptions()
options.add_argument("'user-agent=" + header['UserAgent'] + "'")
options.add_argument('--diable-gpu') # google document mentioned this attribute can avoid some bugs
options.add_experimental_option('excludeSwitches',['enable-automation'])
#set the browser as developer model, prevent the website identifying that you are using Selenium
#browser = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver',
#                           chrome_options=options)
#run command("window.navigator.webdriver")in the Console of the inspection
#result: undefine  # means: regular browser
class GetIp():
    def __init__(self, domain=domain, baseUrl=base):
        self.domain = domain
        self.url = self.domain + str(baseUrl)
        self.browser = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver',
                                        chrome_options=options
                                        )
        from selenium.webdriver.support.ui import WebDriverWait
        self.wait = WebDriverWait(self.browser, 2)

    def __del__(self):

        # self.browser.close()  # close current browser's window or tab
        self.browser.quit()  # close browser

    def get_webpagecontent(self):
        print(self.url)

        # import requests
        # Python requests appear "HTTPConnectionPool Max retires exceeded"
        # requests.session().keep_alive=False
        # requests.adapters.DEFAULT_RETRIES = 5

        self.browser.keep_alive = False
        self.browser.DEFAULT_RETRIES = 5

        self.browser.get(self.url)

        # from selenium.webdriver.support import expected_conditions as EC
        # from selenium.webdriver.common.by import By
        self.wait.until(EC.presence_of_element_located((By.ID, "proxy_list")))
        pageSource = self.browser.page_source
        return pageSource

    def parse_webData(self, pageSource):
        # get ip Data and save it into a dataframe
        df = pd.read_html(pageSource, header=0)[1]
        # print(df)
        # import re
        selector = etree.HTML(pageSource)
        countryDict = {}
        # locate to the selection box on webpage
        time.sleep(.5)
        countryNameRaw = selector.xpath('//select/option/text()')
        # get country name's Abbreviation
        countryAbbreviations = selector.xpath('//select/option/@value')

        # create a dictionary{countryName:countryAbbreviations}
        for i in range(2, len(countryNameRaw)):
            # get country name
            countryName = re.findall('[\u4e00-\u9fa5_a-zA-Z]+', countryNameRaw[i])[0]
            # create a dictionary{countryName:countryAbbreviations}
            countryDict[countryName] = countryAbbreviations[i]

        for r in range(df.shape[0]):
            # get Ip Address
            ipAddress = re.findall(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', df.loc[r, 'IP address'])

            if len(ipAddress) == 0:
                df.drop([r], inplace=True)  # drop the row in which there not exist ip data
            else:
                df.loc[r, 'IP address'] = ipAddress[0]
                # get Port
                port = re.findall(r'\b[0-9]+\b', str(df.loc[r, 'Port']))[0]
                df.loc[r, 'Port'] = port

                if df.loc[r, 'Country'] in countryDict.keys():
                    # countryName, countryAbbreviations      #United States, US
                    df.loc[r, 'Country'] = df.loc[r, 'Country'] + ", " + countryDict[df.loc[r, 'Country']]

                # import numpy as np
                if pd.isnull(df.loc[r, 'Region']) and pd.isnull(df.loc[r, 'City']):
                    # df.loc[r,'Region']=np.nan#pd.NaT
                    # df.loc[r,'City']=np.nan#pd.NaT
                    df.loc[r, 'Location'] = np.nan
                elif pd.isnull(df.loc[r, 'Region']) and pd.isnull(df.loc[r, 'City']) == False:
                    df.loc[r, 'Location'] = df.loc[r, 'City']
                elif pd.isnull(df.loc[r, 'Region']) == False and pd.isnull(df.loc[r, 'City']):
                    df.loc[r, 'Location'] = df.loc[r, 'Region']
                else:
                    df.loc[r, 'Location'] = df.loc[r, 'Region'] + ", " + df.loc[r, 'City']

        df.drop(['Region', 'City'], axis=1, inplace=True)

        df.columns = ['IP Address', 'Port', 'Protocol', 'Country', 'Anonymity',
                      'Speed', 'Available', 'Response Time', 'Update', 'Location']
        return df, countryDict  # main page ip data and country Dict

    def crawler(self, pageNumber=0):  # get first 3 pages' data
        # current page
        # pageNumber = 0

        pageSource = self.get_webpagecontent()
        ipDataFrame, countryDict = self.parse_webData(pageSource)
        # print(ipDataFrame)

        hasNextPage = False
        # from lxml import etree
        selector = etree.HTML(pageSource)
        try:
            time.sleep(.5)
            # nextPageUrl = selector.xpath('//div[@class="pagination"]/a[contains(text(),"Next »")]/@href')[0]
            nextPageUrl = selector.xpath('//div[@class="paginator"]/a[last()]/@href')[0]
            if nextPageUrl:
                hasNextPage = True
        except:
            # print("No next page!")
            return ipDataFrame

        while hasNextPage and pageNumber < 2:
            self.url = self.domain + nextPageUrl
            pageSource = self.get_webpagecontent()
            pageNumber = pageNumber + 1
            ###
            currentIpDataFrame, countryDict = self.parse_webData(pageSource)
            # combine two dataframe
            ipDataFrame = pd.concat([ipDataFrame, currentIpDataFrame])

            hasNextPage = False
            selector = etree.HTML(pageSource)
            try:
                # nextPageUrl = selector.xpath('//div[@class="pagination"]/a[contains(text(),"Next »")]/@href')[0]
                nextPageUrl = selector.xpath('//div[@class="paginator"]/a[last()]/@href')[0]
                if nextPageUrl:
                    hasNextPage = True
            except:
                break

        return ipDataFrame

    def execute2(self):  # get at most (first) 5 pages's data for each value on selection('country')
        pageSource = self.get_webpagecontent()
        self.browser.implicitly_wait(2)

        selector = etree.HTML(pageSource)
        # get all values belonging to select tag
        countryAbbreviations = selector.xpath('//select/option/@value')
        # print(countryAbbreviations)

        #from selenium.webdriver.support.select import Select
        # from selenium.webdriver.common.action_chains import ActionChains #import mouse action event package
        #from time import sleep

        selectionBox = self.browser.find_element_by_id('frmsearchFilter-country')

        # Select(selectionBox).select_by_value("BT")
        # Select(selectionBox).select_by_value("MO")
        # ipCrawler.browser.find_element_by_name("send").click()

        counterCountry = 0
        ipDataFrame = None  # create a empty dataframe
        for value in countryAbbreviations:
            if value != 'all' and counterCountry < 5:
                Select(selectionBox).select_by_value(value)
                self.browser.find_element_by_name("send").click()

                self.url = self.browser.current_url  # reset url since each page's url is different
                if ipDataFrame is None:
                    ipDataFrame = self.crawler()
                else:
                    ipDataFrame = ipDataFrame.append(self.crawler(), ignore_index=True)
                sleep(2)
                counterCountry = counterCountry + 1

                # since each page's select tag is a instance
                selectionBox = self.browser.find_element_by_id('frmsearchFilter-country')
        self.browser.quit()  # close browser
        return ipDataFrame

    def execute(self, country='all', protocol='all',
                anonymity='all'):  # get at most (first) 5 pages's data for each value on selection('country')
        pageSource = self.get_webpagecontent()
        self.browser.implicitly_wait(2)

        selector = etree.HTML(pageSource)
        # get all values belonging to select tag
        countryAbbreviations = selector.xpath('//select/option/@value')
        # print(countryAbbreviations)

        from selenium.webdriver.support.select import Select
        # from selenium.webdriver.common.action_chains import ActionChains #import mouse action event package
        import time
        time.sleep(.5)

        selectionBox = self.browser.find_element_by_id('frmsearchFilter-country')
        Select(selectionBox).select_by_value(country)

        protocolButton = self.browser.find_element_by_xpath('//input[@value="{}"]'.format(protocol))
        protocolButton.click()

        anonymityButton = self.browser.find_element_by_xpath('//input[@value="{}"]'.format(anonymity))
        anonymityButton.click()
        # Filter proxies
        self.browser.find_element_by_name("send").click()

        self.url = self.browser.current_url  # reset url since each page's url is different
        ipDataFrame = self.crawler()

        self.browser.quit()  # close browser
        return ipDataFrame

    def ipData_to_file(self, fileName, ipDataFrame):
        import json
        import sys
        # set the column "IP Address" as the index of dataframe
        # and I will use this index as key of a dict
        ipDataFrame.set_index('IP Address', inplace=True)

        # convert dataframe to json string            #since the dataframe is not using ascii
        ipJsonStr = ipDataFrame.to_json(orient='index', force_ascii=False)  # type: str
        # convert json string to json dict
        ipJsonDict = json.loads(ipJsonStr)  # type: dict
        # save dict to a file            #sys.stdout.encoding=='utf-8' since my system using utf-8
        with open(fileName, 'w', encoding=sys.stdout.encoding) as f:
            ipDataDF = json.dump(ipJsonDict, f, ensure_ascii=False)
            f.write('\n')

    def getIpData_from_file(self, fileName):
        with open(fileName, 'r', encoding='utf-8') as f:
            ipJsonDict = json.load(f)
        return ipJsonDict

    def getSpecifedIpList_from_nestedJsonDict(self, jsonDict, innerKey, innerValue):
        return [k for k, v in jsonDict.items() if v[innerKey] == innerValue]

ipCrawler=GetIp(domain,base)
#level1:High  #level2:Anonymous #level3:Transparent


ipDataFrame=ipCrawler.execute(country='US',protocol='http',anonymity='level1')
ipDataFrame

Write ip data to file

ipCrawler.ipData_to_file('ipDataFrame.txt',ipDataFrame)

Read ip data from specified file and save them in a nested dict

ipDataDict=ipCrawler.getIpData_from_file('ipDataFrame.txt')
ipDataDict

import pprint #for print dict structure
pprint.pprint(ipDataDict)

get the detail by a specifed ip addresss

ipDataDict['104.152.45.45']  #get ip detatil by a specifed ip addresss

get an ip list by specifed location

ipList=ipCrawler.getSpecifedIpList_from_nestedJsonDict(ipDataDict,'Location','California, Los Angeles')
ipList

Convert the nested dict to a dataframe

import pandas as pd
dataframe = pd.DataFrame(ipDataDict)
dataframe

gives a name to the row which the columnName on

dataframe.columns.name="IP Address"
dataframe

current dataframe does transpose

import copy
ipDataframe=copy.deepcopy(dataframe.T)
ipDataframe[:5]

extract the numerical value and rename the columnName of dataframe

for ip in ipDataframe.index:
    speed = re.findall( '[0-9]+', str(ipDataframe.loc[ip,'Speed']) )
    if len(speed)!=0:
        ipDataframe.loc[ip,'Speed'] = int(speed[0])
    else:     
        ipDataframe.loc[ip,'Speed']=np.nan

    response = re.findall( '[0-9]+', str(ipDataframe.loc[ip,'Response Time']) )
    if len(response)!=0:
        ipDataframe.loc[ip,'Response Time'] = int(response[0])
    else:     
        ipDataframe.loc[ip,'Response Time']=np.nan
        
    update = re.findall( '[0-9]+', str(ipDataframe.loc[ip,'Update']) )
    if len(update)!=0:
        ipDataframe.loc[ip,'Update'] = int(update[0])
    else:     
        ipDataframe.loc[ip,'Update']=np.nan    
ipDataframe.columns= [ 'Anonymity','Available', 'Country', 'Location', 'Port', 'Protocol',
                       'Response(ms)','Speed(KB/s)', 'Update(hours ago)']
columnOrder= [ 'Port','Protocol','Anonymity','Available', 'Country', 'Location',
                       'Response(ms)','Speed(KB/s)', 'Update(hours ago)']
ipDataframe=ipDataframe[columnOrder]

ipDataframe[:10]

sort the dataframe

ipDataframe=ipDataframe.sort_values(by=['Speed(KB/s)'],ascending=False)
ipDataframe=ipDataframe.sort_values(by=['Response(ms)','Update(hours ago)'],ascending=True)

ipDataframe[:5]

convert the first column(or index column) to a regular column

ipDataFrame = ipDataframe.reset_index()
ipDataFrame[:10]

test ip dataframe for selecting an ip list without using your computer's used port

availableIPportDF= pd.DataFrame(columns=ipDataFrame.columns.values)  # create a empty dataframe with columnNames
#availableIPportDF.colulmns=ipDataFrame.columns
availableIPportDF

import telnetlib
ifValid=False
for r in ipDataFrame.index:
    try:
        telnetlib.Telnet(ipDataFrame.at[r,'IP Address'], ipDataFrame.at[r,'Port'], timeout=2)
        print(ipDataFrame.at[r,'IP Address']+ ": "+ipDataFrame.at[r,'Port'])
        availableIPportDF=availableIPportDF.append(ipDataFrame.iloc[r])
    except:
        ifValid=False

        
availableIPportDF 

availIPportDF=availableIPportDF.reset_index(drop=True)
availIPportDF

import requests

avail2IPportDF= pd.DataFrame(columns=ipDataFrame.columns.values)  # create a empty dataframe with columnNames

header = {'UserAgent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
          'Connection':'close'#'keep-alive'
         }
proxy={} #dict
for r in availIPportDF.index:
    proxy[availIPportDF.at[r,'Protocol'].lower()] = availIPportDF.at[r,'Protocol'].lower() + "://" +\
                                                    availIPportDF.at[r,'IP Address'] + ":" +\
                                                    availIPportDF.at[r,'Port']
    attempts=0
    success=False
    while attempts <1 and success==False: 
        try:#Note: try is not just one time
            attempts+=1
            page = requests.get('http://icanhazip.com', headers=header, proxies=proxy)
            
            #from lxml import etree
            #page.text='165.22.45.183\n'
            #currentIp=165.22.45.183
            currentIP=re.findall(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b',page.text)[0]
            if currentIP == availIPportDF.at[r,'IP Address'] :
                success=True
                #print("current IP: "+currentIP)

                #print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        except:#Maximum number of open connections reached.
                #OR
                #<title>ERROR:The requested URL could not be retrieved</title>
                #OR
                #ERR_PROXY_CONNECTION_FAILED
            success=False
            attempts+=1
            #print("current IP: "+currentIP+" fail!")
            #current IP: 191.96.42.82 has been attended to the avail2IPportDF
            #current IP: 191.96.42.82 fail!
            #[~]drop the current IP if the current IP in the new dataframe
            avail2IPportDF=avail2IPportDF[~avail2IPportDF['IP Address'].isin([currentIP])]
                
            #print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            break
        else:
            avail2IPportDF=avail2IPportDF.append(availIPportDF.loc[r])
            #avail2IPportDF.append(availIPportDF.loc[0])
            #go to use next ip
        sleep(2)

avail2IPportDF

#use website https://www.ipip.net/ip.html to test ip again since 'Hign anonymity'#########
#and


#import urllib.request
#from lxml import etree

import urllib.request
from lxml import etree

domain='https://en.ipip.net/ip/'

avail2IPportDF=avail2IPportDF.reset_index(drop=True)

for index in avail2IPportDF.index:
    currentIP = avail2IPportDF.at[index,'IP Address']
    currentPort = avail2IPportDF.at[index,'Port']
    currentProtocol = avail2IPportDF.at[index,'Protocol']
    
    proxyStr='{}:{}'.format(currentIP,currentPort)
    proxyDict={}
    proxyDict[currentProtocol]=proxyStr
    proxy_support = urllib.request.ProxyHandler(proxyDict)
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36')]

    #req = urllib.request.Request(url)
    #response = opener.open(req)
    #OR
    urllib.request.install_opener(opener)
    response = urllib.request.urlopen(domain + currentIP + '.html')

    html = response.read().decode('utf-8')
    from lxml import etree

    selector = etree.HTML(html)
    ip = selector.xpath('//a[contains(@style,"color: #000;")]/text()')[0]
    location = selector.xpath('//td[contains(text(),"Location")]/../td[2]/span/text()')[0]
    
    if(ip==currentIP):
        print(currentIP + ":" + avail2IPportDF.at[index,'Country'] + ',' + avail2IPportDF.at[index,'Location'])
        print("              "+location)

    else:
        print('False')

Besides,

I find that the result(location) in some ip address is different with IP location in my dataset/dataframe

Besides, using different ip locator(such as baidu.com) will display another different ip address(112.213.104.1) and location after setting up ip(165.22.235.89) proxy

#####################################西刺代理#####################################

#Under the Anaconda Prompt
#pip install fake-useragent
from fake_useragent import UserAgent
from selenium import webdriver

urlList=["https://www.xicidaili.com/nn"]

url = urlList[0]

header={'UserAgent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                     "Chrome/75.0.3770.100 Safari/537.36"}

options = webdriver.ChromeOptions()
options.add_argument("'user-agent=" + header['UserAgent'] + "'")
options.add_argument('--diable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_experimental_option('excludeSwitches', ['enable-automation'])

browser = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver',
                           chrome_options=options
                          )
browser.get(url)
pageSource=browser.page_source
print(pageSource)            

import pandas as pd

df1 = pd.read_html(pageSource, header=0)[0]
df1[:5]

df1.columns = ['Country', 'IP Address', 'Port', 'Server Address', 'Anonymous', 
               'Type', 'Speed', 'Response Time', 'Available','Update']
df1[:5]

from lxml import etree
selector = etree.HTML(pageSource)

for r in range(df1.shape[0]):
    #for c in range(df1.shape[1]):
        if pd.isnull(df1.loc[r,'Country']):
            #xpath cost time, so we need to run several time
            try:
                cn=selector.xpath('//table[@id="ip_list"]/tbody/tr[{0}]/td[@class="country"]/img/@alt'.format(r+2))[0]
            #print(cn) #Cn
            except:
                print("run again")
            df1.loc[r,'Country']=countryDict[cn.lower()]
            
        if df1.loc[r,'Anonymous'] in anonymousDict.keys():
            df1.loc[r,'Anonymous'] = anonymousDict[ df1.loc[r,'Anonymous'] ]
        try:
            speed = selector.xpath('//table[@id="ip_list"]/tbody/tr[{0}]/td[@class="country"]/div/@title'.format(r+2))[0]
            df1.loc[r,"Speed"]=speed
        except:
            print("speed")
        
        try:
            res = selector.xpath('//table[@id="ip_list"]/tbody/tr[{0}]/td[@class="country"]/div/@title'.format(r+2))[1]
            df1.loc[r,"Response Time"] = res
        except:
            print("response")
df1.head()



from lxml import etree
selector = etree.HTML(pageSource)

data = selector.xpath('//table[@id="ip_list"]/tbody/tr[1]/th[@class="country"]/text()')
data

data = selector.xpath('//table[@id="ip_list"]/tbody/tr[2]/td[@class="country"]/img/@alt')[0]
data

data = selector.xpath('//table[@id="ip_list"]/tbody/tr[2]/td[@class="country"]/div/@title')[0]
data

data = selector.xpath('//table[@id="ip_list"]/tbody/tr[2]/td[contains(text(),"222.89.32.140")]/../td[@class="country"]/div/@title')[0]
data

#############################Class##################################

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:[email protected]
@file:ipProxy.py
@time: 9/7/2019 3:04 AM
"""

#https://www.cnblogs.com/Micro0623/p/10905193.html
#https://www.cnblogs.com/jackadam/p/9293569.html
#http://www.mamicode.com/info-detail-2694399.html
#https://www.cppentry.com/bencandy.php?fid=77&id=213146
#https://sites.google.com/a/chromium.org/chromedriver/capabilities
#https://blog.csdn.net/zwq912318834/article/details/78933910
#https://github.com/stormdony/python_demo/tree/master/TaoBao_Login

#Under the Anaconda Prompt
#pip install fake-useragent
from fake_useragent import UserAgent
from selenium import webdriver
from lxml import etree
import pandas as pd

domainList=["https://www.xicidaili.com"]
domain = domainList[0]
url = '/nn/3818'
###########responding to anti-crawler
#####UserAgent
userAgent = UserAgent()#User agent, or ua, is a special string header that enables the server
                        # to identify the operating system and version, cpu type,
                        # browser and version, browser rendering engine, browser language,
                        # browser plug-in, etc. used by the client.

#print(userAgent.ie) #Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64;
                        # Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8
#print(userAgent.firefox)
            #Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0
#print(userAgent.chrome) #Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36
                            # (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36
#print(userAgent.random) #Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)
                            # Chrome/27.0.1453.90 Safari/537.36


###########responding to anti-crawler
#actual User-Agent
# (right click your mouse then select Inspection
# and then you can find it from Headers by clicking any elementName Under Name in Network )
header={'UserAgent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                     "Chrome/75.0.3770.100 Safari/537.36"}
#print(header['UserAgent']) #check


#from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("'user-agent=" + header['UserAgent'] + "'")
options.add_argument('--diable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
#options.add_argument('disable-infobars')# 去掉提示:Chrome正收到自动测试软件的控制

#browser = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver')
                           #chrome_options=options)
#if I run command("window.navigator.webdriver")in the Console of the inspection
#result: true  # means: Selenium + webdriver
#solution:
options.add_experimental_option('excludeSwitches', ['enable-automation'])
#此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium


#browser = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver',
#                           chrome_options=options)
#run command("window.navigator.webdriver")in the Console of the inspection
#result: undefine  # means: regular browser

#browser.get(url)
#print(browser.page_source)

class GetIp():
    def __init__(self,domain,baseUrl):
        self.domain=domain
        self.url=self.domain+str(baseUrl)

        self.browser = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver',
                                        chrome_options=options
                                       )

    def __del__(self):
        self.browser.close()

    def get_webpagecontent(self):
        print(self.url)
        self.browser.get(self.url)
        pageSource = self.browser.page_source
        return pageSource

    def parse_webData(self, pageSource):
        #import pandas as pd                  #0: first row in dataframe
        df1 = pd.read_html(pageSource, header=0)[0]
        #Converting the chinese columnName to english name
        df1.columns = ['Country', 'IP Address', 'Port', 'Server Address', 'Anonymous',
                       'Type', 'Speed', 'Response Time', 'Available', 'Update']
        #from lxml import etree
        selector = etree.HTML(pageSource)

        countryDict = {'cn': 'China'}
        anonymousDict = {'高匿': 'Highly'}

        for r in range(df1.shape[0]):
            # for c in range(df1.shape[1]):
            if pd.isnull(df1.loc[r, 'Country']):
                # xpath will cost time, so we need to run several time
                try:
                    cn = \
                    selector.xpath('//table[@id="ip_list"]/tbody/tr[{0}]/td[@class="country"]/img/@alt'.format(r + 2))[
                        0]
                # print(cn) #Cn
                except:
                    print("run again")
                df1.loc[r, 'Country'] = countryDict[cn.lower()]

            if df1.loc[r, 'Anonymous'] in anonymousDict.keys():
                df1.loc[r, 'Anonymous'] = anonymousDict[df1.loc[r, 'Anonymous']]
            try:
                speed = \
                selector.xpath('//table[@id="ip_list"]/tbody/tr[{0}]/td[@class="country"]/div/@title'.format(r + 2))[0]
                df1.loc[r, "Speed"] = speed
            except:
                print("speed")

            try:
                res = \
                selector.xpath('//table[@id="ip_list"]/tbody/tr[{0}]/td[@class="country"]/div/@title'.format(r + 2))[1]
                df1.loc[r, "Response Time"] = res
            except:
                print("response")
        #df1.head()
        return df1

    def execute(self):
        pageSource = self.get_webpagecontent()
        ipDataFrame = self.parse_webData(pageSource)
        #print(ipDataFrame.head())

        hasNextPage = True
        #counter=0
        while hasNextPage:
            hasNextPage=False
            selector = etree.HTML(pageSource)
            try:
                nextPageUrl = selector.xpath('//div[@id="body"]/div[@class="pagination"]/a[@class="next_page"]/@href')[0]
                if nextPageUrl:
                    hasNextPage = True
            except:
                break

            self.url=self.domain+nextPageUrl
            pageSource = self.get_webpagecontent()
            currentIpDataFrame = self.parse_webData(pageSource)
            #print(currentIpDataFrame.head())
            ipDataFrame=pd.concat([ipDataFrame,currentIpDataFrame], ignore_index=True)
            #counter=counter+1
        return ipDataFrame


if __name__=="__main__":

    #print(domain + url)
    ipCrawler = GetIp(domain,url)
    ipDF=ipCrawler.execute()
    print(ipDF)

发布了53 篇原创文章 · 获赞 38 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/Linli522362242/article/details/100735167