服务器采集

使用动态IP拨号服务器

import os
g_adsl_account = {"name": "adsl",
                "username": "...",
                "password": "..."}
     
class Adsl(object):
    # __init__ : name: adsl名称
    def __init__(self):
        self.name = g_adsl_account["name"]
        self.username = g_adsl_account["username"]
        self.password = g_adsl_account["password"]
 
    # connect : 宽带拨号
    def connect(self):
        cmd_str = "rasdial %s %s %s" % (self.name, self.username, self.password)
        os.system(cmd_str)
        time.sleep(5)
 
    # disconnect : 断开宽带连接
    def disconnect(self):
        cmd_str = "rasdial %s /disconnect" % self.name
        os.system(cmd_str)
        time.sleep(5)
 
    # reconnect : 重新进行拨号
    def reconnect(self):
        self.disconnect()
        self.connect()

if __name__ == '__main__':
    A = Adsl()
    A.reconnect()
import requests
import time
import random
import changeIP

link = "http://www.santostang.com/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 

def scrapy(url, num_try = 3):
    try:
        r = requests.get(url, headers= headers)
        html = r.text
        time.sleep(random.randint(0,2)+random.random())
    except Exception as e:
        print (e)
        html = None
        if num_try >0:
            x = changeIP.adsl()
            x.reconnect()
            html = scrap(url, num_try-1)
    return html

result = scrapy(link)

使用Tor服务器

from stem import Signal
from stem.control import Controller
import socket
import socks
import requests
import time
print(0)
controller = Controller.from_port(port = 9151)
print(2)
controller.authenticate()
print(1)
socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
socket.socket = socks.socksocket

total_scrappy_time = 0
total_changeIP_time = 0
for x in range(0,10):
    a = requests.get("http://checkip.amazonaws.com").text
    print ("第", x+1, "次IP:", a)

    time1 = time.time()
    a = requests.get("http://www.santostang.com/").text
    #print (a)
    time2 = time.time()
    total_scrappy_time = total_scrappy_time + time2-time1
    print ("第", x+1, "次抓取花费时间:", time2-time1)

    time3 = time.time()
    controller.signal(Signal.NEWNYM)
    time.sleep(5)
    time4 = time.time()
    total_changeIP_time = total_changeIP_time + time4-time3-5
    print ("第", x+1, "次更换IP花费时间:", time4-time3-5)

print ("平均抓取花费时间:", total_scrappy_time/10)
print ("平均更换IP花费时间:", total_changeIP_time/10)


我在jupyter notebook 上运行有错误,

SocketError: Socket error: 0x01: General SOCKS server failure

在spyder上运行没问题

猜你喜欢

转载自blog.csdn.net/liudongdong19/article/details/81141500