Solidity intelligent Ethernet Square crawling contract code simple Python Reptile

Because the problem some network environments, Ethernet Square this https://etherscan.io/ website and is not directly accessible, so you need to configure it.

This reptile can official website of the latest 500 intelligent crawling down the contract, robustness, okay.

After the code directly Copy, need to modify the path to the file filepath, can be executed.

Operating environment is recommended Python3.6 above.

# -*- coding: utf8 -*-
# SmartContactSpider.py
import requests
from bs4 import BeautifulSoup
import traceback
import re
import os
import time
import datetime


def printtime():
    print(time.strftime("%Y-%m-%d %H:%M:%S:", time.localtime()), end=' ')
    return 0


def getsccodecore(eachLine):
    # 伪装成浏览器
    headers = {
        '- Agent-the User ' : ' the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 78.0.3904.87 Safari / 537.36 ' } 

    failedTimes = 100
     the while True:   # has been circulated within the developing times, until visit the site successful 

        iF (failedTimes <= 0): 
            printTime () 
            Print ( " ! too many failed attempts, check the network environment " )
             BREAK 

        failedTimes - = 1
         the try :
             # the following are used to capture except when requesting an exception requests , 
            # by capturing and then wait for the network changes in the situation, in order to protect the uninterrupted operation of the program 
            printtime ()
            Print ( ' URL link is connected to the ' + the eachLine, End = '' ) 
            Response = requests.get (the eachLine, headers = headers, timeout =. 5 )
             BREAK 

        the except requests.exceptions.ConnectionError: 
            printTime () 
            Print ( ' the ConnectionError ! Please wait 3 seconds! ' ) 
            the time.sleep ( . 3 ) 

        the except requests.exceptions.ChunkedEncodingError: 
            printTime () 
            Print ( ' !! Please wait 3 seconds ChunkedEncodingError ' ) 
            the time.sleep ( . 3 )

        except:
            printtime()
            print('Unfortunitely,出现未知错误!请等待3秒!')
            time.sleep(3)

    response.encoding = response.apparent_encoding

    soup = BeautifulSoup(response.text, "html.parser")

    targetPRE = soup.find_all('pre', 'js-sourcecopyarea editor')

    filepath = "C:\\Users\\15321\\Desktop\\SmartContract\\code\\"

    filename = eachLine[29:71]

    if (os.path.exists(filepath + filename + '.sol')):
        printtime()
        print(filename + '已存在!')
        return 0

    fo = open(filepath + filename + '.sol', "w+", encoding="utf-8");
    fo.write(targetPRE[0].text)
    fo.close()
    printtime()
    print(filename + '新建完成!')

    return 0


def getsccode():
    the try : 
        SCAddress = Open ( " C: \\ 15321 the Users \\ Desktop \\ \\ \\ SmartContract address \\ Address.txt " , " r " ) 

    the except : 
        printTime () 
        Print ( ' open intelligent warehouse contract URL address error ! check the file directory is correct! ' ) 

    for eachLine in SCAddress: 
        getsccodecore (eachLine)   # this is the core function of intelligent acquisition contract code 

    SCAddress.close () 
    return 0 


DEF getSCAddress (eachurl, filepath):
     # disguised as some kind of browser, and prevent denial of service server 
    headers = {
         'Agent-the User ' : ' Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 78.0.3904.87 Safari / 537.36 ' } 

    # After setting the maximum number of visits to Web site failed, to develop frequency report error, stop the program 
    failedTimes = 50 the while True:   # has been circulating within the number to access the site until the development of successful IF (failedTimes <= 0): 
            printTime () Print ( " too many failures, check the network environment! " )
             BREAK 
        failedTimes -. 1 =   # each time will decrease. 1 the try :
             # the following are used except when the capture request requests abnormal, #

    

        
            

        
            By capturing and then wait for the network situation changes, in order to protect the uninterrupted operation of the program 
            Print ( ' URL links are connecting to is ' + eachurl) 

            the Response = requests.get (url = eachurl, headers = headers, timeout = 5 ) 

            # perform a successful visit to this sentence means, then exit the while loop 
            BREAK 
        the except requests.exceptions.ConnectionError: 
            printTime () 
            Print ( ' !! ConnectionError Please wait 3 seconds ' ) 
            the time.sleep ( 3 ) 

        the except requests.exceptions.ChunkedEncodingError: 
            printTime () 
            Print ( ' ChunkedEncodingError! Please wait 3 seconds! ' )
            the time.sleep ( . 3 ) 

        the except : 
            printTime () 
            Print ( ' !! Unknown error Please wait 3 seconds ' ) 
            the time.sleep ( . 3 ) 

    # is converted to UTF-8 encoding 
    response.encoding = response.apparent_encoding 

    # soup, 
    soup = BeautifulSoup (response.text, " html.parser " ) 

    # find this field, this field contains the URL address of the smart contract code 
    targetDiv = soup.find_all ( ' div ' , ' the Table-2-responsive mb mb-md-0 ' ) 

    the try  :
        targetTBody= TargetDiv [0] .table.tbody
     the except : 
        printTime () 
        Print ( " targetTBody not succeed! " )
         Return 1 # to open the file added. # If the file does not exist, a new; if the file already exists, then the file pointer append 
    FO = Open (filepath + " Address.txt " , " A " ) # to each address, there are written to the file saved for targetTR in targetTBody:
         IF targetTR.name == ' TR ' : 
            fo.write ( " https://etherscan.io

    
    

    
    " + targetTR.td.find('a', 'hash-tag text-truncate').attrs['href'] + "\n")
    fo.close()
    return 0


def updatescurl():
    urlList = ["https://etherscan.io/contractsVerified/1?ps=100",
               "https://etherscan.io/contractsVerified/2?ps=100",
               "https://etherscan.io/contractsVerified/3?ps=100",
               "https://etherscan.io/contractsVerified/4?ps=100 " ,
                " https://etherscan.io/contractsVerified/5?ps=100 " ] 

    # store filepath contract is smart save to address crawling file path 
    # Please change the path they want according to their needs. 
    filepath = ' C: \\ 15321 the Users \\ Desktop \\ \\ \\ SmartContract address \\ ' 

    # the old address storage contract documents cleaned 
    try :
         IF (os.path.exists (filepath + " Address.txt " )): 
            The os.remove (filepath + " Address.txt " ) 
            printTime () 
            Print ( 'It has cleared the old file (warehouse) in the% s directory! ' % Filepath)
     the except IOError: 

        printTime () 
        Print ( " error that can not handle appears, terminate the program:! IOError " ) 

        # function does not perform properly, return 1 
        return 1 # read urlList where every web page URL in intelligence contract address for eachurl in urlList: 
        Time = 0
         the while (. 1 == getSCAddress (eachurl, filepath)): 
            Time + =. 1
             IF (Time == 10 ):
                 BREAK Pass # function normally performed, return 0 return

    
    
            


    
    0 


DEF main ():
     # update to address crawling smart contract 
    updatescurl () 

    # climb to take the address code intelligence contracts intelligent contracts 
    getsccode () 


main ()

 

Guess you like

Origin www.cnblogs.com/blockchainchain/p/11915130.html