Python reptiles crawl prices look up data

  B2b websites because the production needs of enterprise information and data storage needs. So lock eyes look up enterprise data, ado, open dry! 

  

#-*- coding-8 -*-
import requests
import lxml
import sys
from bs4 import BeautifulSoup
import xlwt
import time
import urllib
 
def craw(url,key_word,x):
    User_Agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'
#    if x == 0:
#        re = 'http://www.qichacha.com/search?key='+key_word
#    else:
#        re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1)
    re = r'https://www.qichacha.com/search?key='+key_word
    headers = {
            'Host':'www.qichacha.com',
            'Connection': 'keep-alive',
            'The Accept': r'text / HTML, * / *; Q = 0.01 ',
            print(response.status_code)
            'X-Requested-With-': 'the XMLHttpRequest', 
            'the User-- Agent': r'Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 69.0.3497.100 Safari / 537.36 ', 
            ' referer ': Re, 
            ' the Accept-Encoding ':' the gzip, the deflate, br ', 
            ' the Accept-Language ':' the CN-ZH, ZH; Q = 0.9 ', 
            ' cookies': here r'xxxxxxxxx into your cookiexxxxxxxx here you cookiexxxxxxxxx replaced here into your cookiexxxxxxx ', 
            } 
 
    the try: 
        Response = requests.get (URL, headers = headers) 
        IF response.status_code = 200 is:! 
            response.encoding =' UTF-. 8 ' 
            Print (' ERROR '    )    
        soup = BeautifulSoup(response.text,'lxml')
    except Exception: 
        Print ( 'do not make the request, this half Guards do is to look ???') 
    the try: 
        com_all_info = soup.find_all (class _ = 'm_srchList') [0] .tbody 
        com_all_info_array = com_all_info .Select ( 'TR') 
        Print ( 'start crawling data, do not open the Excel') 
        for I in Range (0, len (com_all_info_array)): 
# the try: 
                temp_g_name = com_all_info_array [I] .Select ( 'TD') [2] .select ( '. ma_h1 ') [0] .text # acquiring company name 
                temp_g_tag = com_all_info_array [i] .select ( 'td') [2] .select ( '. search-tags') [0]. acquiring company label text # 
                temp_r_name = com_all_info_array [i] .select ( 'td') [2] .select ( 'p') [0] .a.text # acquired legal name
                temp_g_money = com_all_info_array [i] .select ( 'td') [2] .select ( 'p') [0] .select ( 'span') [0] .text.strip ( ' Registered capital:') obtain registration # capital 
                temp_g_addr = com_all_info_array [i] .select ( 'td') [2] .select ( 'p') [2] .text.strip () strip ( ' address: ') # Get Address
                temp_g_date = com_all_info_array [i] .select ( 'td') [2] .select ( 'p') [0] .select ( 'span') [1] .text.strip ( ' establishment:') # Get Company registered 
                temp_r_email = com_all_info_array [i] .select ( 'td') [2] .select ( 'p') [1] .text.split ( '\ n') [1] .strip (). strip ( ' mailboxes : ') # acquired legal In Email 
                temp_r_phone = com_all_info_array [I] .Select (' TD ') [2] .Select (' P ') [. 1] .Select (' m-L ') [0] .text.strip. ( 'Tel:') # phone number acquired legal 
                g_name_list.append (temp_g_name) 
                g_tag_list.append (temp_g_tag) 
                r_name_list.append(temp_r_name)
                g_money_list.append(temp_g_money)
                temp_g_state = com_all_info_array [i] .select ( 'td') [3] .select ( '. nstatus.text-success-lt.ml-xs') [0] .text.strip () # of acquiring the status 
                 
                g_date_list.append (temp_g_date) 
                r_email_list.append (temp_r_email) 
                r_phone_list.append (temp_r_phone) 
                g_addr_list.append (temp_g_addr) 
                g_state_list.append (temp_g_state) 
                 
# the except Exception: 
# Print ( '! error') 
    the except Exception: 
        Print ( 'seems to have been denied access to the A pair of it ... Please try again later ... ') 
         
IF __name__ ==' __main__ ':
    global g_name_list
    global g_tag_list
    global r_name_list
    global g_money_list
    global g_date_list
    , Ltd. Free Join r_email_list 
    , Ltd. Free Join r_phone_list 
    , Ltd. Free Join g_addr_list 
    , Ltd. Free Join g_state_list 
     
    g_name_list=[]
    g_tag_list = [] 
    r_name_list = [] 
    g_money_list = [] 
    g_date_list = [] 
    r_email_list = [] 
    r_phone_list = [] 
    g_addr_list = [] 
    g_state_list = [] 
 
    key_word = the INPUT ( 'Please enter your want to search keywords: ') 
    NUM = int (the iNPUT (' Please enter the number you want to search: ')) + 1 
    sleep_time = int (the iNPUT (' Please enter the number of seconds to retrieve every delay: ')) 
     
    key_word = urllib .parse.quote (key_word) 
     
    Print ( 'searching, please wait') 
     
    for the X-in the Range (1, NUM): 
        url = r'https: //www.qichacha.com/search_index Key = {} & ajaxflag =? 1 & p = {} & ' . format (key_word,x)
        s1 = craw(url,key_word,x)
        the time.sleep (sleep_time) 
    Workbook = xlwt.Workbook () 
    # create sheet objects, a new sheet
    sheet1 = workbook.add_sheet ( 'corporate look up data', cell_overwrite_ok = True) 
    # excel style --- --- Set 
    # initialize style 
    style = xlwt.XFStyle () 
    # create the font style 
    font = xlwt.Font () 
    font. name = 'Arial' 
# Font.Bold bold = True # 
    # set Font 
    style.font font = 
    # data pattern is written using the 
    print ( 'data is being stored, do not open the Excel') 
    # write data to the sheet 
    name_list = [ 'company name', 'company label', 'statutory corporation', 'registered capital', 'date of establishment', 'corporate email', 'corporate phone', 'address', 'the company states'] 
    for CC in the Range (0, len (NAME_LIST)): 
        sheet1.write (0, CC, NAME_LIST [CC], style) 
    for I in Range (0, len (g_name_list)):
        print(g_name_list[i])
        sheet1.write(i+1,0,g_name_list[i],style)#公司名字 
        sheet1.write (i + 1,1, g_tag_list [ i], style) # Company label 
        sheet1.write (i + 1,2, r_name_list [i], style) # statutory corporation 
        sheet1.write (i + 1,3, g_money_list [ i], style) # registered capital 
        sheet1.write (i + 1,4, g_date_list [ i], style) # date of establishment 
        sheet1.write (i + 1,5, r_email_list [ i], style) # corporate mailbox 
        sheet1.write (i + 1,6, r_phone_list [ i], style) # telephone corporation 
        sheet1.write (i + 1,7, g_addr_list [i], style) # address 
        sheet1.write (i + 1,8, g_state_list [ i], style) # company state 
    # excel file saved with the same name directly cover 
    workbook.save (r "D: \ wyy -qcc -. "The time.strftime + (" M-%%% Y-H-D-%%% M-S ", time.localtime ()) +" XLS ") 
    Print ( 'has been saved ~')

  

Guess you like

Origin www.cnblogs.com/68xi/p/11206584.html