Crawling Hotel Information

import requests
from lxml import etree
import re
import xlwt
import pyodbc
import random

class Hotel():

    #初始化
    def __init__(self):

        self.headers = [
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla / 4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30) " ,
             " Mozilla / 5.0 (Windows; U; Windows NT 5.1: zh-CN) AppleWebKit / 523.15 (KHTML, like Gecko, Safari / 419.3) Arora / 0.3 (Change: 287 c9dfb30) " ,
             " Mozilla / 5.0 (X11; U; Linux; en-US) AppleWebKit / 527 + (KHTML, like Gecko, Safari / 419.3) Arora / 0.6 " ,
             " Mozilla / 5.0 (Windows; U; Windows NT 5.1; en-US; rv: 1.8.1.2pre) Gecko / 20070215 K-Ninja / 2.1.1 " ,
             " Mozilla / 5.0 (Windows; U; Windows NT 5.1; zh-CN; rv: 1.9) Gecko / 20080705 Firefox / 3.0 Havens / 3.0 ",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)',
            'UCWEB7.0.2.37/28/999',
            'NOKIA5700/ UCWEB7.0.2.37/28/999',
            'Openwave/ UCWEB7.0.2.37/28/999',
            'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; 360SE)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
            'Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)',
            'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
            'Mozilla / 5.0 (Windows NT 6.1; rv: 2.0.1) Gecko / 20,100,101 Firefox / 4.0.1 ' ,
             ' Mozilla / 5.0 (Androdi; Linux armv7l; rv: 5.0) Gecko / Firefox / Fennec 5.0 / 5.0 ' ,
             " Mozilla /5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 62.0.3192.0 Safari / 537.36Name " ] 
        # Create an excel table 
        self.f = xlwt.Workbook (encoding = ' UTF-. 8 ' ) 
        # Create a single table sheet1, which is inserted in a single table 
        self.sheet1 = self.f.add_sheet (U ' Sheet1 ' , cell_overwrite_ok = True) 

        # database connection 
        self.cnxnPyodbc.connect = ( ' the DRIVER = {} the SQL Server; .; DATABASE SERVER = = the Test; the UID = SA; 123456 = the PWD ' ) 
        # cursor 
        self.cursor = self.cnxn.cursor () 


    # access GET 
    DEF get_html (Self , jingji_url): 
        # headers = { 
        # ' the User-- Agent ' : ' the Mozilla / 5.0 (the Macintosh; the Intel the Mac the OS X-10_12_3) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 58.0.3029.110 Safari / 537.36 ' } 

        # Create a dictionary 
        DIC = {} 
        # randomly selected headers 
        DIC [ ' the User-- Agent '] = random.choice(self.headers)
        for i in range(3):
            r = requests.get(jingji_url,headers=dic)
            #r = requests.get(jingji_url,headers=headers)
            if r.status_code == 200:
                r.encoding = 'utf-8'
                return r.text
            else:
                pass


    def parse(self,html):

        #解析列表页面
        try:
            url_base = 'http://www.gckzw.com'
            html = etree.HTML(html)

            #每个酒店的url
            content_url = html.xpath('//div[@class="travel_left_content travel_left"]//div[@class="travel_hotel_list_content travel_celarfix"]//p[@class="travel_hotel_intro_title"]//a/@href')
            #每个酒店的名称
            content_name = html.xpath('//div[@class="travel_left_content travel_left"]//div[@class="travel_hotel_list_content travel_celarfix"]//p[@class="travel_hotel_intro_title"]//a/text()'=
            detail_url
            # Detailed list of pages for each url)[]
            J = 0 
            for I in CONTENT_URL: 
                DIC = {} 
                DIC [the content_name [J]] = + url_base I 
                detail_url.append (DIC) 
                J + = . 1 

            return detail_url 
        the except: 
            Pass 

    : DEF detail_parse (Self, HTML) 
        Resolution # detail page 
        the try : 
            # new list to hold things 
            lIST END = [] 
            HTML = etree.HTML (HTML) 

            # Contact 
            content_textHtml.xpath = ( ' // div [@ class = "em2_bg clearfix"] [. 1] // P / text () ' ) [ 0 ] 

            # positions 
            LOCATION = html.xpath ( " // div [@ class = ' travel_celarfix hotel_comment_header '] [. 1] // P [2] / span / text () " ) [ 0 ] 

            # data inserted 
            list_end.append (content_text) 
            list_end.append (lOCATION) 
            Print ( " Contact: and position " , LIST_END ) 
            # return data 
            return LIST END 

        the except: 
            Pass 

    DEF URL_LIST (Self): 
 
        the Url # each page
        list_page_list= []
        for i in range(1,160):
            url= 'http://www.gckzw.com/jiudian-xian610100-p'+ str(i) + '.html?startDate=2019-07-24&endDate=2019-07-25'
            list_page_list.append(url)

        return list_page_list


    def sql_connect(self,list_end):
        try:
            print("**********{} **{}***{}****{}*************".format(self.cursor,list_end[0],list_end[1],list_end[2]))

            #插入sql
            insert = self.cursor.execute("insert into jiudian_2(name,phone,adress) values (?,?,?)",(list_end[0],list_end[1],list_end[2])).rowcount
            print("insert",insert)
            self.cursor.commit()
        except:
            pass

if __name__ == '__main__':
    news = Hotel()
    list_page_list = news.url_list()
    print("所有列表页面的url", list_page_list) 

    the try : 
        # a list of cycles per page 
        for url in list_page_list: 
            HTML = news.get_html (url) 
            detail_url_list = news.parse (HTML) 
            #Print ( " each of the hotel's name and url " , detail_url) 

            NUM = 0 
            for deta_url in detail_url_list: 

                # hotel url 
                detail_url = List (deta_url.values ()) [ 0 ] 

                # every hotel name 
                name = List (deta_url.keys ()) [ 0] 
                #Print ( " Each hotel's url " , detail_url, name) 

                # access to each hotel's url 
                HTML = news.get_html (detail_url) 
                LIST END = news.detail_parse (HTML) 
                list_end.append (name) 
                Print ( " required information " , LIST END) 

                # stored in SQL 
                news.sql_connect (LIST END) 

                J = 0 
                for value in LIST END:  
                    news.sheet1.write (NUM, J, value) # these three parameters are the row, column, value)
                    J + = 1 
                NUM + =1
    except Exception as e:
        print("错误原因",e)
    finally:
        news.cnxn.close()
        #news.cnxn.save(r'd:\excel_jingjijiudian.xls')

 

Guess you like

Origin www.cnblogs.com/yuanjia8888/p/11361648.html