Crawling home in any city rental chain data (Beijing Chaoyang)

  . 1  # ! / Usr / bin / Python the env 
  2  # - * - Coding: UTF-. 8 - * - 
  . 3  # @time: 2019-08-16 15:56 
  . 4  # @author: Anthony 
  . 5  # @email: @ 163 ianghont7 .com 
  . 6  # @file: crawling chain city rent house any data .py 
  . 7  
  . 8  
  . 9  Import Requests
 10  from lxml Import etree
 . 11  Import Time
 12 is  Import to xlrd
 13 is  Import OS
 14  Import xlwt
 15  from xlutils.copyImport Copy
 16  
. 17  # disguise request 
18 is headers = {
 . 19      ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 47.0.2526.106 BIDUBrowser / 8.7 Safari / 537.36 ' 
20 is  }
 21 is  
22 is xlsInfo = {}
 23 is  
24  DEF catchHouseDetail (URL):
 25      # request module requests by analog GET 
26 is      page_text = requests.get (URL, headers = headers, Stream = True)
 27  
28      # acquired on the Internet page object data is loaded into etree
29      Tree = etree.HTML (page_text.text)
 30  
31 is      # Location page tag list in a loaded position 
32      li_list = tree.xpath ( ' // div [@ class = "Content w1150"] / div [@ class = " content__article "] / div [@ class =" content__list "] / div ' )
 33 is      all_house_list = []
 34 is      # traversing the list in each field 
35      for Li in li_list:
 36          info = []
 37 [          # House title 
38 is          # houseTitles = Li .xpath ( '.// div [@ class = "content__list - item - main"] / p [@ class = "content__list - item - bottom oneline"]/i/text()')
39          # Print (* houseTitles) 
40          # renting embodiment 
41 is          houseWay = li.xpath ( ' .//div[@class="content__list--item--main"]/p[@class="content__list--item-- twoline title "] / A / text () ' ) [0] .strip (). Split ( '  ' ) [0] .split ( ' · ' ) [0]
 42 is          # monthly amount 
43 is          houseMoney li.xpath = ( ' .// div [@ class = "content__list - Item - main"] / span [@ class = "content__list -. price-Item"] / EM / text () ' ) [0] + ' / month ' 
44          # cell name 
45         plotName = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--title twoline"]/a/text()')[0].strip().split(' ')[0].split('·')[1]
 46         # 房屋大小
 47         houseSize = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/text()')[4].strip()
 48         # 房屋户型
 49         houseType = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--title twoline"]/a/text()')[0].strip().split(' ')[1]
 50         # 房屋朝向
 51         houseOrientation = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--title twoline"]/a/text()')[0].strip().split(' ')[2]
 52         # 区域位置
 53         communityArea = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/a/text()')[0]
 54         # 地铁站名称
 55         subwayArea = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/a/text()')[1]
 56         # 小区名称
 57         # plotName = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/a/text()')[2]
 58         # 发布时间
 59         releaseTime = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--time oneline"]/text()')[0]
 60 
 61         info.append(houseWay)
 62         info.append(houseMoney)
 63         info.append(plotName)
 64         info.append(houseSize)
 65         info.append(houseType)
 66         info.append(houseOrientation)
 67         info.append(communityArea)
 68         info.append(subwayArea)
 69         info.append(releaseTime)
 70 
 71         all_house_list.append(info)
 72     if if_xls_exits() == True:
 73         write_excel_xls_append(xlsInfo["xlsName"],all_house_list)
 74 
 75 # print(catchHouseDetail('https://bj.lianjia.com/zufang/chaoyang/pg1'))
 76 
 77 
 78 #Xls acquired data is written in the table 
79  DEF write_excel_xls (path, SHEET_NAME, value):
 80      index = len (value)   # Get the number of rows of data to be written 
81      Workbook xlwt.Workbook = ()   # Create a workbook 
82      Sheet = workbook.add_sheet (SHEET_NAME)   # Create a workbook in table 
83      for I in Range (0, index):
 84          for J in Range (0, len (value [I])):
 85              sheet.write (I, J , value [I] [J])   # image data is written in the table (corresponding to rows and columns) 
86      workbook.save (path)   # save the workbook 
87      Print( " XLS format form data is successfully written! " )
 88  
89  
90  
91  DEF write_excel_xls_append (path, value):
 92      index = len (value)   # Gets the number of rows of data to be written 
93      Workbook = xlrd.open_workbook (path)   # open a workbook 
94      sheets = workbook.sheet_names ()   # Get all tables in the workbook 
95      Worksheet = workbook.sheet_by_name (sheets [0])   # Get the first spreadsheet workbook table of all 
96      rows_old = Worksheet. nrows   # rows acquiring data that already exists in the table 
97      new_workbook = copy (Workbook)   # copy converted xlrd target object xlwt
98      new_worksheet = new_workbook.get_sheet (0)   # After obtaining the conversion workbook first table 
99      for I in Range (0, index):
 100          for J in Range (0, len (value [I])):
 101              new_worksheet.write (i + rows_old, J, value [I] [J])   # additional writing of data, row i + rows_old note is written starting from 
102      new_workbook.save (path)   # save the workbook 
103      Print ( " XLS format table [] is added to write data successfully! " )
 104  
105  
106  
107  
108  DEF if_xls_exits ():
 109      the while True:
110          book_name_xls = ' Beijing-linked rental information table * .xls ' 
111          sheet_name_xls = ' Housing ' 
112          value_title = [[ " rental mode " , " monthly amount " , " cell name " , " housing size " , " Housing Unit " , " Housing orientation " , " regional position " , " station name " , " Housing published " ],
] 113 the          if os.path.exists('./%s'%book_name_xls):
114             xlsInfo["xlsName"] = book_name_xls
115             return True
116         else:
117             write_excel_xls(book_name_xls, sheet_name_xls, value_title)
118             continue
119 
120 
121 
122 
123 
124 def catch():
125     pages = ['https://bj.lianjia.com/zufang/chaoyang/pg{}/'.format(x) for x in range(1,100)]
126     for page in pages:
127         try:
128             info = catchHouseDetail(page)
129         except:
130             pass
131         time.sleep(2)
132 
133 
134 if __name__ == '__main__':
135     catch()

 Renderings:

 

Guess you like

Origin www.cnblogs.com/ipyanthony/p/11365950.html