. 1 # ! / Usr / bin / Python the env 2 # - * - Coding: UTF-. 8 - * - . 3 # @time: 2019-08-16 15:56 . 4 # @author: Anthony . 5 # @email: @ 163 ianghont7 .com . 6 # @file: crawling chain city rent house any data .py . 7 . 8 . 9 Import Requests 10 from lxml Import etree . 11 Import Time 12 is Import to xlrd 13 is Import OS 14 Import xlwt 15 from xlutils.copyImport Copy 16 . 17 # disguise request 18 is headers = { . 19 ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 47.0.2526.106 BIDUBrowser / 8.7 Safari / 537.36 ' 20 is } 21 is 22 is xlsInfo = {} 23 is 24 DEF catchHouseDetail (URL): 25 # request module requests by analog GET 26 is page_text = requests.get (URL, headers = headers, Stream = True) 27 28 # acquired on the Internet page object data is loaded into etree 29 Tree = etree.HTML (page_text.text) 30 31 is # Location page tag list in a loaded position 32 li_list = tree.xpath ( ' // div [@ class = "Content w1150"] / div [@ class = " content__article "] / div [@ class =" content__list "] / div ' ) 33 is all_house_list = [] 34 is # traversing the list in each field 35 for Li in li_list: 36 info = [] 37 [ # House title 38 is # houseTitles = Li .xpath ( '.// div [@ class = "content__list - item - main"] / p [@ class = "content__list - item - bottom oneline"]/i/text()') 39 # Print (* houseTitles) 40 # renting embodiment 41 is houseWay = li.xpath ( ' .//div[@class="content__list--item--main"]/p[@class="content__list--item-- twoline title "] / A / text () ' ) [0] .strip (). Split ( ' ' ) [0] .split ( ' · ' ) [0] 42 is # monthly amount 43 is houseMoney li.xpath = ( ' .// div [@ class = "content__list - Item - main"] / span [@ class = "content__list -. price-Item"] / EM / text () ' ) [0] + ' / month ' 44 # cell name 45 plotName = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--title twoline"]/a/text()')[0].strip().split(' ')[0].split('·')[1] 46 # 房屋大小 47 houseSize = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/text()')[4].strip() 48 # 房屋户型 49 houseType = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--title twoline"]/a/text()')[0].strip().split(' ')[1] 50 # 房屋朝向 51 houseOrientation = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--title twoline"]/a/text()')[0].strip().split(' ')[2] 52 # 区域位置 53 communityArea = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/a/text()')[0] 54 # 地铁站名称 55 subwayArea = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/a/text()')[1] 56 # 小区名称 57 # plotName = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--des"]/a/text()')[2] 58 # 发布时间 59 releaseTime = li.xpath('.//div[@class="content__list--item--main"]/p[@class="content__list--item--time oneline"]/text()')[0] 60 61 info.append(houseWay) 62 info.append(houseMoney) 63 info.append(plotName) 64 info.append(houseSize) 65 info.append(houseType) 66 info.append(houseOrientation) 67 info.append(communityArea) 68 info.append(subwayArea) 69 info.append(releaseTime) 70 71 all_house_list.append(info) 72 if if_xls_exits() == True: 73 write_excel_xls_append(xlsInfo["xlsName"],all_house_list) 74 75 # print(catchHouseDetail('https://bj.lianjia.com/zufang/chaoyang/pg1')) 76 77 78 #Xls acquired data is written in the table 79 DEF write_excel_xls (path, SHEET_NAME, value): 80 index = len (value) # Get the number of rows of data to be written 81 Workbook xlwt.Workbook = () # Create a workbook 82 Sheet = workbook.add_sheet (SHEET_NAME) # Create a workbook in table 83 for I in Range (0, index): 84 for J in Range (0, len (value [I])): 85 sheet.write (I, J , value [I] [J]) # image data is written in the table (corresponding to rows and columns) 86 workbook.save (path) # save the workbook 87 Print( " XLS format form data is successfully written! " ) 88 89 90 91 DEF write_excel_xls_append (path, value): 92 index = len (value) # Gets the number of rows of data to be written 93 Workbook = xlrd.open_workbook (path) # open a workbook 94 sheets = workbook.sheet_names () # Get all tables in the workbook 95 Worksheet = workbook.sheet_by_name (sheets [0]) # Get the first spreadsheet workbook table of all 96 rows_old = Worksheet. nrows # rows acquiring data that already exists in the table 97 new_workbook = copy (Workbook) # copy converted xlrd target object xlwt 98 new_worksheet = new_workbook.get_sheet (0) # After obtaining the conversion workbook first table 99 for I in Range (0, index): 100 for J in Range (0, len (value [I])): 101 new_worksheet.write (i + rows_old, J, value [I] [J]) # additional writing of data, row i + rows_old note is written starting from 102 new_workbook.save (path) # save the workbook 103 Print ( " XLS format table [] is added to write data successfully! " ) 104 105 106 107 108 DEF if_xls_exits (): 109 the while True: 110 book_name_xls = ' Beijing-linked rental information table * .xls ' 111 sheet_name_xls = ' Housing ' 112 value_title = [[ " rental mode " , " monthly amount " , " cell name " , " housing size " , " Housing Unit " , " Housing orientation " , " regional position " , " station name " , " Housing published " ], ] 113 the if os.path.exists('./%s'%book_name_xls): 114 xlsInfo["xlsName"] = book_name_xls 115 return True 116 else: 117 write_excel_xls(book_name_xls, sheet_name_xls, value_title) 118 continue 119 120 121 122 123 124 def catch(): 125 pages = ['https://bj.lianjia.com/zufang/chaoyang/pg{}/'.format(x) for x in range(1,100)] 126 for page in pages: 127 try: 128 info = catchHouseDetail(page) 129 except: 130 pass 131 time.sleep(2) 132 133 134 if __name__ == '__main__': 135 catch()
Renderings: