Problems encountered:
1.list later added a cross-border inquiry
try:
except IndexError:
pass
A simple crawler
. 1 Import Requests 2 from lxml Import etree . 3 Import csv . 4 Import OS . 5 . 6 . 7 # Create a csv file if the file is automatically created present step . 8 F = Open ( " house3.csv " , " W + " ) . 9 10 # configured a csv objects . 11 CSV_File = csv.writer (F) 12 is 13 is # url acquired web page corresponding to the source 14 head = { 15 ' the User-- Agent ' : ' the User - Agent-the Mozilla / 5.0 (the Windows NT 6.1; the WOW64; Trident / 7.0; RV: 11.0) '} 16 DEF getSource (URL): . 17 18 is . SourceHtml = Requests GET (URL, headers = head) . 19 return sourceHtml.text 20 is 21 is # begin crawling and analysis data 22 is DEF spiderData (URL): 23 is RText = getSource (URL) 24 # path = " C: // the Users / Administrator / Desktop / Picure " 25 # converts the contents of the server returns to the xml format 26 is HTML = etree.HTML (RText) 27 div_list = html.xpath ( ' // div [the contains (@class, "List-JS-Tips")] / div [the contains (@class, "F-List-Item")] ' ) 28 # data = [] 29 csv_file.writerow(["标题","价格","地址"]) 30 for item in div_list: 31 32 try: 33 34 title= item.xpath('.//dd[contains(@class, "title")]/a/text()')[0] 35 price = item.xpath('.//dd[contains(@class,"info")]/div[@class="price"]/span[@class="num"]/text()')[0] 36 yue = item.xpath('.//dd[contains(@class,"info")]/div[@class="price"]/span[@class="yue"]/text()')[0] 37 address= item.xpath('.//dd[contains(@class, "address")]//a[@class="address-eara"]/text()')[0] 38 img = html.xpath('.//div[@class="img-wrap"]//img/@src')[0] 39 40 print(title,price+yue,address,img) 41 except IndexError: # get the movie icon 43 Pass 42 44 is IMG = html.xpath ( ' .//div[@class="img-wrap"]//img/@src ' ) [ 0 ] 45 R & lt = Requests. GET (IMG, headers = head) 46 is # # Save image 47 F1 = Open ( " {} .png " .format (title), " WB " ) 48 Print (F1) 49 f1.write (r.content) # server returns the contents of a file written to the 50 F1. Close () 51 is 52 is the try : 53 is csv_file.writerow ([title, +. price Yue, address, IMG]) 54 is except: 55 pass 56 # f1.close() 57 58 59 if __name__=='__main__': 60 url="http://cs.ganji.com/zufang/b2/" 61 62 for i in range(2,5): 63 64 src="pn"+str(i) 65 url=url+src+"/?qq-pf-to=pcqq.group'" 66 spiderData(url) 67 68
End ------------ ------------ restore content