'' ' @Modify Time @author goal: Jinjiang years later crawling six ------------ ------- http://www.jjwxc.net/search.php? C4 =% EA% kW% C2% & CF2 = T P =. 1. 1 & 2019/8/31 15:19 laoalo '' ' Import Requests from lxml Import etree head = { ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 64.0.3282.140 Safari / 537.36 Edge / 17.17134 ' , ' the Host ' : ' www.jjwxc.net ' , } DEF get_page_detail (URL): #Get the current page url for each book the Response = requests.get (url = url, headers = head, timeout = 50 ) .text Book = etree.HTML (the Response) .xpath ( ' // h3 [@ class = "title "] / a / @ the href ' ) # Print (etree.tostring (book, encoding =' GBK ') decode. (' GBK ')) return book DEF get_book_detial (book_url): # give details of each book the try : Response = requests.get (URL = book_url, headers = head, timeout = 50 ) .text book_detail = etree.HTML (Response) Book = {} title = book_detail.xpath ( "//span[@itemprop='articleSection']//text()")[0].encode('ISO-8859-1').decode('gbk') book['title'] = title author = book_detail.xpath("//span[@itemprop='author']//text()")[0].encode('ISO-8859-1').decode('gbk') book['author'] = author information = book_detail.xpath("string(//div[@id='novelintro'])").encode('ISO-8859-1').decode('gbk') book['information'] = information return book except IndexError as e: print(e,'下标越界') # targets = book_detail.xpath("//text()") # for index,target in enumerate(targets): # print(index,'*'*30,target.encode('ISO-8859-1').decode('gbk')) # ''' '' '#Label can not climb out# def spider(): bookshelf = [] for i in range(1,5): print("这是第{index}页的信息\n\n\n".format(index=i)) url = 'http://www.jjwxc.net/search.php?kw=%C4%EA%CF%C2&t=1&p={page_num}'.format(page_num=i) book_list = get_page_detail(url) for i in book_list: # print(get_book_detial(i)) bookshelf.append(get_book_detial(i)) return bookshelf # print(url) if __name__ == '__main__': # print(get_book_detial("http://www.jjwxc.net/onebook.php?novelid=3402626")) print(spider())
Often there will be: "list index out of range subscript out of range" or "TimeoutError: [WinError 10060] Since the connected party did not properly respond after a period of time or host connections did not respond, the connection attempt fails," some said the master because access is too fast, resulting in the assignment list did not attach the code to be optimized ......