1 import requests 2 from pyquery import PyQuery as pq 3 4 def get_content(a): 5 response=requests.get(a) 6 #print(str(response)) 7 response.encoding = 'gbk' 8 #print(response.text) 9 doc = pq(response.text) 10 text=doc('#content.showtxt') 11 a=str(text) 12 b=a.replace("# 13 is &; 13 is a & #; a " , " \ n- " ) .replace ( ' a a ' , ' \ n- ' ) .replace ( ' <Script> chaptererror (); </ script> <br/> Remember the book launching domain: www.biqugexsw.com pen Fun Club mobile version of reading the novel network URL: m.biqugexsw.com </ div>. ' , ' ' ). Replace ( ' \ XA0 ' , '' ) .replace ( ' <div ID = "Content" class = "showtxt"> ' , '' ) 13 is File Open = (U ' F.:\ Python \ novel \ 1.txt ' , ' A + ' ) 14 File.close () 15 DEF get_mulu (): 16 index_url = ' https://www.biqugexsw.com/75_75362/ ' # Books Alternatively other pages . 17 Response = requests.get (index_url) 18 is response.encoding = Response. apparent_encoding . 19 DOC = PQ (response.text) 20 is URLs = DOC ( ' div.listmain A ' ) 21 is for I in urls.items (): 22 is A = ' https://www.biqugexsw.com/ ' + I. attr.href # obtain the URL of each chapter 23 get_content (A) 24- Print ( " succeed " ) 25 # Print (A) 26 get_mulu ()
A recent study reptiles, practice crawling pen Fun Club novel.
To be perfect:
the browser simulation access
Asynchronous crawling
Get bookname
Regular Expressions