python: Practice crawling reptile fiction (beginner)

 1 import requests
 2 from pyquery import PyQuery as pq 
 3 
 4 def get_content(a):
 5     response=requests.get(a)
 6     #print(str(response))    
 7     response.encoding = 'gbk'
 8     #print(response.text)
 9     doc = pq(response.text)
10     text=doc('#content.showtxt')
11     a=str(text)
12     b=a.replace("# 13 is &; 13 is a & #; a " , " \ n- " ) .replace ( ' a a ' , ' \ n- ' ) .replace ( ' <Script> chaptererror (); </ script> <br/> Remember the book launching domain: www.biqugexsw.com pen Fun Club mobile version of reading the novel network URL: m.biqugexsw.com </ div>. ' , ' ' ). Replace ( ' \ XA0 ' , '' ) .replace ( ' <div ID = "Content" class = "showtxt"> ' , '' )
 13 is      File Open = (U ' F.:\ Python \ novel \ 1.txt ' , ' A + ' )
 14     File.close ()
 15  DEF get_mulu ():
 16      index_url = ' https://www.biqugexsw.com/75_75362/ ' # Books Alternatively other pages 
. 17      Response = requests.get (index_url)
 18 is      response.encoding = Response. apparent_encoding
 . 19      DOC = PQ (response.text)
 20 is      URLs = DOC ( ' div.listmain A ' )
 21 is      for I in urls.items ():
 22 is          A = ' https://www.biqugexsw.com/ ' + I. attr.href # obtain the URL of each chapter
 23         get_content (A)
 24-          Print ( " succeed " )
 25          # Print (A) 
26 get_mulu ()

A recent study reptiles, practice crawling pen Fun Club novel.

To be perfect:
  the browser simulation access

  Asynchronous crawling

  Get bookname

  Regular Expressions

Guess you like

Origin www.cnblogs.com/liubingzhe/p/11262691.html