python web crawler - regular resolved

 

- re.I # ignore case
each row re.M # plurality of rows match, then applied to a positive source data -
- # re.S single row match, a positive effect of the entire source data, output a whole string (line feed comprising Fu also print)

= String '' ' Fall in Love with you 
i Love you Very much 
i Love SHE 
i Love HER ' '' 
# removed at the beginning of each row i data 
the re.findall ( ' ^ i. * ' , String, re.M)
['i love you very much', 'i love she', 'i love her']
# Match all of the rows 
string1 = "" " fine thinking very fear of 
your teammates in reading 
your enemies in sharpening 
your girlfriends to lose weight 
the next Pharaoh in practice waist 
" "" 
re.findall ( ' . * ' , String1, re.S )
[ 'Fine thinking very fear \ n your teammates are reading \ n your enemies Brothers \ n your girlfriends on a diet \ n the next Pharaoh in practice waist \ n', ''] 

crawling embarrassments Wikipedia all pictures save
Import Requests
 Import Re
 Import the urllib
 Import OS 

URL = ' https://www.qiushibaike.com/pic/page/%d/?s=5170552 ' 
# custom request header information 
headers = {
     ' the User-- Agent ' : ' the Mozilla /5.0 (Windows NT 6.1; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 72.0.3626.119 Safari / 537.36 ' 
} 
# determine whether there is a folder named qiutu, if it does not exist create folders
IF not os.path.exists ( ' ./qiutu ' ): os.mkdir ( ' ./qiutu ' )
# Manual input starting page, ending page START_PAGE
= int (INPUT ( ' Start >>> ' )) end_page = int (INPUT ( ' End >>> ' )) for Page in Range (START_PAGE, end_page +. 1 ):
  # accordance page url in the form of stitching specified url NEW_URL
= format (url% page)
  # initiate a request page_text
= requests.get (url = NEW_URL, headers = headers) .text
  # use regular page_text page to find all the pictures url img_url_list
= re.findall ( ' <div class = "Thumb">. *? <img src = "(. *?)" alt =. *? </ div> ' , page_text, Re.S)
  # to each picture to add url https protocol header
forimg_url in img_url_list: img_url = ' HTTPS: ' + img_url
     # url picture image cut is removed name, picture name is stored as the time img_name
= img_url.split ( ' / ' ) [-. 1 ] img_path = ' qiutu / ' + img_name,
     # of pictures do persistent storage urllib.request.urlretrieve (url
= img_url, filename = img_path) Print (img_path, ' download successful ' ) Print ( ' over ' )

 




Guess you like

Origin www.cnblogs.com/bilx/p/11545988.html