- re.I # ignore case
each row re.M # plurality of rows match, then applied to a positive source data -
- # re.S single row match, a positive effect of the entire source data, output a whole string (line feed comprising Fu also print)
= String '' ' Fall in Love with you i Love you Very much i Love SHE i Love HER ' '' # removed at the beginning of each row i data the re.findall ( ' ^ i. * ' , String, re.M)
['i love you very much', 'i love she', 'i love her']
# Match all of the rows string1 = "" " fine thinking very fear of your teammates in reading your enemies in sharpening your girlfriends to lose weight the next Pharaoh in practice waist " "" re.findall ( ' . * ' , String1, re.S )
[ 'Fine thinking very fear \ n your teammates are reading \ n your enemies Brothers \ n your girlfriends on a diet \ n the next Pharaoh in practice waist \ n', '']
crawling embarrassments Wikipedia all pictures save
Import Requests Import Re Import the urllib Import OS URL = ' https://www.qiushibaike.com/pic/page/%d/?s=5170552 ' # custom request header information headers = { ' the User-- Agent ' : ' the Mozilla /5.0 (Windows NT 6.1; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 72.0.3626.119 Safari / 537.36 ' }
# determine whether there is a folder named qiutu, if it does not exist create folders IF not os.path.exists ( ' ./qiutu ' ): os.mkdir ( ' ./qiutu ' )
# Manual input starting page, ending page START_PAGE = int (INPUT ( ' Start >>> ' )) end_page = int (INPUT ( ' End >>> ' )) for Page in Range (START_PAGE, end_page +. 1 ):
# accordance page url in the form of stitching specified url NEW_URL = format (url% page)
# initiate a request page_text = requests.get (url = NEW_URL, headers = headers) .text
# use regular page_text page to find all the pictures url img_url_list = re.findall ( ' <div class = "Thumb">. *? <img src = "(. *?)" alt =. *? </ div> ' , page_text, Re.S)
# to each picture to add url https protocol header forimg_url in img_url_list: img_url = ' HTTPS: ' + img_url
# url picture image cut is removed name, picture name is stored as the time img_name = img_url.split ( ' / ' ) [-. 1 ] img_path = ' qiutu / ' + img_name,
# of pictures do persistent storage urllib.request.urlretrieve (url = img_url, filename = img_path) Print (img_path, ' download successful ' ) Print ( ' over ' )