I have nothing to do recently and write an article about reptiles.
Crawl the Wikipedia of Embarrassment Target: 1. Grab the popular posts of the embarrassing thing encyclopedia, get their **publisher**, **comments (and quantity)**, **likes (and quantity)**, etc. 2. Clean and print the information, and output it cyclically 3. Design the program so that the range of pages to be crawled can be selected 4. Save the information of each page to text
Without further ado, the code is as follows (each step is explained in detail):
# -*-coding:utf-8-*- import re import requests import time """URL to initialize query""" siteURL = "https://www.qiushibaike.com/" def replace(x): """ It is convenient to use the replace method to delete line breaks, etc. :stop: x :return: x.strip """ x = re.sub(re.compile('<br>|</br>|/>|<br'), "", x) return x.strip() def getSource(url): """ Get webpage source code :param: url :return: result """ user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' headers = {'User_agent': user_agent} r = requests.get(url, headers=headers) result = r.text return result def getDetaipage(detailURL): """ Get details page information :param: detailURL :return:items """ source = getSource(detailURL) pattern = re.compile( '<div class="author.*?<h2>(.*?)</h2>.*?Icon">(.*?)</div>.*?<div class="content">.*?<span>(.*?)</span>.*?<span.*?stats-vote.*?number">(.*?)</i>.*?stats-comments.*?number">(.*?)</i>.*?up.*?number hidden">(.*?)</span>.*?down.*?number hidden">(.*?)</span>', re.S) items = re.findall(pattern, source) return items def saveDetailpage(data): """ save information to file :param: data,name :return: """ with open("qiushibaike.txt", "a+", encoding='utf-8') as f: f.write(data) def OnePage(detailURL): """ Operations on one page :param: detailURL,name :return:data """ data = getDetaipage(detailURL) return data def getAllPage(start, end): """ Operations on many pages (1: start page is equal to 1 2: start page is greater than 1) :param: start,end :return: False """ items = [] if start == 1: print(u'Getting page 1 data...') detailURL = siteURL data = OnePage(detailURL) # merge multiple data items += data number = 2 for page in range(2, end + 1): print(u' is getting the data of page ', number, u'...') detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625' data = OnePage(detailURL) items += data time.sleep(2) number += 1 if number == end + 1: print(u'', u'\nLoading finished!') return items elif start > 1: number = start for page in range(start, end + 1): print(u'', u'\nGetting the data of page ', number, u'...') # https://www.qiushibaike.com/8hr/page/10/ detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625' data = OnePage(detailURL) items += data time.sleep(2) number += 1 if number == end + 1: print(u'', u'Loading finished!') return items def main(): """The parameters of the main function call can be changed up to 13 pages""" items = getAllPage(start=int(1), end=int(13)) number = 1 for item in items: data = str(number) + u'lou' + u'\tLandlord:' + replace(item[0]) + u'\t' + item[1] + u'year' + u'\nSpeak: ' + replace( item[2]) + u'\nfunny:' + item[3] + u'\tcomment:' + item[4] + u'\tlike:' + item[5] + u'\tdislike : ' + item[6] + '\n' print(data) saveDetailpage (str (data)) number = int(number) + 1 if __name__ == "__main__": main()