Crawling embarrassing things encyclopedia based on python3

I have nothing to do recently and write an article about reptiles.

Crawl the Wikipedia of Embarrassment
Target:
    1. Grab the popular posts of the embarrassing thing encyclopedia, get their **publisher**, **comments (and quantity)**, **likes (and quantity)**, etc.
    2. Clean and print the information, and output it cyclically
    3. Design the program so that the range of pages to be crawled can be selected
    4. Save the information of each page to text

Without further ado, the code is as follows (each step is explained in detail):

# -*-coding:utf-8-*-
import re
import requests
import time

"""URL to initialize query"""
siteURL = "https://www.qiushibaike.com/"


def replace(x):
    """
    It is convenient to use the replace method to delete line breaks, etc.
    :stop: x
    :return: x.strip
    """
    x = re.sub(re.compile('<br>|</br>|/>|<br'), "", x)
    return x.strip()


def getSource(url):
    """
    Get webpage source code
    :param: url
    :return: result
    """
    user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
    headers = {'User_agent': user_agent}
    r = requests.get(url, headers=headers)
    result = r.text
    return result


def getDetaipage(detailURL):
    """
    Get details page information
    :param: detailURL
    :return:items
    """
    source = getSource(detailURL)
    pattern = re.compile(
        '<div class="author.*?<h2>(.*?)</h2>.*?Icon">(.*?)</div>.*?<div class="content">.*?<span>(.*?)</span>.*?<span.*?stats-vote.*?number">(.*?)</i>.*?stats-comments.*?number">(.*?)</i>.*?up.*?number hidden">(.*?)</span>.*?down.*?number hidden">(.*?)</span>',
        re.S)
    items = re.findall(pattern, source)
    return items


def saveDetailpage(data):
    """
    save information to file
    :param: data,name
    :return:
    """
    with open("qiushibaike.txt", "a+", encoding='utf-8') as f:
        f.write(data)


def OnePage(detailURL):
    """
    Operations on one page
    :param: detailURL,name
    :return:data
    """
    data = getDetaipage(detailURL)
    return data


def getAllPage(start, end):
    """
    Operations on many pages (1: start page is equal to 1 2: start page is greater than 1)
    :param: start,end
    :return: False
    """
    items = []
    if start == 1:
        print(u'Getting page 1 data...')
        detailURL = siteURL
        data = OnePage(detailURL)
        # merge multiple data
        items += data
        number = 2
        for page in range(2, end + 1):
            print(u' is getting the data of page ', number, u'...')
            detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625'
            data = OnePage(detailURL)
            items += data
            time.sleep(2)
            number += 1
        if number == end + 1:
            print(u'', u'\nLoading finished!')
            return items
    elif start > 1:
        number = start
        for page in range(start, end + 1):
            print(u'', u'\nGetting the data of page ', number, u'...')
            # https://www.qiushibaike.com/8hr/page/10/
            detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625'
            data = OnePage(detailURL)
            items += data
            time.sleep(2)
            number += 1
        if number == end + 1:
            print(u'', u'Loading finished!')
            return items


def main():
    """The parameters of the main function call can be changed up to 13 pages"""
    items = getAllPage(start=int(1), end=int(13))
    number = 1
    for item in items:
        data = str(number) + u'lou' + u'\tLandlord:' + replace(item[0]) + u'\t' + item[1] + u'year' + u'\nSpeak: ' + replace(
            item[2]) + u'\nfunny:' + item[3] + u'\tcomment:' + item[4] + u'\tlike:' + item[5] + u'\tdislike : ' + item[6] + '\n'
        print(data)
        saveDetailpage (str (data))
        number = int(number) + 1


if __name__ == "__main__":
    main()

 

 

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=326951011&siteId=291194637