Based on python3, grab post bar pictures and comment pictures to download and save

Crawl Baidu Tieba
Target:
    1. Get the post title, total number of pages, comments, pictures
    2. Write the image to the file and save it
    3. Print various information (test tracking)
    4. Enter the post number to achieve the above operations (also applicable to other posts)

 

First edition:

# -*-coding:utf-8-*-
import random
import re
import them
import urllib
import requests
import urllib.request
import time
from PIL import Image
from io import BytesIO

from bs4 import BeautifulSoup

"""URL to initialize query"""
siteURL = "http://tieba.baidu.com/p/"


def replace(x):
    """
    It is convenient to use the replace method to delete line breaks, etc.
    :stop: x
    :return: x.strip
    """
    # Convert the list to a string or report an error expected string or bytes-like object
    x = ''.join(x)
    removeImg = re.compile('<img.*?>|{7}| ') # Remove img tags, 1-7 spaces,
    removeAddr = re.compile('<a.*?>|</a>') # remove hyperlink tags
    replaceLine = re.compile('<tr>|<div>|</div>|</p>') # Transpose the newline label\n
    replaceTD = re.compile('<td>') # Replace tabular <td> with \t
    replaceBR = re.compile('<br><br>|<br>|</br>|</br></br>') # Replace newline or double newline with \n
    removeExtraTag = re.compile('.*?') # remove the remaining tags
    removeNoneLine = re.compile('\n+') # delete extra blank lines
    removeNoneLine = re.compile('\n+') # delete extra blank lines
    x = re.sub(removeImg, "", x)
    x = re.sub(removeAddr, "", x)
    x = re.sub(replaceLine, "\n", x)
    x = re.sub(replaceTD, "\t", x)
    x = re.sub(replaceBR, "\n", x)
    x = re.sub(removeExtraTag, "", x)
    x = re.sub(removeNoneLine, "\n", x)
    return x.strip() # Delete the redundant content before and after strip()


def getSource(url):
    """
    Get webpage source code
    :param: url
    :return: result
    """
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
        'Mozilla / 4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident / 4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
        'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
        'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)'
    ]

    length = len(user_agents) - 1
    print(length)
    index = random.randint(0, length)
    user_agent = user_agents[index]
    headers = {'User_agent': user_agent}
    r = requests.get(url, headers=headers)
    return r.text


def saveImage(imageURL, path, title, name, pageName):
    """
    save image to file
    :param: imageURL, path, title, name, pageName
    :return:
    """
    try:
        # There is a problem with the naming format. It cannot be named with a URL because there is a '/' naming format. Next, I will make a filter and do it next Monday.
        # The solution is to pass in the name before the call
        proDir = os.path.split(os.path.realpath(__file__))[0]
        fileName = name + '.' + 'jpg'
        filePath = os.path.join(proDir, "photo", title, path, pageName)
        urllib.request.urlretrieve(imageURL, filePath + fileName)
        # urllib.request.urlretrieve(imageURL, filePath)
        # urllib.request.urlretrieve(imageURL, filePath + '\\%s.jpg' % imageURL)
    except Exception as e:
        print (s)


def getTitle(url):
    """
    Get the title of the post and print it out
    :param: url
    :return: iteam
    """
    result = getSource(url)
    pattern = re.compile('<h1.*?title.*?>(.*?)</h1>', re.S)
    iteam = re.findall(pattern, result)
    text = replace(iteam)
    print(u'The title of this article is------' + text)
    return text


def getPageNumber(url):
    """
    Get the total number of pages for this post and print it out
    :param: url
    :return:iteams
    """
    result = getSource(url)
    soup = BeautifulSoup(result, 'lxml')
    # pattern = re.compile('<div class="pb_footer".*?<ul class="l_posts_num".*?<li class="l_reply_num".*?<span.*?red.*?></span>', re.S)
    # If there are multiple pattern substrings, only the first one is returned
    # iteams = re.findall(pattern, result)
    iteams = soup.find_all('span', attrs={'class': 'red'})
    iteam = iteams[-1].get_text()
    print(iteam)
    page = replace(iteam)
    print(u'The total number of pages of this article is ------' + page)
    return page


def getContent(url):
    """
    Get reviews
    :param: url
    :return:items
    """
    result = getSource(url)
    pattern = re.compile('<a data-field.*?p_author_name.*?>(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>', re.S)
    items = re.findall(pattern, result)
    number = 1
    for item in items:
        data = str(number) + u'lou\tThe landlord:' + item[0] + '\nContent:' + item[1] + '\n'
        print(data)
        number += 1
    return items


def makeDir(path, title):
    """
    Create a directory
    :param: path,title
    :return: path
    """
    # Get the main path of proDir
    proDir = os.path.split(os.path.realpath(__file__))[0]
    # filePath configuration file path address
    filePath = os.path.join(proDir, "photo", title)
    # strip() is used to remove newlines and the like
    # create a path for each page
    path = path.strip()
    # os.path.exists() is used to check if the path exists false true
    E = os.path.exists(os.path.join(filePath, path))
    if not E:
        # Create a new directory, if you want to save the content to another path (non-system default), you need to change the environment variable os.makedir if the subdirectory fails to create or already exists
        # Change environment variables and switch paths with os.chdir()
        os.makedirs(os.path.join(filePath, path))
        os.chdir(os.path.join(filePath, path))
        print(u' is creating a folder named ', path, u'')
        return path
    else:
        print(u'A folder named ', path, u' already exists...')
        return path


def getImage(url):
    """
    Get the photo, clean it to get the link and save it in the list
    :param: url
    :return: images
    """
    result = getSource(url)
    soup = BeautifulSoup(result, 'lxml')
    # It is obviously more efficient to use BeautifulSoup here
    # find_all() returns a list, find() returns an element
    # Note that the class attribute coincides with the built-in python, so adding _ becomes class_
    # items = soup.find_all('img', class_="BDE_Image")
    items = soup.find_all('img', attrs={'class': 'BDE_Image'})
    images = []
    number = 0
    for item in items:
        print(u'found a picture, the link is ------', item['src'])
        images.append(item['src'])
        number += 1
    if number >= 1:
        print(u'\n', u'shared pictures', number, u'pieces, amazing my brother!!!')
    else:
        print(u'Here, no picture...')
    return images


def getAllPage(Num, siteURL=siteURL):
    """
    :param: Num,siteURL
    :return:
    """
    siteURL = siteURL + str (Num)
    # Get the post title
    title = getTitle(siteURL)
    # Get the number of post pages
    numbers = getPageNumber(siteURL)
    # browse all pages
    for page in range(1, int(numbers) + 1):
        # format the index link
        url = siteURL + '?pn=' + str(page)
        print(u'\n\n', u' is about to get the content of the ', page, u' page...')
        # get comments
        print(u'\n', u' is about to get comments...')
        getContent(url)
        # save Picture
        # Create a file for each page page1 page2 page3
        path = makeDir(path='page' + str(page), title=title)
        # get image
        print(u'\n', u' is getting the picture...')
        images = getImage(url)
        print(images)
        print(u'\n', u' is about to save the picture...')
        number = 1
        # Save the picture, first find the link from the previous list
        for detailURL in images:
            pageName = str(page) + str(number)
            name = 'page' + str(page) + 'num' + str(number)
            saveImage(detailURL, path, title, name, pageName)
            time.sleep(0.1)
            number += 1

        print(u'\n\n', u'complete the first', page, u'page'
              )

    print(u'\n\n', u'Congratulations, a complete success!')


def main():
    """The main function Num fills in the post number to open the post to view the last string of numbers in the URL"""
    Num = 4252370485
    items = getAllPage (Num)


if __name__ == "__main__":
    main()

 

Remove some duplicates after optimization

'''
Crawl Baidu Tieba (optimized)
Target:
    1. Get the post title, total number of pages, comments, pictures
    2. Write the image to the file and save it
    3. Print various information (test tracking)
    4. Enter the post number to achieve the above operations (also applicable to other posts)
'''
# -*-coding:utf-8-*-
import random
import re
import them
import urllib
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

"""URL to initialize query"""
siteURL = "http://tieba.baidu.com/p/"


def replace(x):
    """
    It is convenient to use the replace method to delete line breaks, etc.
    :stop: x
    :return: x.strip
    """
    # Convert the list to a string or report an error expected string or bytes-like object
    x = ''.join(x)
    removeImg = re.compile('<img.*?>|{7}| ') # Remove img tags, 1-7 spaces,
    removeAddr = re.compile('<a.*?>|</a>') # remove hyperlink tags
    replaceLine = re.compile('<tr>|<div>|</div>|</p>') # Transpose the newline label\n
    replaceTD = re.compile('<td>') # Replace tabular <td> with \t
    replaceBR = re.compile('<br><br>|<br>|</br>|</br></br>') # Replace newline or double newline with \n
    removeExtraTag = re.compile('.*?') # remove the remaining tags
    removeNoneLine = re.compile('\n+') # delete extra blank lines
    x = re.sub(removeImg, "", x)
    x = re.sub(removeAddr, "", x)
    x = re.sub(replaceLine, "\n", x)
    x = re.sub(replaceTD, "\t", x)
    x = re.sub(replaceBR, "\n", x)
    x = re.sub(removeExtraTag, "", x)
    x = re.sub(removeNoneLine, "\n", x)
    return x.strip() # Delete the redundant content before and after strip()


def getSource(url):
    """
    Get webpage source code
    :param: url
    :return: result
    """
    # Set 18 browsing headers to prevent repeated blocking
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
        'Mozilla / 4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident / 4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
        'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
        'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)'
    ]

    length = len(user_agents)
    # Randomly select one of the 18 to generate a random number of symbols from 0 to 1: 0 <= n <= 17
    index = random.randint(0, length-1)
    user_agent = user_agents[index]
    headers = {'User_agent': user_agent}
    r = requests.get(url, headers=headers)
    return r.text


def get_Title_Numbers(url):
    """
    Get the title and total page count of the post, and print the output
    :param: url
    :return:title,page
    """
    result = getSource(url)
    # The following uses regular to find the total number of pages. The regular can not be found, so beautifulsoup is used. Combine the two together and use soup.
    # pattern = re.compile('<h1.*?title.*?>(.*?)</h1>', re.S)
    # iteam = re.findall(pattern, result)
    # text = replace(iteam)
    # print(u'The title of this article is------' + text)
    soup = BeautifulSoup(result, 'lxml')
    titles = soup.find_all('h1', attrs={'class': 'core_title_txt'})
    title = titles[0].get_text()
    print(u'The title of this article is------' + title)
    pages = soup.find_all('span', attrs={'class': 'red'})
    page = pages[-1].get_text()
    print(u'The total number of pages of this article is ------' + page)
    return title, page


def makeDir(title):
    """
    Create a directory
    :param:title
    :return:filePath
    Defect: If the folder has already been created, the repeated creation will fail. Consider the situation that has already been created (modified)
    """
    # Get the main path of proDir
    proDir = os.path.split(os.path.realpath(__file__))[0]
    filePath = os.path.join(proDir, "photo", title)
    # os.path.exists() is used to check if the path exists false true
    E = os.path.exists(filePath)
    if not E:
        # Create a new directory, if you want to save the content to another path (non-system default), you need to change the environment variable os.makedir if the subdirectory fails to create or already exists
        # Change environment variables and switch paths with os.chdir()
        os.makedirs(os.path.join(filePath))
        os.chdir(os.path.join(filePath))
        print(u' is creating a folder named ', title, u'')
        return filePath+"\\"
    else:
        print(u'A folder named', title, u' already exists...')
        return filePath+"\\"


def get_Content_Images(url, page, path):
    """
    Get pictures and comments, return pictures and print out comments
    :param: url, page, title
    :return:
    """
    result = getSource(url)
    pattern = re.compile('<a data-field.*?p_author_name.*?>(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>', re.S)
    items = re.findall(pattern, result)
    print(u'\n', u' is about to get comments...')
    number = 1
    for item in items:
        data = str(number) + u'lou\tThe landlord:' + item[0] + '\nContent:' + item[1] + '\n'
        print(data)
        number += 1
    soup = BeautifulSoup(result, 'lxml')
    items = soup.find_all('img', attrs={'class': 'BDE_Image'})
    images = []
    number = 0
    print(u'\n', u' is getting the picture...')
    for item in items:
        print(u'found a picture, the link is ------', item['src'])
        images.append(item['src'])
        number += 1
    if number >= 1:
        print(u'\n', u'shared pictures', number, u'pieces!')
    else:
        print(u'emm, no picture...')
    print(u'\n', u' is about to save the picture...')
    number = 1
    # Save the picture, first find the link from the previous list
    for detailURL in images:
        name = 'page' + str(page) + 'num' + str(number)
        fileName = name + '.' + 'jpg'
        urllib.request.urlretrieve(detailURL, path+fileName)
        time.sleep(0.1)
        number += 1
    print(u'\n\n', u'Get the first ', page, u' page picture has been completed!')


def getAllPage(Num, siteURL=siteURL):
    """
    get all pages
    :param: Num,siteURL
    :return:
    """
    siteURL = siteURL + str (Num)
    # Get the title of the post and the number of pages can be combined into one
    title, numbers = get_Title_Numbers(siteURL)
    # create folder
    path = makeDir(title)
    # Browse all pages from the first page to the last page
    for page in range(1, int(numbers) + 1):
        # Formatted index link viewing method Look at the URL address and find the rules
        url = siteURL + '?pn=' + str(page)
        print(u'\n\n', u' is about to get the content of the ', page, u' page...')
        # Get comments Because each page is different, so you have to re-getSource(url) You can combine getting pictures and comments together
        get_Content_Images(url, page, path)
    print(u'\n\n', u'Congratulations, a complete success!')


def shuru():
    """
    input function
    :param:
    :return: x
    """
    print("Please enter the post number of Baidu Tieba: ")
    x = input("Please enter...")
    return x


def main():
    """The main function Num fills in the post number to open the post to view the last string of numbers in the URL"""
    # 4252370485
    Num = 4252370485
    items = getAllPage (Num)


if __name__ == "__main__":
    main()

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=326950875&siteId=291194637