python reptile crawling Baidu Post Bar Posts

Recently learning python, python on the use of reptiles do a simple project to consolidate the knowledge of their own learning.

I realize the idea:

1. Obtain all posts page contains subject matter;

2. Extract the post from the page is saved to the queue URL, each URL is a post;

3. Extract URL from the queue in order to get the content of the post.

Implementation:

First, get all the content of the page.

I have been using a urllib, later replaced requests, personal feel this module a lot of convenience.

def getPageAllContent(base_url):

    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }
    try:
        request = requests.get(base_url)
        # request = urllib.request.Request(base_url,headers=header)
        # reponse = urllib.request.urlopen(request).read()#取得网页内容
        # print(reponse)
        return request
    except Exception as e:
        if hasattr(e,"reason"):
            print(e.reason)

Post Bar contains 50 posts per page, you can get all the URL through the loop.

    for i in range(0,pageNum):
        list_url.append('https://tieba.baidu.com/f?kw=' + tiebaName[0] + '&ie=utf-8&pn=' + str(i*50))

The total number of pages of the Post Bar with topics / 50, use regular expressions to extract topics.

def getTitlePageNumber(base_url):

    content = getPageAllContent(base_url)
    titleNum = re.findall(r'共有主题数<span class="red_text">([0-9]+)</span>个',content.text)
    # print(titleNum[0])
    pageNum = math.floor(int(titleNum[0])/50)
    # print(pageNum)
    return int(pageNum)

Use xpath get the post URL, if the URL contains Chinese, you need to be transcoded.

def getSingleUrl(list_url):
    final_url = []
    for url in list_url:
        print(url)
        urlcode = quote(url, safe='/:?=&')
        content = getPageAllContent(urlcode)
        # print(content)
        txt = content.text
        html_obj = html.fromstring(txt)

        urls = html_obj.xpath('//*[@id="thread_list"]/li/div/div[2]/div/div/a[@rel]/@href')
        for _url in urls:
            #print(_url)
            final_url.append("https://tieba.baidu.com" + _url)
        # print(final_url)
    return final_url

Obtaining content from a URL on every floor

def getTitlePageContent(single_url):
    page = getContentPageNumber(single_url)
    # print(page)
    for i in range(1,int(page)+1):
        page_url = "%s?pn=%s" % (single_url,i)
        content = getPageAllContent(page_url)
        txt = content.text
        content_obj = html.fromstring(txt)
        if i == 1:
            tieTitle = content_obj.xpath("//*[@id='j_core_title_wrap']/h3/@title")
            saveTitle(i,str(tieTitle))
        tr_list = content_obj.xpath("//*[@id='j_p_postlist']//div[@data-field]/div")
        # print(type(tr_list))

        for j,tr in enumerate(tr_list):
            # print(type(tr))
            if j<len(tr_list)/2:
                try:
                    tieContent = tr.xpath("//cc/div")[j].xpath('string(.)').strip()
                    tieAuthor = tr.xpath("//ul/li[3]/a/text()")[j+1].strip()
                    tieFloor = tr.xpath("//span[@class='tail-info'][last()-1]/text()")[j].strip()
                    tieTime = tr.xpath("//span[@class='tail-info'][last()]/text()")[j].strip()
                    # print(tieTime)
                    #写入文件
                    saveContent(tieAuthor,tieTime,tieFloor,tieContent)
                    # print(tieContent)
                except Exception as e:
                    print(single_url, "----", j, "----", e)
                    with open("error.html", "w") as f:
                        f.write(txt.encode("utf-8"))
                    return

The following is the full code

#-*- coding:utf-8 -*-
# import urllib.request
import requests
import math
import re
from lxml import html
from urllib.parse import quote


#获取待爬取页面URL
def getMainPageUrl(base_url):
    list_url = []
    pageNum = getTitlePageNumber(base_url)
    tiebaName = re.findall(r'f?kw=(.+)&ie',base_url)
    for i in range(0,pageNum):
        list_url.append('https://tieba.baidu.com/f?kw=' + tiebaName[0] + '&ie=utf-8&pn=' + str(i*50))
    # print(list_url[0])
    return list_url

#取得每个帖子对应的URL
def getSingleUrl(list_url):
    final_url = []
    for url in list_url:
        print(url)
        urlcode = quote(url, safe='/:?=&')
        content = getPageAllContent(urlcode)
        # print(content)
        txt = content.text
        html_obj = html.fromstring(txt)

        urls = html_obj.xpath('//*[@id="thread_list"]/li/div/div[2]/div/div/a[@rel]/@href')
        for _url in urls:
            #print(_url)
            final_url.append("https://tieba.baidu.com" + _url)
        # print(final_url)
    return final_url


#获取整个页面内容
def getPageAllContent(base_url):

    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }
    try:
        request = requests.get(base_url)
        # request = urllib.request.Request(base_url,headers=header)
        # reponse = urllib.request.urlopen(request).read()#取得网页内容
        # print(reponse)
        return request
    except Exception as e:
        if hasattr(e,"reason"):
            print(e.reason)

#获取帖子内页面内容
def getTitlePageContent(single_url):
    page = getContentPageNumber(single_url)
    # print(page)
    for i in range(1,int(page)+1):
        page_url = "%s?pn=%s" % (single_url,i)
        content = getPageAllContent(page_url)
        txt = content.text
        content_obj = html.fromstring(txt)
        if i == 1:
            tieTitle = content_obj.xpath("//*[@id='j_core_title_wrap']/h3/@title")
            saveTitle(i,str(tieTitle))
        tr_list = content_obj.xpath("//*[@id='j_p_postlist']//div[@data-field]/div")
        # print(type(tr_list))

        for j,tr in enumerate(tr_list):
            # print(type(tr))
            if j<len(tr_list)/2:
                try:
                    tieContent = tr.xpath("//cc/div")[j].xpath('string(.)').strip()
                    tieAuthor = tr.xpath("//ul/li[3]/a/text()")[j+1].strip()
                    tieFloor = tr.xpath("//span[@class='tail-info'][last()-1]/text()")[j].strip()
                    tieTime = tr.xpath("//span[@class='tail-info'][last()]/text()")[j].strip()
                    # print(tieTime)
                    #写入文件
                    saveContent(tieAuthor,tieTime,tieFloor,tieContent)
                    # print(tieContent)
                except Exception as e:
                    print(single_url, "----", j, "----", e)
                    with open("error.html", "w") as f:
                        f.write(txt.encode("utf-8"))
                    return

#保存title到文件
def saveTitle(i,tieTitle):
    tiebaData = './TiebaData.txt'
    f = open(tiebaData,'a+')
    f.write('----------------- ' + str(i) + ' ----------------\n')
    f.write(tieTitle + '\n')
    f.write('------------------------------------\n')
    f.close()

#保存帖子内容到文件
def saveContent(tieAuthor,tieTime,tieFloor,tieContent):
    tiebaData = './TiebaData.txt'
    f = open(tiebaData,'a+')
    f.write(tieFloor + '  ' + tieAuthor + '  ' + tieTime + '\n')
    f.write(tieContent)
    f.write('\n')
    f.close()

#获取主题总页面数
def getTitlePageNumber(base_url):

    content = getPageAllContent(base_url)
    # print(content.text)
    titleNum = re.findall(r'共有主题数<span class="red_text">([0-9]+)</span>个',content.text)
    # print(titleNum[0])
    pageNum = math.floor(int(titleNum[0])/50)
    # print(pageNum)
    return int(pageNum)

#获取帖子内内容页面数
def getContentPageNumber(url):
    content = getPageAllContent(url)
    pageNum = re.findall(r'回复贴,共<span class="red">([0-9]+)</span>页',content.text)
    # print(pageNum[0])
    return pageNum[0]


def main(base_url):

    # getPageAllContent(base_url)
    # getTitlePageNumber(base_url)
    # getContentPageNumber(base_url)
    # getMainPageUrl(base_url)
    # list_url = getMainPageUrl(base_url)
    # getSingleUrl(list_url)
    # getTitlePageContent(base_url)

    # 获取待爬取贴吧全部页面URL
    list_url = getMainPageUrl(base_url)
    # 从每个页面URL中获取单个帖子URL
    single_url = getSingleUrl(list_url)
    # 依次访问单个帖子URL,取得每个帖子的内容
    for j,surl in enumerate(single_url):
        getTitlePageContent(surl)
        print('正在写入第' + str(j+1) + '条...')

base_url = 'https://tieba.baidu.com/f?kw=北宋&ie=utf-8&pn=0'
# base_url = 'https://tieba.baidu.com/p/3864746283'
if __name__ == '__main__':
    main(base_url)
In actual operation, the program still exist some small bug, crawling efficiency needs to be further optimized


Guess you like

Origin blog.csdn.net/caorya/article/details/80318262