Python爬虫抓取微博热搜榜

Python爬虫抓取微博热搜榜

环境

Python 3.7.4
bs4==0.0.1
xlwt==1.3.0
urllib3==1.24.2
re

初始化

def __init__(self):
	# 正则模式
	# 微博热搜榜的热搜条目和广告条目的正则模式不同
    self.hotSearchPattern = re.compile(r'<a href="(.*?)" target="_blank">(.*?)</a>')
    self.advertisementPattern = re.compile(r'<a action-data.*?href_to="(.*?)" suda-data=.*?word=".*?">(.*?)</a>')
	
	# URL和跳转链接的URL前缀
    self.url = 'https://s.weibo.com/top/summary'
    self.prefixUrl = 'https://s.weibo.com'

    self.headers = {
    
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}

抓取网页

def crawlPage(self, url):
	# 抓取网页,以UTF-8格式解析
	# request请求
    request = urllib.request.Request(headers = self.headers, url = url)

    page = None

    try:
        response = urllib.request.urlopen(request)
        page = response.read().decode('utf-8')
    except urllib.error.URLError as e:
        print("Get page fail!")
        print(e)

    return page

提取数据

def extractData(self):
	# 提取数据,返回热搜列表
    page = self.crawlPage(self.url)
	
	# 提取数据,选择HTML解析器
    beautifulSoup = BeautifulSoup(page, 'html.parser')

    hotSearchList = []

    for tag in beautifulSoup.find_all('td', class_ = 'td-02'):
        tag = str(tag)
        # 正则表达式提取数据
        item = re.findall(self.hotSearchPattern, tag)

        if len(item) == 0:
        	# 微博热搜榜的热搜条目和广告条目的正则模式不同
        	# 若用热搜条目模式无法获取则用广告条目正则模式
            item = re.findall(self.advertisementPattern, tag)
		
		# 增加URL前缀,并加入列表
        hotSearchList.append([item[0][1], self.prefixUrl + item[0][0]])

    return hotSearchList

导出为XLS文件

class Exporter():
    def __init__(self):
        self.workbook = xlwt.Workbook(encoding='utf-8')

    def exportXLS(self, columns, data, path, sheetName):
        if len(data) == 0:
            print("Please get data first!")
            return

        sheet = self.workbook.add_sheet(sheetName)

        for column in range(0, len(columns)):
            sheet.write(0, column, columns[column])

        for i in range(1, len(data) + 1):
            for column in range(0, len(columns)):
                sheet.write(i, column, data[i - 1][column])

        self.workbook.save(path)

实现代码

# -*- coding:utf-8  -*-
import re
from bs4 import BeautifulSoup
import urllib
import xlwt

class Spider():
    '''
        Description:
            Spider program to crawl data from Weibo hot search rank list
        Attributes:
            None
    '''
    def __init__(self):
        self.hotSearchPattern = re.compile(r'<a href="(.*?)" target="_blank">(.*?)</a>')
        self.advertisementPattern = re.compile(r'<a action-data.*?href_to="(.*?)" suda-data=.*?word=".*?">(.*?)</a>')

        self.url = 'https://s.weibo.com/top/summary'
        self.prefixUrl = 'https://s.weibo.com'

        self.headers = {
    
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}

    '''
        Description:
            crawl page from the given URL
        Args:
            url: the URL of page need to get
        Returns:
            page of the given URL
    '''

    def crawlPage(self, url):
        request = urllib.request.Request(headers = self.headers, url = url)

        page = None

        try:
            response = urllib.request.urlopen(request)
            page = response.read().decode('utf-8')
        except urllib.error.URLError as e:
            print("Get page fail!")
            print(e)

        return page

    '''
        Description:
            extract data from the given page, return the list of data
        Args:
            None
        Returns:
            list of data extract from given page
    '''

    def extractData(self):
        page = self.crawlPage(self.url)

        beautifulSoup = BeautifulSoup(page, 'html.parser')

        hotSearchList = []

        for tag in beautifulSoup.find_all('td', class_ = 'td-02'):
            tag = str(tag)
            item = re.findall(self.hotSearchPattern, tag)

            if len(item) == 0:
                item = re.findall(self.advertisementPattern, tag)

            hotSearchList.append([item[0][1], self.prefixUrl + item[0][0]])

        return hotSearchList

class Exporter():
    '''
        Description:
            export the dictionary and list to xls file
        Attributes:
            None
    '''

    def __init__(self):
        self.workbook = xlwt.Workbook(encoding='utf-8')

    '''
        Description:
            export the dictionary and list to xls file
        Args:
            columns: the labels of each c0lumn
            data: the list of data
            path: the path to save xls file
            sheetName: the name of sheet created in the xls file
        Returns:
            None
    '''
    def exportXLS(self, columns, data, path, sheetName):
        if len(data) == 0:
            print("Please get data first!")
            return

        sheet = self.workbook.add_sheet(sheetName)

        for column in range(0, len(columns)):
            sheet.write(0, column, columns[column])

        for i in range(1, len(data) + 1):
            for column in range(0, len(columns)):
                sheet.write(i, column, data[i - 1][column])

        self.workbook.save(path)


if __name__ == "__main__":
    spider = Spider()
    hotSearchList = spider.extractData()
    for hotSearch in hotSearchList:
        print("%s %s" % (hotSearch[0], hotSearch[1]))

    columns = ['Word', 'URL']
    sheetName = 'HotSearch'
    path = './HotSearch.xls'

    exporter = Exporter()
    exporter.exportXLS(columns, hotSearchList, path, sheetName)

输出结果

XLS文件结果

最后

  • 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!

猜你喜欢

转载自blog.csdn.net/qq_44486439/article/details/108026278