classExporter():def__init__(self):
self.workbook = xlwt.Workbook(encoding='utf-8')defexportXLS(self, columns, data, path, sheetName):iflen(data)==0:print("Please get data first!")return
sheet = self.workbook.add_sheet(sheetName)for column inrange(0,len(columns)):
sheet.write(0, column, columns[column])for i inrange(1,len(data)+1):for column inrange(0,len(columns)):
sheet.write(i, column, data[i -1][column])
self.workbook.save(path)
实现代码
# -*- coding:utf-8 -*-import re
from bs4 import BeautifulSoup
import urllib
import xlwt
classSpider():'''
Description:
Spider program to crawl data from Weibo hot search rank list
Attributes:
None
'''def__init__(self):
self.hotSearchPattern = re.compile(r'<a href="(.*?)" target="_blank">(.*?)</a>')
self.advertisementPattern = re.compile(r'<a action-data.*?href_to="(.*?)" suda-data=.*?word=".*?">(.*?)</a>')
self.url ='https://s.weibo.com/top/summary'
self.prefixUrl ='https://s.weibo.com'
self.headers ={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}'''
Description:
crawl page from the given URL
Args:
url: the URL of page need to get
Returns:
page of the given URL
'''defcrawlPage(self, url):
request = urllib.request.Request(headers = self.headers, url = url)
page =Nonetry:
response = urllib.request.urlopen(request)
page = response.read().decode('utf-8')except urllib.error.URLError as e:print("Get page fail!")print(e)return page
'''
Description:
extract data from the given page, return the list of data
Args:
None
Returns:
list of data extract from given page
'''defextractData(self):
page = self.crawlPage(self.url)
beautifulSoup = BeautifulSoup(page,'html.parser')
hotSearchList =[]for tag in beautifulSoup.find_all('td', class_ ='td-02'):
tag =str(tag)
item = re.findall(self.hotSearchPattern, tag)iflen(item)==0:
item = re.findall(self.advertisementPattern, tag)
hotSearchList.append([item[0][1], self.prefixUrl + item[0][0]])return hotSearchList
classExporter():'''
Description:
export the dictionary and list to xls file
Attributes:
None
'''def__init__(self):
self.workbook = xlwt.Workbook(encoding='utf-8')'''
Description:
export the dictionary and list to xls file
Args:
columns: the labels of each c0lumn
data: the list of data
path: the path to save xls file
sheetName: the name of sheet created in the xls file
Returns:
None
'''defexportXLS(self, columns, data, path, sheetName):iflen(data)==0:print("Please get data first!")return
sheet = self.workbook.add_sheet(sheetName)for column inrange(0,len(columns)):
sheet.write(0, column, columns[column])for i inrange(1,len(data)+1):for column inrange(0,len(columns)):
sheet.write(i, column, data[i -1][column])
self.workbook.save(path)if __name__ =="__main__":
spider = Spider()
hotSearchList = spider.extractData()for hotSearch in hotSearchList:print("%s %s"%(hotSearch[0], hotSearch[1]))
columns =['Word','URL']
sheetName ='HotSearch'
path ='./HotSearch.xls'
exporter = Exporter()
exporter.exportXLS(columns, hotSearchList, path, sheetName)