Python クローラー アプリケーションは私が最初に関与したモジュールでしたが、その時点ではブログ投稿には触れていなかったので、今簡単にレビューします。
今回の案件は、Weibo上で国慶節60周年に関する関連情報を入手するというもの。
まず、国慶節に関連する関連検索キーワードを取得する必要がありますが、そのときは「新時代コラム」があり、コードは次のとおりです。
#encoding:utf-8--
from lxml import etree
import time
import requests
class WEIBO():
cookie={
'Cookie': '_T_WM=52165934645; ALF=1573204852; SCF=ArdMMKY9SBOgWxi4HE1DCrEm8vYkDcTnT_8NIoAFJhr3yiG1ryIrOOKbX6ecfBCNdCFo6T_cvboV37xveAwUh34.; SUB=_2A25wmdaYDeRhGeFP61sV-ESoq2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeK54Shnfeoqc; SUHB=0OMk-etS2Ek-ET; SSOLoginState=1570612936'}
#cookie免验证登录
def __init__(self,choose):
self.choose=choose#选择功能,因为本程序爬两个不同的网页
def getdate(self):#获取时间
self.time=time.strftime('%Y-%m-%d',time.localtime(time.time()))#字符串转为日期格式
return self.time
def get_weibo_newage(self):
url = 'https://s.weibo.com/top/summary?cate=socialevent'#新时代的网站
html = requests.get(url, cookies=self.cookie).content#解析网页,获取网页内容
selector = etree.HTML(html)#得到element对象
self.weibo=selector.xpath('//td[@class="td-02"]/*/text()')#爬取相关文本内容
self.weibo_1=[]
for i in self.weibo:
i=i[1:-1]#去掉‘#’
self.weibo_1.append(i)
self.weibo=self.weibo_1
sep=','
self.time=self.getdate()
self.path=str(self.time)+'newAge.txt'#命名文件
with open(self.path,'a',encoding='utf-8') as f:
f.write(str(self.weibo))#用逗号分割文本
print(self.weibo)
def start(self):爬取新时代网页
self.get_weibo_newage()
if __name__=='__main__':
k=WEIBO()
k.start()
次に、クロールされたキーワード リストに基づいて URL を構築しますが、操作は最初のプログラムよりも複雑になります。
#encoding:utf-8--
import requests
from lxml import etree
import re
from collections import OrderedDict
import pandas as pd
import time
from datetime import datetime
import random
from time import sleep
class WEIBO():
cookie = {
'Cookie': '_T_WM=52165934645; ALF=1573204852; SCF=ArdMMKY9SBOgWxi4HE1DCrEm8vYkDcTnT_8NIoAFJhr3yiG1ryIrOOKbX6ecfBCNdCFo6T_cvboV37xveAwUh34.; SUB=_2A25wmdaYDeRhGeFP61sV-CvOzTqIHXVQZfrQrDV6PUJbktANLUiikW1NQSI-eIFPm_5zxcxo3ah_9S8cH-4Nf-Iy; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W54z7GLQ_uRCDy3AoKHpPxB5JpX5K-hUgL.FoMpeh.X1h-ESoq2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeK54Shnfeoqc; SUHB=0OMk-etS2Ek-ET; SSOLoginState=1570612936'}
def __init__(self):#初始化
self.weibo_num=0
self.weibo=[]
def deal_html(self,url):#处理html
html=requests.get(url,cookies=self.cookie).content
selector=etree.HTML(html)
return selector
def get_id(self,info):#获取id信息
id=info.xpath('@id')[0][2:]
return id
def get_date(self,info):#获取发表日期
times = info.xpath('div/span[@class="ct"]/text()')#注意xpath的语法,出错一点,则不能读出数据
times = ''.join(times)
date = str(times[:times.find(' ')])
#if u'今' in date:
# date=time.strftime("%m月%d日",time.localtime(time.time()))
if u'今天' in times or u'分钟' in times or u'刚刚' in times:
month=datetime.now().strftime('%m')
day=datetime.now().strftime('%d')
date = month+'月'+day+'日'
return date
def get_name(self,info):#获取发表名字
name=info.xpath('div/a[@class="nk"]/text()')[0]
return name
def get_content(self,info):#获取内容
content=''.join(info.xpath('div//text()'))
contents = content[content.find(':') + 1:content.find(u'赞')]
return contents
def get_fonter(self,info):#获取点赞评论转发
pattern = r'\d+'
halfcontent = info.xpath('div/a/text()')
halfcontent = ''.join(halfcontent)
foot = halfcontent[halfcontent.find(u'赞'):halfcontent.find(u'收藏')]
foots = re.findall(pattern, foot)
return foots
def printAweibo(self,info,k):#打印获取的信息
print(self.word_list[k])
print(self.get_id(info))
print(self.get_date(info))
print(self.get_name(info))
print(self.get_content(info))
print("点赞数:"+self.get_fonter(info)[0])
print("转发数:" + self.get_fonter(info)[1])
print("评论数:" + self.get_fonter(info)[2])
def get_weibo_tuple(self,info,k):#获取微博信息的元组
weibo=OrderedDict()
weibo['user id']=self.get_id(info)
weibo['weibo keyword']=self.word_list[k]
weibo['send date']=self.get_date(info)
weibo['user name']=self.get_name(info)
weibo['weibo content']=self.get_content(info)
weibo['weibo support']=self.get_fonter(info)[0]
weibo['weibo transpound']=self.get_fonter(info)[1]
weibo['weibo comment']=self.get_fonter(info)[2]
return weibo
def get_pagenum(self,k):#获取微博的页数
try:
url = 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%s&advancedfilter=1&starttime=20190920&endtime=20191008&sort=hot' % (self.word_list[k])
html = self.deal_html(url)
pageNum = html.xpath('//div[@class="pa"]/form/div/input[@name="mp"]')[0].attrib['value']
pageNum = int(pageNum)
return pageNum
except:
pass
def get_keywordlist(self):
with open(self.filename, 'r', encoding='utf8') as f:
self.word_list = f.read()
self.word_list=eval(self.word_list)#字符串转换为列表
self.word_num=len(self.word_list)
def deal_url(self,words,pageNum): # 以后要修改,网址
#确定微博发布的时间段20190920到20191008
urls='https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%s&advancedfilter=1&starttime=20190920&endtime=20191008&sort=hot&page=%d'%(words,pageNum)
return urls
def write_weibo(self,info,k):#把元组信息写入列表
weibo=self.get_weibo_tuple(info,k)
self.weibo.append(weibo)
def get_pageweibo(self, url,k):#获取一页的微博
#容错处理,否则会出现'NoneType' object has no attribute 'xpath'
self.selector = self.deal_html(url)
info = self.selector.xpath("//div[@class='c']")
for i in range(2, len(info) - 2):
try:
self.weibo_num += 1
print(self.weibo_num)
self.write_weibo(info[i],k)
self.printAweibo(info[i],k)
print("-----" * 100)
except:
continue
def write_csv(self,keepfile):#写入csv文件
filename=keepfile
DataFrame=pd.DataFrame(self.weibo,columns=['user id','weibo keyword','send date','user name','weibo support','weibo transpound','weibo comment','weibo content'])
DataFrame.to_csv(filename,index=False,sep=',')
def start(self, filename, keepfilename): # 运行爬虫
self.filename = filename
self.get_keywordlist()
for k in range(0, self.word_num - 1):
try:
num = self.get_pagenum(k)
pagenum = 0
randompage = random.randint(1, 3)
#randompage=1
for j in range(1, num):#
# 设置爬虫睡眠数据避免被系统限制
try:
if j < num and j == pagenum + randompage:
sleep(random.randint(25, 30))
url = self.deal_url(self.word_list[k], j)
self.get_pageweibo(url, k)
pagenum += 1
except:
continue
except:
continue
print(self.weibo)
self.write_csv(keepfilename)
print(u'共爬取' + str(self.weibo_num) + u'条微博')
d=WEIBO()
d.start('2019-10-11hotSearch.txt','data4.csv')
#第一个是读取文件的路径
#第二个是存储路径
クロールされたデータを処理するデータ処理モジュール
#encoding:utf-8--
import pymysql
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from matplotlib.font_manager import FontProperties#中文处理
class DEALING():
#采用另一种连接方式:
def connect(self):
config={
'host':'localhost',
'port':3306,
'user':'muwenlong',
'password':'12345678',
'db':'weibofile',
'charset':'utf8mb4',
}
db=pymysql.connect(**config)
sql_1="select `user id`,`weibo support`,`weibo transpound`,`weibo comment`,`weibo keyword`,`weibo content` from `weibo` ORDER BY `weibo support` DESC LIMIT 50"
sql_2="select `user id`,`weibo support`,`weibo transpound`,`weibo comment`,`user name`,`weibo keyword`,`send date`,`weibo content` from `weibo` WHERE `weibo support`>=100000 OR `weibo transpound`>=100000 OR `weibo comment`>=100000"
sql_3 = "select * from `weibo`"
sql_7 = "select * from `weibo` WHERE `send date` LIKE '9月2%' OR `send date` LIKE '10月%'"
sql_4="select * FROM `WEIBO` ORDER BY `weibo support` DESC LIMIT 1"
sql_5="select * FROM `WEIBO` ORDER BY `weibo transpound` DESC LIMIT 1"
sql_6 = "select * FROM `WEIBO` ORDER BY `weibo comment` DESC LIMIT 1"
data=pd.read_sql(sql_2,db)
data1=pd.read_sql(sql_3,db)
data2=pd.read_sql(sql_4,db)
data3 = pd.read_sql(sql_5, db)
data4 = pd.read_sql(sql_6, db)
data5=pd.read_sql(sql_7,db)
return data,data1,data2,data3,data4,data5
def Chinese(self,lables):#保证表中的文字是中文,不是看不懂的方块
font=FontProperties(fname=r'c:\windows\fonts\simsun.ttc',size=10)
for lable in lables:
lable.set_fontproperties(font)
def deal_pandas(self):
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)
def makemaps(self,title,data,choose):
self.deal_pandas()
plt.Figure()
filename=str(title+'.png')
if choose==1:#为1画柱状图(昵称),为2画折线图(日期),为3画柱状图(热词)
data=data.plot(kind='bar',align = 'center',title=title,color=["g", "r"])
elif choose==2:
data = data.plot(kind='line', title=title,color='b', style='--')
elif choose==3:
data=data.plot(kind='bar',title=title,color=["blue","red","yellow","purple"])
lables=data.get_xticklabels()+data.legend().texts+[data.title]#中文处理
self.Chinese(lables)
plt.tight_layout()#防止横坐标或者标签显示不全
plt.savefig(filename,dpi=600)#像素
plt.show()
def getusername_map_csv(self):
data=self.connect()[0]
groupData=data['user id'].groupby(data['user name']).count()
groupData = groupData.sort_values(ascending=False)
self.makemaps('热门博主分布柱状图(微博热度超十万)',groupData,1)
groupData.to_csv('weiboUserName.csv', encoding='utf8', header=True)
def getsenddate_map_csv(self):
data=self.connect()[1]
groupData=data['user id'].groupby(data['send date']).count()
self.makemaps('国庆热度变化折线图(日期)',groupData,2)
groupData.to_csv('weiboSendDate.csv', encoding='utf8', header=True)
def gettopic_map_csv(self):
data=self.connect()[1]
print(data)
groupData = data['user id'].groupby(data['weibo keyword']).count()
groupData = groupData.sort_values(ascending=False)
self.makemaps('国庆热词占比柱状图',groupData,3)
groupData.to_csv('weiboKeyWords.csv', encoding='utf8', header=True)
def get_general_username_map_csv(self):
self.deal_pandas()
data=self.connect()[1]
groupData = data['user id'].groupby(data['user name']).count()
groupData=groupData.sort_values(ascending=False)#按降序排序
groupData=groupData.head(20)
self.makemaps('微博博主柱状图(按微博发帖数)', groupData, 1)
groupData.to_csv('weiboUserName_general.csv', encoding='utf8', header=True)
def max_csv(self):
for i in range(2,5):
self.deal_pandas()
data=self.connect()[i]
data.to_csv('max.csv', mode='a',encoding='utf8', header=True)
def start(self):
self.get_general_username_map_csv()
self.getusername_map_csv()
self.getsenddate_map_csv()
self.gettopic_map_csv()
self.max_csv()
d=DEALING()
d.start()