Python クローラーとデータ分析例のレビュー 1

Python クローラー アプリケーションは私が最初に関与したモジュールでしたが、その時点ではブログ投稿には触れていなかったので、今簡単にレビューします。
今回の案件は、Weibo上で国慶節60周年に関する関連情報を入手するというもの。

まず、国慶節に関連する関連検索キーワードを取得する必要がありますが、そのときは「新時代コラム」があり、コードは次のとおりです。

#encoding:utf-8--
from lxml import etree
import time
import requests
class WEIBO():
    cookie={
    
    

        'Cookie': '_T_WM=52165934645; ALF=1573204852; SCF=ArdMMKY9SBOgWxi4HE1DCrEm8vYkDcTnT_8NIoAFJhr3yiG1ryIrOOKbX6ecfBCNdCFo6T_cvboV37xveAwUh34.; SUB=_2A25wmdaYDeRhGeFP61sV-ESoq2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeK54Shnfeoqc; SUHB=0OMk-etS2Ek-ET; SSOLoginState=1570612936'}
    #cookie免验证登录
    def __init__(self,choose):
        self.choose=choose#选择功能,因为本程序爬两个不同的网页
    def getdate(self):#获取时间
        self.time=time.strftime('%Y-%m-%d',time.localtime(time.time()))#字符串转为日期格式
        return self.time

    def get_weibo_newage(self):
        url = 'https://s.weibo.com/top/summary?cate=socialevent'#新时代的网站
        html = requests.get(url, cookies=self.cookie).content#解析网页,获取网页内容
        selector = etree.HTML(html)#得到element对象
        self.weibo=selector.xpath('//td[@class="td-02"]/*/text()')#爬取相关文本内容
        self.weibo_1=[]
        for i in self.weibo:
            i=i[1:-1]#去掉‘#’
            self.weibo_1.append(i)
        self.weibo=self.weibo_1
        sep=','
        self.time=self.getdate()
        self.path=str(self.time)+'newAge.txt'#命名文件
        with open(self.path,'a',encoding='utf-8') as f:
                f.write(str(self.weibo))#用逗号分割文本
        print(self.weibo)
    
    def start(self):爬取新时代网页
        self.get_weibo_newage()
if __name__=='__main__':
    k=WEIBO()
    k.start()

次に、クロールされたキーワード リストに基づいて URL を構築しますが、操作は最初のプログラムよりも複雑になります。

#encoding:utf-8--
import requests
from lxml import etree
import re
from collections import OrderedDict
import pandas as pd
import time
from datetime import datetime
import random
from time import sleep
class WEIBO():
    cookie = {
    
    
        'Cookie': '_T_WM=52165934645; ALF=1573204852; SCF=ArdMMKY9SBOgWxi4HE1DCrEm8vYkDcTnT_8NIoAFJhr3yiG1ryIrOOKbX6ecfBCNdCFo6T_cvboV37xveAwUh34.; SUB=_2A25wmdaYDeRhGeFP61sV-CvOzTqIHXVQZfrQrDV6PUJbktANLUiikW1NQSI-eIFPm_5zxcxo3ah_9S8cH-4Nf-Iy; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W54z7GLQ_uRCDy3AoKHpPxB5JpX5K-hUgL.FoMpeh.X1h-ESoq2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeK54Shnfeoqc; SUHB=0OMk-etS2Ek-ET; SSOLoginState=1570612936'}
    def __init__(self):#初始化
        self.weibo_num=0
        self.weibo=[]
    def deal_html(self,url):#处理html
        html=requests.get(url,cookies=self.cookie).content
        selector=etree.HTML(html)
        return selector
    def get_id(self,info):#获取id信息
        id=info.xpath('@id')[0][2:]
        return id
    def get_date(self,info):#获取发表日期
        times = info.xpath('div/span[@class="ct"]/text()')#注意xpath的语法,出错一点,则不能读出数据
        times = ''.join(times)
        date = str(times[:times.find(' ')])
        #if u'今' in date:
         #   date=time.strftime("%m月%d日",time.localtime(time.time()))
        if u'今天' in times or u'分钟' in times or u'刚刚' in times:
            month=datetime.now().strftime('%m')
            day=datetime.now().strftime('%d')
            date = month+'月'+day+'日'

        return date
    def get_name(self,info):#获取发表名字
        name=info.xpath('div/a[@class="nk"]/text()')[0]
        return name

    def get_content(self,info):#获取内容
        content=''.join(info.xpath('div//text()'))
        contents = content[content.find(':') + 1:content.find(u'赞')]
        return contents
    def get_fonter(self,info):#获取点赞评论转发
        pattern = r'\d+'
        halfcontent = info.xpath('div/a/text()')
        halfcontent = ''.join(halfcontent)
        foot = halfcontent[halfcontent.find(u'赞'):halfcontent.find(u'收藏')]
        foots = re.findall(pattern, foot)
        return foots
    def printAweibo(self,info,k):#打印获取的信息
        print(self.word_list[k])
        print(self.get_id(info))
        print(self.get_date(info))
        print(self.get_name(info))
        print(self.get_content(info))
        print("点赞数:"+self.get_fonter(info)[0])
        print("转发数:" + self.get_fonter(info)[1])
        print("评论数:" + self.get_fonter(info)[2])
    def get_weibo_tuple(self,info,k):#获取微博信息的元组
        weibo=OrderedDict()
        weibo['user id']=self.get_id(info)
        weibo['weibo keyword']=self.word_list[k]
        weibo['send date']=self.get_date(info)
        weibo['user name']=self.get_name(info)
        weibo['weibo content']=self.get_content(info)
        weibo['weibo support']=self.get_fonter(info)[0]
        weibo['weibo transpound']=self.get_fonter(info)[1]
        weibo['weibo comment']=self.get_fonter(info)[2]
        return weibo
    def get_pagenum(self,k):#获取微博的页数
        try:
            url = 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%s&advancedfilter=1&starttime=20190920&endtime=20191008&sort=hot' % (self.word_list[k])
            html = self.deal_html(url)
            pageNum = html.xpath('//div[@class="pa"]/form/div/input[@name="mp"]')[0].attrib['value']
            pageNum = int(pageNum)
            return pageNum
        except:
            pass
    def get_keywordlist(self):
        with open(self.filename, 'r', encoding='utf8') as f:
            self.word_list = f.read()
        self.word_list=eval(self.word_list)#字符串转换为列表
        self.word_num=len(self.word_list)
    def deal_url(self,words,pageNum):  # 以后要修改,网址
        #确定微博发布的时间段20190920到20191008
        urls='https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%s&advancedfilter=1&starttime=20190920&endtime=20191008&sort=hot&page=%d'%(words,pageNum)
        return urls
    def write_weibo(self,info,k):#把元组信息写入列表
        weibo=self.get_weibo_tuple(info,k)
        self.weibo.append(weibo)
    def get_pageweibo(self, url,k):#获取一页的微博
        #容错处理,否则会出现'NoneType' object has no attribute 'xpath'
            self.selector = self.deal_html(url)
            info = self.selector.xpath("//div[@class='c']")
            for i in range(2, len(info) - 2):
                try:
                    self.weibo_num += 1
                    print(self.weibo_num)
                    self.write_weibo(info[i],k)
                    self.printAweibo(info[i],k)
                    print("-----" * 100)
                except:
                    continue
    def write_csv(self,keepfile):#写入csv文件
        filename=keepfile
        DataFrame=pd.DataFrame(self.weibo,columns=['user id','weibo keyword','send date','user name','weibo support','weibo transpound','weibo comment','weibo content'])
        DataFrame.to_csv(filename,index=False,sep=',')

    def start(self, filename, keepfilename):  # 运行爬虫
        self.filename = filename
        self.get_keywordlist()
        for k in range(0, self.word_num - 1):
            try:

                num = self.get_pagenum(k)
                pagenum = 0
                randompage = random.randint(1, 3)
                #randompage=1
                for j in range(1, num):#
                    # 设置爬虫睡眠数据避免被系统限制
                    try:
                        if j < num and j == pagenum + randompage:
                            sleep(random.randint(25, 30))
                        url = self.deal_url(self.word_list[k], j)
                        self.get_pageweibo(url, k)
                        pagenum += 1
                    except:
                        continue

            except:
                continue
        print(self.weibo)
        self.write_csv(keepfilename)
        print(u'共爬取' + str(self.weibo_num) + u'条微博')


d=WEIBO()
d.start('2019-10-11hotSearch.txt','data4.csv')
#第一个是读取文件的路径
#第二个是存储路径

クロールされたデータを処理するデータ処理モジュール

#encoding:utf-8--
import pymysql
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from matplotlib.font_manager import FontProperties#中文处理
class DEALING():
#采用另一种连接方式:
    def connect(self):
        config={
    
    
            'host':'localhost',
            'port':3306,
            'user':'muwenlong',
            'password':'12345678',
            'db':'weibofile',
            'charset':'utf8mb4',
        }
        db=pymysql.connect(**config)
        sql_1="select `user id`,`weibo support`,`weibo transpound`,`weibo comment`,`weibo keyword`,`weibo content` from `weibo` ORDER BY `weibo support` DESC LIMIT 50"
        sql_2="select `user id`,`weibo support`,`weibo transpound`,`weibo comment`,`user name`,`weibo keyword`,`send date`,`weibo content` from `weibo` WHERE `weibo support`>=100000 OR `weibo transpound`>=100000 OR `weibo comment`>=100000"
        sql_3 = "select * from `weibo`"
        sql_7 = "select * from `weibo` WHERE `send date` LIKE '9月2%' OR `send date` LIKE '10月%'"
        sql_4="select * FROM `WEIBO` ORDER BY `weibo support` DESC  LIMIT 1"
        sql_5="select * FROM `WEIBO` ORDER BY `weibo transpound` DESC  LIMIT 1"
        sql_6 = "select * FROM `WEIBO` ORDER BY `weibo comment` DESC  LIMIT 1"
        data=pd.read_sql(sql_2,db)
        data1=pd.read_sql(sql_3,db)
        data2=pd.read_sql(sql_4,db)
        data3 = pd.read_sql(sql_5, db)
        data4 = pd.read_sql(sql_6, db)
        data5=pd.read_sql(sql_7,db)
        return data,data1,data2,data3,data4,data5
    def Chinese(self,lables):#保证表中的文字是中文,不是看不懂的方块
        font=FontProperties(fname=r'c:\windows\fonts\simsun.ttc',size=10)
        for lable in lables:
            lable.set_fontproperties(font)
    def deal_pandas(self):
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        pd.set_option('max_colwidth', 100)
    def makemaps(self,title,data,choose):
        self.deal_pandas()
        plt.Figure()
        filename=str(title+'.png')
        if choose==1:#为1画柱状图(昵称),为2画折线图(日期),为3画柱状图(热词)
            data=data.plot(kind='bar',align = 'center',title=title,color=["g", "r"])
        elif choose==2:
            data = data.plot(kind='line', title=title,color='b', style='--')
        elif choose==3:
            data=data.plot(kind='bar',title=title,color=["blue","red","yellow","purple"])
        lables=data.get_xticklabels()+data.legend().texts+[data.title]#中文处理
        self.Chinese(lables)
        plt.tight_layout()#防止横坐标或者标签显示不全
        plt.savefig(filename,dpi=600)#像素
        plt.show()
    def getusername_map_csv(self):
        data=self.connect()[0]
        groupData=data['user id'].groupby(data['user name']).count()
        groupData = groupData.sort_values(ascending=False)
        self.makemaps('热门博主分布柱状图(微博热度超十万)',groupData,1)
        groupData.to_csv('weiboUserName.csv', encoding='utf8', header=True)
    def getsenddate_map_csv(self):
        data=self.connect()[1]
        groupData=data['user id'].groupby(data['send date']).count()
        self.makemaps('国庆热度变化折线图(日期)',groupData,2)
        groupData.to_csv('weiboSendDate.csv', encoding='utf8', header=True)
    def gettopic_map_csv(self):
        data=self.connect()[1]
        print(data)
        groupData = data['user id'].groupby(data['weibo keyword']).count()
        groupData = groupData.sort_values(ascending=False)
        self.makemaps('国庆热词占比柱状图',groupData,3)
        groupData.to_csv('weiboKeyWords.csv', encoding='utf8', header=True)
    def get_general_username_map_csv(self):
        self.deal_pandas()
        data=self.connect()[1]
        groupData = data['user id'].groupby(data['user name']).count()
        groupData=groupData.sort_values(ascending=False)#按降序排序
        groupData=groupData.head(20)
        self.makemaps('微博博主柱状图(按微博发帖数)', groupData, 1)
        groupData.to_csv('weiboUserName_general.csv', encoding='utf8', header=True)
    def max_csv(self):
        for i in range(2,5):
            self.deal_pandas()
            data=self.connect()[i]
            data.to_csv('max.csv', mode='a',encoding='utf8', header=True)
    def start(self):
        self.get_general_username_map_csv()
        self.getusername_map_csv()
        self.getsenddate_map_csv()
        self.gettopic_map_csv()
        self.max_csv()
d=DEALING()
d.start()

おすすめ

転載: blog.csdn.net/qq_44717437/article/details/106156765