Python crawler and data analysis example review 1

The python crawler application was the first module I involved, but I didn't touch the blog post at that time, so I will briefly review it now.
This case is to obtain relevant information about the 60th anniversary of the National Day on Weibo.

First of all, we need to obtain relevant search keywords related to the National Day. At that time, there was a "New Era Column", the code is as follows:

#encoding:utf-8--
from lxml import etree
import time
import requests
class WEIBO():
    cookie={
    
    

        'Cookie': '_T_WM=52165934645; ALF=1573204852; SCF=ArdMMKY9SBOgWxi4HE1DCrEm8vYkDcTnT_8NIoAFJhr3yiG1ryIrOOKbX6ecfBCNdCFo6T_cvboV37xveAwUh34.; SUB=_2A25wmdaYDeRhGeFP61sV-ESoq2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeK54Shnfeoqc; SUHB=0OMk-etS2Ek-ET; SSOLoginState=1570612936'}
    #cookie免验证登录
    def __init__(self,choose):
        self.choose=choose#选择功能,因为本程序爬两个不同的网页
    def getdate(self):#获取时间
        self.time=time.strftime('%Y-%m-%d',time.localtime(time.time()))#字符串转为日期格式
        return self.time

    def get_weibo_newage(self):
        url = 'https://s.weibo.com/top/summary?cate=socialevent'#新时代的网站
        html = requests.get(url, cookies=self.cookie).content#解析网页,获取网页内容
        selector = etree.HTML(html)#得到element对象
        self.weibo=selector.xpath('//td[@class="td-02"]/*/text()')#爬取相关文本内容
        self.weibo_1=[]
        for i in self.weibo:
            i=i[1:-1]#去掉‘#’
            self.weibo_1.append(i)
        self.weibo=self.weibo_1
        sep=','
        self.time=self.getdate()
        self.path=str(self.time)+'newAge.txt'#命名文件
        with open(self.path,'a',encoding='utf-8') as f:
                f.write(str(self.weibo))#用逗号分割文本
        print(self.weibo)
    
    def start(self):爬取新时代网页
        self.get_weibo_newage()
if __name__=='__main__':
    k=WEIBO()
    k.start()

Then construct a URL based on the crawled keyword list, and the operation is more complicated than the first program.

#encoding:utf-8--
import requests
from lxml import etree
import re
from collections import OrderedDict
import pandas as pd
import time
from datetime import datetime
import random
from time import sleep
class WEIBO():
    cookie = {
    
    
        'Cookie': '_T_WM=52165934645; ALF=1573204852; SCF=ArdMMKY9SBOgWxi4HE1DCrEm8vYkDcTnT_8NIoAFJhr3yiG1ryIrOOKbX6ecfBCNdCFo6T_cvboV37xveAwUh34.; SUB=_2A25wmdaYDeRhGeFP61sV-CvOzTqIHXVQZfrQrDV6PUJbktANLUiikW1NQSI-eIFPm_5zxcxo3ah_9S8cH-4Nf-Iy; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W54z7GLQ_uRCDy3AoKHpPxB5JpX5K-hUgL.FoMpeh.X1h-ESoq2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeK54Shnfeoqc; SUHB=0OMk-etS2Ek-ET; SSOLoginState=1570612936'}
    def __init__(self):#初始化
        self.weibo_num=0
        self.weibo=[]
    def deal_html(self,url):#处理html
        html=requests.get(url,cookies=self.cookie).content
        selector=etree.HTML(html)
        return selector
    def get_id(self,info):#获取id信息
        id=info.xpath('@id')[0][2:]
        return id
    def get_date(self,info):#获取发表日期
        times = info.xpath('div/span[@class="ct"]/text()')#注意xpath的语法,出错一点,则不能读出数据
        times = ''.join(times)
        date = str(times[:times.find(' ')])
        #if u'今' in date:
         #   date=time.strftime("%m月%d日",time.localtime(time.time()))
        if u'今天' in times or u'分钟' in times or u'刚刚' in times:
            month=datetime.now().strftime('%m')
            day=datetime.now().strftime('%d')
            date = month+'月'+day+'日'

        return date
    def get_name(self,info):#获取发表名字
        name=info.xpath('div/a[@class="nk"]/text()')[0]
        return name

    def get_content(self,info):#获取内容
        content=''.join(info.xpath('div//text()'))
        contents = content[content.find(':') + 1:content.find(u'赞')]
        return contents
    def get_fonter(self,info):#获取点赞评论转发
        pattern = r'\d+'
        halfcontent = info.xpath('div/a/text()')
        halfcontent = ''.join(halfcontent)
        foot = halfcontent[halfcontent.find(u'赞'):halfcontent.find(u'收藏')]
        foots = re.findall(pattern, foot)
        return foots
    def printAweibo(self,info,k):#打印获取的信息
        print(self.word_list[k])
        print(self.get_id(info))
        print(self.get_date(info))
        print(self.get_name(info))
        print(self.get_content(info))
        print("点赞数:"+self.get_fonter(info)[0])
        print("转发数:" + self.get_fonter(info)[1])
        print("评论数:" + self.get_fonter(info)[2])
    def get_weibo_tuple(self,info,k):#获取微博信息的元组
        weibo=OrderedDict()
        weibo['user id']=self.get_id(info)
        weibo['weibo keyword']=self.word_list[k]
        weibo['send date']=self.get_date(info)
        weibo['user name']=self.get_name(info)
        weibo['weibo content']=self.get_content(info)
        weibo['weibo support']=self.get_fonter(info)[0]
        weibo['weibo transpound']=self.get_fonter(info)[1]
        weibo['weibo comment']=self.get_fonter(info)[2]
        return weibo
    def get_pagenum(self,k):#获取微博的页数
        try:
            url = 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%s&advancedfilter=1&starttime=20190920&endtime=20191008&sort=hot' % (self.word_list[k])
            html = self.deal_html(url)
            pageNum = html.xpath('//div[@class="pa"]/form/div/input[@name="mp"]')[0].attrib['value']
            pageNum = int(pageNum)
            return pageNum
        except:
            pass
    def get_keywordlist(self):
        with open(self.filename, 'r', encoding='utf8') as f:
            self.word_list = f.read()
        self.word_list=eval(self.word_list)#字符串转换为列表
        self.word_num=len(self.word_list)
    def deal_url(self,words,pageNum):  # 以后要修改,网址
        #确定微博发布的时间段20190920到20191008
        urls='https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%s&advancedfilter=1&starttime=20190920&endtime=20191008&sort=hot&page=%d'%(words,pageNum)
        return urls
    def write_weibo(self,info,k):#把元组信息写入列表
        weibo=self.get_weibo_tuple(info,k)
        self.weibo.append(weibo)
    def get_pageweibo(self, url,k):#获取一页的微博
        #容错处理,否则会出现'NoneType' object has no attribute 'xpath'
            self.selector = self.deal_html(url)
            info = self.selector.xpath("//div[@class='c']")
            for i in range(2, len(info) - 2):
                try:
                    self.weibo_num += 1
                    print(self.weibo_num)
                    self.write_weibo(info[i],k)
                    self.printAweibo(info[i],k)
                    print("-----" * 100)
                except:
                    continue
    def write_csv(self,keepfile):#写入csv文件
        filename=keepfile
        DataFrame=pd.DataFrame(self.weibo,columns=['user id','weibo keyword','send date','user name','weibo support','weibo transpound','weibo comment','weibo content'])
        DataFrame.to_csv(filename,index=False,sep=',')

    def start(self, filename, keepfilename):  # 运行爬虫
        self.filename = filename
        self.get_keywordlist()
        for k in range(0, self.word_num - 1):
            try:

                num = self.get_pagenum(k)
                pagenum = 0
                randompage = random.randint(1, 3)
                #randompage=1
                for j in range(1, num):#
                    # 设置爬虫睡眠数据避免被系统限制
                    try:
                        if j < num and j == pagenum + randompage:
                            sleep(random.randint(25, 30))
                        url = self.deal_url(self.word_list[k], j)
                        self.get_pageweibo(url, k)
                        pagenum += 1
                    except:
                        continue

            except:
                continue
        print(self.weibo)
        self.write_csv(keepfilename)
        print(u'共爬取' + str(self.weibo_num) + u'条微博')


d=WEIBO()
d.start('2019-10-11hotSearch.txt','data4.csv')
#第一个是读取文件的路径
#第二个是存储路径

Data processing module, to process the crawled data

#encoding:utf-8--
import pymysql
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from matplotlib.font_manager import FontProperties#中文处理
class DEALING():
#采用另一种连接方式:
    def connect(self):
        config={
    
    
            'host':'localhost',
            'port':3306,
            'user':'muwenlong',
            'password':'12345678',
            'db':'weibofile',
            'charset':'utf8mb4',
        }
        db=pymysql.connect(**config)
        sql_1="select `user id`,`weibo support`,`weibo transpound`,`weibo comment`,`weibo keyword`,`weibo content` from `weibo` ORDER BY `weibo support` DESC LIMIT 50"
        sql_2="select `user id`,`weibo support`,`weibo transpound`,`weibo comment`,`user name`,`weibo keyword`,`send date`,`weibo content` from `weibo` WHERE `weibo support`>=100000 OR `weibo transpound`>=100000 OR `weibo comment`>=100000"
        sql_3 = "select * from `weibo`"
        sql_7 = "select * from `weibo` WHERE `send date` LIKE '9月2%' OR `send date` LIKE '10月%'"
        sql_4="select * FROM `WEIBO` ORDER BY `weibo support` DESC  LIMIT 1"
        sql_5="select * FROM `WEIBO` ORDER BY `weibo transpound` DESC  LIMIT 1"
        sql_6 = "select * FROM `WEIBO` ORDER BY `weibo comment` DESC  LIMIT 1"
        data=pd.read_sql(sql_2,db)
        data1=pd.read_sql(sql_3,db)
        data2=pd.read_sql(sql_4,db)
        data3 = pd.read_sql(sql_5, db)
        data4 = pd.read_sql(sql_6, db)
        data5=pd.read_sql(sql_7,db)
        return data,data1,data2,data3,data4,data5
    def Chinese(self,lables):#保证表中的文字是中文,不是看不懂的方块
        font=FontProperties(fname=r'c:\windows\fonts\simsun.ttc',size=10)
        for lable in lables:
            lable.set_fontproperties(font)
    def deal_pandas(self):
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        pd.set_option('max_colwidth', 100)
    def makemaps(self,title,data,choose):
        self.deal_pandas()
        plt.Figure()
        filename=str(title+'.png')
        if choose==1:#为1画柱状图(昵称),为2画折线图(日期),为3画柱状图(热词)
            data=data.plot(kind='bar',align = 'center',title=title,color=["g", "r"])
        elif choose==2:
            data = data.plot(kind='line', title=title,color='b', style='--')
        elif choose==3:
            data=data.plot(kind='bar',title=title,color=["blue","red","yellow","purple"])
        lables=data.get_xticklabels()+data.legend().texts+[data.title]#中文处理
        self.Chinese(lables)
        plt.tight_layout()#防止横坐标或者标签显示不全
        plt.savefig(filename,dpi=600)#像素
        plt.show()
    def getusername_map_csv(self):
        data=self.connect()[0]
        groupData=data['user id'].groupby(data['user name']).count()
        groupData = groupData.sort_values(ascending=False)
        self.makemaps('热门博主分布柱状图(微博热度超十万)',groupData,1)
        groupData.to_csv('weiboUserName.csv', encoding='utf8', header=True)
    def getsenddate_map_csv(self):
        data=self.connect()[1]
        groupData=data['user id'].groupby(data['send date']).count()
        self.makemaps('国庆热度变化折线图(日期)',groupData,2)
        groupData.to_csv('weiboSendDate.csv', encoding='utf8', header=True)
    def gettopic_map_csv(self):
        data=self.connect()[1]
        print(data)
        groupData = data['user id'].groupby(data['weibo keyword']).count()
        groupData = groupData.sort_values(ascending=False)
        self.makemaps('国庆热词占比柱状图',groupData,3)
        groupData.to_csv('weiboKeyWords.csv', encoding='utf8', header=True)
    def get_general_username_map_csv(self):
        self.deal_pandas()
        data=self.connect()[1]
        groupData = data['user id'].groupby(data['user name']).count()
        groupData=groupData.sort_values(ascending=False)#按降序排序
        groupData=groupData.head(20)
        self.makemaps('微博博主柱状图(按微博发帖数)', groupData, 1)
        groupData.to_csv('weiboUserName_general.csv', encoding='utf8', header=True)
    def max_csv(self):
        for i in range(2,5):
            self.deal_pandas()
            data=self.connect()[i]
            data.to_csv('max.csv', mode='a',encoding='utf8', header=True)
    def start(self):
        self.get_general_username_map_csv()
        self.getusername_map_csv()
        self.getsenddate_map_csv()
        self.gettopic_map_csv()
        self.max_csv()
d=DEALING()
d.start()

Guess you like

Origin blog.csdn.net/qq_44717437/article/details/106156765