python爬取微博评论（通过xpath解析的方式）

前段时间笔者写了一份儿爬取微博评论的代码，是用了http://m.weibo.cn的接口，通过这个接口比较好的是代码比较好写，因为数据都是json格式的，规律性非常直观，属性json数据和字典操作的朋友都可以很快掌握，但是这个有个比较大的问题是基本限制了前一百页的内容爬取，而且cookie过期的比较快，所以现在提供一种新的方式是通过xpath方式解析http://www.weibo.cn这个入口。这个方式下面，所有微博评论都是可以在代码中直观看到的，只是可能大家对于xpath的一些相关操作不熟悉，这一块儿大家可以自己去找资源学习，这里不做介绍。所谓的xpath解析网页，实质上和bs4差不多，只是xpath在工业级用的比较多，bs4适合于小型爬虫任务，这种方式下对于cookie的有效期较长，笔者曾经试过一个cookie用了两到三个月未失效。大家可以在打开某一个微博的评论页面时，直接查看源代码。以“https://weibo.cn/comment/Hpg589Py7?ckAll=1”为例，查看源代码发现是这样的

这样大家可以直接复制下来，找一个格式化网站格式化一下就可以很规则了，这里不做演示。

然后只要规则化后就是找h5标签，就行层层剖析就行，对于标签的剖析不做解释。源码贴在下面供大家学习。

# -*- coding:utf-8 -*-
__author__ = 'TengYu'
import requests
import xlwt
import json
import time
import re
from datetime import timedelta
from datetime import datetime
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf8')


headers = {
        'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
        'Cookie': 'your-cookie'
}



#工具类，用来去除爬取的正文中一些不需要的链接、标签等
class Tool:
    deleteImg = re.compile('<img.*?>')
    newLine =re.compile('<tr>|<div>|</tr>|</div>')
    deleteAite = re.compile('//.*?')
    deleteAddr = re.compile('<a.*?>.*?</a>|<a href='+'\'https:')
    deleteTag = re.compile('<.*?>')
    deleteWord = re.compile('回复@|回覆@|回覆|回复|:')

    @classmethod
    def replace(cls,x):
        x = re.sub(cls.deleteWord,'',x)
        x = re.sub(cls.deleteImg,'',x)
        x = re.sub(cls.deleteAite,'',x)
        x = re.sub(cls.deleteAddr, '', x)
        x = re.sub(cls.newLine,'',x)
        x = re.sub(cls.deleteTag,'',x)
        return x.strip()


def deleteAite(text):
    if re.search('//@',text,flags=0):
        loc = re.search('//@',text,flags=0).span()
        if loc[0] == 0:
            return ' '
        else:
            return text[:loc[0]]
    return text



class LxmCOM(object):

    def __init__(self,weiboid,uid,maxpage,need):
        self.weiboid = weiboid
        self.uid = uid
	self.need = need
        self.maxpage = maxpage


    def get_url(self):
        excel = xlwt.Workbook(encoding='utf-8')
        sheet = excel.add_sheet('sheet1')
        sheet.write(0, 0, 'text')
        sheet.write(0, 1, 'time')
        count = 0
        i = 1
        try:
            while i < self.maxpage and count < self.need:
                url = 'https://weibo.cn/comment/'+self.weiboid+'??&uid='+self.uid+'&&page={}'.format(i)
                print (url)
                i += 1
                response = requests.get(url, 'html.parser', headers=headers).content
                selector = etree.HTML(response)
                divs = selector.xpath("//div[@class='c']")
                j = 1
                while j < len(divs):
                    try:
                        div = divs[j]
                        j += 1
                        a = div.xpath(".//a")
                        t = div.xpath(".//span[@class='ctt']")[0]
                        ctt = str(t.xpath("string(.)").encode('utf-8'))
                        if "回复" in ctt:
                            LOC = ctt.find(':')
                            ctt = ctt[LOC + 1:]
                        ctt = Tool.replace(ctt)
                        if "@" in ctt:
                            if ctt.find("@") == 1:
                                if " " in ctt:
                                    loc = ctt.find(':')
                                    loc1 = ctt.find(" ")
                                    ctt = ctt[loc1 + 1:]
                                else:
                                    ctt = ""
                            else:
                                ctt = ctt[:ctt.find("@")]
                        if ctt == "" or ctt == " ":
                            continue
                        if ctt is None:
                            continue
                        print(ctt)
                        count += 1
                        cc = div.xpath(".//span[@class='cc']")
                        ct = cc[0]
                        created_at = str(ct.xpath(".//a/text()")[0].encode('utf-8'))
                        index1 = created_at.index("[")
                        index2 = created_at.index("]")
                        Time = str(div.xpath(".//span[@class='ct']/text()")[0].encode('utf-8'))
                        if '刚刚' in Time:
                            Time = datetime.now().strftime(
                            '%Y-%m-%d %H:%M')
                        elif '分钟' in Time:
                            loc = Time.find('分钟')
                            num = Time[:loc-1]
                            num = timedelta(minutes=int(num))
                            Time =  (datetime.now() - num).strftime(
                            "%Y-%m-%d %H:%M")
                        elif '今天' in Time:
                            today = datetime.now().strftime("%Y-%m-%d")
                            t = Time[3:9]
                            Time = today+' '+t
                        elif '月' in Time:
                            year = datetime.now().strftime("%Y")
                            loc1 = Time.index('月')
                            loc2 = Time.index('日')
                            loc3 = Time.index(' ')
                            month = Time[0:loc1]
                            day = Time[loc1+3:loc2]
                            #t = Time[loc3+1:loc3+6]
                            Time = (
                                    year + "-" + month + "-" + day + " ")
                        else:
                            Time = Time[:15]

                        like_counts = created_at[index1+1:index2]
                        userid = a[0].xpath("./@href")
                        if '/u/' in str(userid):
                            user = str(userid)[5:-2]
                        else:
                            user = str(userid)[3:-2]
                        sheet.write(count, 0, str(ctt))
                        sheet.write(count, 1, str(time))
                    except Exception as e:
                        print (e)
                print("已经抓取"+str(count)+"条数据")
                time.sleep(5)
        except Exception as e:
            print (e)
        excel.save('filename.xls')
        pass


if __name__ == "__main__":
    weiboid = "此处是微博id"#示例url中的'H6yxq5ho2'
    uid = "此处是用户id"#示例url中的'3843591705''
    maxpage = "此处是评论最大页数"
    need = "此处是你需要的评论量"
    COM = LxmCOM(weiboid,uid,maxpage,need)
    COM.get_url()
    #url='https://weibo.cn/comment/H6yxq5ho2??&uid=3843591705'

python爬取微博评论（通过xpath解析的方式）

猜你喜欢