前段时间笔者写了一份儿爬取微博评论的代码,是用了http://m.weibo.cn的接口,通过这个接口比较好的是代码比较好写,因为数据都是json格式的,规律性非常直观,属性json数据和字典操作的朋友都可以很快掌握,但是这个有个比较大的问题是基本限制了前一百页的内容爬取,而且cookie过期的比较快,所以现在提供一种新的方式是通过xpath方式解析http://www.weibo.cn这个入口。这个方式下面,所有微博评论都是可以在代码中直观看到的,只是可能大家对于xpath的一些相关操作不熟悉,这一块儿大家可以自己去找资源学习,这里不做介绍。所谓的xpath解析网页,实质上和bs4差不多,只是xpath在工业级用的比较多,bs4适合于小型爬虫任务,这种方式下对于cookie的有效期较长,笔者曾经试过一个cookie用了两到三个月未失效。大家可以在打开某一个微博的评论页面时,直接查看源代码。以“https://weibo.cn/comment/Hpg589Py7?ckAll=1”为例,查看源代码发现是这样的
这样大家可以直接复制下来,找一个格式化网站格式化一下就可以很规则了,这里不做演示。
然后只要规则化后就是找h5标签,就行层层剖析就行,对于标签的剖析不做解释。源码贴在下面供大家学习。
# -*- coding:utf-8 -*-
__author__ = 'TengYu'
import requests
import xlwt
import json
import time
import re
from datetime import timedelta
from datetime import datetime
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf8')
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
'Cookie': 'your-cookie'
}
#工具类,用来去除爬取的正文中一些不需要的链接、标签等
class Tool:
deleteImg = re.compile('<img.*?>')
newLine =re.compile('<tr>|<div>|</tr>|</div>')
deleteAite = re.compile('//.*?')
deleteAddr = re.compile('<a.*?>.*?</a>|<a href='+'\'https:')
deleteTag = re.compile('<.*?>')
deleteWord = re.compile('回复@|回覆@|回覆|回复|:')
@classmethod
def replace(cls,x):
x = re.sub(cls.deleteWord,'',x)
x = re.sub(cls.deleteImg,'',x)
x = re.sub(cls.deleteAite,'',x)
x = re.sub(cls.deleteAddr, '', x)
x = re.sub(cls.newLine,'',x)
x = re.sub(cls.deleteTag,'',x)
return x.strip()
def deleteAite(text):
if re.search('//@',text,flags=0):
loc = re.search('//@',text,flags=0).span()
if loc[0] == 0:
return ' '
else:
return text[:loc[0]]
return text
class LxmCOM(object):
def __init__(self,weiboid,uid,maxpage,need):
self.weiboid = weiboid
self.uid = uid
self.need = need
self.maxpage = maxpage
def get_url(self):
excel = xlwt.Workbook(encoding='utf-8')
sheet = excel.add_sheet('sheet1')
sheet.write(0, 0, 'text')
sheet.write(0, 1, 'time')
count = 0
i = 1
try:
while i < self.maxpage and count < self.need:
url = 'https://weibo.cn/comment/'+self.weiboid+'??&uid='+self.uid+'&&page={}'.format(i)
print (url)
i += 1
response = requests.get(url, 'html.parser', headers=headers).content
selector = etree.HTML(response)
divs = selector.xpath("//div[@class='c']")
j = 1
while j < len(divs):
try:
div = divs[j]
j += 1
a = div.xpath(".//a")
t = div.xpath(".//span[@class='ctt']")[0]
ctt = str(t.xpath("string(.)").encode('utf-8'))
if "回复" in ctt:
LOC = ctt.find(':')
ctt = ctt[LOC + 1:]
ctt = Tool.replace(ctt)
if "@" in ctt:
if ctt.find("@") == 1:
if " " in ctt:
loc = ctt.find(':')
loc1 = ctt.find(" ")
ctt = ctt[loc1 + 1:]
else:
ctt = ""
else:
ctt = ctt[:ctt.find("@")]
if ctt == "" or ctt == " ":
continue
if ctt is None:
continue
print(ctt)
count += 1
cc = div.xpath(".//span[@class='cc']")
ct = cc[0]
created_at = str(ct.xpath(".//a/text()")[0].encode('utf-8'))
index1 = created_at.index("[")
index2 = created_at.index("]")
Time = str(div.xpath(".//span[@class='ct']/text()")[0].encode('utf-8'))
if '刚刚' in Time:
Time = datetime.now().strftime(
'%Y-%m-%d %H:%M')
elif '分钟' in Time:
loc = Time.find('分钟')
num = Time[:loc-1]
num = timedelta(minutes=int(num))
Time = (datetime.now() - num).strftime(
"%Y-%m-%d %H:%M")
elif '今天' in Time:
today = datetime.now().strftime("%Y-%m-%d")
t = Time[3:9]
Time = today+' '+t
elif '月' in Time:
year = datetime.now().strftime("%Y")
loc1 = Time.index('月')
loc2 = Time.index('日')
loc3 = Time.index(' ')
month = Time[0:loc1]
day = Time[loc1+3:loc2]
#t = Time[loc3+1:loc3+6]
Time = (
year + "-" + month + "-" + day + " ")
else:
Time = Time[:15]
like_counts = created_at[index1+1:index2]
userid = a[0].xpath("./@href")
if '/u/' in str(userid):
user = str(userid)[5:-2]
else:
user = str(userid)[3:-2]
sheet.write(count, 0, str(ctt))
sheet.write(count, 1, str(time))
except Exception as e:
print (e)
print("已经抓取"+str(count)+"条数据")
time.sleep(5)
except Exception as e:
print (e)
excel.save('filename.xls')
pass
if __name__ == "__main__":
weiboid = "此处是微博id"#示例url中的'H6yxq5ho2'
uid = "此处是用户id"#示例url中的'3843591705''
maxpage = "此处是评论最大页数"
need = "此处是你需要的评论量"
COM = LxmCOM(weiboid,uid,maxpage,need)
COM.get_url()
#url='https://weibo.cn/comment/H6yxq5ho2??&uid=3843591705'