此次写的是python爬取微博话题下面的帖子,示例代码以爬取#转发这个杨超越#
https://s.weibo.com/weibo/%23%E8%BD%AC%E5%8F%91%E8%BF%99%E4%B8%AA%E6%9D%A8%E8%B6%85%E8%B6%8A%23
# -*- coding:utf-8 -*-
__author__ = 'TengYu'
import requests
import json
import re
import time
import xlwt
from bs4 import BeautifulSoup
headers = {
'User-agent' : 'Your-Agent',
'Cookie':'Your-cookie'
}
url = 'https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23%E8%BD%AC%E5%8F%91%E8%BF%99%E4%B8%AA%E6%9D%A8%E8%B6%85%E8%B6%8A%23&page_type=searchall&page='
class Tool:
deleteImg = re.compile('<img.*?>')
newLine =re.compile('<tr>|<div>|</tr>|</div>')
deleteAite = re.compile('//.*?:')
deleteAddr = re.compile('<a.*?>.*?</a>')
deleteTag = re.compile('<.*?>')
@classmethod
def replace(cls,x):
x = re.sub(cls.deleteImg,'',x)
x = re.sub(cls.deleteAite,'',x)
x = re.sub(cls.deleteAddr, '', x)
x = re.sub(cls.newLine,'',x)
x = re.sub(cls.deleteTag,'',x)
return x.strip()
class tiezi(object):
def get_info(self,url):
File = open('filename.txt', 'w')
excel = xlwt.Workbook(encoding='utf-8')
sheet = excel.add_sheet('sheet1')
sheet.write(0, 0, 'id')
sheet.write(0, 1, 'name')
sheet.write(0, 2, 'time')
sheet.write(0, 3, 'text')
count = 0
for i in range(1,41):
url = 'https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23%E8%BD%AC%E5%8F%91%E8%BF%99%E4%B8%AA%E6%9D%A8%E8%B6%85%E8%B6%8A%23&page_type=searchall&page=' + str(i)
response = requests.get(url)
print url
resj = json.loads(response.text)
data = resj.get('data').get('cards')
for i in range(0,len(data)):
datatemp = data[i]
card_group = datatemp.get('card_group')
for j in range(0,len(card_group)):
temp = card_group[j]
card_type = temp.get('card_type')
if (int)(card_type) == 9:
count += 1
mblog = temp.get('mblog')
text = mblog.get('text')
user = mblog.get('user')
id = user.get('id')
screen_name = user.get('screen_name')
created_at = mblog.get('created_at')
text=Tool.replace(text)
File.write(str(text.encode('utf-8')) + '\n')
sheet.write(count,0,str(id.encode('utf-8')))
sheet.write(count,1,screen_name.encode('utf-8'))
sheet.write(count,2,created_at.encode('utf-8'))
sheet.write(count,3,text.encode('utf-8'))
print ("已经获取" + str(count)+"条数据")
time.sleep(2)
excel.save('filename.xls')
if __name__=='__main__':
Tiezi = tiezi()
Tiezi.get_info(url)
为例,基本的url分析没太多难题,大家多写写就熟练了,毕竟不是什么技术活,我直接贴代码了
尊重原作,转载请注明,转载自:https://blog.csdn.net/kr2563