python爬取微博话题下面的帖子并存入excel文件

此次写的是python爬取微博话题下面的帖子，示例代码以爬取#转发这个杨超越#

https://s.weibo.com/weibo/%23%E8%BD%AC%E5%8F%91%E8%BF%99%E4%B8%AA%E6%9D%A8%E8%B6%85%E8%B6%8A%23

# -*- coding:utf-8 -*-
__author__ = 'TengYu'
import requests
import json
import re
import time
import xlwt
from bs4 import  BeautifulSoup



headers = {
    'User-agent' : 'Your-Agent',
    'Cookie':'Your-cookie'
}


url = 'https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23%E8%BD%AC%E5%8F%91%E8%BF%99%E4%B8%AA%E6%9D%A8%E8%B6%85%E8%B6%8A%23&page_type=searchall&page='




class Tool:
    deleteImg = re.compile('<img.*?>')
    newLine =re.compile('<tr>|<div>|</tr>|</div>')
    deleteAite = re.compile('//.*?:')
    deleteAddr = re.compile('<a.*?>.*?</a>')
    deleteTag = re.compile('<.*?>')

    @classmethod
    def replace(cls,x):
        x = re.sub(cls.deleteImg,'',x)
        x = re.sub(cls.deleteAite,'',x)
        x = re.sub(cls.deleteAddr, '', x)
        x = re.sub(cls.newLine,'',x)
        x = re.sub(cls.deleteTag,'',x)
        return x.strip()


class tiezi(object):
    def get_info(self,url):
        File = open('filename.txt', 'w')
        excel = xlwt.Workbook(encoding='utf-8')
        sheet = excel.add_sheet('sheet1')
        sheet.write(0, 0, 'id')
        sheet.write(0, 1, 'name')
        sheet.write(0, 2, 'time')
        sheet.write(0, 3, 'text')
        count = 0
        for i in range(1,41):
            url = 'https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23%E8%BD%AC%E5%8F%91%E8%BF%99%E4%B8%AA%E6%9D%A8%E8%B6%85%E8%B6%8A%23&page_type=searchall&page=' + str(i)
            response = requests.get(url)
            print url
            resj = json.loads(response.text)
            data = resj.get('data').get('cards')
            for i in range(0,len(data)):
                datatemp = data[i]
                card_group = datatemp.get('card_group')
                for j in range(0,len(card_group)):
                    temp = card_group[j]
                    card_type = temp.get('card_type')
                    if (int)(card_type) == 9:
                        count += 1
                        mblog = temp.get('mblog')
                        text = mblog.get('text')
                        user = mblog.get('user')
                        id = user.get('id')
                        screen_name = user.get('screen_name')
                        created_at = mblog.get('created_at')
                        text=Tool.replace(text)
                        File.write(str(text.encode('utf-8')) + '\n')
                        sheet.write(count,0,str(id.encode('utf-8')))
                        sheet.write(count,1,screen_name.encode('utf-8'))
                        sheet.write(count,2,created_at.encode('utf-8'))
                        sheet.write(count,3,text.encode('utf-8'))
            print ("已经获取" + str(count)+"条数据")
            time.sleep(2)
            excel.save('filename.xls')




if __name__=='__main__':
    Tiezi = tiezi()
    Tiezi.get_info(url)

为例，基本的url分析没太多难题，大家多写写就熟练了，毕竟不是什么技术活，我直接贴代码了

尊重原作，转载请注明，转载自：https://blog.csdn.net/kr2563

python爬取微博话题下面的帖子并存入excel文件

猜你喜欢