使用python做微博爬虫遇到的问题(1)

使用的是python 2.7

python装不了requests:

百度上的大致意思是我的电脑的用户名是中文,python不支持中文
解决办法是在python文件路径下(我的是C:\Python27)的Lib\site-packages内新建一个名为“sitecustomize.py”的文件,文件里写入:

import sys 
sys.setdefaultencoding('gbk')

保存即可

‘NoneType’ object is not iterable错误

出现这个原因是有个变量出现了none值,比如:

File "XXXXXXXXX.py", line 33, in main
for card in list_cards:
TypeError: 'NoneType' object is not iterable

这里是因为list_cards是none,所以要追查list_cards的来源,这里我用到了debug
如何使用可查看:Python如何用自带的IDLE进行调试DEBUG

查到了获取微博用户主页的某条微博内容是有两层的(data和cards):

然后通过开发者工具查看也是如此:
这里写图片描述

然后将原来代码:

list_cards = ob_json.get('cards')

修改代码为

# _*_ coding:utf-8 _*_

from lxml import html
import requests
import json
import re


class Tool:
    removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
    removeAddr = re.compile('<a.*?>|</a>')
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')

    removeTag = re.compile('<.*?>')
    #self是实例方法 cls是类方法

    @classmethod
    def replace(cls,x):
        x=re.sub(cls.removeImg,'',x)
        x=re.sub(cls.removeAddr,'',x)
        x=re.sub(cls.replaceLine,'',x)
        x=re.sub(cls.removeTag,'',x)

        return x.strip() #去掉多余的空格


class Weibo(object):
    def get_weibo(self,id,page):

        url = 'https://m.weibo.cn/api/container/getIndex?uid={}&type=uid&value={}&containerid=107603{}&page={}'.format(id,id,id,page)
        response = requests.get(url)
        ob_json = json.loads(response.text)

        list_cards = ob_json.get('data').get('cards')


        return list_cards

    def get_comments(self,id,page):
        url = 'https://m.weibo.cn/api/comments/show?id={}&page={}'.format(id,page)
        response = requests.get(url)
        ob_json = json.loads(response.text)
        list_comments = ob_json.get('data').get('hot_data')

        return list_comments

    def main(self,uid,page):
        list_cards = self.get_weibo(uid,page)

        if list_cards != None:
            for card in list_cards:
                if card.get('card_type')==9:
                    id=card.get('mblog').get('id')
                    text=card.get('mblog').get('text')
                    text = Tool.replace(text)
                    print '******'
                    print u'微博:'+text+'\n'

                    list_comments = weibo.get_comments(id,1)
                    count_hotcomments =1
                    for comment in list_comments:
                        created_at = comment.get('created_at')
                        like_counts = comment.get('like_counts')
                        source = comment.get('source')
                        text = comment.get('text')
                        tree = html.fromstring(text)
                        text = tree.xpath('string(.)')#用string函数过滤多余标签
                        name_user = comment.get('user').get('screen_name')
                        if source =='':
                            source = u'未知'

                        print str(count_hotcomments),':**',name_user+u'  时间'+created_at+u'  点赞'+str(like_counts)+u'  来源'+source
                        print text+'\n'
                        count_hotcomments+=1
                    print '============='

if __name__ == '__main__':
    weibo=Weibo()
    weibo.main('1192329374',1)



猜你喜欢

转载自blog.csdn.net/qq_20366761/article/details/79443303
今日推荐