杨超越微博爬虫（微博文字+图片）粉丝信息待续

# -*- coding: utf-8 -*-
import urllib.request
import json
import time
import random

from urllib.request import urlopen
from bs4 import BeautifulSoup
import threading
import requests
from urllib.request import urlretrieve
import re
import sys
import string
import os
import socket
import urllib


id = '5644764907'  # 定义要爬取的微博id。杨超越微博https://m.weibo.cn/u/5644764907
proxy = [  {'http': '106.14.47.5:80'},
                {'http': '61.135.217.7:80'},
                {'http': '58.53.128.83:3128'},
                {'http': '58.118.228.7:1080'},
                {'http': '221.212.117.10:808'},
                {'http': '115.159.116.98:8118'},
                {'http': '121.33.220.158:808'},
                {'http': '124.243.226.18:8888'},
                {'http': '124.235.135.87:80'},
                {'http': '14.118.135.10:808'},
                {'http': '119.176.51.135:53281'},
                {'http': '114.94.10.232:43376'},
                {'http': '218.79.86.236:54166'},
                {'http': '221.224.136.211:35101'},
                {'http': '58.56.149.198:53281'}]  # 设置代理IP
# 定义页面打开函数
def use_proxy(url,proxy_addr):
    req = urllib.request.Request(url)
    req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
    proxy = urllib.request.ProxyHandler(proxy_addr)
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(req).read().decode('utf-8','ignore')
    return data

# 获取微博用户的基本信息，如：微博昵称、微博地址、微博头像、关注人数、粉丝数、性别、等级等
def get_userInfo(id):
    url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id  # 个人信息接口
    seed_num = random.randint(1,15)-1
    proxy_addr = proxy[seed_num]
    data = use_proxy(url, proxy_addr)
    content = json.loads(data).get('data')
    profile_image_url = content.get('userInfo').get('profile_image_url')
    description = content.get('userInfo').get('description')
    profile_url = content.get('userInfo').get('profile_url')
    verified = content.get('userInfo').get('verified')
    guanzhu = content.get('userInfo').get('follow_count')
    name = content.get('userInfo').get('screen_name')
    fensi = content.get('userInfo').get('followers_count')
    gender = content.get('userInfo').get('gender')
    urank = content.get('userInfo').get('urank')
 
    print("微博昵称："+name+"\n"+"微博主页地址："+profile_url+"\n"+"微博头像地址："+profile_image_url+"\n"+"是否认证："+str(verified)+"\n"+"微博说明："+description+"\n"+"关注人数："+str(guanzhu)+"\n"+"粉丝数："+str(fensi)+"\n"+"性别："+gender+"\n"+"微博等级："+str(urank)+"\n")
 
    pass

def save_pics(pics_info,m):
    print("pic_save start")
    for pic_info in pics_info:
        pic_url=pic_info['large']['url']#原图
        #pic_url=pic_info['url']#低清图
        pic_path=pics_dir + '\\%d.jpg'%m
        try:
            #下载图片
            with open(pic_path,'wb') as f:
                for chunk in requests.get(pic_url,stream=True).iter_content():
                    f.write(chunk)
        except:
            print(pic_path + '保存失败')
        else:
            print(pic_path + '保存成功')
            m+=1
        

# 获取微博主页的containerid，爬取微博内容时需要此id
def get_containerid(url,proxy_addr):
    data = use_proxy(url, proxy_addr)
    content = json.loads(data).get('data')
    for data in content.get('tabsInfo').get('tabs'):
        if(data.get('tab_type') == 'weibo'):
            containerid = data.get('containerid')
    return containerid
 

 
# 获取微博内容信息,并保存到文本中，内容包括：每条微博的内容、微博详情页面地址、点赞数、评论数、转发数等
def get_weibo(id, file,file_content):
    i = 1
    m = 0
    while True:
        num = random.randint(1,15)-1
        proxy_addr = proxy[num]
        url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id
        weibo_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id+'&containerid='+get_containerid(url,proxy_addr)+'&page='+str(i)
        print(url)
        print(weibo_url)
        try:

            data = use_proxy(weibo_url, proxy_addr)
            content = json.loads(data).get('data')
            cards = content.get('cards')
            if(len(cards)>0):
                threads = []
                for j in range(len(cards)):
                    print("第"+str(i)+"页，第"+str(j)+"条微博")
                    card_type = cards[j].get('card_type')
                    if(card_type == 9):
                        mblog = cards[j].get('mblog')
                        attitudes_count = mblog.get('attitudes_count')
                        comments_count = mblog.get('comments_count')
                        created_at = mblog.get('created_at')
                        reposts_count = mblog.get('reposts_count')
                        scheme = cards[j].get('scheme')
                        print(i)

                        #获取微博内容
                        try:
                            text = mblog.get('text')
                            text = re.sub(u"\<.*?\>", "", text)

                        except:
                             return None
                        with open(file_content, 'a+',encoding='utf-8') as f1:
                            f1.write(str(text)+"\n")
                            pass
                            
                        #下载图片
                        try:
                            pics_info = mblog.get('pics')
                        except:
                            pass
                        else:
                            if pics_info:
                                print("have pics")
                                save_pics(pics_info,m)
                                m += 1
                        with open(file, 'a+', encoding='utf-8') as fh:
                            fh.write("第"+str(i)+"页，第"+str(j)+"条微博"+"\n")
                            fh.write("微博地址："+str(scheme)+"\n"+"发布时间："+str(created_at)+"\n"+"微博内容："+text+"\n"+"点赞数："+str(attitudes_count)+"\n"+"评论数："+str(comments_count)+"\n"+"转发数："+str(reposts_count)+"\n")
                            pass
                        pass
                    pass
                i += 1
                time.sleep(random.randint(1,3))
                pass
            else:
                break
        except Exception as e:
            print(e)
            pass
        pass
 
    pass

if __name__ == "__main__":
    print('开始---')
    pics_dir = r"D:\software_study\my_jupyter_notebook\scrawl\pics_origin"
    file_all = "ycy_all.txt"
    file_content = "ycy_content.txt"
    #pic_index
    get_userInfo(id)
    get_weibo(id, file_all, file_content)
    print('完成---')
pass
结果展示：
微博内容：
微博图片：
GO! 冲鸭！！！超越一切
杨超越微博爬虫（微博文字+图片）粉丝信息待续

猜你喜欢