爬取微博热点评论用户ID--简易版本

把以前写的代码分享给大家,性能不太好,有时间给大家写个高性能的模型,发现程序运行不了就改改cookie。

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 17 09:06:30 2018

@author: Administrator
"""
import requests
from requests.exceptions import RequestException
import re
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import time
from  pyquery import PyQuery as pq
import time 
import json
import pymongo
import sys
s=requests.Session()
headers={
        
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'SINAGLOBAL=540062493192.52795.1513920812166; SCF=ArfNVnv0G8snZQO-0tjU0Ny9wYh-vDVHnpo6E1gG8lLyEe3Sc6oyB6gN6QcUzJz0N5Dawz8Rx4-RQNY-C9jfvsk.; SUHB=0U1JhnCkECPwyQ; SUB=_2AkMsEDyjdcPxrAZXkP8QxGjkbopH-jyfxVVVAn7uJhMyAxh77g4MqSVutBF-XL9ifGMm870rAjKzYJ5pfxC2NAVA; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWpCGKsXaGF3kwZPywp173e5JpVF020Sh2NehqXehq0; wb_view_log=1366*7681; UOR=acm.hdu.edu.cn,widget.weibo.com,www.baidu.com; YF-Page-G0=c6cf9d248b30287d0e884a20bac2c5ff; _s_tentry=-; Apache=1746290529031.4756.1531789292730; ULV=1531789297281:20:5:4:1746290529031.4756.1531789292730:1531786194935; login_sid_t=4cae36a37c8c1d38c8cb8d8e518fb3cb; YF-Ugrow-G0=9642b0b34b4c0d569ed7a372f8823a8e; YF-V5-G0=b59b0905807453afddda0b34765f9151; WBtopGlobal_register_version=2e7679c973813872; cross_origin_proto=SSL; WBStorage=5548c0baa42e6f3d|undefined',
'Host':'weibo.com',
'Referer':'https://weibo.com/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4092.1 Safari/537.36'
        }
pro={
     'http':'http://114.229.71.72:808'
     }
#连接数据库
client=pymongo.MongoClient(host='localhost')
db=client['WeiboUser']
collection=db['ID']
#获得微博首页的链接
def get_page_url():
    data={
            
            'ajwvr':6,
            'category':0,
            'page':2,
            'lefnav':0,
            'cursor':'',
            '__rnd':int(time.time()*1000)
            }
    url='https://weibo.com/a/aj/transform/loadingmoreunlogin?'+urlencode(data)
  
    
    #print(url)
    url_data=s.get(url,headers=headers,proxies=pro).text
    pat=re.compile('<ul\sclass(.*?)read_pos',re.S)
    data=re.search(pat,url_data)
    #print(data.group(1))
    doc=pq(data.group(1))
    a=doc('div')
    for ul in a.items():
        if ul.attr('href'):
            d=ul.attr('href')
            url='https:'+re.sub('\\x5c','',d)  #去\
            url=re.sub('"','',url)   #去""
            #下面的判断条件还没正式 测试
            '''if  'feedsdkn' in url:
                url=url+'&type=comment' 
            else :
                url=url+'?type=comment'
            '''
            
            get_page_detail(url)
#进入每个连接,进行翻页          
def get_page_detail(url):
    print(url)
    id_text=s.get(url,headers=headers)
    id_text.encoding='utf-8'
    #print(id_text.text)
    #time.sleep(15)
    pat=re.compile('按热度(.*?)按时间',re.S)
    text=re.search(pat,id_text.text)
    #print(text.group(1))
    doc=pq(text.group(1))
    url_id=doc('a').attr('action-data')    #找出链接片段
    #print(url_id)
    id_url=re.sub('\\x5c"','',url_id)
    #用户专属的ID:
    user_id=re.sub('(\D+)','',id_url) #大V用户id
   
    page=1
  
    for page in range(1,20000):
        head_url='https://weibo.com/aj/v6/comment/big?ajwvr=6&'
        time.sleep(1)
        if(page>1):
            id_text=s.get(url,headers=headers)
            response=id_text.json()

            #第四页开始需要点击才能进行翻页,就是标签改变了
            if page>=4:  
                if response and 'data' in response.keys():
                    data=response.get('data').get('html')
                    #寻找用户id
                    find_user_id(data)  
                    #print(data)
                    soup=BeautifulSoup(data,'lxml')
                    try:
                        action=soup.find(attrs={'action-type':"click_more_comment"})
                    except Exception :
                        print("出错")
                    try:
                        id_url=action.attrs['action-data']
                    except  AttributeError:
                        print('该链接挖掘完毕')
                        return 

                else:
                    print('失败')
                    continue    
            else:
                
                if response and 'data' in response.keys():
                    data=response.get('data').get('html')
                    find_user_id(data) 
                    soup=BeautifulSoup(data,'lxml')
                    try:
                        
                        action=soup.find(attrs={'node-type':"comment_loading"})
                        id_url=action.attrs['action-data']
                    except Exception:
                        print("出错")
                        
                    #print(id_data)
                    #print(action)
                
                else:
                    print('失败')
                    continue
  
            id_url=re.sub('all','hot',id_url)
            url=head_url+id_url+'&from=singleWeiBo'+"&__rnd="+str(int(time.time()*1000))
            
            #print(url)
            time.sleep(1)
            
          #首个页面评论
        else :
            id_url=re.sub('all','hot',id_url)
            url=head_url+id_url+"&__rnd="+str(int(time.time()*1000))
            #print(url)
#寻找用户id
def find_user_id(response):
    
    pat=re.compile('o_uid=(\d+)&',re.S)
    
    uid=re.findall(pat,response)
    #储存数据
    save_id(uid)
    
num=0 #用户量            
#存储用户信息            
def save_id(uid):
    global num #声明是全局变量
    name='user'
    #储存类型为字典
    
    
        
    for u in uid:
         
            
        if db['ID'].insert({name:u}):
            print('储存成功  '+str(num))
            
            
         
        else:
            save_id(uid)
            print('存储失败,重新开始')
    
        num+=1
        
        
   
   
def main():
    get_page_url()
    print('完成初步挖掘')
if __name__=='__main__':
    main()      

猜你喜欢

转载自blog.csdn.net/Lzs1998/article/details/87987595
今日推荐