QQ爬虫升级版

#-*-coding:utf-8-*-
#该QQ_user_small_spider文件中,需要重写的是downLoad(numble)函数,其他都不用动。
#該項目需要一些前期工作,訪問https://blog.csdn.net/qq_41861526/article/details/80194266
import requests
import re
import os
import json
import time
import urllib3
from queue import Queue

urllib3.disable_warnings()
session = requests.session()

user = ''
def init():
    global session
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
        'Referer': 'https://qzone.qq.com/',
        'Host': 'user.qzone.qq.com'
    }
    session.headers.update(headers)
    global user
    try:
        ans = open('cookies.txt', 'r', encoding='utf-8').read()
        user = re.findall('uin==o(\d{1,15})', ans)[0]
    except:
        user=input('输入QQ号:')

def getGTK(cookie):
    hashes = 5381

    for letter in cookie['p_skey']:
        hashes += (hashes << 5) + ord(letter)
    return hashes & 0x7fffffff
def get_user_detail(g_tk,user,user_login):
    headers = {}
    r=session.get('https://user.qzone.qq.com/%s'%(user),headers=headers)
    qzonetoken=re.findall('return "(.*?)"',r.text,re.S)[0]
    params={"uin":user,
            "vuin":user_login,
            "g_tk":g_tk,
            "qzonetoken":qzonetoken
            }
    r=session.get('https://h5.qzone.qq.com/proxy/domain/base.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all',params=params)

    ans=r.text.replace('_Callback(','').replace(');','')

    return json.loads(ans)
def cookielogin():

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
               'Referer': 'https://qzone.qq.com/',
               'Host': 'user.qzone.qq.com'}
    try:
        ans = open(r'cookies.txt', 'r').readlines()
        if not re.search('p_uin', str(ans), re.S) and re.search(user, str(ans), re.S):

            return False
    except:

        return False

    for an in ans:

        an = an.replace('\n', '')
        a = an.split('==')
        cookies[a[0]] = a[1]
    cookies['_qz_referrer'] = 'i.qq.com'
    requests.utils.add_dict_to_cookiejar(session.cookies, cookies)

    r = session.get('https://user.qzone.qq.com/%s/infocenter' % (user) , verify=False)
    if r.status_code!=200:

        return False

    if not re.findall('QQ空间-分享生活,留住感动', r.text):

        return True
    else:
        return False


def login():
    from selenium import webdriver
    driver = webdriver.Firefox()

    driver.get('https://user.qzone.qq.com/')

    time.sleep(10)
    try:
        with open(r'cookies.txt', 'w+') as f:
            for cookie in driver.get_cookies():
                f.write(cookie['name'] + '==' + cookie['value'] + '\n')
        f.close()
        return True
    except :

        return False
def judge(j_tk, user_i, numble):
    r=session.get('https://h5.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds_html_module?i_uin=%s&i_login_uin=%s'%(numble,user_i))
    r.encoding=r.apparent_encoding
    ans=re.findall('浏览(\d*?)次',r.text,re.S)
    detail=get_user_detail(g_tk, an, user)
    """
    这个是返回的detail的内容,是一个字典{'uin': 11419, 'is_famous': False, 'famous_custom_homepage': False, 'nickname': '致青春短句[em]e327810[/em]', 'emoji': [], 'spacename': '致青春短句[em]e327810[/em]的空间', 'desc': '爱生活爱腾讯', 'signature': '爱生活爱腾讯', 'avatar': 'http://b355.photo.store.qq.com/psb?/V12icHf93hSYM9/Re5tZuc9Eda*y8KtA4TUu9stCpe5IMuT2gMKNS*QSkk!/b/dGMBAAAAAAAA&bo=kADiAAAAAAAFAFE!', 'sex_type': 0, 'sex': 0, 'animalsign_type': 0, 'constellation_type': 0, 'constellation': -1, 'age_type': 0, 'age': 0, 'islunar': 0, 'birthday_type': 0, 'birthyear': 0, 'birthday': '00-00', 'bloodtype': 0, 'address_type': 0, 'country': '', 'province': '', 'city': '', 'home_type': 0, 'hco': '', 'hp': '', 'hc': '', 'marriage': 1, 'career': '-', 'company': '', 'cco': '', 'cp': '', 'cc': '', 'cb': '-', 'mailname': '', 'mailcellphone': '', 'mailaddr': '', 'qzworkexp': [], 'qzeduexp': [], 'ptimestamp': 1450253662}
    """
    #以字典的形式返回
    #detail['age']获取到的是年龄,如detail['age']=19,表示该qq用户年龄为19岁
    #detail['birthyear']表示出生年,detail['birthyear']=1999表示该qq用户为1999年出生
    #detail['sex']表示性别,有三个值。0,1,2,分别表示。1表示用户为男生,2表示用户为妹子
    #detail['hco']表示故乡哪个国家,detail['hp']表示故乡的省级,detail['hc']表示市级别。比如:detail['hco']='中国',detail['hp']=北京,detail['hc']=xx,表示用户为中国北京xx人
    # if detail['sex']!=1:
    #
    #     return False
    #如果是不是男生就不会记录这个qq
    try:
        ans=ans[0]
        if int(ans) >10:#判断用户第一条说说的访问量是否达标到10

            return  True
        else:
            return False
    except:
        return False


def downLoad(numble):#对获取到的qq号码进行存储,存储方式没有定义。如果想进行去重处理可以运用python的异常处理机制+mysql的unicode键
    f=open('qqnumble.txt','a+',encoding='utf-8')
    f.write(numble+'\n')
    print("https://user.qzone.qq.com/%s succeed"%(numble))
    time.sleep(6)           #防反爬

if __name__ == "__main__":
    init()
    cookies = {} 
    q = Queue(maxsize=100000)
    #q.put(101538)#加上这个后,爬虫程序将会从全国开始爬,如果注释掉这个,爬虫将会从你自己开始爬
    url = 'https://user.qzone.qq.com/'

    if not cookielogin():
        login()
        cookielogin()
    r = session.get('https://user.qzone.qq.com/%s' % (user),   verify=False)
    numble = re.findall('data-uin="(\d*?)"', r.text)#data-uin是指一条说说下点赞的qq号码,data-origuin是这条说说的主人
    numble=set(numble)

    g_tk=getGTK(session.cookies.get_dict())

    for uin in numble:
        q.put(uin)
    while 1:
        if q.qsize() <= 100000:
            r = session.get('https://h5.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds_html_module?i_uin=%s&i_login_uin=%s'
                            %( q.get(),user), verify=False)

            ans = re.findall('des_(\d{5,15})', r.text, re.S)

            ans=set(ans)

            for an in ans:
                url = 'https://user.qzone.qq.com/%s' % (an)
                r = session.get(url, cookies=cookies, verify=False)
                if not len(re.findall('QQ空间-分享生活,留住感动', r.text, re.S)) and judge(g_tk, an, user):
                    downLoad(an)
                if q.qsize() <= 100000:
                    q.put(an)
                else:

                    break

        else:

            break

猜你喜欢

转载自blog.csdn.net/qq_41861526/article/details/80946702