#-*-coding:utf-8-*-
#该QQ_user_small_spider文件中,需要重写的是downLoad(numble)函数,其他都不用动。
#該項目需要一些前期工作,訪問https://blog.csdn.net/qq_41861526/article/details/80194266
import requests
import re
import os
import json
import time
import urllib3
from queue import Queue
urllib3.disable_warnings()
session = requests.session()
user = ''
def init():
global session
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
'Referer': 'https://qzone.qq.com/',
'Host': 'user.qzone.qq.com'
}
session.headers.update(headers)
global user
try:
ans = open('cookies.txt', 'r', encoding='utf-8').read()
user = re.findall('uin==o(\d{1,15})', ans)[0]
except:
user=input('输入QQ号:')
def getGTK(cookie):
hashes = 5381
for letter in cookie['p_skey']:
hashes += (hashes << 5) + ord(letter)
return hashes & 0x7fffffff
def get_user_detail(g_tk,user,user_login):
headers = {}
r=session.get('https://user.qzone.qq.com/%s'%(user),headers=headers)
qzonetoken=re.findall('return "(.*?)"',r.text,re.S)[0]
params={"uin":user,
"vuin":user_login,
"g_tk":g_tk,
"qzonetoken":qzonetoken
}
r=session.get('https://h5.qzone.qq.com/proxy/domain/base.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all',params=params)
ans=r.text.replace('_Callback(','').replace(');','')
return json.loads(ans)
def cookielogin():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
'Referer': 'https://qzone.qq.com/',
'Host': 'user.qzone.qq.com'}
try:
ans = open(r'cookies.txt', 'r').readlines()
if not re.search('p_uin', str(ans), re.S) and re.search(user, str(ans), re.S):
return False
except:
return False
for an in ans:
an = an.replace('\n', '')
a = an.split('==')
cookies[a[0]] = a[1]
cookies['_qz_referrer'] = 'i.qq.com'
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
r = session.get('https://user.qzone.qq.com/%s/infocenter' % (user) , verify=False)
if r.status_code!=200:
return False
if not re.findall('QQ空间-分享生活,留住感动', r.text):
return True
else:
return False
def login():
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://user.qzone.qq.com/')
time.sleep(10)
try:
with open(r'cookies.txt', 'w+') as f:
for cookie in driver.get_cookies():
f.write(cookie['name'] + '==' + cookie['value'] + '\n')
f.close()
return True
except :
return False
def judge(j_tk, user_i, numble):
r=session.get('https://h5.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds_html_module?i_uin=%s&i_login_uin=%s'%(numble,user_i))
r.encoding=r.apparent_encoding
ans=re.findall('浏览(\d*?)次',r.text,re.S)
detail=get_user_detail(g_tk, an, user)
"""
这个是返回的detail的内容,是一个字典{'uin': 11419, 'is_famous': False, 'famous_custom_homepage': False, 'nickname': '致青春短句[em]e327810[/em]', 'emoji': [], 'spacename': '致青春短句[em]e327810[/em]的空间', 'desc': '爱生活爱腾讯', 'signature': '爱生活爱腾讯', 'avatar': 'http://b355.photo.store.qq.com/psb?/V12icHf93hSYM9/Re5tZuc9Eda*y8KtA4TUu9stCpe5IMuT2gMKNS*QSkk!/b/dGMBAAAAAAAA&bo=kADiAAAAAAAFAFE!', 'sex_type': 0, 'sex': 0, 'animalsign_type': 0, 'constellation_type': 0, 'constellation': -1, 'age_type': 0, 'age': 0, 'islunar': 0, 'birthday_type': 0, 'birthyear': 0, 'birthday': '00-00', 'bloodtype': 0, 'address_type': 0, 'country': '', 'province': '', 'city': '', 'home_type': 0, 'hco': '', 'hp': '', 'hc': '', 'marriage': 1, 'career': '-', 'company': '', 'cco': '', 'cp': '', 'cc': '', 'cb': '-', 'mailname': '', 'mailcellphone': '', 'mailaddr': '', 'qzworkexp': [], 'qzeduexp': [], 'ptimestamp': 1450253662}
"""
#以字典的形式返回
#detail['age']获取到的是年龄,如detail['age']=19,表示该qq用户年龄为19岁
#detail['birthyear']表示出生年,detail['birthyear']=1999表示该qq用户为1999年出生
#detail['sex']表示性别,有三个值。0,1,2,分别表示。1表示用户为男生,2表示用户为妹子
#detail['hco']表示故乡哪个国家,detail['hp']表示故乡的省级,detail['hc']表示市级别。比如:detail['hco']='中国',detail['hp']=北京,detail['hc']=xx,表示用户为中国北京xx人
# if detail['sex']!=1:
#
# return False
#如果是不是男生就不会记录这个qq
try:
ans=ans[0]
if int(ans) >10:#判断用户第一条说说的访问量是否达标到10
return True
else:
return False
except:
return False
def downLoad(numble):#对获取到的qq号码进行存储,存储方式没有定义。如果想进行去重处理可以运用python的异常处理机制+mysql的unicode键
f=open('qqnumble.txt','a+',encoding='utf-8')
f.write(numble+'\n')
print("https://user.qzone.qq.com/%s succeed"%(numble))
time.sleep(6) #防反爬
if __name__ == "__main__":
init()
cookies = {}
q = Queue(maxsize=100000)
#q.put(101538)#加上这个后,爬虫程序将会从全国开始爬,如果注释掉这个,爬虫将会从你自己开始爬
url = 'https://user.qzone.qq.com/'
if not cookielogin():
login()
cookielogin()
r = session.get('https://user.qzone.qq.com/%s' % (user), verify=False)
numble = re.findall('data-uin="(\d*?)"', r.text)#data-uin是指一条说说下点赞的qq号码,data-origuin是这条说说的主人
numble=set(numble)
g_tk=getGTK(session.cookies.get_dict())
for uin in numble:
q.put(uin)
while 1:
if q.qsize() <= 100000:
r = session.get('https://h5.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds_html_module?i_uin=%s&i_login_uin=%s'
%( q.get(),user), verify=False)
ans = re.findall('des_(\d{5,15})', r.text, re.S)
ans=set(ans)
for an in ans:
url = 'https://user.qzone.qq.com/%s' % (an)
r = session.get(url, cookies=cookies, verify=False)
if not len(re.findall('QQ空间-分享生活,留住感动', r.text, re.S)) and judge(g_tk, an, user):
downLoad(an)
if q.qsize() <= 100000:
q.put(an)
else:
break
else:
break
QQ爬虫升级版
猜你喜欢
转载自blog.csdn.net/qq_41861526/article/details/80946702
今日推荐
周排行