抓取百度云盘链接的小虫

#-*-coding=utf-8-*-
#encoding=utf-8
"""
python版本：Python2
python版本：Python2.7.10
httphttplib2版本：httplib2-0.7.7
=====零号虫=====
创建开始时间：2015-8-19 星期三。
创建完成时间：2015-8-20 星期四。百度云盘链接提取Python爬虫---【零号虫】终于完成了。
都凌晨了，百度弹出个消息今天是七夕节。
恩~~那么打算给【零号虫--云蜘蛛】起个昵称，就叫----七夕云鹊---
QQ群：427248225
帮忙的人：
编辑：迷失的幽灵_苍蓝钢铁的琶音_伊401_longinnus_limbo_黑光计划
@author: Dreace
@咸阳-Victory  516742171
"""
import urllib2
import sys
import time
import os
import random
import re

#from multiprocessing.dummy import Pool as ThreadPool 
type_ = sys.getfilesystemencoding()

start_number=10
end_number=11
user_number=0
boy=0

my_t=0.1
my_tt=0

	
"""
start_number:准备要爬取百度账号的开始账号（大于0的整数）。
end_number:准备要爬取的百度账号的结束账号（大于0的整数）。
boy:记录账号个数。
x_:作为前缀表示，这个变量是局部变量。
"""
#01_文件名后缀文件创建的时间防止文件名冲突。
def rename():
    return time.strftime("%Y%m%d%H%M%S")

#02_创建时间戳,作为网址的一部分.
def shijianchuo():
	myshijianchuo=str(time.strftime("%Y%m%d%H%M%S"))
	return myshijianchuo

#02_1检查磁盘路径exisst是否存在。
def exists_dir():
	dir_flag= True
	x_dir = ""
	while dir_flag:
		x_dir_a=raw_input("[输入quit退出]你要存在哪个盘？请输入盘符字母[默认D盘]>>>".decode("utf-8").encode(type_))
		if x_dir_a != '':
			if x_dir_a == 'quit' or x_dir_a =='QUIT':
				dir_flag = False
				
			elif x_dir_a=='a'or x_dir_a=='A':
				print 'zuosi'
			else:
				x_dir=str(x_dir_a)+':\\'
		else:
			x_dir=str("D") + ':\\'

		if os.path.exists(x_dir):
			print "true",x_dir
			dir_flag = False
		else:
			print "wrong",x_dir
	return x_dir

#2_2选择文件夹目录。file_dir
def exists_file(x_x_dir):
	global file_dir
	file_dir_flag= True
	x_file_dir = ""
	while file_dir_flag:
		x_file_dir=raw_input("输入文件夹名字(尽量用英文名)[默认Spider]>>>".decode("utf-8").encode(type_))
		if x_file_dir!="":
			file_dir=str(x_x_dir)+str(x_file_dir)
			if os.path.isdir(file_dir): 
				print "true",file_dir
				os.chdir(file_dir)
				file_dir_flag = False
			else:
				print "wrong",file_dir
				os.makedirs(file_dir)
				os.chdir(file_dir)
				print "OK"
				file_dir_flag = False
		else:
			file_dir=str(x_x_dir)+"Spider"
			if os.path.isdir(file_dir):
				print "true",file_dir
				os.chdir(file_dir)
				file_dir_flag = False
			else:
				os.makedirs(file_dir)
				print "true",file_dir
				os.chdir(file_dir)
				file_dir_flag = False
	return file_dir
	
	
#03_0_获得url_address地址,用来判断这个地址的网页是不是空的。
def get_url(x_myshijianchuo,x_start_link,x_user_uk):
	global url_address
	global start_link
	global user_uk
	url_address = url_address1 + x_myshijianchuo + url_address2 + x_start_link + url_address3 + x_user_uk + url_address4
	return url_address
#03_1_获得urllink_address地址,用来获得网页分享链接。
def get_urllink(x_user_uk,x_start_page):
	global urllink_address
	global start_page
	global user_uk
	urllink_address = url_address11 + x_user_uk + url_address12 + x_start_page + url_address13
	return urllink_address
	
#04_0_获得两种网页第一页内容,用来判断用户是否存在。
def open_url(x_url_address):
	global url_address
	global str_
	global time_out
	try:
		req = urllib2.Request(url = x_url_address,headers = headers)
		str_ = urllib2.urlopen(req, timeout = time_out).read()
	except Exception, e:
		print e
	return str_


	
#04_1_获得百度分享链接，这个网页才是真实的链接。
def open_urllink(x_urllink):
	global urllink_address
	global str_link
	global time_out
	global re_shorturl
	try:
		req = urllib2.Request(url = x_urllink,headers = headers)
		str_link = urllib2.urlopen(req, timeout = time_out).read()
	except Exception, e:
		print e
	return str_link
	
#04_2通过正则获得百度云盘完整短链接，判断是否有效。
def get_shorturl(x_s_url):
	#global re_shorturl
	global s_url
	global url_code
	url_code = None
	x_s_url = s_url
	re_shorturl = x_s_url.replace(':','')
	re_shorturl = re_shorturl.replace('"','')
	re_shorturl = re_shorturl.replace('shorturl','http://pan.baidu.com/s/')
	print re_shorturl
	#time.sleep(0.01)
	if re_shorturl!='http://pan.baidu.com/s/':
		print "get the shorturl url link, check the short url>>>>>"
		try:
			#req = urllib2.Request(url = x_urllink,headers = headers)
			url_code = urllib2.urlopen(re_shorturl, timeout = time_out).code
			
			share_nofound = urllib2.urlopen(re_shorturl, timeout = time_out).read()
			if share_nofound.find('share_nofound_des') != -1:
				url_code = '<<share_nofound_des>>'
			#print share_nofound
			print "url_code>>>",url_code
		except Exception, e:
			print "wrong>>>",e
	else:
		print "no shtor url ,need not check it, pass it next"
		url_code = "no shorturl"
	
	return url_code
	
	
#05_中断提醒用户输入。
def stop_infor(x_StopNumber):
	global program_exit
	"""
	先留着。
	if x_StopNumber =="1":
		x_infor="中断1，我是百度云盘小灵虫_零号虫---萌萌哒！按[Q]退出，按[回车继续]>>！"
	elif x_StopNumber =="2":
		x_infor="中断2，显示网页内容,按[Q]退出，按[回车继续]>>！"
	elif x_StopNumber =="3":
		x_infor="中断3，显示分享数量,按[Q]退出，按[回车继续]>>！"
	elif x_StopNumber =="4":
		x_infor="中断4，显示源链接,按[Q]退出，按[回车继续]>>！"	
	
	x_stop = raw_input(x_infor.decode("utf-8").encode(type_)) 
	if x_stop=="Q" or x_stop=="q":
		print "program_exit!"
		program_exit="end"
	else:
		print "三秒后继续哦>>>".decode("utf-8").encode(type_)
		time.sleep(3)
		program_exit="continue"
	"""
	if x_StopNumber =="1":
		x_infor="中断1，我是百度云盘小灵虫_零号虫---七夕云鹊---萌萌哒！>>！"
	elif x_StopNumber =="2":
		x_infor="中断2，显示网页内容>>！"
	elif x_StopNumber =="3":
		x_infor="中断3，显示分享数量>>！"
	elif x_StopNumber =="4":
		x_infor="中断4，显示源链接>>！"	
	print x_infor.decode("utf-8").encode(type_)
	print "1秒后继续哦~~>>>".decode("utf-8").encode(type_)
	time.sleep(1)
	program_exit="continue"
	return program_exit
	
#5_1用户输入开始账号和结束账号。
def stop_usernum():
	global program_exit
	global start_number
	global end_number
	global x_startnum
	global x_endnum
	
	print "例如：这是一个百度分享者的主页-----http://yun.baidu.com/share/home?uk=2635271865#category/type=0".decode("utf-8").encode(type_)
	print "uk=2635271865,里面的数字是这个账户的ID。".decode("utf-8").encode(type_)
	print "如果不输ID，那么零号虫会自己按程序的内部预置的ID进行搜索，你可以修改源代码的start_number,和end_number的初始值>>>".decode("utf-8").encode(type_)
	print "输入[Q]按回车退出>>>>>>>>>>".decode("utf-8").encode(type_)
	x_startnum = raw_input('[Q退出]输入起始ID号>>>>>'.decode("utf-8").encode(type_))
	if x_startnum.isdigit():
		x_endnum = raw_input('[Q退出]输入结束ID号>>>>>'.decode("utf-8").encode(type_))
		if x_endnum.isdigit():
			if int(x_endnum) >= int(x_startnum) :
				start_number=int(x_startnum)
				end_number=int(x_endnum)
				program_exit="continue"
			
			else:
				print "结束ID号比开始ID号还小呢~~所以，我呢把开始和结束ID号颠倒了，省的你又输错啦~~么么哒！>>>>>>>>>>".decode("utf-8").encode(type_)
				program_exit="continue"
		elif x_endnum=="Q" or x_endnum=="q":
			program_exit="end"
		else:
			print "必须输入数字,你造吗?>>>>>>>>>>".decode("utf-8").encode(type_)
			print "零号虫原谅你的错误啦，就抓取我自己的网盘作为安慰奖吧！>>>>>>>>>>".decode("utf-8").encode(type_)
			start_number=588134400
			end_number=588134400		
			program_exit="continue"
	elif x_startnum=="Q" or x_startnum=="q":
		program_exit="end"
	else:
		print "必须输入数字,你造吗?>>>>>>>>>>".decode("utf-8").encode(type_)
		print "零号虫原谅你的错误啦，就抓取我自己的网盘作为安慰奖吧！>>>>>>>>>>".decode("utf-8").encode(type_)
		print "1秒后开始>>>>>".decode("utf-8").encode(type_)
		start_number=588134400
		end_number=588134400
		time.sleep(1)
		program_exit="continue"
		
	return start_number,end_number,program_exit

#5_2程序自动运行时候使用这个比较短的中断。
def stop_min(sx_user_number):
	global user_number
	global my_tt
	global my_t
	print "<<<<<零号虫刚刚找到了".decode("utf-8").encode(type_),repr(sx_user_number),"个有分享的百度账号，么么哒！继续找！萌萌哒！>>>>>".decode("utf-8").encode(type_)
	my_tt += 1
	if my_tt==20:	
		my_t = random.randint(1,3)*0.01
		my_tt=0
	time.sleep(my_t)

#06_获得链接文本。
def get_str(x_str_1):
	global link_str
	for j in x_str_1:
		j ='"shorturl":' + j
		my_list.append(j)
	for i in range(0,len(my_list)):
		f.write('\n'.encode('utf8'))
		k = my_list[i].split(',')
		#获得分享文件,基本信息。
		for l in k:
			for m in xiangmu_list:
				if l.find(m) != -1:
					print l
					link_str=str(link_str) + l
	
	return link_str

#打印user_infor，用户信息
def p_user_infor(p_x_user_number,p_x_page,p_x_mypage,x_my_count):
	global p_u_infor
	p_user_id1 = "分享链接总数<<<<<".decode("utf-8").encode(type_)
	p_user_id2 = str(x_my_count)
	
	p_page1 = ">>>>>>>>>>>>>>>>>>>>第<<<<<".decode("utf-8").encode(type_)
	p_page2 = str(page)
	p_page3 = ">>>>>页>>>>>".decode("utf-8").encode(type_)
	
	p_mypage1 = "/"
	p_mypage2 = str(p_x_mypage)
	p_u_infor = p_user_id1 + p_user_id2 + p_page1+ p_page2 + p_mypage1 + p_mypage2 + p_page3
	return p_u_infor

#写入文件结尾信息。
def end_infor (x_start_number,x_boy,x_user_number):
	global start_number
	global boy
	global user_number
	global myshijianchuo
	global f_infor
	f_infor = []
	daihaofanwei="代号范围：".decode("utf-8").encode('gbk')+str(start_number)+ ">>>>>>>>>" + str(boy)
	zhanghaogeshu="收集账号个数：".decode("utf-8").encode('gbk')+str(user_number)
	myshijianchuo=shijianchuo()
	qq_spider = "<<<<<qq群427248225>>>>===<<<<<零号虫====七夕云鹊>>>>>".decode("utf-8").encode('gbk')
	end_time = "记录时间：".decode("utf-8").encode('gbk')
	
	f_infor.append(daihaofanwei)
	f_infor.append(zhanghaogeshu)
	f_infor.append(qq_spider)
	f_infor.append(end_time)
	f_infor.append(myshijianchuo)
	return f_infor
	
"""
变量赋值。
i,j,k,l,m,n基本都是引用数组的时候临时声名的，作为局部变量使用。
start_link,每页的起始链接的序号。范围是1到分享着分享的总文件数。递增公差是60，每页最多显示60个链接。
total_count:每个分享者分享的链接数量。
url_address1、2、3、4地址的固定部分。
start_link，地址的变动部分，最好是60的倍数，也可以是大于1小于分享者分享链接总数的任意整数。
user_uk,地址的变动部分，百度云盘账户的代号。

作者发现有些百度账号非常诡异，以至于，[七夕云鹊]把这个账号失效的链接也抓取了。所以为了能减少失效链接的抓取数量，尽量选择优质高产百度云账号抓取。
所以start_number就可以设置成分享链接很多的账号，end_number呢就比start_number大1就好了。这样一次就能抓取一个账号的分享链接。
start_number 必须比end_number小，可以设置start_number=1,end_number=10000000,这样你就可以开着电脑睡觉了，七夕云冲会整个晚上在百度云盘搜索分享链接的。
一个晚上过去你会得到一个上G的txt链接文档。
好了碎觉啦。。。。
"""



"""
第一种形式的分享网页。
"""

url_address1 = "http://yun.baidu.com/pcloud/feed/getsharelist?t="
#加上时间戳
url_address2 = "&category=0&auth_type=1&request_location=share_home&start="
#起始页
url_address3 = "&limit=60&query_uk="
#用户代码
url_address4 = "&channel=chunlei&clienttype=0&web=1&bdstoken=null"

"""
没想到还有第二种形式的分享网页。还有第三种吗？
"""

url_address11 = "http://yun.baidu.com/share/homerecord?uk="
#用户代码
url_address12 = "&page="
#起始页
url_address13 = "&pagelength=60"
"""
正则匹配的字符获得total_count分享者总分享链接数。
以'"shorturl":'(它后边是百度分享链接的短链。)作为分隔符，把原始页面的分割成列表list，写入文档。
获得的链接赋值给列表。
"""
pattern_count = re.compile(r'\d+')

"""
url_moudle:选择url_moudle，百度分享页面有两种格式。建议使用1模式，2模式好像不行。有些获取不到。
"""

find_shorturl = "shorturl"
find_title = "ctime"
find_id = "typicalPath"
find_title = "shareId"
find_uk = 'fsIds'
find_link = "pan.baidu.com"
find_tag = 'tag'
"""
find_time = "feed_time"
find_time_stamp = "time_stamp"
find_source_uid = "source_uid"
find_time_source_id = "source_id"
"""

"""
my_list是从总的网页代码中分离出来的。===xiangmu_list从my_list中分离出来的各个项目。
f_infor[]是写入txt的结尾信息的。
"""
xiangmu_list = [find_title ,find_id ,find_title ,find_uk ,find_link,find_tag]

#xiangmu_list = [find_link,find_tag]

my_list = []
my_newlist=[]

f_infor = []
p_u_infor = "<<>>"
"""
这个是原来作者用过的头文件。先留着吧。
#headers = {"User-Agent":" Mozilla/5.0 (Windows NT 10.0; rv:39.0) Gecko/20100101 Firefox/39.0"}
#headers = {"Accept-Encoding":"deflate","Accept-Language":"zh-CN,zh;q=0.8","User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"}
"""
headers = {"Accept-Encoding":"deflate","Accept-Language":"zh-CN,zh;q=0.8","User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"}
time_out = 60
thread_num = 2


file_dir=""
file_name=""





i1='python版本：Python2.7.10='.decode("utf-8").encode(type_)
i2='httphttplib2版本：httplib2-0.7.7'.decode("utf-8").encode(type_)
i3='====零号虫[版本8.0]====='.decode("utf-8").encode(type_)
i4='创建开始时间：2015-8-19 星期三。'.decode("utf-8").encode(type_)
i5='创建完成时间：2015-8-20 星期四。百度云盘链接提取Python爬虫---【零号虫】终于完成了。'.decode("utf-8").encode(type_)
i6='都凌晨了，百度弹出个消息今天是七夕节。'.decode("utf-8").encode(type_)
i7='恩~~那么打算给【零号虫--云蜘蛛】起个昵称，就叫----七夕云鹊---'.decode("utf-8").encode(type_)
i8='QQ群：427248225----'.decode("utf-8").encode(type_)
i9='python__qq群:python自学新人交流  236147801'.decode("utf-8").encode(type_)
i10='帮忙的人：'.decode("utf-8").encode(type_)
i11='编辑：迷失的幽灵_苍蓝钢铁的琶音_伊401_longinnus_limbo_黑光计划'.decode("utf-8").encode(type_)
i12='@author: Dreace'.decode("utf-8").encode(type_)
i13='@咸阳-Victory  516742171'.decode("utf-8").encode(type_)
i14='应建设单位要求，调整如下  15968208152'.decode("utf-8").encode(type_)
ppi=[i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14]



for x_ppi in ppi:
	print x_ppi
	time.sleep(0.1)
a = raw_input("感谢帮助_幽灵_完成__零号虫的大神们，再次谢过>>>>>>>>[回车继续]".decode("utf-8").encode(type_)) 
print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"



"""
中断1==================================================================
"""

program_exit = stop_infor("1")
#选择电脑硬盘。
x_dir = exists_dir()
#选择文件夹,并进入目录。
if os.path.exists(x_dir):
	file_dir = exists_file(x_dir)
	os.chdir(file_dir)	
	program_exit = "continue"
else:
	program_exit = "end"

"""
根据当前时间建立存储连接的文件夹和文档文件，防止目录和文件名重名。
"""

print program_exit,"..................."
if program_exit =="continue":
	runmoudle = True
	while runmoudle == True:
		#选择运行模式。默认是自动。
		a = raw_input("运行模式：<<<手动单账号模式_输入[H]>>><<<<回车:默认[自动模式]>>>>".decode("utf-8").encode(type_)) 	
		if a=="h" or a=="H":
			runmoudle = True
			#0是手动，1是自动
		else:
			runmoudle = False
		stop_usernum()
		file_name="YunSpider__"+str(start_number)+'__'+str(end_number)+'__'+rename()+".doc"
		f = open(file_name,"ab")
		if program_exit=="continue":
			if start_number>0:
				for boy in range(start_number, end_number+1):

					#文件不要大于50MB。
					if os.path.getsize(file_name) <= 1024*1024*50:
						str_=""
						user_uk = str(boy)
						start_link = str(0)
						myshijianchuo = shijianchuo()
						#url_address = get_url(myshijianchuo,start_link,user_uk)
						#str_ = open_url(url_address)
						start_page = "1"
						urllink_address = get_urllink(user_uk,start_page)
						str_link=open_urllink(urllink_address)
						
						if str_link.find("shorturl") != -1:
							finduser=True
							allpage=0
							page_flage =True
							while page_flage != False:
								start_page = repr(allpage)
								urllink_address = get_urllink(user_uk,start_page)
								str_link=open_urllink(urllink_address)
								print str_link
								print "<<<<<<<<<<<page>>>>>>>>>>>",allpage
								if str_link.find("shorturl") != -1:
									allpage+=1
								else:
									page_flage=False
						else:
							finduser = False
							
						print str_
						print str_link
						
						"""
						中断2==================================================================
						"""
						#stop_infor("2")
						print "2>>>>>>>>>>>>>>>>>>>>"
						stop_min(user_number)
						
						if program_exit=="continue":
							"""
							05_随机生成百度账号代码并，判断账号是否存在(exit)。===得到分享者的分享链接数量：total_count===计算分享页数：page=total_count/60+1
							"""	
							if finduser==True:
								user_number += 1
								print user_uk,"找到一个百度云盘分享者".decode("utf-8").encode(type_)
								
								"""
								06_向指定文件写入网页内容。===total_count:分享链接数。===my_count_list:含有连接数变量的数组列表。===my_count:从my_count_list中获得链接数。
								my_page:my_count/60+1得到的总页数。===page:for循环，获得每一页的链接。
								"""
								total_count =str_.split('"records":')
								for my_count_list in total_count:
									if my_count_list.find('"total_count":')!=-1:
										my_count = pattern_count.findall(my_count_list)
										my_page=int(my_count[1])/60 +1
									else:
										my_page = allpage
										my_count=["page",repr(my_page)]
										
										
									user_infor = "======用户 ".decode("utf-8").encode(type_)+str(user_uk)+" 分享了".decode("utf-8").encode(type_)+my_count[1]+" 个链接。一共有 ".decode("utf-8").encode(type_)+str(my_page)+" 页。=====================================================".decode("utf-8").encode(type_)
									f.write(user_infor)
									print user_infor
									print "零号虫等待您的回车确认...".decode("utf-8").encode(type_)
									"""
									中断3==================================================================
									"""
									#stop_infor("3")
									print "3>>>>>>>>>>>>>>>>>>>>"
									stop_min(user_number)
									if program_exit=="continue":
										
										start_page=str(my_page+1)
										urllink_address = get_urllink(user_uk,start_page)
										str_link=open_urllink(urllink_address)
										str_1=str_link.split('"shorturl":')
										error_links=len(str_1)+60*my_page-int(my_count[1])
																				
										for page in range(1,my_page+1):
											p_user_infor(user_number,page,my_page,my_count[1])
											
											print "--------------------------------------------------"
											print p_u_infor
											print "--------------------------------------------------"
											#time.sleep(0.01)
											
											str_link = ""
											f.write('\n'.encode('utf8'))
											f.write("--------------------------------------------------")
											f.write(str(page))
											f.write("----------")
											f.write(p_u_infor)
											f.write("--------------------------------------------------")
											f.write('\n'.encode('utf8'))

											#start_link = str(60 * page)
											#get_url(myshijianchuo,start_link,user_uk)
											#print url_address
											
											start_page=str(page)
											urllink_address = get_urllink(user_uk,start_page)
											print urllink_address
											print "第".decode("utf-8").encode(type_),user_number,"个百度账户---账户代号".decode("utf-8").encode(type_),user_uk
											"""
											中断4==================================================================
											"""
											#stop_infor("4")
											print "4>>>>>>>>>>>>>>>>>>>>"
											stop_min(user_number)
											if program_exit=="continue":
											
												str_link=open_urllink(urllink_address)
												str_1=str_link.split('"shorturl":')
												#清空my_list是十分必要的，不然列表会越来越大。
												my_list = []
												for j in str_1:
													j ='"shorturl":' + j
													my_list.append(j)
												
												
												
												if len(my_list)!=0:
													#需不需要分析失效的链接，如果失效的链接比较少比如3个以内就不进行分析了，分析太慢。
													right_vs_error=int(my_count[1])/error_links
													if right_vs_error<100 or error_links>20:
														#把失效的my_list去掉。
														n=0
														for i in range(0,len(my_list)):
															i=i-n
															j=my_list[i].split(',')
															for s_url in j:
																if s_url.find('shorturl')!=-1:
																	url_code = get_shorturl(s_url)
																	if url_code != 200:
																		print 'wrong_link_url_code>>>',url_code
																		#time.sleep(0.5)
																		del my_list[i]
																		n+=1
																		#mei删除一个my_list,i的值需要减1，否则会漏掉删除元素的下一个。
																	
													
													
													#把‘"shorurl":’替换成http://pan.baidu.com/s/
													#清空my_newlist的重要性。
													my_newlist=[]
													for i in range(0,len(my_list)):
														str_2=str(my_list[i])
														
														str_2 = str_2.replace('"','')
														str_2 = str_2.replace(':','')
														str_2 = str_2.replace('shorturl','http://pan.baidu.com/s/')
														my_newlist.append(str_2)
														#print my_newlist[i]
														
													#把有效的链接写入文本文档。
													for i in range(0,len(my_newlist)):
														print 'my_newlist----------',i*page+1,'----------Error links<<<<',repr(error_links),">>>>Right links<<<<",my_count[1]
														print my_newlist[i]
														f.write('\n'.encode('utf8'))
														k = my_newlist[i].split(',')
														#获得分享文件,基本信息。
														for l in k:
															for m in xiangmu_list:
																
																if l.find(m) != -1:
																	if l.find('\u'):
																		try:
																			l=l.decode('unicode_escape')
																			l=l.encode('gbk')
																		except Exception,e:
																			l="error code change"
																			print "error code>>>>gbk",e
																		
																	print l
																	f.write(l)
																	f.write(' :'.decode("utf-8").encode('gbk'))
														
																	
													if runmoudle==True:
														a = raw_input("[Q退出]回车继续下一个账号".decode("utf-8").encode(type_)) 	
														if a=="q" or a=="Q":
															runmoudle==False
															program_exit="end"
														else:
															program_exit="continue"
							else:
								boy = str(boy)
								str_ = "-----" + boy + "---这个百度云盘空空的>>>"
								try:
									#f.write(str_.decode("utf-8").encode(type_))
									#f.write('\n'.decode("utf-8").encode(type_))
									print boy,"这个百度云盘空空的>>>".decode("utf-8").encode(type_)
								except Exception,e:
										print '<零号虫召回中>>>>>>>>>>'.decode("utf-8").encode(type_),e
										print '<<<<<零号虫归位>>>>>'.decode("utf-8").encode(type_)
								if runmoudle==True:
									a = raw_input("[Q退出]回车继续下一个账号".decode("utf-8").encode(type_)) 	
									if a=="q" or a=="Q":
										runmoudle==False
										program_exit="end"
									else:
										program_exit="continue"
								
					elif os.path.getsize(file_name) > 1024*1024*50:
						end_infor(start_number,boy,user_number)
						try:
							for i in range(0,len(f_infor)):
								f.write(f_infor[i])
								f.write('\n'.decode("utf-8").encode('gbk'))
							f.close()
							print "链接文件大于50M，已经建立一个新的文件啦>>>>>".decode("utf-8").encode(type_)
							print "10秒后继续>>>>>".decode("utf-8").encode(type_)
							time.sleep(10)
							f = open(file_name,"wb")
						except Exception,e:
							print '<零号虫召回中>>>>>>>>>>'.decode("utf-8").encode(type_),e
			b=raw_input("按回车结束".decode("utf-8").encode(type_))			
		
			
elif program_exit== "end":
	print "end"			


end_infor(start_number,boy,user_number)
for i in range(0,len(f_infor)):
	try:
		f.write(f_infor[i])
		f.write('\n'.decode("utf-8").encode('gbk'))
	
	except Exception,e:
		print '<零号虫召回中>>>>>>>>>>'.decode("utf-8").encode(type_),e
		print '<<<<<零号虫归位>>>>>'.decode("utf-8").encode(type_)
f.close()		


print "零号虫抓取了>>>>>>>>>>".decode("utf-8").encode(type_),str(user_number),"<<<<<个百度账号的链接>>>>>送你一个巨无霸的么么哒！".decode("utf-8").encode(type_)
print "链接保存在文件夹>>>>>>>>>>".decode("utf-8").encode(type_),file_dir
print "文件名是>>>>>>>>>>".decode("utf-8").encode(type_),file_dir
print "下一个百度账号是>>>>>>>>>>".decode("utf-8").encode(type_),str(int(boy)+1)
time.sleep(2)
b=raw_input("按回车结束".decode("utf-8").encode(type_))
抓取百度云盘链接的小虫

猜你喜欢