# coding = utf-8
# __author__ = Christopher
from urllib import request, parse
from bs4 import BeautifulSoup
import re
import os
# from time import strftime
class Spider:
def __init__(self):
self.header = {}
self.url = 'http://www.baidu.com/s?wd='
self.page = 0
self.word = 'inurl:action' # 这里设置你想要搜索的内容
'''
self.hrefre = re.compile('(http|ftp|https)'
':\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])')
'''
def EncodingWd(self, wd):
return parse.quote(wd)
def struct_url(self, search_word, page=0):
url = self.url
if search_word == '':
pass
else:
self.url = search_word
if page != 0:
url = url + self.EncodingWd(self.word) + '&pn=' + str(page) + '0'
else:
url += self.EncodingWd(self.word)
return url
def spider(self, page, search_word):
self.header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1'
auth_url_lis = []
baidu_search_url = (self.struct_url(search_word, page))
req = request.Request(baidu_search_url, headers=self.header)
rsp = request.urlopen(req)
html = rsp.read()
soup = BeautifulSoup(html, 'html.parser')
# auth_url_lis = []
for url in soup.find_all('a', {'class': 'c-showurl'}):
auth_url_lis.append(url.get('href'))
# soupresult.append(soup.findAll('a', {'class': 'c-showurl'}))
# time.sleep(8)
return auth_url_lis
'''
def extract_href(self, waitforre):
hrefre = self.hrefre
result = []
base_url = 'http://www.baidu.com/'
for i in re.findall(hrefre, waitforre):
result.append(base_url+i[2])
return result
'''
def auth_url(self, crypt_url):
result = []
# header = {}
# header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1'
try:
req = request.Request(crypt_url, headers=self.header)
url = request.urlopen(req)
result.append(url.geturl())
except:
print('[*]Can\'t get auth_url!%s' % str(crypt_url))
return result
def main():
path = os.getcwd()
path += '\\auth_url.txt' # 获取脚本当前路径
spi_ob = Spider()
file = open(path, 'a')
# for k in range(0, 5):
print('[#]Version 0.3\n[#]__Author__=ChristopherLam\n[#]qq:770304694', end='\n')
search_word = str(input('[*]请输入搜索关键词(可不填):'))
subscript_page = int(input('[*]请输入页码下限(0为第一页):'))
superscript_page = int(input('[*]请输入页码上限:'))
print('[*]Spider is under running...')
for k in range(subscript_page, superscript_page):
auth_url_lis = spi_ob.spider(page=k, search_word=search_word)
while auth_url_lis:
url_result = spi_ob.auth_url(auth_url_lis.pop())
while url_result:
file.write(url_result.pop()+'\n')
file.close()
print('[*]Success. Quit...')
# for i in re_result:
# print(i)
if __name__ == '__main__':
main()