《百度URL采集器》

# coding = utf-8
# __author__ = Christopher

from urllib import request, parse
from bs4 import BeautifulSoup
import re
import os
# from time import strftime

class Spider:
    def __init__(self):
        self.header = {}
        self.url = 'http://www.baidu.com/s?wd='
        self.page = 0
        self.word = 'inurl:action'  # 这里设置你想要搜索的内容
        '''
        self.hrefre = re.compile('(http|ftp|https)'
                                 ':\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])')
'''
    def EncodingWd(self, wd):
        return parse.quote(wd)

    def struct_url(self, search_word, page=0):
        url = self.url
        if search_word == '':
            pass
        else:
            self.url = search_word

        if page != 0:
            url = url + self.EncodingWd(self.word) + '&pn=' + str(page) + '0'
        else:
            url += self.EncodingWd(self.word)
        return url

    def spider(self, page, search_word):
        self.header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1'
        auth_url_lis = []

        baidu_search_url = (self.struct_url(search_word, page))
        req = request.Request(baidu_search_url, headers=self.header)
        rsp = request.urlopen(req)
        html = rsp.read()
        soup = BeautifulSoup(html, 'html.parser')
        # auth_url_lis = []
        for url in soup.find_all('a', {'class': 'c-showurl'}):
            auth_url_lis.append(url.get('href'))
            # soupresult.append(soup.findAll('a', {'class': 'c-showurl'}))
            # time.sleep(8)
        return auth_url_lis

    '''
    def extract_href(self, waitforre):
        hrefre = self.hrefre
        result = []
        base_url = 'http://www.baidu.com/'
        for i in re.findall(hrefre, waitforre):
            result.append(base_url+i[2])
        return result
'''
    def auth_url(self, crypt_url):
        result = []
        # header = {}
        # header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1'
        try:
            req = request.Request(crypt_url, headers=self.header)
            url = request.urlopen(req)
            result.append(url.geturl())
        except:
            print('[*]Can\'t get auth_url!%s' % str(crypt_url))
        return result


def main():
    path = os.getcwd()
    path += '\\auth_url.txt'  # 获取脚本当前路径
    spi_ob = Spider()
    file = open(path, 'a')
    # for k in range(0, 5):
    print('[#]Version 0.3\n[#]__Author__=ChristopherLam\n[#]qq:770304694', end='\n')

    search_word = str(input('[*]请输入搜索关键词(可不填)：'))
    subscript_page = int(input('[*]请输入页码下限（0为第一页）：'))
    superscript_page = int(input('[*]请输入页码上限：'))
    
    print('[*]Spider is under running...')

    for k in range(subscript_page, superscript_page):
        auth_url_lis = spi_ob.spider(page=k, search_word=search_word)
        while auth_url_lis:
            url_result = spi_ob.auth_url(auth_url_lis.pop())
            while url_result:
                file.write(url_result.pop()+'\n')
    file.close()
    print('[*]Success. Quit...')
    # for i in re_result:
    #    print(i)

if __name__ == '__main__':
    main()
Christopher_L1n
发布了15 篇原创文章 · 获赞 3 · 访问量 2万+
私信关注
《百度URL采集器》

猜你喜欢