python 人生的第一个小爬虫 爬小说 保存一下 自动化搜索

# -*- coding:UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import chardet
import re
def download(urls):
#下载
    for item in urls:
        print(item[3] + ':' + 'http://www.biquge.com.tw' + item[1])
    #输出一下找到了那些章节,与对应的网址
    sure = input("是否下载这些章节y/n(下载在当前目录下,访问上面链接可以直接观看):")
    if sure == 'y' or sure == 'Y':
        for item in urls:
                f = open(item[3], mode='w+', encoding='utf-8')
                #创建文本用章节名命名
                download_url = 'http://www.biquge.com.tw' + item[1]
                head = {}
                head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
                download_req = request.Request(url=download_url, headers=head)
                download_response = request.urlopen(download_req)
                download_html = download_response.read().decode('gbk', 'ignore')
                soup_texts = BeautifulSoup(download_html, 'lxml')
                #获取章节的内容页,解码并结构化
                texts = soup_texts.find_all(id="content")
                soup_text = BeautifulSoup(str(texts), 'lxml')
                #找到需要的内容
                f.write(soup_text.div.text.replace('\xa0', ''))
                #把不要的字符去掉如何写入文本
                f.close
                print(item[3]+" 下载完成\n")
    print('all down')
    return
def find_url(target_url):
#从目录页获取每一个章节的网址
    head = {}
    head[
        'User-Agent'] = "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19"
    req = request.Request(url=target_url, headers=head)
    #设置代理头,这个网站不知道python的默认头能不能访问
    response = request.urlopen(req)
    html = response.read()
    code = chardet.detect(html)['encoding']
    #这个不能用,因为它会识别为简体但是还有一部分是繁体
    html = html.decode('gbk')
    #用gbk解码支持简繁
    soup = BeautifulSoup(html, 'html.parser')
    #用beatifulsoup结果化一下,不要也可以  
    result = re.findall('<a(.*?)href\=\"(.*?)\"(.*?)>(.*?)</a>', str(soup.find_all(id='list')))
    #正则匹配
    download(urls=result)
def search(txt_name):
#利用笔趣阁网站的搜索功能
    target_name = str(txt_name.encode("GBK")).upper()
    #这个网站用的get方法,搜索的内容用GBK编码加在后面,但并不是直接加要变换
    target_url = 'http://www.biquge.com.tw/modules/article/soshu.php?searchkey='
    i = 2
    while i < len(target_name) - 1:
        if target_name[i] == '\\':
            target_url += '%'
            i += 2
            continue
        target_url += target_name[i]
        i += 1
    #此时的target_url的内容就是处理后的网址
    head = {}
    head[
        'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
    req = request.Request(url=target_url, headers=head)
    html = request.urlopen(req)
    if target_url != html.geturl():
    #这个网站如果搜索结果只有一个就会自动打开
        print("已经找到,正在提取\n")
        target_url=html.geturl()
        find_url(target_url)
        return
    html = html.read().decode('GBK')
    result = re.finditer(r'<td class="odd"><a href="(.*?)">(.*?)</a>', html)
    #正则匹配
    flag = 0
    for item in result:
    #在多个结果中找到我们要的那个,此处修改可以支持模糊化搜索
        if item[2] == txt_name:
            target_url = item[1]
            print("已经找到,正在提取\n")
            find_url(target_url=target_url)
            return
    print('找不到')
if __name__=='__main__':
    name = input("输入要找的小说名称:")
    search(txt_name=name)

猜你喜欢

转载自blog.csdn.net/qq_30754565/article/details/81084576