爬取金瓶梅小说

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u011529752/article/details/79939873

先上代码
小说的目录页面是
http://www.lhh1.com/modules/article/reader.php?aid=33

# coding: utf-8
import urllib
from urllib import request
import os
from os import path
import re

result_dir = path.join(os.getcwd(),'result')  #创建
if not path.exists(result_dir):
    os.makedirs(result_dir)

#一开始有编码问题 就直接把目录页的HTML复制到txt里用utf-8编码  后来才发现用的gbk编码,难得改了
src = open('src.txt','r',encoding='utf-8').read()

pattern = '<a href="(.+?)">(.+?)</a>'
res1 = re.compile(pattern,re.S).findall(src)

for name in res1[1:2]:
    url = name[0]
    #修饰URL
    pattern_url = "(.+?)amp;(.*)" #去掉转义字符
    res1 = re.compile(pattern_url, re.S).findall(url)
    url = res1[0][0] + res1[0][1]
    title =name[1]
    print(url,title)

    pattern_br = '\&nbsp;\&nbsp;\&nbsp;\&nbsp;(.+?)<br'
    html = request.urlopen(url).read()
    html = html.decode('gbk')  #编码形式
    res2 = re.compile(pattern_br, re.S).findall(html)
    # print(html)
    # print(res2)
    resfile = open(path.join(result_dir,title+'.txt'),'w')  #分章节
    words = ''

    for x in res2:
        res3 = re.compile('(.+)\</div\>', re.S).findall(x)  #去掉尾部的字符
        if res3 != []:
            x = res3[0]
        words += "  " + x + '\n'

    resfile.write(words)
    resfile.close()

print('done!')

里面两个坑,一个是编码问题,另一个是url中的转义字符问题,在html中有5个转义字符,,HTML的 &lt; &gt; &amp; &quot; &copy; 分别是<,>,&,”,©;的转义字符。需要进行校正。
结果:
这里写图片描述

猜你喜欢

转载自blog.csdn.net/u011529752/article/details/79939873