Python 获取txt

#get jingdong.com 获取京东电子书
#coding=utf8
import re,urllib
#-------------取得目录
def getlist(url):
    #获取目录的正则表达式
    re_jdebook = r'<li(.+?)</li>'
    #'\r\n'代替换行
    ##re_jdebook = r'<a href="(.+?)"\r\n +?title="(.+?)">\r\n'
    page = urllib.urlopen(url).read()
    #re.DOTALL Make the '.' special character
    #match any character at all, including a newline
    #匹配任何符号,包括换行
    r1=re.compile(re_jdebook,re.DOTALL)
    content = re.findall(r1,page)
    return content

#---------------------取得每一篇文章的内容
def getOne(address):
    rr = r'<p>(.+?)</p>'#r'<div id="zoom">(.+?)</div>'
    paper = urllib.urlopen(address).read()
    rr1=re.compile(rr,re.DOTALL)
    cont = re.findall(rr1,paper)
    #for para in cont:
        #print para
    return cont

#---------download ebook
def downloadbook(content,bookname='book'):
    book = []
    for cc in content:
        if url in cc:
            ncc = re.findall(r'<a href="(.+?)"\r\n +?title="(.+?)">\r\n',cc)
            print ncc[0][0],ncc[0][1]
            book.append(ncc[0][1])
            book += getOne(ncc[0][0])
            book.append('\n')

    f = open(bookname+".txt",'w+')
    for line in book:
        f.write(line+'\n')
    f.close()
#-----------
#目录页地址
url = 'http://read.jd.com/4281/'
downloadbook(getlist(url),u'日月')#中文前加'u'防止产生乱码

猜你喜欢

转载自blog.csdn.net/viomag/article/details/38930469