#get jingdong.com 获取京东电子书 #coding=utf8 import re,urllib #-------------取得目录 def getlist(url): #获取目录的正则表达式 re_jdebook = r'<li(.+?)</li>' #'\r\n'代替换行 ##re_jdebook = r'<a href="(.+?)"\r\n +?title="(.+?)">\r\n' page = urllib.urlopen(url).read() #re.DOTALL Make the '.' special character #match any character at all, including a newline #匹配任何符号,包括换行 r1=re.compile(re_jdebook,re.DOTALL) content = re.findall(r1,page) return content #---------------------取得每一篇文章的内容 def getOne(address): rr = r'<p>(.+?)</p>'#r'<div id="zoom">(.+?)</div>' paper = urllib.urlopen(address).read() rr1=re.compile(rr,re.DOTALL) cont = re.findall(rr1,paper) #for para in cont: #print para return cont #---------download ebook def downloadbook(content,bookname='book'): book = [] for cc in content: if url in cc: ncc = re.findall(r'<a href="(.+?)"\r\n +?title="(.+?)">\r\n',cc) print ncc[0][0],ncc[0][1] book.append(ncc[0][1]) book += getOne(ncc[0][0]) book.append('\n') f = open(bookname+".txt",'w+') for line in book: f.write(line+'\n') f.close() #----------- #目录页地址 url = 'http://read.jd.com/4281/' downloadbook(getlist(url),u'日月')#中文前加'u'防止产生乱码
Python 获取txt
猜你喜欢
转载自blog.csdn.net/viomag/article/details/38930469
今日推荐
周排行