1 import requests 2 from pyquery import PyQuery as pq 3 4 def get_content(a): 5 response=requests.get(a) 6 #print(str(response)) 7 response.encoding = 'gbk' 8 #print(response.text) 9 doc = pq(response.text) 10 text=doc('#content.showtxt') 11 a=str(text) 12 b=a.replace(" <br/> <br/>","\n").replace('<br/><br/>','\n').replace('<script>chaptererror();</script><br/> 请记住本书首发域名:www.biqugexsw.com。笔趣阁小说网手机版阅读网址:m.biqugexsw.com</div>','').replace('\xa0','').replace('<div id="content" class="showtxt">','') 13 file = open(u'F:\python\小说\1.txt','a+') 14 file.close() 15 def get_mulu(): 16 index_url='https://www.biqugexsw.com/75_75362/'#可替换其他书籍网页 17 response=requests.get(index_url) 18 response.encoding = response.apparent_encoding 19 doc = pq(response.text) 20 urls = doc('div.listmain a') 21 for i in urls.items(): 22 a='https://www.biqugexsw.com/'+i.attr.href #获取每个章节的URL 23 get_content(a) 24 print("获取成功") 25 #print(a) 26 get_mulu()
最近学习爬虫,练习爬取笔趣阁的一部小说。
待完善:
浏览器模拟访问
异步爬取
获取bookname
正则表达式