python:爬虫练习爬取小说(初学)

 1 import requests
 2 from pyquery import PyQuery as pq 
 3 
 4 def get_content(a):
 5     response=requests.get(a)
 6     #print(str(response))    
 7     response.encoding = 'gbk'
 8     #print(response.text)
 9     doc = pq(response.text)
10     text=doc('#content.showtxt')
11     a=str(text)
12     b=a.replace("&#13;<br/>&#13;<br/>","\n").replace('<br/><br/>','\n').replace('<script>chaptererror();</script><br/> 请记住本书首发域名:www.biqugexsw.com。笔趣阁小说网手机版阅读网址:m.biqugexsw.com</div>','').replace('\xa0','').replace('<div id="content" class="showtxt">','')
13     file = open(u'F:\python\小说\1.txt','a+')
14     file.close()
15 def get_mulu():
16     index_url='https://www.biqugexsw.com/75_75362/'#可替换其他书籍网页
17     response=requests.get(index_url)
18     response.encoding = response.apparent_encoding
19     doc = pq(response.text)
20     urls = doc('div.listmain a')
21     for i in urls.items():
22         a='https://www.biqugexsw.com/'+i.attr.href #获取每个章节的URL
23         get_content(a)
24         print("获取成功")
25         #print(a)
26 get_mulu()

最近学习爬虫,练习爬取笔趣阁的一部小说。

待完善:
  浏览器模拟访问

  异步爬取

  获取bookname

  正则表达式

猜你喜欢

转载自www.cnblogs.com/liubingzhe/p/11262691.html