很早之前就看了别人写的笔趣阁的爬虫,那时候自己也不会什么,所以就暂时没有自己尝试去写。最近又学了点东西,所以尝试写一个静态爬虫。所以就爬了笔趣阁。
利用request+pquery,感觉还蛮简单的,不过是因为第一次自己动手,所以写的比较慢。
设计思路是先获取所有的一本书章节的url链接,然后再一章章爬取下来写入到文件。
import requests from pyquery import PyQuery as pq class biqukan(): def __init__(self): self.url = "http://www.yuetutu.com/18_18147/" self.path = '/home/xxp/git_learning/practice/spider_learning/漫漫武仙路.txt' def get_index(self,url): html = requests.get(url).text doc = pq(html) links = doc('.listmain a') for link in links.items(): yield link.attr.href def parse_url(self,index): for link in index: yield self.url + link[10:] def get_text(self,urls): for url in urls: html = requests.get(url).text doc = pq(html) title = doc('h1').text() text = doc('#content').text() yield title + text def write(self,texts): for text in texts: with open(self.path, 'a', encoding='utf-8') as f: f.write(text + '\n\n') def main(self): index = self.get_index(self.url) urls = self.parse_url(index) texts = self.get_text(urls) self.write(texts) if __name__ == "__main__": b =biqukan() b.main()