import urllib.request from lxml import etree def chu_url(url,shuhao): url = url + shuhao headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) res = urllib.request.urlopen(req) return res def wanmei_spider(res): html = res.read() html_tree = etree.HTML(html) tree_list = html_tree.xpath("//div[@class='box_con']//dl//dd/a/@href") # print(tree_list) #验证 #获取章节的内容和章节名称 shus = {} for tree in tree_list: res = urllib.request.urlopen(tree) html = res.read() tree1 = etree.HTML(html) shus['章节'] = tree1.xpath("//div[@class='bookname']/h1/text()") shus["内容"] = tree1.xpath("//div[@id='content']/text()") # print(html) return shus def main(): url = "https://www.biquge5200.com/" #书号表示网址 https://www.biquge5200.com/0_9/ .com/后的0_9就是书号 shuhao = input("请输入书号") text = wanmei_spider(chu_url(url,shuhao)) # print(text) return text if __name__ == '__main__': main()
笔趣阁的小说爬取
猜你喜欢
转载自blog.csdn.net/mjp_erhuo/article/details/80106145
今日推荐
周排行