#糗事百科的网络爬虫 ,爬出所有段子信息 import requests from lxml import html def reptile_q(url,node): page = requests.Session().get(url) tress = html.fromstring(page.text) contents = tress.xpath(node) #open a file qsbk.text qsbkFile = open("F://糗事百科.txt","w+",encoding='utf-8') print(contents) index = 0; # 导入re import re contents = ''.join(contents) contents = contents.split('\n\n\n') contents = ''.join(contents) contents = contents.split('\n\n') print(contents) for content in contents: #遍历集合 #cont = content.replace('\n','') #qsbkFile.writelines("\n\n\t") # for c in cont: #遍历每个元素切割的元素 # #alines = re.findall(r'.{20}',c) # #for line in alines: # #qsbkFile.writelines(line) print("0000"+content) qsbkFile.writelines(content+"\n") qsbkFile.close() #url:要爬的的网址,num:怕多少页,node:爬网页中的那个节点的内容 def reptile_q_patch(url,num,node): index = 1 while index<=num: reptile_q(url+index.__str__()+"/",node) index = index+1 reptile_q_patch('https://www.qiushibaike.com/8hr/page/',1,'//div[@class="content"]//span/text()')
python 初学网络爬虫(糗事百科段子)
猜你喜欢
转载自blog.csdn.net/qq_29499107/article/details/80015784
今日推荐
周排行