python 初学网络爬虫(糗事百科段子)

#糗事百科的网络爬虫 ,爬出所有段子信息
import requests
from lxml import html

def reptile_q(url,node):
    page = requests.Session().get(url)
    tress = html.fromstring(page.text)
    contents = tress.xpath(node)

    #open a file qsbk.text
    qsbkFile = open("F://糗事百科.txt","w+",encoding='utf-8')
    print(contents)
    index = 0;
    # 导入re
    import re
    contents = ''.join(contents)
    contents = contents.split('\n\n\n')
    contents = ''.join(contents)
    contents = contents.split('\n\n')
    print(contents)
    for content in contents:    #遍历集合
        #cont = content.replace('\n','')
        #qsbkFile.writelines("\n\n\t")

        # for c in cont:          #遍历每个元素切割的元素
        #     #alines = re.findall(r'.{20}',c)
        #     #for line in alines:
        #        #qsbkFile.writelines(line)
        print("0000"+content)
        qsbkFile.writelines(content+"\n")

    qsbkFile.close()

#url:要爬的的网址,num:怕多少页,node:爬网页中的那个节点的内容
def reptile_q_patch(url,num,node):
    index = 1
    while index<=num:
        reptile_q(url+index.__str__()+"/",node)
        index = index+1


reptile_q_patch('https://www.qiushibaike.com/8hr/page/',1,'//div[@class="content"]//span/text()')

猜你喜欢

转载自blog.csdn.net/qq_29499107/article/details/80015784
今日推荐