Python爬虫抓取页面内容

博客园示例:Ctrl+Alt+L格式化代码

#coding:utf-8
import requests
from lxml import etree


def gettitle(url):
    html=requests.get(url)
    selector=etree.HTML(html.text)
    title=selector.xpath('//a[@id="cb_post_title_url"]/text()')
    return title[0]

def getcontent(url):
    html=requests.get(url)
    selector=etree.HTML(html.text)
    contentlist=selector.xpath('//div[@class="postBody"]/div/p/text()')
    contents=''
    for i in contentlist:
        contents=contents+"\n"+i
    return contents
print("请输入博客园文章的链接:")
url=input("")
print(gettitle(url))
print(getcontent(url))

 

发布了46 篇原创文章 · 获赞 9 · 访问量 3672

猜你喜欢

转载自blog.csdn.net/weixin_41896770/article/details/100099428