BeautifulSoup语法笔记(爬取新浪新闻)

以爬取新浪新闻为例

import re
import requests
from bs4 import  BeautifulSoup
import json
from datetime import datetime

def getSoup(newsurl):
    res=requests.get(newsurl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    return soup

newsurl为新浪新闻sh首页某则新闻的链接

打印出soup查看结构

 title=soup.select('title')[0].text

def getArtcle(soup):
    article=[]
    for p in soup.select('#article p')[:-1]:
        article.append(p.text.strip())
    return ' '.join(article)
print(getArtcle(getSoup('http://news.sina.com.cn/w/2018-07-17/doc-ihfkffam0100205.shtml')))

注意像这种格式的不能直接用soup.select('#show_author')[0].text.strip('责任编辑:')  而应该用下面这种语法:

editor=soup.select('p[class="show_author"]')[0].text.strip('责任编辑:')

最后贴出完整的代码

import re
import requests
from bs4 import  BeautifulSoup
import json
from datetime import datetime

def getSoup(newsurl):
    res=requests.get(newsurl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    return soup

def getTitle(soup):
    title=soup.select('title')[0].text
    return title

def getArtcle(soup):
    article=[]
    for p in soup.select('#article p')[:-1]:
        article.append(p.text.strip())
    return ' '.join(article)

def getEditor(soup):
    editor=soup.select('p[class="show_author"]')[0].text.strip('责任编辑:')
    return editor

def getTime(soup):
    time=soup.select('span[class="date"]')[0].text
    return(time)

def catch_all(newsurl):
    soup=getSoup(newsurl)
    print('新闻标题:',getTitle(soup),'\n'
          '时间:',getTime(soup),'\n'
          '新闻内容:',getArtcle(soup),'\n'
          '编辑作者:',getEditor(soup),'\n')

#新浪新闻首页某则新闻的链接作为catch_all的参数输入,
#即可输出该则新闻的标题、时间、内容、作者 

catch_all('http://news.sina.com.cn/w/2018-07-17/doc-ihfkffam0100205.shtml')

catch_all('http://news.sina.com.cn/c/2018-07-17/doc-ihfkffak9643422.shtml')

猜你喜欢

转载自blog.csdn.net/u014165082/article/details/81083120