python:爬取新浪新闻的内容


import requests
import json
from bs4 import BeautifulSoup
import re
import pandas
import sqlite3


commenturl='https://comment.sina.com.cn/page/info?version=1&format=json' \
           '&channel=gn&newsid=comos-{}&group=undefined&compress=0&' \
           'ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread' \
           '=1&callback=jsonp_1543748934208'
# 获取评论数
def getCommentCounts(newsurl):
    #获取没则新闻的编号(正则表达式)
    m = re.search('doc-i(.*).shtml', newsurl)
    newsid = m.group(1)
    #格式化链接中的的大括号
    comments = requests.get(commenturl.format(newsid))
    #把加了js外套的json变成标准json
    jd = json.loads(comments.text.strip('jsonp_1543748934208()'))
    #获取评论数
    return jd['result']['count']['total'];

# 提取每则新闻的内文
def getNewsDetail(newsurl):
    # 定义一个字典存储信息
    result = {}
    rsp = requests.get(newsurl)
    rsp.encoding = 'utf-8'
    soup = BeautifulSoup(rsp.text,'html.parser')
    # 获取标题
    result['title'] = soup.select('.main-title')[0].text
    # 获取日期
    result['time'] = soup.select('.date')[0].text
    # 获取来源
    result['source'] = soup.select('.source')[0].text
    # 获取内容
    result['article'] = ' '.join([p.text.strip() for p in soup.select('#article p')[:-1]])
    # 获取编辑
    result['editor'] = soup.select('.show_author')[0].text.lstrip('责任编辑:')
    # 获取评论数
    result['comment']=getCommentCounts(newsurl)
    return result

# 获取分页链接
def parseListLinks(url):
    newsdetails = []
    rsp = requests.get(url)
    # 把加了js外套的json变成标准json
    jsonUrl = '{' + str(rsp.text.lstrip('try{feedCardJsonpCallback(').rstrip(') ;}catch(e){};')) + '}}'
    jd=json.loads(jsonUrl)
    # 获取每页的新闻链接
    for ent in jd['result']['data']:
        newsdetails.append(getNewsDetail(ent['url']))
    return newsdetails

url='https://feed.sina.com.cn/api/roll/' \
    'get?pageid=121&lid=1356&num=20&versionNumber=1.2.4' \
    '&page={}&encode=utf-8&callback=feedCardJsonpCallback&_'
news_total = []
for i in range(1,3):#爬取的页数自己设定
    # 格式化链接中的大括号
    newsurl = url.format(i)
    newsary = parseListLinks(newsurl)
    news_total.extend(newsary)
# 使用pandas模块使爬取到的信息格式化
df = pandas.DataFrame(news_total)
# 保存为xlsx文件
df.to_excel('news.xlsx')

猜你喜欢

转载自blog.csdn.net/qq_42680202/article/details/84856958