python[爬虫]爬取百万条新浪新闻 新浪滚动新闻中心(多进程)

最近在做python爬取新闻,所以分别研究了下新浪、网易、中国新闻网的爬取方法。其他几个网页的新闻爬取我的博客里都有,请自行查看~
首先,因为需获取的数据为百万级别,所以直接选择了新浪的滚动新闻中心

https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1

本人共需获取100w条数据,分了10个新闻类别,每个类别获取10w条

获取百万条新闻链接,存储到csv文件中

1、获取滚动新闻网第一页的50条新闻链接
获取某一页内的新闻链接可以直接这样写,测试使用

import time
import sys
import requests
from multiprocessing import Pool
from bs4 import BeautifulSoup

init_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page={}'
headers = {'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; WOW64) '
                    'AppleWebKit/537.36 (KHTML, like Gecko) '
                    'Chrome/55.0.2883.87 Safari/537.36'}
page = requests.get(url=init_url.format(1)).json()
#res = json.loads(res.text)
for j in range(50):
    urls = page['result']['data'][j]['wapurl']
    print(urls)

2、取某一类别的新闻链接
以“科技”新闻为例

'''
获取 新浪滚动新闻 的链接
存储为csv文件
'''
import time
import sys
import os
import requests
from multiprocessing import Pool
from bs4 import BeautifulSoup
from pandas.core.frame import DataFrame

def get_URL(i):
    #init_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2513&k=&num=50&page={}'#娱乐
    #init_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2511&k=&num=50&page={}'#国际
    #init_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2512&k=&num=50&page={}'#体育
    #init_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2514&k=&num=50&page={}'#军事
    init_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2515&k=&num=50&page={}'#科技
    #init_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&k=&num=50&page={}'#财经
    page = requests.get(url=init_url.format(i), headers=headers).json()
    links = []
    for j in range(50):
        urls = page['result']['data'][j]['url']
        links.append(urls)
    return links

def main():
    global headers
    headers = {'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; WOW64) '
                        'AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/55.0.2883.87 Safari/537.36'}
    pagenum = 50 #choose pagenum u want to scrapy
    link_list = []
    for i in range(1,pagenum):
        try:          
            links = get_URL(i)
            link_list = link_list + links
        except:
            print("第"+str(i)+"页链接获取失败")
        else:
            print("第"+str(i)+"页链接已经全部获取")

    c = {'url':link_list}
    data = DataFrame(c)
    
    root = ".//newsCollection//"
    path = root + "科技.csv"
    try:
        if not os.path.exists(root):
            os.mkdir(root)
            print('mkdir success')
        data.to_csv(path)
    except IOError:
        print('sorry, write failed')
    else:
        print("---科技.csv have created---")

if __name__ == "__main__":
    sys.setrecursionlimit(100000)  #设置默认递归深度
    main()
    #print("共计用时:" + str(round((end-start)/60, 2)) + '分钟')

获取新闻链接的新闻内容

1、获取单个新闻链接的内容

#获取单条新闻的内容
import requests
from bs4 import BeautifulSoup
       
url = 'https://mil.news.sina.com.cn/2019-12-09/doc-iihnzahi6355412.shtml'
res = requests.get(url)
#print(res.encoding)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
title = soup.select(".main-title")[0].text
print(title)
article_content = ""
article = soup.select('.article p')[:-1]#末端的消息来源不需要
for p in article:
    article_content = article_content + p.text.strip()
print(article_content)

2、多进程执行解析链接内容
因为链接多达百万条,单条按顺序执行可能会要跑很久,所以不妨引入多进程。核心代码如下:

pool = Pool(8) #create class of Processing Pool
res_list = []
for url in url_list:
    res = pool.apply_async(func=parse, args=(url,))
    res_list.append(res)
count = 0
for res in res_list:
    count = count + 1
    try:
        result = res.get() #执行
        print('第'+ str(count) + '页链接获取成功')
    except:
        print('第'+ str(count) + '页链接获取失败,正在尝试下一页')
        continue
    record_list.append(result)

pool.close()
pool.join() #Wait for all programs stopping and close pools

转载请注明出处,谢谢~

猜你喜欢

转载自blog.csdn.net/Iv_zzy/article/details/107535041