爬虫 python 爬取澎湃新闻

import requests
import pymongo
from bs4 import BeautifulSoup
import getNews

client = pymongo.MongoClient(host='localhost',port=27017,connect=False)
pengpai = client['pengpai']
if 'pengpai_news' in pengpai.list_collection_names():
    pengpai_news = pengpai['pengpai_news']
    pengpai_news.drop()
else:
    pengpai_news = pengpai['pengpai_news']
#抗疫情 链接构建
urlList = ['https://www.thepaper.cn/channel_90077']
url_1 = 'https://www.thepaper.cn/'
url_2 = 'load_index.jsp?nodeids=90069,&channelID=90077&topCids=,5922202,5934344,5934605,5934601,5934698&pageidx='
url_3 = '&lastTime=1581492637041'
for i in [url_1 + url_2 + str(n) + url_3 for n in range(1,30)]:
    urlList.append(i)

num = 0
for url in urlList:
    news_data = requests.get(url)
    news_data.encoding = "utf-8"
    soup=BeautifulSoup(news_data.text,'lxml')
    news_item = soup.select('.news_li')
    #print(news_item)
    for new in news_item:
        if len(new.select('h2')) != 0:
            title = new.select('h2')[0].text.strip()
            text_href = "https://www.thepaper.cn/" + new.select('a')[0]['href']
            pic_src ="https://www.thepaper.cn/" +  new.select('img')[0]['src']
            num += 1
            try:
                news_info = getNews.getNewsTxt(text_href)
            except:
               print("bug")
            data = {
                'title': title,
                'title_link': text_href,
                'pic_link': pic_src,
                'news_info':news_info
            }
            print("第%d插入成功"%(num),data)
            pengpai_news.insert_one(data)
发布了6 篇原创文章 · 获赞 1 · 访问量 340

猜你喜欢

转载自blog.csdn.net/weixin_44600471/article/details/104298187