Python uses newspaper3k to extract titles, images, keywords, texts, etc. of articles

Python uses newspaper3k to extract titles, images, keywords, texts, etc. of articles

1. Rendering

insert image description here

The first picture is as follows:
insert image description here
insert image description here

2. Installation and error resolution

python 3.7.4

pip install newspaper3k==0.2.8

3. Source code

import newspaper
from newspaper import Config

sina_paper = newspaper.build('http://www.sina.com.cn/', language='zh')

for category in sina_paper.category_urls():
    print(category)
# http://health.sina.com.cn
# http://eladies.sina.com.cn
# http://english.sina.com

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'

config = Config()
config.browser_user_agent = user_agent

article = sina_paper.articles[0]
article.config = config
print(len(article.url), article.url)
if (article.url.find("http") > 4): article.url = 'http' + article.url.split("http")[2]

print(len(article.url), article.url)
article.url = article.url.replace(' ', '')
print(len(article.url), article.url)
article.download()
article.parse()

print('text: ', article.text)
print('title: ', article.title)
print('publish_date: ', article.publish_date)
print('tags: ', article.tags)
print('top_image: ', article.top_image)
print('top_img: ', article.top_img)
print('url: ', article.url)
print('meta_keywords: ', article.meta_keywords)
print('meta_img: ', article.meta_img)
print('keywords: ', article.keywords)
print('images:', len(article.images), article.images)
print('imgs', len(article.imgs), article.imgs)


# url = "https://www.newsweek.com/new-mexico-compound-charges-dropped-children-1096830".strip()
#
# content = []
# for i in [article.url]:
#     article = Article(str(i), config=config)
#     try:
#         article.download()
#         article.parse()
#         article.download()
#         print(article.html)
#         article.parse()
#         text = article.text
#         print(i, text)
#         content.append(text)
#     except Exception as e:
#         print(e)
#         content.append('error')
#     continue
# print(content)

reference

Guess you like

Origin blog.csdn.net/qq_40985985/article/details/130603142