Crawling Tiger Punch NBA News

1. Class libraries required to crawl data

import requests
import re
from bs4 import BeautifulSoup
import jieba.analyse
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator

2. An error occurred when installing the wordcloud library

The workaround is:

  • The installation prompt reports an error and goes to the official website to download the vc++ tool, but the installed memory is too large and only a few G
  • Go to https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud to download the whl file, select the version number and system digits corresponding to python

3. The basic idea of ​​crawling

View the html node of the website, crawl the title and content pages of Hupu NBA News, save the crawled content as a txt file, segment it, and generate a word cloud.

Crawl 12,000 pieces of data, a total of 3 million words (I didn't know that many at first)

import requests
import re
from bs4 import BeautifulSoup
import jieba.analyse
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
url = ' https://voice.hupu.com/nba/1 ' 
#Get the title and content of the first 12,000 pieces of information on Hupu.com nba news 
def AlltitleAndUrl(url):
    j=0
    reslist = requests.get(url)
    reslist.encoding = 'utf-8'
    soup_list = BeautifulSoup(reslist.text, 'html.parser')
    for news in soup_list.select('li'):  # 首页
        if len(news.select('h4')) > 0:
            j =j+1
             print (j)
             #title title 
            = news.find( ' h4 ' ).text
            href=news.find('h4').a['href']
            reslist = requests.get(href)
            reslist.encoding = 'utf-8'
            soup = BeautifulSoup(reslist.text, 'html.parser')
            context=soup.select('div .artical-main-content')[0].text
            f = open('dongman.txt', 'a', encoding='utf-8')
            f.write(title)
            f.write(context)
            f.close()
            print ( " Article title: " + title)
             print (context)
             # print('https://voice.hupu.com/nba/%s' %i)

    #Number of pages behind 
    for i in range(2, 201 ):
        pages = i;
        nexturl = 'https://voice.hupu.com/nba/%s' % (pages)
        # nexturl = '%s%s%s' % (head, pages, tail)
        newcontent = requests.get(nexturl)
        newcontent.encoding = 'utf-8'
        soup_alllist = BeautifulSoup(newcontent.text, 'html.parser')

        for news in soup_list.select('li'):
            if len(news.select('h4')) > 0:
                j = j + 1
                 #title title 
                = news.find( ' h4 ' ).text
                href = news.find('h4').a['href']
                reslist = requests.get(href)
                reslist.encoding = 'utf-8'
                soup = BeautifulSoup(reslist.text, 'html.parser')
                context = soup.select('div .artical-main-content')[0].text
                f = open('dongman.txt', 'a', encoding='utf-8')
                f.write(title)
                f.write(context)
                f.close()
                print ( " Article title: " + title)
                 print (context)
                 print (j)


def getWord():
    lyric = '' 
    f = open( ' 3.txt ' , ' r ' , encoding= ' utf-8 ' )
     # Read the data in the document individually to facilitate the generation of word cloud 
    for i in f:
        lyric += f.read() #Analyze
     result =      jieba.analyse.textrank 
    (lyric, topK=2000, withWeight= True)
    keywords = dict()
    for i in result:
        keywords[i[0]] = i[1]
    print(keywords)

    #Get the template image required for word cloud generation 
    image = Image.open( ' body.png ' )
    graph = np.array(image) #Set
     the word cloud 
    wc = WordCloud(font_path= ' ./fonts/simhei.ttf ' , background_color= ' White ' ,max_words=230, mask=graph, random_state=30,scale= 1.5 )
    wc.generate_from_frequencies(keywords)
    image_color = ImageColorGenerator(graph)
    plt.imshow(wc)
    plt.imshow(wc.recolor(color_func=image_color))
    plt.axis("off")
    plt.show()
    wc.to_file('dream.png')


getWord()
AlltitleAndUrl(url)

Data screenshot:

Result screenshot:

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324768846&siteId=291194637