1. Class libraries required to crawl data
import requests import re from bs4 import BeautifulSoup import jieba.analyse from PIL import Image,ImageSequence import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud,ImageColorGenerator
2. An error occurred when installing the wordcloud library
The workaround is:
- The installation prompt reports an error and goes to the official website to download the vc++ tool, but the installed memory is too large and only a few G
- Go to https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud to download the whl file, select the version number and system digits corresponding to python
3. The basic idea of crawling
View the html node of the website, crawl the title and content pages of Hupu NBA News, save the crawled content as a txt file, segment it, and generate a word cloud.
Crawl 12,000 pieces of data, a total of 3 million words (I didn't know that many at first)
import requests import re from bs4 import BeautifulSoup import jieba.analyse from PIL import Image,ImageSequence import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud,ImageColorGenerator url = ' https://voice.hupu.com/nba/1 ' #Get the title and content of the first 12,000 pieces of information on Hupu.com nba news def AlltitleAndUrl(url): j=0 reslist = requests.get(url) reslist.encoding = 'utf-8' soup_list = BeautifulSoup(reslist.text, 'html.parser') for news in soup_list.select('li'): # 首页 if len(news.select('h4')) > 0: j =j+1 print (j) #title title = news.find( ' h4 ' ).text href=news.find('h4').a['href'] reslist = requests.get(href) reslist.encoding = 'utf-8' soup = BeautifulSoup(reslist.text, 'html.parser') context=soup.select('div .artical-main-content')[0].text f = open('dongman.txt', 'a', encoding='utf-8') f.write(title) f.write(context) f.close() print ( " Article title: " + title) print (context) # print('https://voice.hupu.com/nba/%s' %i) #Number of pages behind for i in range(2, 201 ): pages = i; nexturl = 'https://voice.hupu.com/nba/%s' % (pages) # nexturl = '%s%s%s' % (head, pages, tail) newcontent = requests.get(nexturl) newcontent.encoding = 'utf-8' soup_alllist = BeautifulSoup(newcontent.text, 'html.parser') for news in soup_list.select('li'): if len(news.select('h4')) > 0: j = j + 1 #title title = news.find( ' h4 ' ).text href = news.find('h4').a['href'] reslist = requests.get(href) reslist.encoding = 'utf-8' soup = BeautifulSoup(reslist.text, 'html.parser') context = soup.select('div .artical-main-content')[0].text f = open('dongman.txt', 'a', encoding='utf-8') f.write(title) f.write(context) f.close() print ( " Article title: " + title) print (context) print (j) def getWord(): lyric = '' f = open( ' 3.txt ' , ' r ' , encoding= ' utf-8 ' ) # Read the data in the document individually to facilitate the generation of word cloud for i in f: lyric += f.read() #Analyze result = jieba.analyse.textrank (lyric, topK=2000, withWeight= True) keywords = dict() for i in result: keywords[i[0]] = i[1] print(keywords) #Get the template image required for word cloud generation image = Image.open( ' body.png ' ) graph = np.array(image) #Set the word cloud wc = WordCloud(font_path= ' ./fonts/simhei.ttf ' , background_color= ' White ' ,max_words=230, mask=graph, random_state=30,scale= 1.5 ) wc.generate_from_frequencies(keywords) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.imshow(wc.recolor(color_func=image_color)) plt.axis("off") plt.show() wc.to_file('dream.png') getWord() AlltitleAndUrl(url)
Data screenshot:
Result screenshot: