Ready to work
1. Enter the Douban book channel: https://book.douban.com
2. Find a book of interest, go to its page and view the book's reviews
3. Analyze the URL address characteristics of the comment data, and get the common part: https://book.douban.com/subject/book_id/comments?
where book_id is the number of the book in the address bar of the web page
Code to implement crawler
#Get HTML page def getHtml(url): try : r = requests.get(url, timeout=30) r.raise_for_status() return r.text except: return '' #Get comments def getComment(html): soup = BeautifulSoup(html, 'html.parser') comments_list = [] #comment list comment_nodes = soup.select( ' .comment > p ' ) for node in comment_nodes: comments_list.append(node.get_text().strip().replace("\n", "") + u'\n') return comments_list #Get and save the comment to a file def saveCommentText(fpath): pre_url = "https://book.douban.com/subject/1799652/comments?" # 爬取深度 depth = 16 with open(fpath, ' w ' , encoding= ' utf-8 ' ) as f: for i in range(depth): print ( ' Start crawling comments on page {}... ' .format(i)) url = pre_url + 'start=' + str(20 * i) + '&limit=20&sort=new_score&' + 'status=P' html = getHtml(url) f.writelines(getComment(html)) #Set random sleep to prevent IP from being blocked, it seems that there is no need time.sleep(1 + float(random.randint(1, 20)) / 20)
Generate word cloud
The word cloud is generated using the wordcloud component
In addition, you must specify the background image and the resource path of the text font file, otherwise the Chinese cannot be displayed
In addition, word segmentation
word segmentation
#cut words def cutWords(fpath): text = '' with open(fpath, 'r', encoding='utf-8') as fin: for line in fin.readlines(): line = line.strip('\n') text += ' '.join(jieba.cut(line)) text += ' ' with codecs.open('cut_word.txt', 'w', encoding='utf-8') as f: f.write(text) print ( " \nThe word segmentation is completed, the file is saved successfully! " )
Create word cloud images
#Draw word cloud def drawWordcloud(): with codecs.open('cut_word.txt', encoding='utf-8') as f: comment_text = f.read() color_mask = imread( " comment.jpeg " ) #Read the background picture Stopwords = [u ' is ' , u ' author ' , u ' you ' , u ' so ' , u ' but ' , u ' but ' , u ' what ' , u ' no ' , u ' this ' , u ' that ' , u ' everyone ' , u ' compare ' , u ' see ' , u ' really ' , u ' except ' , u ' when ' , u ' already ' , u ' can ' , u ' , ' u ' . ' ] cloud = WordCloud( font_path = " FZYTK.TTF " , #Chinese font, otherwise it cannot be displayed background_color= ' white ' , max_words=200, max_font_size=200, min_font_size=4, mask=color_mask, stopwords=Stopwords) word_cloud = cloud.generate(comment_text) # 产生词云 image_colors = ImageColorGenerator(color_mask) # 以下代码显示图片 plt.imshow(cloud) plt.axis("off") # 绘制词云 plt.figure() # 重新着色,使用背景图片中的颜色 plt.imshow(cloud.recolor(color_func=image_colors)) plt.axis("off") # 绘制背景图片为颜色的图片 plt.figure() plt.imshow(color_mask, cmap=plt.cm.gray) plt.axis("off") plt.show() #Save the picture word_cloud.to_file( " comment_cloud.jpg " ) print ( ' Word cloud image saved successfully ' )
operation result: