Python crawling Douban.com Book Review

Ready to work

1. Enter the Douban book channel: https://book.douban.com

2. Find a book of interest, go to its page and view the book's reviews

3. Analyze the URL address characteristics of the comment data, and get the common part: https://book.douban.com/subject/book_id/comments?

  where book_id is the number of the book in the address bar of the web page

Code to implement crawler

#Get HTML page 
def getHtml(url):
     try :
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        return r.text
    except:
        return ''


#Get comments 
def getComment(html):
    soup = BeautifulSoup(html, 'html.parser')
    comments_list = []   #comment list 
    comment_nodes = soup.select( ' .comment > p ' )
     for node in comment_nodes:
        comments_list.append(node.get_text().strip().replace("\n", "") + u'\n')
    return comments_list


#Get and save the comment to a file 
def saveCommentText(fpath):
    pre_url = "https://book.douban.com/subject/1799652/comments?"
    # 爬取深度
    depth = 16

    with open(fpath, ' w ' , encoding= ' utf-8 ' ) as f:
         for i in range(depth):
             print ( ' Start crawling comments on page {}... ' .format(i))
            url = pre_url + 'start=' + str(20 * i) + '&limit=20&sort=new_score&' + 'status=P'
        html = getHtml(url)
        f.writelines(getComment(html))
        #Set random sleep to prevent IP from being blocked, it seems that there is no need 
        time.sleep(1 + float(random.randint(1, 20)) / 20)

Generate word cloud

The word cloud is generated using the wordcloud component

In addition, you must specify the background image and the resource path of the text font file, otherwise the Chinese cannot be displayed

In addition, word segmentation

word segmentation 

#cut words 
def cutWords(fpath):
    text = ''
    with open(fpath, 'r', encoding='utf-8') as fin:
        for line in fin.readlines():
            line = line.strip('\n')
            text += ' '.join(jieba.cut(line))
            text += ' '
    with codecs.open('cut_word.txt', 'w', encoding='utf-8') as f:
        f.write(text)

    print ( " \nThe word segmentation is completed, the file is saved successfully! " )

Create word cloud images

#Draw word cloud 
def drawWordcloud():
    with codecs.open('cut_word.txt', encoding='utf-8') as f:
        comment_text = f.read()

    color_mask = imread( " comment.jpeg " )   #Read the background picture 
    Stopwords = [u ' is ' , u ' author ' , u ' you ' , u ' so ' , u ' but ' , u ' but ' , u ' what ' , u ' no ' ,
                 u ' this ' , u ' that ' , u ' everyone ' , u ' compare ' , u ' see ' , u ' really ' ,
                 u ' except ' , u ' when ' , u ' already ' , u ' can ' , u ' , ' u ' . ' ]
    cloud = WordCloud( font_path = " FZYTK.TTF " ,   #Chinese font, otherwise it cannot be displayed 
                      background_color= ' white ' ,
                      max_words=200,
                      max_font_size=200,
                      min_font_size=4,
                      mask=color_mask,
                      stopwords=Stopwords)
    word_cloud = cloud.generate(comment_text)  # 产生词云
    image_colors = ImageColorGenerator(color_mask)

    # 以下代码显示图片
    plt.imshow(cloud)
    plt.axis("off")
    # 绘制词云
    plt.figure()

    # 重新着色,使用背景图片中的颜色
    plt.imshow(cloud.recolor(color_func=image_colors))
    plt.axis("off")
    # 绘制背景图片为颜色的图片
    plt.figure()
    plt.imshow(color_mask, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()
    #Save the picture 
    word_cloud.to_file( " comment_cloud.jpg " )
     print ( ' Word cloud image saved successfully ' )

 

operation result:

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324600087&siteId=291194637