Example Python crawling reptiles word cloud generated watercress critics

A simple Python reptiles example, this article explains how to crawl pages of information, analysis of the structure of the page, the information stored in MongoDB (if not MongoDB can also be present in txt, does not affect the overall program to run), use jieba word for critics to constitute word cloud (to solve the Chinese word cloud generated empty squares).

Use to package

import requests
#连接失败的异常
from requests.exceptions import ConnectionError
#MongoDB的包
import pymongo
from pymongo.mongo_client import MongoClient
#MongoDB连接失败的异常
from pymongo.errors import PyMongoError
#使用BeautifulSoup来处理页面
from bs4 import BeautifulSoup
#使用结巴分词
import jieba
#wordcloud词云包,STOPWORDS是默认的弃用词
from wordcloud import WordCloud, STOPWORDS
#将词云的图绘出
import matplotlib.pyplot as plt
import numpy as np
#导入图片用的包
from PIL import Image
#多线程用的包
import threading

Download package used command

jieba package download, at the beginning I useconda install -c conda-forge jiebaBut later found in the jieba conda package is less than 0.4.
jieba of github URL
in jieba where I saw a paddle mode, I feel very bad to use pip to jieba upgrade. If you also want to use the paddle to press mode in order to map
Here Insert Picture Description

#jieba下载,后面那个部分是使用清华源下载
pip install jieba -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install paddlepaddle-tiny==1.6.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
#wordcloud下载
pip install wordcloud -i https://pypi.tuna.tsinghua.edu.cn/simple

Other packages are more common, do not write

The main program two word cloud generated in part by the composition and crawling

1. crawling watercress critics

1.1 to see if you can access IMDb request was successful

It is intended crawling watercress movie critic clown
Here Insert Picture Description

def scrape(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'
        }
        res = request.get(url, headers=headers)
        print(res.text)
    except ConnectionError:
        print("爬取失败")


if __name__ == '__main__':
    urls = [
        'https://movie.douban.com/subject/27119724/comments?start=0&limit=20&sort=new_score&status=P',
    ]
    for i in range(0, len(urls)):
        scrape(urls[i])
    createWordCloud()
    print('结束')

Run out by the next page, under the general looked really clown Commentary
Here Insert Picture Description

1.2 Analysis page, what information should think out

Here Insert Picture Description
We will be stored in MongoDB in the username, the evaluation made by the user, the content of Commentary
easy to know the user name, comments in class = comment-info span in which
Here Insert Picture Description
the user name is this the only span a label, comments class = rating in the title represents the value
Commentary class = short is in the content

1.3 crawling corresponding information

# 连接MongoDb
def mongo():
    try:
        mongoClient = MongoClient('localhost', 27017)
        # 连接叫local的库
        mongoDatabase = mongoClient.local
        # 连接叫douban_two的集合
        collection = mongoDatabase.douban_two
        # 返回集合
        return collection
    except PyMongoError as e:
        print('出错', e)


# 爬取豆瓣短评
def scrape(url):
    try:
        # 请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'
        }
        res = requests.get(url, headers=headers)
        # 这个是想如果没请求成功就再请求5次
        for i in range(0, 5):
            if res.status_code is 403:
                print('出现403')
                res = request.get(url, headers=headers)
            else:
                break

        print("爬取成功%s,收到%s" % (url, res.status_code))
        # 使用BeautifulSoup
        soup = BeautifulSoup(res.text, 'html.parser')
        # 包含用户名和评价的列表,通过class查出
        username_list = soup.find_all('span', attrs={'class': 'comment-info'})
        # 短评的列表,通过class=short找出
        content_list = soup.find_all('span', attrs={'class': 'short'})
        # 因为两个列表的长度是对应的所以用长度来循环
        for indexOf in range(0, len(username_list)):
            # 找出评论
            rate = username_list[indexOf].find('span', attrs='rating')
            # 调用连接MongoDB的方法获取集合
            collection = mongo()
            title = ''
            # 说出来你可能不信,在start=220那页有个人没打分,在此处进行判空
            if rate is not None:
            	#通过get()来获取title里的值
                title = rate.get('title')
            # 调用insert_one添加记录,username_list[indexOf].a.string,这个的意思就是在这个位置取a标签里的文本
            collection.insert_one(
                {'context': content_list[indexOf].string, 'username': username_list[indexOf].a.string, 'rate': title})
    except ConnectionError:
        print("爬取失败")


if __name__ == '__main__':
    urls = [
        'https://movie.douban.com/subject/27119724/comments?start=20&limit=20&sort=new_score&status=P',
        'https://movie.douban.com/subject/27119724/comments?start=40&limit=20&sort=new_score&status=P',
        'https://movie.douban.com/subject/27119724/comments?start=60&limit=20&sort=new_score&status=P',
        # 先注解掉下面这个,这个需要登录才能看,最后的完整代码里会展示以登录的状态爬取短评
        #'https://movie.douban.com/subject/27119724/comments?start=220&limit=20&sort=new_score&status=P'
    ]
    for i in range(0, len(urls)):
        scrape(urls[i])
    print('结束')

The result:
Here Insert Picture Description
a visualization tool I use MongoDB Compass, this tool for a long text is not fully expanded, in addition to this is okay with them.
Here Insert Picture Description
This has been crawling to the watercress critics and stored in the MongoDB

2. Generate word cloud based on critics

Process: segmentation of critic, a word cloud generated in accordance with the segmentation of text

Code: code here for simplicity shows just take a short commentary to generate a word cloud, after the code is complete with 80 Commentary

# 这个请加在包引用的后面,这个用于开启paddle模式
jieba.enable_paddle()


# 分词
def getCutWord():
    # 获取集合
    collection = mongo()
    # 通过[0]来只取出第一条数据
    douban = collection.find()[0]
    # word_total = ''
    # 获取短评
    context = douban['context']
    list_paddle = jieba.cut(context, use_paddle=True)
    # 通过join()通过空格来连接list_paddle来生成一段文本
    word = " ".join(list_paddle)
    return word


# 生成词云
def createWordCloud():
    # 调用分词方法
    text = getCutWord()
    print(text)
    # mask = np.array(Image.open('img5.jpg'))
    # 下面这两行是将这些,电影加入屏蔽词中让词云不分析它们
    word = {'这些', '电影'}
    stopwords = STOPWORDS.update(word)
    # 背景白,font_path字体要加
    # 这里是个坑,因为wordcloud是针对字母的,如果是中文的话是识别不了的,需要设置字体
    wordcloud = WordCloud(background_color='white', font_path="msyh.ttc", stopwords=stopwords, max_words=300)
    # 如果你要设置背景的话使用这个添加mask
    # wordcloud = WordCloud(background_color='white', font_path="msyh.ttc",mask=mask, stopwords=stopwords, max_words=300)
    # 以text生成词云
    wordcloud.generate(text=text)
    # 显示词云
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    # 保存词云为图片
    wordcloud.to_file('cloud_one.png')


if __name__ == '__main__':
    createWordCloud()
    print('结束')

Run-generated images:
Here Insert Picture Description
on the above code, the most important to note that Chinese font set font_path this place
if you do not set, the result will be:
Here Insert Picture Description

How to display the Chinese character set:

turn onC:\Windows\FontsIn the search box to search Microsoft elegant black

Here Insert Picture Description
After the seizure of the copy to the project, I pasted this a direct three documents do not know you and I are not the same.
Here Insert Picture Description
This Python crawling watercress critics and generate word clouds have ended.

A full-Code

Log on this learning Python login watercress and crawling critics
caveats: S = requests.Session (), as well as local account password to log into their own

import requests
from requests.exceptions import ConnectionError
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.errors import PyMongoError
from bs4 import BeautifulSoup
import jieba
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import threading

jieba.enable_paddle()
# 生成Session对象用来保存Cookie
s = requests.Session()


# 连接MongoDb
def mongo():
    try:
        mongoClient = MongoClient('localhost', 27017)
        # 连接叫local的库
        mongoDatabase = mongoClient.local
        # 连接叫douban_two的集合
        collection = mongoDatabase.douban_two
        # 返回集合
        return collection
    except PyMongoError as e:
        print('出错', e)


# 分词
def getCutWord():
    # 获取集合
    collection = mongo()
    # 全数据
    douban = collection.find()
    word_total = ''
    #进行遍历
    for row in douban:
        context = row['context']
        list_paddle = jieba.cut(context, use_paddle=True)
        word = " ".join(list_paddle)
        # 将80个短评加入word_total里
        word_total = word_total + ' ' + word
    return word_total


# 生成词云
def createWordCloud():
    # 调用分词方法
    text = getCutWord()
    print(text)
    #以这张图片为背景来生成词云
    mask = np.array(Image.open('img5.jpg'))
    # 下面这两行是将这些,电影加入屏蔽词中让词云不分析它们
    word = {'这些', '电影'}
    stopwords = STOPWORDS.update(word)
    # 背景白,font_path字体要加
    # 这里是个坑,因为wordcloud是针对字母的,如果是中文的话是识别不了的,需要设置字体
    # wordcloud = WordCloud(background_color='white', font_path="msyh.ttc", stopwords=stopwords, max_words=300)
    # 如果你要设置背景的话使用这个添加mask
    wordcloud = WordCloud(background_color='white', font_path="msyh.ttc",mask=mask, stopwords=stopwords, max_words=300)
    # 以text生成词云
    wordcloud.generate(text=text)
    # 显示词云
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    # 保存词云为图片
    wordcloud.to_file('cloud_one.png')


# 发送登录请求
def Login():
    url = 'https://accounts.douban.com/j/mobile/login/basic'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'
    }
    # 请输入自己登录豆瓣的用户名和密码
    data = {
        'name': '******',
        'password': '*****',
        'remember': 'false'
    }
    try:
        re = s.post(url, headers=headers, data=data)
        print(re.status_code)
    except:
        print('失败')


# 爬取豆瓣短评
def scrape(url):
    try:
        # 请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'
        }
        res = s.get(url, headers=headers)
        # 这个是想如果没请求成功就再请求5次
        for i in range(0, 5):
            if res.status_code is 403:
                print('出现403')
                res = s.get(url, headers=headers)
            else:
                break

        print("爬取成功%s,收到%s" % (url, res.status_code))
        # 使用BeautifulSoup
        soup = BeautifulSoup(res.text, 'html.parser')
        # 包含用户名和评价的列表,通过class查出
        username_list = soup.find_all('span', attrs={'class': 'comment-info'})
        # 短评的列表,通过class=short找出
        content_list = soup.find_all('span', attrs={'class': 'short'})
        # 因为两个列表的长度是对应的所以用长度来循环
        for indexOf in range(0, len(username_list)):
            # 找出评论
            rate = username_list[indexOf].find('span', attrs='rating')
            # 调用连接MongoDB的方法获取集合
            collection = mongo()
            title = ''
            # 说出来你可能不信,在start=220那页有个人没打分,在此处进行判空
            if rate is not None:
                title = rate.get('title')
            # 调用insert_one添加记录,username_list[indexOf].a.string,这个的意思就是在这个位置取a标签里的文本
            collection.insert_one(
                {'context': content_list[indexOf].string, 'username': username_list[indexOf].a.string, 'rate': title})
    except ConnectionError:
        print("爬取失败")


# 使用线程
class MyThread(threading.Thread):
    def __init__(self, target, args):
        threading.Thread.__init__(self)
        self.target = target
        self.args = args

    def run(self):
        self.target(self.args)


if __name__ == '__main__':
	# 进行登录
    Login()
    urls = [
        'https://movie.douban.com/subject/27119724/comments?start=20&limit=20&sort=new_score&status=P',
        'https://movie.douban.com/subject/27119724/comments?start=40&limit=20&sort=new_score&status=P',
        'https://movie.douban.com/subject/27119724/comments?start=60&limit=20&sort=new_score&status=P',
        'https://movie.douban.com/subject/27119724/comments?start=220&limit=20&sort=new_score&status=P'
    ]
    # for i in range(0, len(urls)):
    #     scrape(urls[i])
    # 通过多线程来并发爬取
    thread_list = []
    for i in range(0, len(urls)):
        thread = MyThread(target=scrape, args=urls[i])
        thread_list.append(thread)
    for i in range(0, len(urls)):
        thread_list[i].start()
    for i in range(0, len(urls)):
        thread_list[i].join()
    createWordCloud()
    print('结束')

Log analysis of how this can be drawn on
Here Insert Picture Description
operating results:
Here Insert Picture Description

Released six original articles · won praise 5 · Views 3007

Guess you like

Origin blog.csdn.net/weixin_43882435/article/details/104094738