Word segmentation for hotel reviews

The data source can be downloaded here

Front-end visualization

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>词云</title>
</head>
<body>
<div class="btn-group">
    <button onclick="loadAndShowWordcloud('正面')">正面</button>
    <button onclick="loadAndShowWordcloud('负面')">负面</button>
</div>
<div id="main" style="width: 500px;height: 500px;"></div>
</body>
</html>
<script src="js/echarts.min.js"></script>
<script src="js/echarts-wordcloud.min.js"></script>
<script>
    function loadAndShowWordcloud(type='正面'){
        var chart = echarts.init(document.getElementById('main'));
        // 1.新建请求对象
        var request = new XMLHttpRequest();
        //2.设置请求对象url
        request.open('get','/wordcloud?type=' + type)
        //3.发送请求
        request.send()
        //4.等待响应
        request.onreadystatechange = function (){
            if(request.readyState === 4){
                var chart = echarts.init(document.getElementById("main"))
                chart.setOption({
                    series: [{
                        type: 'wordCloud',
                        shape: 'circle',
                        keepAspect: false,
                        left: 'center',
                        top: 'center',
                        width: '100%',
                        height: '100%',
                        // sizeRange: [12, 60],
                        // rotationRange: [-90, 90],
                        // rotationStep: 45,
                        // drawOutOfBound: false,
                        //  shrinkToFit: false,

                        textStyle: {
                            fontFamily: 'sans-serif',
                            fontWeight: 'bold',
                            // Color can be a callback function or a color string
                            color: function () {
                                // Random color
                                return 'rgb(' + [
                                    Math.round(Math.random() * 160),
                                    Math.round(Math.random() * 160),
                                    Math.round(Math.random() * 160)
                                ].join(',') + ')';
                            }
                        },
                        // Data is an array. Each array item must have name and value property.
                        data: JSON.parse(request.responseText)
                    }]
                });
            }
        }
    }

    loadAndShowWordcloud()
</script>

backend processing

First build an entity class Word

Note: To use annotations such as @Data, first import the configuration file of lombok

package com.example.springbootdemo1.entity;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.io.Serializable;

@Data
@NoArgsConstructor
@AllArgsConstructor
public  class Word implements Serializable {

    private String name;
    private int value;

}
package com.example.springbootdemo1.contorller;
import com.example.springbootdemo1.entity.Word;
import com.example.springbootdemo1.util.StopWordUtil;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;
@Slf4j
@RestController
public class WordCloudContorller {
    @GetMapping("/wordcloud")
    public List<Word> wordcloud(String type) throws IOException, ClassNotFoundException {
        long l = System.currentTimeMillis();//记录开始时间
        //=======================================================
        //判断有没有被读取过,如果有读取,直接调用
        //======================================================

        File path = new File("D:\\word\\");
        File file1 = new File(path,type);
        //判断路径是否存在,如果不存在,就生成该路径
        if (!path.exists()){
            path.mkdirs();
        }
        //判断该文件是否已经存在,如果存在,直接读取
        long s= System.currentTimeMillis();
        if(file1.exists()){
            //直接读取盘中内容
            ObjectInputStream objectInputStream = new ObjectInputStream(new FileInputStream(file1));
            LinkedList<Word>words =(LinkedList<Word>) objectInputStream.readObject();
            if(words!=null){
                objectInputStream.close();
                long en= System.currentTimeMillis();
                //读取本地花费时间
                System.out.println(s-en);
                return words;
            }
        }
        //=========================================================
        //没有生成结果,读取预料,并且分词统计次数
        //=========================================================
        if (type == null || type.equals("")) {
            return new ArrayList<>(0);
        }
        File file = new File("C:\\Users\\luofei\\Desktop\\自然语言处理-张老师\\ChnSentiCorp情感分析酒店评论\\" + type);
        File[] files = file.listFiles(File::isFile);
        StringBuilder text = new StringBuilder();//线程不安全,相对于StringBuffer有速度优势
        assert files != null;
        for (File filess : files) {
            try {
                BufferedReader bufferedReader = new BufferedReader(new FileReader(filess));
                String line = bufferedReader.readLine();
                text.append(line);
                bufferedReader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        //分词
        List<Term> segment = HanLP.segment(text.toString());
        //保留分词类型为形容词
        List<Term> termList = segment.stream().filter(new Predicate<Term>() {
            @Override
            public boolean test(Term term) {
                return term.nature.toString().equalsIgnoreCase("n");
            }
        }).collect(Collectors.toList());
        //转换成List<String>
        List<String> collect = termList.stream().map(term -> term.word).collect(Collectors.toList());


        //StopWordUtil.loadStopWord对停留词做预处理,判断是否存在,是否有值
        List<String> stopWord = StopWordUtil.loadStopWord("C:\\Users\\luofei\\Desktop\\自然语言处理-张老师\\停留词.txt");
        //去停留词
        collect.removeAll(stopWord);

        //统计分词次数
        LinkedList<Word> words = new LinkedList<>();
        for (Term word : termList) {
            //第一次循环
            if (words.size() == 0) {
                words.add(new Word(word.word, 1));
            }
            //第二次循环
            boolean exists = false;
            Word word1 = null;
            for (Word value : words) {
                word1 = value;
                if (word1.getName().equals(word.word)) {
                    exists = true;
                    break;
                }
            }
            if (exists) {
                word1.setValue(word1.getValue() + 1);
            } else {
                words.add(new Word(word.word, 1));
            }
        }
        //========================================================
        //将序列化结果,保存包文件中
        //========================================================

        ObjectOutputStream objectOutputStream = new ObjectOutputStream(new FileOutputStream(file1));
        objectOutputStream.writeObject(words);
        //清理缓存区
        objectOutputStream.flush();
        objectOutputStream.close();
        //创建完成时间
        long end = System.currentTimeMillis();
        //计算总共花了多少时间
        System.out.println(l - end);
        return words;
    }
}

Show results

 

 Note: just a classroom exercise

Guess you like

Origin blog.csdn.net/qq_62249633/article/details/130864474