package com.nlp.HanFenci.Fenci;
/*
* 该算法将长菜单切分成独立的词,[龙眼/n, 大肠/n],然后将这些词进行词频统计(词频统计时将后面的词按/进行分割,只要前面的数据),用到的知识
* 有文件的读入与读出,hashmap的去重及更新数据,以及字符串数组,队列(用起来跟数组一样),以及hash数据的增加和更新
*/
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
/**
* Hello world!
*
*/
public class Fenci
{
private static String trainFile = "1.txt";
private static String trainpplFile = "分词后文件.txt";
private static String ppl_fre_File = "词频统计文件.txt";
public static void main(String[] args) throws IOException {
//按行读取文件
LineNumberReader lineNumberReader = new LineNumberReader(new FileReader(trainFile));
//用于加快字符读取的速度
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(trainpplFile));
BufferedWriter bufferedWriter2 = new BufferedWriter(new FileWriter(ppl_fre_File));
String readline = null;
HashMap<String , Integer> pplHashMap = new HashMap<String, Integer>();
//逐行读取数据
while((readline = lineNumberReader.readLine()) != null) {
//进行分词,分词后得到这种数据[龙眼/n, 大肠/n]
List<Term> termList = HanLP.segment(readline);
System.out.println(termList);
String writeLine = "";
System.out.println("Hello");
for (Term term : termList) {
//System.out.println(term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]");
//只要前面那些数据青椒/n
String [] termSplit = term.toString().split("/");
//下面这一输出,让我们知道可以获得输出的内容
//System.out.println(termSplit[0]);
writeLine += "\t" + termSplit[0];
//System.out.println(writeLine);
///如果没有找到之前的映射,对已切分的数据,只捡取前面的那部分
if(!pplHashMap.containsKey(termSplit[0])){
pplHashMap.put(termSplit[0], 1);
}else {
int tmp_count = pplHashMap.get(termSplit[0]);
//向hashmap中更新数据
pplHashMap.put(termSplit[0], tmp_count + 1);
}
}
//将分好的词的数据写入文件
bufferedWriter.write(writeLine + "\n");
bufferedWriter.flush();
}
//将统计好词频的数据写入文件
for (String key : pplHashMap.keySet()) {
bufferedWriter2.write(key + "\t" + pplHashMap.get(key) + "\n");
}
lineNumberReader.close();
bufferedWriter.close();
bufferedWriter2.close();
}
}
Hanlp 分词代码
猜你喜欢
转载自blog.csdn.net/qiuyushuofeng/article/details/80991960
今日推荐
周排行