Hanlp 分词代码

package com.nlp.HanFenci.Fenci;

/*
 * 该算法将长菜单切分成独立的词,[龙眼/n, 大肠/n],然后将这些词进行词频统计(词频统计时将后面的词按/进行分割,只要前面的数据),用到的知识
 * 有文件的读入与读出,hashmap的去重及更新数据,以及字符串数组,队列(用起来跟数组一样),以及hash数据的增加和更新
 */

import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;

/**
 * Hello world!
 *
 */
public class Fenci
{
	private static String trainFile = "1.txt";
	private static String trainpplFile = "分词后文件.txt";
	private static String ppl_fre_File = "词频统计文件.txt";
   
	public static void main(String[] args) throws IOException {
		
		//按行读取文件
		LineNumberReader lineNumberReader = new LineNumberReader(new FileReader(trainFile));
		//用于加快字符读取的速度
		BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(trainpplFile));
		BufferedWriter bufferedWriter2 = new BufferedWriter(new FileWriter(ppl_fre_File));
		
		String readline = null;
		
		HashMap<String , Integer> pplHashMap = new HashMap<String, Integer>();
	    //逐行读取数据
		while((readline = lineNumberReader.readLine()) != null) {
                 //进行分词,分词后得到这种数据[龙眼/n, 大肠/n]
                 List<Term> termList = HanLP.segment(readline);
                 
                 System.out.println(termList);              
                 String writeLine = "";
                 System.out.println("Hello");
                 for (Term term : termList) {
                	 //System.out.println(term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]");
                	 
                	//只要前面那些数据青椒/n 
					String [] termSplit = term.toString().split("/");
					//下面这一输出,让我们知道可以获得输出的内容
					//System.out.println(termSplit[0]);
					writeLine += "\t" + termSplit[0];										
					//System.out.println(writeLine);
					///如果没有找到之前的映射,对已切分的数据,只捡取前面的那部分
					if(!pplHashMap.containsKey(termSplit[0])){
						pplHashMap.put(termSplit[0], 1);
					}else {
						int tmp_count = pplHashMap.get(termSplit[0]);
						//向hashmap中更新数据
						pplHashMap.put(termSplit[0], tmp_count + 1);
					}
					
				}
                 //将分好的词的数据写入文件
                 bufferedWriter.write(writeLine + "\n");
                 bufferedWriter.flush();
		}
		
		//将统计好词频的数据写入文件
		for (String key : pplHashMap.keySet()) {

			 bufferedWriter2.write(key + "\t" + pplHashMap.get(key) + "\n");

			}
		
		lineNumberReader.close();
		bufferedWriter.close();
		bufferedWriter2.close();
    }
	
}

猜你喜欢

转载自blog.csdn.net/qiuyushuofeng/article/details/80991960