文章中需要的Apache openNlp jar包:openNlp下载:https://opennlp.apache.org/cgi-bin/download.cgi
其他涉及的jar都是java基础类包
package com.npl.demo.utils;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Scanner;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.WhitespaceTokenizer;
/**
* Filename: NlpTokenization.java
* Description:
* Copyright: Copyright (c) 2019 All Rights Reserved.
* @author: wangk
* @version: 1.0
* Create at: 2019年5月5日 下午4:28:56
*
* Modification History:
* Date Author Version Description
* ------------------------------------------------------------------
* 2019年5月5日 wangk 1.0 1.0 Version
*
*/
public class NlpTokenization {
static String paragraph = "Let's The first sentence. The second sentence. Let's ";
static String[] sentences = {
"Tim was agood neighbor. Perhaps not as good Bob "+
"Haywood, but stille pretty good. Of course Mr. Adam "+
"took the cake!"
};
static String chineseLanguage = "时代的碰撞|中国古典民乐与流行的相遇"; //中文可以进行正则匹配每隔字中间加一个空格,就可以进行分词了
//代码如下
/*String regex = "(.{1})";
text = text.replaceAll (regex, "$1 ");*/
public static void main(String[] args) {
NlpTokenization to = new NlpTokenization();
//to.scanner(paragraph);
//to.split(chineseLanguage);
//to.breakIterator(paragraph);
//to.streamTokenizer(paragraph);
//to.stringTokenizer(chineseLanguage);
//to.textSplit(); //测试分词 性能
to.openNlpSimpleTokenizer(chineseLanguage);
}
/**
* @Description: /默认使用空格作为分隔符 java类 Scanner方法
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:51:38
*/
public List scanner(String text) {
Scanner scanner = new Scanner(text);
scanner.useDelimiter("[ ,.]");//设置基于字符串或模式的分隔符 --设置分隔符为空格,逗号,句号 使用正则设置
//scanner.reset();//分隔符复位为空格
List<String> list = new ArrayList<>();
while(scanner.hasNext()) {
String token = scanner.next();
list.add(token);
}
for(String token : list) {
System.out.println(token);
}
return null;
}
/**
* @Description: 文本分词 java类 split方法
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:51:30
*/
public List split(String text) {
String tokens[] = text.split("\\s+");
for (String token : tokens) {
System.out.println(token);
}
return null;
}
/**
* @Description: 文本分词 java类 BreakIterator方法 该类可以获取各种边界
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:51:19
*/
public List breakIterator(String text) {
BreakIterator wordIterator = BreakIterator.getWordInstance();
wordIterator.setText(text);
int boundary = wordIterator.first();
while(boundary != BreakIterator.DONE) {//done为最后一个边界
int begin = boundary;
System.out.print(boundary+"-");
boundary = wordIterator.next();
int end = boundary;
if(end == BreakIterator.DONE) break;
System.out.println(boundary+"["+ text.substring(begin, end)+"]");
}
return null;
}
/**
* @Description: 文本分词 java类 StreamTokenizer方法 通常基于一个文件创建,对文件中的文本分词
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午1:50:37
*/
public List streamTokenizer(String text) {
StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(text));
//对于分词器会将单引号字符和双引号字符表示引用文本,由于没有对应的引号,故字符串的其他部分被忽略了
//使用ordinaryChar方法制定那些字符串应为普通字符
tokenizer.ordinaryChar('\'');
tokenizer.ordinaryChar(',');
boolean isEOF = false; //用来终止循环
while(!isEOF) {
try {
int token = tokenizer.nextToken(); //返回词项的类型
switch(token) {
case StreamTokenizer.TT_EOF: //static int 流结束的一个常数
isEOF = true;
break;
case StreamTokenizer.TT_EOL: //static int 行结束的一个常数
break;
case StreamTokenizer.TT_NUMBER: //static int 读取词项的数量
System.out.println(tokenizer.nval); //double 如果当前词项是一个单词则存有一个数字
break;
case StreamTokenizer.TT_WORD: //static int 指明一个单词词项的常数
System.out.println(tokenizer.sval); //String 如果当前词项是一个单词则存有这个词项
break;
default:
System.out.println((char) token);
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* @Description: 文本分词 java类 stringTokenizer方法 可以处理热恩和来源的字符串
* @author wangk
* @param text
* @return
* @date: 2019年5月5日 下午3:05:36
*/
public List stringTokenizer(String text) {
StringTokenizer st = new StringTokenizer(text);
while(st.hasMoreTokens()) {
System.out.println(st.nextToken());
}
return null;
}
/**
* @Description: 测试分词 性能
* @author wangk
* @date: 2019年5月5日 下午4:28:59
*/
public void textSplit() {
StringBuilder sb = new StringBuilder();
for (int i = 100000; i < 100000 + 60; i++)
sb.append(i).append(' ');
String sample = sb.toString();
int runs = 100000;
for (int i = 0; i < 5; i++) {
{
long start = System.nanoTime();
for (int r = 0; r < runs; r++) {
StringTokenizer st = new StringTokenizer(sample);
List<String> list = new ArrayList<String>();
while (st.hasMoreTokens())
list.add(st.nextToken());
}
long time = System.nanoTime() - start;
System.out.printf("StringTokenizer took an average of %.1f us%n", time / runs / 1000.0);
}
{
long start = System.nanoTime();
Pattern spacePattern = Pattern.compile(" ");
for (int r = 0; r < runs; r++) {
List<String> list = Arrays.asList(spacePattern.split(sample, 0));
}
long time = System.nanoTime() - start;
System.out.printf("Pattern.split took an average of %.1f us%n", time / runs / 1000.0);
}
{
long start = System.nanoTime();
for (int r = 0; r < runs; r++) {
List<String> list = new ArrayList<String>();
int pos = 0, end;
while ((end = sample.indexOf(' ', pos)) >= 0) {
list.add(sample.substring(pos, end));
pos = end + 1;
}
}
long time = System.nanoTime() - start;
System.out.printf("indexOf loop took an average of %.1f us%n", time / runs / 1000.0);
}
}
}
/**
* @Description: 英文标点也被作为单独项 openNlp 方法SimpleTokenizer
* @author wangk
* @param text
* @return
* @date: 2019年5月6日 上午10:36:38
*/
public List openNlpSimpleTokenizer(String text) {
SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
String regex = "(.{1})";
text = text.replaceAll (regex, "$1 ");
String tokens[] = simpleTokenizer.tokenize(text);
for(String token : tokens) {
System.out.println(token);
}
return null;
}
/**
* @Description: 空格作为分隔符 openNlp 方法 WhitespaceTokenizer
* @author wangk
* @param text
* @return
* @date: 2019年5月6日 上午10:36:38
*/
public List openNlpWhitespaceTokenizer(String text) {
WhitespaceTokenizer simpleTokenizer = WhitespaceTokenizer.INSTANCE;
String tokens[] = simpleTokenizer.tokenize(text);
for(String token : tokens) {
System.out.println(token);
}
return null;
}
}