table of Contents
Part-of-Speech Tagger
The part-of-speech tagger marks the type of words according to the word itself and the context of the word. The OpenNLP POS tagger uses a probability model to predict the type of words in the tag set. In order to limit the range of word types, a dictionary can be used to improve the tagging and runtime efficiency of the tokenizer.
Common part of speech abbreviation reference:
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
The training can be carried out with the following labeled corpus; words and parts of speech are connected with underscore "_", and words and words are separated by spaces.
About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
That_DT sounds_VBZ good_JJ ._.
Model training
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.postag.POSEvaluator;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.postag.WordTagSampleStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
public class PartOfSpeechTaggingTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "part-of-speech-taggin.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "en-pos-maxent-my.bin";
//按行读取数据
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<POSSample> sampleStream = new WordTagSampleStream(lineStream);
POSTaggerFactory factory =new POSTaggerFactory();
//训练模型
POSModel model =POSTaggerME.train("en",sampleStream, TrainingParameters.defaultParams(), factory);
//保存模型
FileOutputStream fos=new FileOutputStream(new File(modelPath));
OutputStream modelOut = new BufferedOutputStream(fos);
model.serialize(modelOut);
//评估模型
POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model),null);
evaluator.evaluate(sampleStream);
Double result = evaluator.getWordAccuracy();
System.out.println("正确标记的次数:"+result.toString());
}
}
Part-of-speech tagging
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
public class PartOfSpeechTaggingPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//String filePath = fileResourcesDir + "sentenceDetector.txt";
String modelPath = modelResourcesDir + "en-pos-maxent.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
POSModel model = new POSModel(modelIn);
//实例化模型
POSTaggerME tagger = new POSTaggerME(model);
//词性检测,返回的是一个概率数组
String sent[] = new String[]{"Most", "large", "cities", "in", "the", "US", "had","morning", "and", "afternoon", "newspapers", "."};
String tags[] = tagger.tag(sent);
for(String str:tags){
System.out.print(str+",");
}
System.out.println();
//返回概率
double probs[] = tagger.probs();
for(double str:probs){
System.out.print(str+",");
}
}
}