[NLP] Use of OpenNLP part-of-speech tagger

table of Contents

 

Part-of-Speech Tagger

Model training

Part-of-speech tagging


Part-of-Speech Tagger

The part-of-speech tagger marks the type of words according to the word itself and the context of the word. The OpenNLP POS tagger uses a probability model to predict the type of words in the tag set. In order to limit the range of word types, a dictionary can be used to improve the tagging and runtime efficiency of the tokenizer.

Common part of speech abbreviation reference:

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

The training can be carried out with the following labeled corpus; words and parts of speech are connected with underscore "_", and words and words are separated by spaces.

 About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
That_DT sounds_VBZ good_JJ ._.

 

Model training

import java.io.BufferedOutputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStream;

import java.nio.charset.StandardCharsets;

import opennlp.tools.postag.POSEvaluator;

import opennlp.tools.postag.POSModel;

import opennlp.tools.postag.POSSample;

import opennlp.tools.postag.POSTaggerFactory;

import opennlp.tools.postag.POSTaggerME;

import opennlp.tools.postag.WordTagSampleStream;

import opennlp.tools.util.InputStreamFactory;

import opennlp.tools.util.MarkableFileInputStreamFactory;

import opennlp.tools.util.ObjectStream;

import opennlp.tools.util.PlainTextByLineStream;

import opennlp.tools.util.TrainingParameters;



public class PartOfSpeechTaggingTrain {



    public static void main(String[] args) throws IOException {

       // TODO Auto-generated method stub

       String rootDir = System.getProperty("user.dir") + File.separator;

      

       String fileResourcesDir = rootDir + "resources" + File.separator;

       String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;

      

       //训练数据的路径

        String filePath = fileResourcesDir + "part-of-speech-taggin.txt";

       //训练后模型的保存路径

        String modelPath = modelResourcesDir + "en-pos-maxent-my.bin";

          

       //按行读取数据

       InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));

       ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);

      

       //按行读取数据

       ObjectStream<POSSample> sampleStream = new WordTagSampleStream(lineStream);

       POSTaggerFactory factory =new POSTaggerFactory();



       //训练模型

        POSModel  model =POSTaggerME.train("en",sampleStream,  TrainingParameters.defaultParams(),  factory);

        

       //保存模型

       FileOutputStream fos=new FileOutputStream(new File(modelPath));

        OutputStream modelOut = new BufferedOutputStream(fos);

        model.serialize(modelOut);

        

        //评估模型

        POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model),null);

        evaluator.evaluate(sampleStream);



        Double result = evaluator.getWordAccuracy();

        System.out.println("正确标记的次数:"+result.toString());

    }

}

Part-of-speech tagging

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import opennlp.tools.postag.POSModel;

import opennlp.tools.postag.POSTaggerME;

public class PartOfSpeechTaggingPredit {



    public static void main(String[] args) throws IOException {

       // TODO Auto-generated method stub

       String rootDir = System.getProperty("user.dir") + File.separator;

      

       String fileResourcesDir = rootDir + "resources" + File.separator;

       String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;

      

       //String filePath = fileResourcesDir + "sentenceDetector.txt";

       String modelPath = modelResourcesDir + "en-pos-maxent.bin";



       InputStream modelIn = new FileInputStream(modelPath) ;

       //加载模型

       POSModel model = new POSModel(modelIn);

       //实例化模型

       POSTaggerME tagger = new POSTaggerME(model);

        

       //词性检测,返回的是一个概率数组

       String sent[] = new String[]{"Most", "large", "cities", "in", "the", "US", "had","morning", "and", "afternoon", "newspapers", "."};      

        String tags[] = tagger.tag(sent);

       

        for(String str:tags){

        System.out.print(str+",");

        }

        System.out.println();

       

        //返回概率

        double probs[] = tagger.probs();

        for(double str:probs){

        System.out.print(str+",");

        }

    }

}

 

Guess you like

Origin blog.csdn.net/henku449141932/article/details/111319424