1、自定义一个类继承TokenFilter
import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class MyTokenFilter extends TokenFilter { //TokenFilter的构造方法是protected类型,所以子类一定要继承重写 protected MyTokenFilter(TokenStream input) { //采用责任链模式传递TokenStream对象,交由TokenFilter处理 super(input); } @Override public boolean incrementToken() throws IOException { while(input.incrementToken()){ CharTermAttribute cta = input.getAttribute(CharTermAttribute.class); System.out.println(cta.toString()); if("江西".equals(cta.toString())){ //根据分词信息作业务逻辑处理 System.out.println("--------"+cta.toString()+"----------"); } } return false; } }
2、自定义一个类继承Analyzer类,里面实现分词是根据自定义的TokenFilter类
import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import com.chenlb.mmseg4j.Dictionary; import com.chenlb.mmseg4j.MaxWordSeg; import com.chenlb.mmseg4j.analysis.MMSegTokenizer; public class MyAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String str, Reader reader) { Dictionary dic = Dictionary.getInstance("G:\\c参考资料\\lucene\\mmseg4j-1.8.5\\data"); return new MyTokenFilter( new MMSegTokenizer(new MaxWordSeg(dic), reader)); } }
3、编写一个测试类,直接调用自定义的Analyzer
public class TestAnalyzer { @Test public void test00() { Analyzer a1 = new MyAnalyzer(); TokenStream tokenStream = a1.tokenStream("anyThing", new StringReader("中华人民共和国江西省南昌市")); try { while(tokenStream.incrementToken()){ //根据分词信息作业务逻辑处理 } } catch (IOException e) { e.printStackTrace(); } } }