lucene TokenFilter类学习

1、自定义一个类继承TokenFilter

import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

public class MyTokenFilter extends TokenFilter {

	//TokenFilter的构造方法是protected类型,所以子类一定要继承重写
	protected MyTokenFilter(TokenStream input) {
		//采用责任链模式传递TokenStream对象,交由TokenFilter处理
		super(input);
	}

	@Override
	public boolean incrementToken() throws IOException {
		while(input.incrementToken()){
			CharTermAttribute cta = input.getAttribute(CharTermAttribute.class);
			System.out.println(cta.toString());
			if("江西".equals(cta.toString())){
				//根据分词信息作业务逻辑处理
				System.out.println("--------"+cta.toString()+"----------");
			}
		}
		return false;
	}

}

2、自定义一个类继承Analyzer类,里面实现分词是根据自定义的TokenFilter类

import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;

public class MyAnalyzer extends Analyzer {

	@Override
	public TokenStream tokenStream(String str, Reader reader) {
		Dictionary dic = Dictionary.getInstance("G:\\c参考资料\\lucene\\mmseg4j-1.8.5\\data");
		return new MyTokenFilter(
				new MMSegTokenizer(new MaxWordSeg(dic), reader));
	}

}

3、编写一个测试类,直接调用自定义的Analyzer

public class TestAnalyzer {
	
	@Test
	public void test00() {
		Analyzer a1 = new MyAnalyzer();
		TokenStream tokenStream = a1.tokenStream("anyThing", new StringReader("中华人民共和国江西省南昌市"));
		try {
			while(tokenStream.incrementToken()){
				//根据分词信息作业务逻辑处理
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
}

猜你喜欢

转载自hbiao68.iteye.com/blog/2115078