自定义lucene分词器,单字分词

在SearchManager里定义分词器

public class LuceneManager implements SearchManager {

Analyzer analyzer = new ZCJChineseAnalyzer();
   
//   Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_4_9);
    
   this.settings = new LuceneSettings(analyzer);

......................................

}

//ZCJChineseAnalyzer 类源码   继承Analyzer

package com.onegrid.darj.search;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;

/**
 *
 * @author 周长江 自定义分词器 2015年1月7日 14:10:53
 *
 */
public class ZCJChineseAnalyzer extends Analyzer {

 // 自定义停用词
 private static final String[] stopWords = {"and", "of", "the", "to", "is", "their", "can", "all"};
 public ZCJChineseAnalyzer() {
 }

 @Override
 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  // 创建一个分词器
  Tokenizer tokenizer = new China(reader);
  // TokenStream的包装类 在2.2之中 是TokenStream
  return new TokenStreamComponents(tokenizer);
 }
 

}

 

//China类源码   继承Tokenizer

package com.onegrid.darj.search;

import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
/**
 *
 * @author 周长江  2015年1月7日 14:10:53
 *每个单词都分  和数据库查询似的  适合中英文
 */
public class China extends Tokenizer {
 public China(Reader in) {
  super(in);
 }

 public China(AttributeFactory factory, Reader in) {
  super(factory, in);
 }

 private int offset = 0, bufferIndex = 0, dataLen = 0;
 private final static int MAX_WORD_LEN = 255;
 private final static int IO_BUFFER_SIZE = 1024;
 private final char[] buffer = new char[MAX_WORD_LEN];
 private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
 private int length;
 private int start;
 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

 private final void push(char c) {
  if (length == 0)
   start = offset - 1; // start of token
  buffer[length++] = Character.toLowerCase(c); // buffer it
 }

 private final boolean flush() {
  if (length > 0) {
   // System.out.println(new String(buffer, 0,
   // length));
   termAtt.copyBuffer(buffer, 0, length);
   offsetAtt.setOffset(correctOffset(start), correctOffset(start
     + length));
   return true;
  } else
   return false;
 }

 @Override
 public boolean incrementToken() throws IOException {
  clearAttributes();
  length = 0;
  start = offset;
  while (true) {
   final char c;
   offset++;
   if (bufferIndex >= dataLen) {
    dataLen = input.read(ioBuffer);
    bufferIndex = 0;
   }
   if (dataLen == -1) {
    offset--;
    return flush();
   } else
    c = ioBuffer[bufferIndex++];
   switch (Character.getType(c)) {
   case Character.DECIMAL_DIGIT_NUMBER:// 注意此部分不过滤一些熟悉或者字母
   case Character.LOWERCASE_LETTER:// 注意此部分
   case Character.UPPERCASE_LETTER:// 注意此部分
   // push(c);
   // if (length == MAX_WORD_LEN) return flush();
   // break;
   case Character.OTHER_LETTER:
    if (length > 0) {
     bufferIndex--;
     offset--;
     return flush();
    }
    push(c);
    return flush();
   default:
    if (length > 0)
     return flush();
    break;
   }
  }
 }

 @Override
 public final void end() {
  // set final offset
  final int finalOffset = correctOffset(offset);
  this.offsetAtt.setOffset(finalOffset, finalOffset);
 }

 @Override
 public void reset() throws IOException {
  super.reset();
  offset = bufferIndex = dataLen = 0;
 }
}

猜你喜欢

转载自blog.csdn.net/zcjwsrf/article/details/42963865