solr版本3.4.0
paoding版本(Revision154), 下载源代码http://paoding.googlecode.com/svn/trunk/, 到paoding-analysis目录执行build.bat, 生成paoding-analysis.jar
package com.sh2600.test.paoding; import java.io.Reader; import java.util.Map; import net.paoding.analysis.analyzer.PaodingTokenizer; import net.paoding.analysis.analyzer.TokenCollector; import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector; import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector; import net.paoding.analysis.knife.PaodingMaker; import org.apache.lucene.analysis.Tokenizer; import org.apache.solr.analysis.BaseTokenizerFactory; public class ChineseTokenizerFactory extends BaseTokenizerFactory { /** * 最多切分 默认模式 */ public static final String MOST_WORDS_MODE = "most-words"; /** * 按最大切分 */ public static final String MAX_WORD_LENGTH_MODE = "max-word-length"; private String mode = null; public void setMode(String mode) { if (mode == null || MOST_WORDS_MODE.equalsIgnoreCase(mode) || "default".equalsIgnoreCase(mode)) { this.mode = MOST_WORDS_MODE; } else if (MAX_WORD_LENGTH_MODE.equalsIgnoreCase(mode)) { this.mode = MAX_WORD_LENGTH_MODE; } else { throw new IllegalArgumentException("不合法的分析器Mode参数设置:" + mode); } } @Override public void init(Map args) { super.init(args); setMode(args.get("mode").toString()); } public Tokenizer create(Reader input) { return new PaodingTokenizer(input, PaodingMaker.make(), createTokenCollector()); } private TokenCollector createTokenCollector() { if (MOST_WORDS_MODE.equals(mode)) return new MostWordsTokenCollector(); if (MAX_WORD_LENGTH_MODE.equals(mode)) return new MaxWordLengthTokenCollector(); throw new Error("never happened"); } }
以上代码打包为paoding4solr.jar, 和paoding-analysis.jar放到$solr.home/lib下
复制词典(dic目录)到/opt/paoding/dic
复制paoding-*.properties到classpath(例如tomcat/lib下, 放到$solr.home/lib似乎不行)
修改paoding-dic-home.properties中的paoding.dic.home=/opt/paoding/dic
修改$solr.home/conf/schema.xml
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <!--<tokenizer class="solr.WhitespaceTokenizerFactory"/>--> <tokenizer class="com.sh2600.test.paoding.ChineseTokenizerFactory" mode="most-words"/> ...... </analyzer> <analyzer type="query"> <!--<tokenizer class="solr.WhitespaceTokenizerFactory"/>--> <tokenizer class="com.sh2600.test.paoding.ChineseTokenizerFactory" mode="most-words"/> ...... </analyzer> </fieldType>
重启solr
执行数据导入,查询
参考
http://tech.ddvip.com/2009-09/1252589447132071.html
http://www.iteye.com/topic/364513