【停用词分词器】
1 /** 2 * 自定义停用词分词器 3 * @author Terry 4 * 5 */ 6 public class EnStopAnalyzer extends Analyzer{ 7 private Version version = Version.LUCENE_35; 8 //存储停用词集合 9 private Set stopWords; 10 11 /** 12 * 使用默认停用词词库 13 */ 14 public EnStopAnalyzer(){ 15 this.stopWords.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); 16 } 17 18 /** 19 * 使用自定义+默认停用词词库 20 * @param stopWords 21 */ 22 public EnStopAnalyzer(String[] stopWords){ 23 //参数2:停用词字符串数组(存储了停用词集合) 24 //参数3:是否忽略大小写 25 this.stopWords = StopFilter.makeStopSet(version, stopWords, true); 26 27 //原有停用词列表 28 //System.out.println(StopAnalyzer.ENGLISH_STOP_WORDS_SET); 29 //将默认停用词集合添加到现有集合中 30 this.stopWords.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); 31 } 32 33 /** 34 * 自定义是否使用停用词词库 35 * @param stopWords 36 * @param userDefault:true---使用默认停用词词库;false---不适用默认停用词词库 37 */ 38 public EnStopAnalyzer(String[] stopWords, boolean userDefault){ 39 this.stopWords = StopFilter.makeStopSet(version, stopWords, true); 40 41 if(userDefault) 42 this.stopWords.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); 43 } 44 45 /** 46 * 对流中数据启用停用词设置 47 */ 48 @Override 49 public TokenStream tokenStream(String fieldName, Reader reader) { 50 return new StopFilter(version, 51 new LowerCaseFilter(version, new LetterTokenizer(version, reader)), 52 stopWords); 53 } 54 55 }
1 /** 2 * 停用词分词器 3 */ 4 @Test 5 public void test04(){ 6 //定义停用词字符串 7 String[] stopWords = new String[] {"to","of","him","and","take"}; 8 String str = "helped him survive the jungle of Hollywood made pick up the phone and call for a car to take him to the airport"; 9 10 Analyzer a =new EnStopAnalyzer(stopWords); 11 12 AnalyzerUtil.displayTokenStream(str,a); 13 }
【中文分词器】
1) 常用分词器
- Paoding :庖丁解牛
- MMSeg4j :传说使用了搜狗词库。另说:使用了Paoding的词库
- IK_CAnalyzer
2) 分词器的技术点
- 词库是否强大
- 算法是否优化
未完待续。。。