我的架构采用
<dependency> <groupId>io.netty</groupId> <artifactId>netty</artifactId> <version>3.10.6.Final</version> </dependency>
netty 3.10.6版本,采用3版本是老板的规定,我也就遵守了。
听老板的意思就是要做一个基于netty和Lucene多线程的东西出来就好。
好吧我做出来了一个末班
一个多线程查询的Lucene的方法
/** * 获取IndexSearcher * * @param reader IndexReader对象 * @param executor 如果你需要开启多线程查询,请提供ExecutorService对象参数 * @return */ public static IndexSearcher getIndexSearcher(IndexReader reader, ExecutorService executor) { return luceneManager.getIndexSearcher(reader, executor); }
代码就像这个样子的,很好使。
测试代码
import LuceneOne.Article; import luceneTwo.MyWordAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; public class one { @Test public void add() throws IOException { Article article = new Article(); article.setId(1); article.setTitle("Lucene全文检索"); article.setContent("Lucene是apache软件基金会4 jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包,但它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言)。"); final Path path = Paths.get("./article/"); Directory directory = FSDirectory.open(path); Analyzer analyzer = new MyWordAnalyzer(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig); Document document = new Document(); document.add(new TextField("id", article.getId()+"", Field.Store.YES)); document.add(new TextField("title", article.getTitle(), Field.Store.YES)); document.add(new TextField("content", article.getContent(), Field.Store.YES)); indexWriter.addDocument(document); indexWriter.close(); } }
对了还有word分词
package luceneTwo; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; import org.apdplat.word.segmentation.Segmentation; import org.apdplat.word.segmentation.SegmentationAlgorithm; import org.apdplat.word.segmentation.SegmentationFactory; public class MyWordAnalyzer extends Analyzer { Segmentation segmentation = null; public MyWordAnalyzer() { segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching); } public MyWordAnalyzer(Segmentation segmentation) { this.segmentation = segmentation; } @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MyWordTokenizer(segmentation); return new TokenStreamComponents(tokenizer); } }
还有
package luceneTwo; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apdplat.word.recognition.StopWord; import org.apdplat.word.segmentation.Segmentation; import org.apdplat.word.segmentation.SegmentationAlgorithm; import org.apdplat.word.segmentation.SegmentationFactory; import org.apdplat.word.segmentation.Word; import java.io.BufferedReader; import java.io.IOException; import java.util.Queue; import java.util.concurrent.LinkedTransferQueue; public class MyWordTokenizer extends Tokenizer { private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); private final Queue<Word> words = new LinkedTransferQueue<>(); private Segmentation segmentation = null; private BufferedReader reader = null; private int startOffset = 0; public MyWordTokenizer() { segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching); } public MyWordTokenizer(Segmentation segmentation) { this.segmentation = segmentation; } private Word getWord() throws IOException { Word word = words.poll(); if (word == null) { String line; while ((line = reader.readLine()) != null) { words.addAll(segmentation.seg(line)); } startOffset = 0; word = words.poll(); } return word; } @Override public final boolean incrementToken() throws IOException { reader = new BufferedReader(input); Word word = getWord(); if (word != null) { int positionIncrement = 1; //忽略停用词 while (StopWord.is(word.getText())) { positionIncrement++; startOffset += word.getText().length(); word = getWord(); if (word == null) { return false; } } charTermAttribute.setEmpty().append(word.getText()); offsetAttribute.setOffset(startOffset, startOffset + word.getText().length()); positionIncrementAttribute.setPositionIncrement(positionIncrement); startOffset += word.getText().length(); return true; } return false; } }
然后在引入
<!-- https://mvnrepository.com/artifact/org.apdplat/word --> <dependency> <groupId>org.apdplat</groupId> <artifactId>word</artifactId> <version>1.3.1</version> </dependency>
我用的是最新版,出问题了再换
csdn不多
word分词可以进行词向量的计算,就可以实现词向量的权重近似值去实现数学级别的模糊搜索。
我之前做过分词之后基于基数统计模型的,这是我第一次用词向量进行检索,不知道后面有什么坑。