基于netty Lucene word分词的搜索引擎

我的架构采用

<dependency>
    <groupId>io.netty</groupId>
    <artifactId>netty</artifactId>
    <version>3.10.6.Final</version>
</dependency>

netty 3.10.6版本,采用3版本是老板的规定,我也就遵守了。

听老板的意思就是要做一个基于netty和Lucene多线程的东西出来就好。

好吧我做出来了一个末班

一个多线程查询的Lucene的方法

/**
 * 获取IndexSearcher
 *
 * @param reader   IndexReader对象
 * @param executor 如果你需要开启多线程查询,请提供ExecutorService对象参数
 * @return
 */
public static IndexSearcher getIndexSearcher(IndexReader reader, ExecutorService executor) {
    return luceneManager.getIndexSearcher(reader, executor);
}

代码就像这个样子的,很好使。

测试代码

import LuceneOne.Article;
import luceneTwo.MyWordAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

public class one {
    @Test
    public void add() throws IOException {

        Article article = new Article();
        article.setId(1);
        article.setTitle("Lucene全文检索");
        article.setContent("Lucene是apache软件基金会4 jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包,但它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言)。");

        final Path path = Paths.get("./article/");

        Directory directory = FSDirectory.open(path);
        Analyzer analyzer = new MyWordAnalyzer();

        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);

        Document document = new Document();
        document.add(new TextField("id", article.getId()+"", Field.Store.YES));
        document.add(new TextField("title", article.getTitle(), Field.Store.YES));
        document.add(new TextField("content", article.getContent(), Field.Store.YES));

        indexWriter.addDocument(document);
        indexWriter.close();

    }
}

对了还有word分词

package luceneTwo;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apdplat.word.segmentation.Segmentation;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.SegmentationFactory;

public class MyWordAnalyzer extends Analyzer {
    Segmentation segmentation = null;

    public MyWordAnalyzer() {
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
    }

    public MyWordAnalyzer(Segmentation segmentation) {
        this.segmentation = segmentation;
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MyWordTokenizer(segmentation);
        return new TokenStreamComponents(tokenizer);
    }

}

还有

package luceneTwo;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apdplat.word.recognition.StopWord;
import org.apdplat.word.segmentation.Segmentation;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.SegmentationFactory;
import org.apdplat.word.segmentation.Word;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Queue;
import java.util.concurrent.LinkedTransferQueue;

public class MyWordTokenizer extends Tokenizer {
    private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
    private final Queue<Word> words = new LinkedTransferQueue<>();
    private Segmentation segmentation = null;
    private BufferedReader reader = null;
    private int startOffset = 0;

    public MyWordTokenizer() {
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
    }

    public MyWordTokenizer(Segmentation segmentation) {
        this.segmentation = segmentation;
    }

    private Word getWord() throws IOException {
        Word word = words.poll();
        if (word == null) {
            String line;
            while ((line = reader.readLine()) != null) {
                words.addAll(segmentation.seg(line));
            }
            startOffset = 0;
            word = words.poll();
        }
        return word;
    }

    @Override
    public final boolean incrementToken() throws IOException {
        reader = new BufferedReader(input);
        Word word = getWord();
        if (word != null) {
            int positionIncrement = 1; //忽略停用词
            while (StopWord.is(word.getText())) {
                positionIncrement++;
                startOffset += word.getText().length();
                word = getWord();
                if (word == null) {
                    return false;
                }
            }
            charTermAttribute.setEmpty().append(word.getText());
            offsetAttribute.setOffset(startOffset, startOffset + word.getText().length());
            positionIncrementAttribute.setPositionIncrement(positionIncrement);
            startOffset += word.getText().length();
            return true;
        }
        return false;
    }
}

然后在引入

<!-- https://mvnrepository.com/artifact/org.apdplat/word -->
<dependency>
    <groupId>org.apdplat</groupId>
    <artifactId>word</artifactId>
    <version>1.3.1</version>
</dependency>

我用的是最新版,出问题了再换

csdn不多

word分词可以进行词向量的计算,就可以实现词向量的权重近似值去实现数学级别的模糊搜索。

我之前做过分词之后基于基数统计模型的,这是我第一次用词向量进行检索,不知道后面有什么坑。

猜你喜欢

转载自blog.csdn.net/weixin_41046245/article/details/81300456
今日推荐