基于lucene的检索的简单的例子

需要的jar包：

    compile group: 'org.apache.lucene', name: 'lucene-core', version: '8.3.0'
    compile group: 'org.apache.lucene', name: 'lucene-queryparser', version: '8.3.0'
    compile group: 'org.apache.lucene', name: 'lucene-highlighter', version: '8.3.0'
    compile group: 'org.apache.lucene', name: 'lucene-backward-codecs', version: '8.3.0'
    compile group: 'com.github.magese', name: 'ik-analyzer', version: '8.3.0'

package lucene_test;

import java.nio.file.FileSystems;
import java.util.UUID;

import lombok.Cleanup;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;


/**
 * @Title:EntitySearchDemo.java Created by zhangdapeng on 2019/11/27上午11:01
 * @Description:基于Lucene 8.3
 * @Author:zhangdapeng
 * @Verson:1.0
 */
public class EntitySearchDemo {
    /**
     * 索引存储路径
     */
    public static final String INDEX_PATH = "./knowledge_graph-index";

    /**
     * 创建索引
     */
    public void creatIndex() throws Exception {
        @Cleanup IndexWriter indexWriter = null;
        Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(INDEX_PATH));
        //Analyzer analyzer = new StandardAnalyzer();
        Analyzer analyzer = new IKAnalyzer(true);
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);

        indexWriter = new IndexWriter(directory, indexWriterConfig);
        indexWriter.deleteAll();// 清除以前的index

        for (int i = 0; i < 10; i++) {
            Document document=null;
            if(i==0) {
                UUID uuid = UUID.randomUUID();
                String id = uuid.toString().replace("-", "");
                document = new Document();
                document.add(new Field("id", id, TextField.TYPE_STORED));
                document.add(new Field("title", "tile" + '"' + i + '"', TextField.TYPE_STORED));
                document.add(new Field("content", "huawei牛掰啊", TextField.TYPE_STORED));
                document.add(new Field("tag", "tag" + id, TextField.TYPE_STORED));
                document.add(new Field("url", "url" + id, TextField.TYPE_STORED));
            }else{
                if(i==2){
                    UUID uuid = UUID.randomUUID();
                    String id = uuid.toString().replace("-", "");
                    document = new Document();
                    document.add(new Field("id", id, TextField.TYPE_STORED));
                    document.add(new Field("title", "tile" + '"' + i + '"', TextField.TYPE_STORED));
                    document.add(new Field("content", "上海的天空？", TextField.TYPE_STORED));
                    document.add(new Field("tag", "tag" + id, TextField.TYPE_STORED));
                    document.add(new Field("url", "url" + id, TextField.TYPE_STORED));
                }else {
                    UUID uuid = UUID.randomUUID();
                    String id = uuid.toString().replace("-", "");
                    document = new Document();
                    document.add(new Field("id", id, TextField.TYPE_STORED));
                    document.add(new Field("title", "tile" + '"' + i + '"', TextField.TYPE_STORED));
                    document.add(new Field("content", "C" + '"' + i + '"', TextField.TYPE_STORED));
                    document.add(new Field("tag", "tag" + id, TextField.TYPE_STORED));
                    document.add(new Field("url", "url" + id, TextField.TYPE_STORED));
                }
            }
            indexWriter.addDocument(document);
        }
    }

    /**
     * 搜索
     */
    public void search(String keyWord) throws Exception {
        @Cleanup DirectoryReader directoryReader = null;
        // 1、创建Directory
        @Cleanup Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(INDEX_PATH));
        // 2、创建IndexReader
        directoryReader = DirectoryReader.open(directory);
        // 3、根据IndexReader创建IndexSearch
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
        // 4、创建搜索的Query
        // Analyzer analyzer = new StandardAnalyzer();
        Analyzer analyzer = new IKAnalyzer(true); // 使用IK分词

        // 简单的查询，创建Query表示搜索域为content包含keyWord的文档
        //Query query = new QueryParser("content", analyzer).parse(keyWord);

        String[] fields = {"title", "content", "tag"};
        // MUST 表示and，MUST_NOT 表示not ，SHOULD表示or
        BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
        // MultiFieldQueryParser表示多个域解析， 同时可以解析含空格的字符串，如果我们搜索"上海 中国"
        Query multiFieldQuery = MultiFieldQueryParser.parse(keyWord, fields, clauses, analyzer);

        // 5、根据searcher搜索并且返回TopDocs
        TopDocs topDocs = indexSearcher.search(multiFieldQuery, 3); // 搜索前100条结果
        System.out.println("共找到匹配处：" + topDocs.totalHits);
        // 6、根据TopDocs获取ScoreDoc对象
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        System.out.println("共找到匹配文档数：" + scoreDocs.length);

        QueryScorer scorer = new QueryScorer(multiFieldQuery, "content");
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"backgroud:red\">", "</span>");
        Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
        highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
        for (ScoreDoc scoreDoc : scoreDocs) {
            // 7、根据searcher和ScoreDoc对象获取具体的Document对象
            Document document = indexSearcher.doc(scoreDoc.doc);
            String content = document.get("content");
            System.out.println("--------------------"+keyWord+"---------------------");
            System.out.println("文章标题：" + document.get("title"));
            System.out.println("文章地址：" + document.get("url"));
            System.out.println("文章内容："+document.get("content"));
            System.out.println(highlighter.getBestFragment(analyzer, "content", content));
            System.out.println("");
            // 8、根据Document对象获取需要的值
        }

    }

    public static void main(String args[]) throws Exception {
        EntitySearchDemo demo = new EntitySearchDemo();
        demo.creatIndex();
        demo.search("huawei");
        demo.search("上海");
        demo.search("上海 天空");
        demo.search("上海 南方");


    }
}

张大鹏的博客

发布了203 篇原创文章 · 获赞 68 · 访问量 72万+

他的留言板关注

基于lucene的检索的简单的例子

猜你喜欢