Lucene简介及Hello World实现

1、Lucene简介

Lucene是一个用Java写的全文检索引擎工具包，实现构造了索引和搜索两大核心功能，并且两者相互独立，这使得开发人员可以方便扩展，Lucene提供了丰富的API , 可以与存储在索引中的信息方便的交互。需要说明的是它并不是一个完整的全文检索应用, 而是为应用程序提供索引和搜索功能。即若想让Lucene 真正起作用, 还需在其基础上做一些必要的二次开发。

子包和功能：

2、架构设计

Lucene功能非常强大，但从根本上来说，主要包括两块：一是从文本内容切分词后索引入库；二是根据查询条件返回结果，即建立索引和进行查询两部分。

如图1-1所示，本文抛出外部接口以及信息来源，重点对网页爬取的文本内容进行索引和查询

3、通过一个简单的例子进行初步了解

3.1、依赖jar包：

<dependencies>
   <dependency>
       <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-core</artifactId>
       <version>5.3.1</version>
   </dependency>

   <dependency>
   <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-queryparser</artifactId>
       <version>5.3.1</version>
   </dependency>

   <dependency>
       <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-analyzers-common</artifactId>
       <version>5.3.1</version>
   </dependency>
</dependencies>

3.2、索引类：

package com.java1234.lucene;

import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Indexer {

   private IndexWriter writer; // 写索引实例

   /**
   * 构造方法实例化IndexWriter
   * @param indexDir
   * @throws Exception
   */
   public Indexer(String indexDir)throws Exception{
       Directory dir=FSDirectory.open(Paths.get(indexDir));
       Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
       IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
       writer=new IndexWriter(dir, iwc);
   }

   /**
   * 关闭写索引
   * @throws Exception
   */
   public void close()throws Exception{
       writer.close();
   }

   /**
   * 索引指定目录的所有文件
   * @param dataDir
   * @throws Exception
   */
   public int index(String dataDir)throws Exception{
       File []files=new File(dataDir).listFiles();
       for(File f:files){
           indexFile(f);
       }
       return writer.numDocs();
   }

   /**
   * 索引指定文件
   * @param f
   */
   private void indexFile(File f) throws Exception{
       System.out.println("索引文件："+f.getCanonicalPath());
       Document doc=getDocument(f);
       writer.addDocument(doc);
   }

   /**
   * 获取文档，文档里再设置每个字段
   * @param f
   */
   private Document getDocument(File f)throws Exception {
       Document doc=new Document();
       doc.add(new TextField("contents",new FileReader(f)));
       doc.add(new TextField("fileName", f.getName(),Field.Store.YES));
       doc.add(new TextField("fullPath",f.getCanonicalPath(),Field.Store.YES));
       return doc;
   }

   public static void main(String[] args) {
       //索引存放的位置
       String indexDir="D:\\lucene";
       //文件所在的位置
       String dataDir="D:\\lucene\\data";
       Indexer indexer=null;
       int numIndexed=0;
       long start=System.currentTimeMillis();
       try {
           indexer = new Indexer(indexDir);
           numIndexed=indexer.index(dataDir);
       } catch (Exception e) {
           e.printStackTrace();
       }finally{
           try {
               indexer.close();
           } catch (Exception e) {
               e.printStackTrace();
           }
       }
       long end=System.currentTimeMillis();
       System.out.println("索引："+numIndexed+" 个文件花费了"+(end-start)+" 毫秒");
   }
}
3.3、查询类：

package com.java1234.lucene;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher {

   public static void search(String indexDir,String q)throws Exception{
       Directory dir=FSDirectory.open(Paths.get(indexDir));
       IndexReader reader=DirectoryReader.open(dir);
       IndexSearcher is=new IndexSearcher(reader);
       Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
       QueryParser parser=new QueryParser("contents", analyzer);
       Query query=parser.parse(q);
       long start=System.currentTimeMillis();
       TopDocs hits=is.search(query, 10);
       long end=System.currentTimeMillis();
       System.out.println("匹配 "+q+" ，总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
       for(ScoreDoc scoreDoc:hits.scoreDocs){
           Document doc=is.doc(scoreDoc.doc);
           System.out.println(doc.get("fullPath"));
       }
       reader.close();
   }

   public static void main(String[] args) {
       String indexDir="D:\\lucene";
       String q="Zygmunt Saloni";
       try {
           search(indexDir,q);
       } catch (Exception e) {
           e.printStackTrace();
       }
   }
}

Lucene简介及Hello World实现

猜你喜欢