1. 索引过程
1.1 提取文本和创建文档
首先从文件中提取文本格式信息,并用这些文本信息创建Lucene文档和域。
1.2 分析文档
创建Lucene文档和域后,就可以调用IndexWriter对象的addDocument方法将数据传递给Lucene进行索引操作了。在索引操作时,Lucene首先分析文本,将文本数据分割成语汇单元串,然后对它们执行一些可选操作。如:LowerCaseFilter类实现搜索不对大小写敏感、StopFilter类从输入中去掉一些使用频繁却没有意义的词、PorterStemFilter类处理英文文本去掉它们的词干。这些将原始数据转换为词汇单元,随便用一系列Filter来修正该语汇单元的操作。
1.3 向索引添加文档
对输入数据分析完毕后,就可以将分析结果写入索引文件中。Lucene将输入数据以一种倒排索引的数据结构进行存储。Lucene使用倒排数据结构的原因是:把文档中提取出的语汇单元作为查询关键字,而不是将文档作为中心实体。换句话说,倒排索引不是回答“这个文档中包含哪些单词?”,而是经过优化后用来快速回答“哪些文档包含单词X?”这个问题。
Lucene的索引文件目录有唯一一个段结构-索引段:
Lucene索引都包含一个或多个段,每个段都是一个独立的索引,它包含事个文档索引的一个子集。每当Writer刷新缓冲区增加文档,以及挂起目录删除操作时,索引文件都会建立一个新段。在搜索索引时,每个段都是单独访问的,但搜索结果是合并返回的。
每个段都包含多个文件,文件格式为_X.<ext>,这里X代表段名称,<ext>代表扩展名,用来标识该文件对应索引的某个部分。如果使用混合文件格式(Lucene默认处理方式,可通过IndexWriter.setUseCompoundFile方法进行修改),则上述索引文件都会被压缩成一个单一的文件:_X.cfs。这种方式在搜索期间减少打开的文件数量。
还有一个特殊文件,叫段文件,用段_<N>标识,该文件指向所有激活的段。Lucene会首先打开该文件,然后打开它指向的其他文件。值<N>被称为“the geneeration”,它是一个整数,Lucene每次向索引提交更改时都会将这个数加1。
IndexWriter会周期性地选择一些段,然后将它们合并到一个新段中,然后删除老的段。被合并段的选取策略由一个独立的MergePolicy类主导。一旦选好这些段,具体合并操作由MergeScheduler类实现。
2. 基本索引操作
package lucene.indexing; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class IndexingTest { private String[] ids = { "1", "2" }; private String[] unindexed = { "netherlands", "italy" }; private String[] unstored = { "amsterdam has lots of bridges", "venice has lots of canals" }; private String[] text = { "amsterdam", "venice" }; private Directory directory; public IndexingTest() throws IOException { directory = FSDirectory.open(new File("F:/project/Lucene/index")); } private IndexWriter getWriter() throws IOException { Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer); return new IndexWriter(directory, config); } private void getHitCount(String searchString, String fieldName) throws IOException { IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term(searchString, fieldName); Query query = new TermQuery(term); TopDocs hits = searcher.search(query, null, 10000); System.out.print("在" + searchString + "中查询" + fieldName + ":"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); System.out.print(doc.get("id")); System.out.print(","); System.out.print(doc.get("country")); System.out.print(","); System.out.print(doc.get("contents")); System.out.print(","); System.out.print(doc.get("city")); System.out.println("."); } System.out.println(); } public void testIndexWriter() throws IOException { IndexWriter writer = getWriter(); writer.numDocs(); System.out.println("writer.numDocs=" + writer.numDocs()); writer.close(); } public void testIndexReader() throws IOException { IndexReader reader = DirectoryReader.open(directory); System.out.println("reader.maxDoc=" + reader.maxDoc()); System.out.println("reader.numDocs=" + reader.numDocs()); reader.close(); } public void testUpdate() throws IOException { getHitCount("city", "amsterdam"); getHitCount("city", "den haag"); getHitCount("contents", "amsterdam"); getHitCount("contents", "den"); IndexWriter writer = getWriter(); Document doc = new Document(); doc.add(new StringField("id", "1", Store.YES)); doc.add(new StringField("country", "nnetherlands", Store.YES)); doc.add(new TextField("contents", "den haag has a lot of museums", Store.YES)); doc.add(new StringField("city", "den haag", Store.YES)); writer.updateDocument(new Term("id", "1"), doc); writer.close(); getHitCount("city", "amsterdam"); getHitCount("city", "den haag"); getHitCount("contents", "amsterdam"); getHitCount("contents", "den"); } public void testDelete() throws IOException{ getHitCount("city", "amsterdam"); getHitCount("city", "venice"); getHitCount("contents", "amsterdam"); getHitCount("contents", "venice"); IndexWriter writer=getWriter(); writer.deleteDocuments(new Term("id","1")); writer.commit(); writer.close(); getHitCount("city", "amsterdam"); getHitCount("city", "venice"); getHitCount("contents", "amsterdam"); getHitCount("contents", "venice"); } public void set() throws IOException { // directory = new RAMDirectory(); IndexWriter writer = getWriter(); for (int i = 0; i < ids.length; i++) { Document doc = new Document(); doc.add(new StringField("id", ids[i], Store.YES)); doc.add(new StringField("country", unindexed[i], Store.YES)); doc.add(new TextField("contents", unstored[i], Store.NO)); doc.add(new StringField("city", text[i], Store.YES)); writer.addDocument(doc); } writer.close(); } public static void main(String[] args) throws IOException { IndexingTest test = new IndexingTest(); test.set(); test.testIndexWriter(); test.testIndexReader(); test.testUpdate(); //test.testDelete(); } }