有了上一篇中建立好的索引,我们就可以完成检索任务了。
在这之前,介绍一下lucene检索有关的基本概念。
- IndexReader:负责将索引文件读入内存。这里用户可以自定义索引文件编码格式,可以实现索引文件的压缩等。IndexReader的构造方法需要一个Directory对象。
- IndexSearcher:lucene对外提供检索功能的类。这个类中有多个重载的search方法,负责提供检索功能。
- TopDocs:保存lucene检索结果的类。
- ScoreDoc:保存lucene检索结果中,Document的docId及其评分。默认情况下,lucene使用其评分机制对Document进行评分,用户也可以自定义评分机制。
- Query:lucene使用的Query,相当于数据库语言中使用的sql。Query几个常用的子类:
Query子类 | 使用场景 | 类比SQL |
TermQuery | 等值查询 | SELECT * FROM `emp` WHERE `name` = ? |
NumericRangeQuery | (数值类型的)范围查询 | SELECT * FROM `emp` WHERE `age` BETWEEN ? AND ? |
WildcardQuery | 通配符查询 | SELECT * FROM `emp` WHERE `name` like `?` |
BooleanQuery | 组合查询 | SELECT * FROM `emp` WHERE `name` = ? AND (`age` BETWEEN ? AND ?) |
看一下我们的检索任务:
- 按照文件名检索:TermQuery
- 按照文件类型检索:TermQuery
- 按照文件大小检索:NumericRangeQuery
- 按照修改日期检索:NumericRangeQuery
- 按照文件内容检索:TermQuery
我们建立一个Searcher类,负责接收索引文件路径、检索条件Query、返回结果条数,以简化客户端操作。
Searcher类:
package cn.lym.lucene.quickstart.search; import java.io.File; import java.util.ArrayList; import java.util.List; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * 提供检索的类 * * @author liuyimin * */ public class Searcher { /** * logger */ private static final Logger logger = LogManager.getLogger(Searcher.class); /** * 检索 * * @param indexDir * 索引存放目录 * @param query * 检索条件 * @param n * 返回结果数量 * @return * @throws Exception */ public List<Document> search(String indexDir, Query query, int n) throws Exception { if (logger.isDebugEnabled()) { logger.debug("Search " + indexDir + " for " + n + " documents, with query: " + query); } Directory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TopDocs topDocs = searcher.search(query, n); ScoreDoc[] scoreDocs = topDocs.scoreDocs; if (logger.isDebugEnabled()) { logger.debug("Totally " + scoreDocs.length + " documents hit."); } List<Document> documents = new ArrayList<>(scoreDocs.length); for (ScoreDoc scoreDoc : scoreDocs) { documents.add(searcher.doc(scoreDoc.doc)); } return documents; } }
写单元测试进行测试:
package cn.lym.lucene.quickstart.search; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.util.Date; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.junit.Before; import org.junit.Test; public class SearcherTest { /** * 索引存放目录 */ private static final String indexDir = "E:\\Documents\\lucene-quickstart\\"; private Searcher searcher; @Before public void init() { this.searcher = new Searcher(); } /** * 按文件名搜索文件 */ @Test public void testSearchWithFileName() throws Exception { // 搜索文件名为jdk-8u60-windows-x64.exe Query query = new TermQuery(new Term("filename", "jdk-8u60-windows-x64.exe")); int n = 10; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document); } assertEquals(1, documents.size()); } /** * 按文件类型搜索文件 */ @Test public void testSearchWithFileType() throws Exception { // 搜索文件类型为exe的文件 Query query = new TermQuery(new Term("type", "exe")); int n = Integer.MAX_VALUE; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); assertTrue(documents.size() > 0); } /** * 按文件类型搜索文件 */ @Test public void testSearchWithFileType2() throws Exception { // 搜索文件类型为exe的文件 Query query = new TermQuery(new Term("type", "txt")); int n = Integer.MAX_VALUE; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document.get("pathname")); } assertTrue(documents.size() > 0); } /** * 按文件大小搜索文件 */ @Test public void testSearchWithFileSize() throws Exception { // 搜索文件大小为195,200,088字节的文件(jdk-8u60-windows-x64.exe) long size = 195_200_088L; Query query = NumericRangeQuery.newLongRange("size", size, size, true, true); int n = 10; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document); } assertEquals(1, documents.size()); } /** * 按文件大小搜索文件 */ @Test public void testSearchWithFileSize2() throws Exception { // 搜索文件大小在1024~2048字节之间的文件 Long min = 1024L; Long max = 2048L; Query query = NumericRangeQuery.newLongRange("size", min, max, true, true); int n = 10; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document); } assertTrue(documents.size() > 0); } /** * 按文件大小搜索文件 */ @Test public void testSearchWithFileSize3() throws Exception { // 搜索文件大小小于1024字节的文件 Long min = null; Long max = 1024L; Query query = NumericRangeQuery.newLongRange("size", min, max, true, true); int n = 10; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document); } assertTrue(documents.size() > 0); } /** * 按文件大小搜索文件 */ @Test public void testSearchWithFileSize4() throws Exception { // 搜索文件大小大于1024 * 1024 * 1024字节的文件 Long min = 1024 * 1024 * 1024L; Long max = null; Query query = NumericRangeQuery.newLongRange("size", min, max, true, true); int n = 10; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document); } assertTrue(documents.size() > 0); } /** * 按文件修改日期搜索文件 */ @Test public void testSearchWithModifiedTime() throws Exception { // 搜索最近一周修改的文件 Long max = new Date().getTime(); Long min = max - 7 * 24 * 3600 * 1000L; Query query = NumericRangeQuery.newLongRange("lastmodified", min, max, true, true); int n = 10; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document); } assertTrue(documents.size() > 0); } /** * 按文件内容搜索文件 */ @Test public void testSearchWithContent() throws Exception { // 搜索内容中包含success的文件 Query query = new TermQuery(new Term("content", "success")); int n = Integer.MAX_VALUE; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document.get("pathname")); } assertTrue(documents.size() > 0); } /** * 按文件名和文件大小搜索文件 */ @Test public void testSearchWithFileNameAndFileSize() throws Exception { BooleanQuery query = new BooleanQuery(); query.add(new TermQuery(new Term("filename", "jdk-8u60-windows-x64.exe")), Occur.MUST); // 搜索文件大小为195,200,088字节的文件(jdk-8u60-windows-x64.exe) long size = 195_200_088L; query.add(NumericRangeQuery.newLongRange("size", size, size, true, true), Occur.MUST); int n = 10; List<Document> documents = this.searcher.search(indexDir, query, n); System.out.println(documents.size() + " documents hit."); for (Document document : documents) { System.out.println(document); } assertEquals(1, documents.size()); } }
需要说明的是:
- NumericRangeQuery类通过工厂构造方法创建实例。构造方法参数含义分别为:字段名、最小值、最大值、是否包含最小值、是否包含最大值。后两个boolean类型的参数表示开区间还是闭区间。当最大值、最小值某个为null是,表示无穷大、无穷小。
- BooleanQuery可以添加任意多个Query,而这些Query之间的逻辑关系(与、或、非),通过Occur.MUST、Occur.SHOULD、Occur.MUST_NOT表示。
本文的代码可以从 https://git.oschina.net/coding4j/lucene-quickstart 获得。