package com.test.lucene; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; public class HelloLucene { /** * * @throws IOException */ @SuppressWarnings("deprecation") public static void createIndex () throws IOException{ //存储索引文件 Directory indexDir = new SimpleFSDirectory(new File("D://luceneIndex")); //需要检索的文件信息 File dataDir = new File("D://luceneTest"); //在一个文档被索引之前,首先对文档内容进行分词处理 (分词器) Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_33); //File [] dataFiles = dataDir.listFiles(); //IndexWriter 是 Lucene 用来创建索引的一个核心的类,他的作用是把一个个的 Document 对象加到索引中来 //第一个参数是索引存放位置 //第二个参数是Analyzer类的一个实现 //第三个boolean参数 true表示创建一个新的索引 false表示在原有索引基础上操作 //第四个参数 最大文件长度 MaxFileLength UNLIMITED无限的 IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer, true,IndexWriter.MaxFieldLength.LIMITED); long startTime = new Date().getTime(); //遍历dataDir下所有文件 indexDirectory(indexWriter,dataDir); // if(dataDir.isHidden() || dataDir.exists() || dataDir.canRead()) // return; // System.out.println("path: "+dataDir.getCanonicalPath()); // Document doc=new Document(); // doc.add(new Field("contents",new FileReader(dataDir))); // doc.add(new Field("filename",dataDir.getCanonicalPath(),Field.Store.YES,Field.Index.ANALYZED)); // indexWriter.addDocument(doc); indexWriter.optimize(); indexWriter.close();//创建结束 long endTime = new Date().getTime(); System.out.println("索引数: "+indexWriter.numDocs()); System.out.println("用时"+(endTime-startTime)+"毫秒来创建文件中的索引."+dataDir.getPath()); } /** * 遍历dataDir下所有文件 进行索引 * @param indexWriter * @param dataDir * @throws IOException */ public static void indexDirectory(IndexWriter indexWriter,File dataDir) throws IOException{ File [] dataFiles = dataDir.listFiles(); // FileReader fr = new FileReader(dataDir); // BufferedReader br = new BufferedReader(fr); // String line = br.readLine(); // // int j=1; // while (line != null) // { // line = br.readLine(); // j=j+1; // } // br.close(); // fr.close(); for (int i = 0; i < dataFiles.length; i++) { if (dataFiles[i].isDirectory()) { //递归 indexDirectory(indexWriter,dataFiles[i]); } else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){ System.out.println("files: "+dataFiles[i].getCanonicalPath()); //文档 Document document = new Document(); Reader txtReader = new FileReader(dataFiles[i]); //Field.Store.YES 存储 Field.Index.ANALYZED 分词 document.add(new Field("path",dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NOT_ANALYZED)); document.add(new Field("fileName",dataFiles[i].getName(),Field.Store.YES,Field.Index.ANALYZED)); // 另外一个构造函数,接受一个Reader对象 document.add(new Field("contents",txtReader)); indexWriter.addDocument(document); } } } @SuppressWarnings("deprecation") public static void searchIndex() throws IOException, ParseException{ String queryStr = "filename:令狐冲"; //在一个文档被索引之前,首先对文档内容进行分词处理 (分词器) Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_33); //主机索引目录//D:\\usr\\local\\wings\\SEARCH_INDEX\\WINGS D:\\luceneIndex File indexDir = new File("D:\\luceneIndex"); FSDirectory directory = FSDirectory.open(indexDir); IndexSearcher indexSearcher=new IndexSearcher(directory,true); QueryParser parserContents = new QueryParser(Version.LUCENE_33, "contents", luceneAnalyzer); QueryParser parserFilename = new QueryParser(Version.LUCENE_33,"filename", luceneAnalyzer); //生成Query对象 Query query = parserFilename.parse(queryStr); //搜索结果 TopScoreDocCollector collector = TopScoreDocCollector.create(50, false); //开始计时 long start = new Date().getTime(); indexSearcher.search(query, collector); //搜索结果TopScoreDocCollector里面有 TopDocs,TopDocs里面有scoreDocs[]数组,里面保存着索引值. ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println("共有: "+hits.length+"条记录..."); //循环ScoreDoc数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值 for (int i = 0; i < hits.length; i++) { // new method is.doc() //使用indexSearch.doc方法把Document还原 Document doc = indexSearcher.doc(hits[i].doc); System.out.println(doc.getField("fileName") + "------------"+ hits[i].toString()); } indexSearcher.close(); //结束计时 long end = new Date().getTime(); System.out.println("用时"+(end-start)+"毫秒来搜索文件中的索引."); } public static void main(String[] args) throws ParseException{ try { createIndex(); System.out.println("--------------------------------"); searchIndex(); } catch (IOException e) { e.printStackTrace(); } } }
浅尝Lucene
猜你喜欢
转载自dbh0512.iteye.com/blog/1852739
今日推荐
周排行