使用apache Lucene作MSWord全文检索

使用Apache Lucene可以对文本文件作全文关键字检索，加入其它库的支持可以对pdf、word、excel等作全文内容检索，建立检索索引。

以下记录对word的两种格式作索引建立和全文检索以及高亮显示，但未作效率对比（相对文件读取）。

版本：

Lucene：7.2.1

POI:3.1.7

public class LuceneTest1 {
	
	public static String[] suf = new String[]{".doc",".docx"}; //{".txt"}; //
	
	public static List<String> lst = new ArrayList<String>();
	
	static {
		lst = Arrays.asList(suf);
	}

	public static void main(String[] args) {
		
		//F:\worklog
		//createIndex(true,  "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_txt" );
		//search("字符串","D:\\ex_java\\lucene_test\\index_worklog_txt");//
		//createIndex(true,  "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_msword" );
		search("等值连接","D:\\ex_java\\lucene_test\\index_worklog_msword");//doc/docx类型必需加入POI库支持
	}

	/***		
	 * 
	 * 
	 * 以下方法是【文档检索】
	 * 
	 * 	  
	 * */
	
	public static void search(String qwords, String indexdir ) {
		
		try {
			System.out.println("[搜索词]：【"+qwords+"】");
			IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexdir)));
			
			IndexSearcher searcher = new IndexSearcher(reader);
			
			SmartChineseAnalyzer anal = new SmartChineseAnalyzer();
			
			QueryParser parser = new QueryParser("contents",anal);//lastmodify,contents,path,title
			
			Query qr = parser.parse(qwords);
			
			TopDocs tps = searcher.search(qr, 50);
			
			/**高亮部分*/
			SimpleHTMLFormatter shf = new SimpleHTMLFormatter("<b><font color=\"red\">","</font></b>");
			QueryScorer scorer = new QueryScorer(qr);
			Fragmenter frgm = new SimpleSpanFragmenter(scorer);//根据得分计算出一个片段
			Highlighter hlt = new Highlighter(shf, scorer);
			hlt.setTextFragmenter(frgm);			
			/**高亮部分*/
						
			for(ScoreDoc sdoc:  tps.scoreDocs) {
				Document doc = searcher.doc(sdoc.doc);
				//System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify")+", "+ doc.get("contents")+", "+ doc.get("title"));
				System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify"));
				
				//摘要高亮片段(已保存的Field)				
				TokenStream tsm = anal.tokenStream("contents", new StringReader(doc.get("contents")));//////////////////////		
				String summary = hlt.getBestFragment(tsm, doc.get("contents"));
			
				tsm.close();
				System.out.println();
				System.out.println("[摘要开始]------------------------------------------------------");
				System.out.println(summary);
				System.out.println("[摘要结束]------------------------------------------------------");
				System.out.println();
				
			}
			reader.close();			
			
		} catch (Exception e) {
			System.err.println("Directory|Parse wrong. "+ e.toString());
		}	
		
	}
	
	
	/***		
	 * 
	 * 
	 * 以下方法是【建立索引】
	 * 
	 * 	 * 
	 * */
	
	
	//创建或更新索引
	public static void createIndex(boolean create, String docspath, String indexpath) {
		
		Path docspt = Paths.get(docspath); //
		if(!Files.isReadable(docspt)) {
			System.err.println("Docs Path not readable: "+ docspt);
			System.exit(1);
		}
		long stime = System.currentTimeMillis();
		System.out.println("Begin Index ......");
		try {
			Directory dir = FSDirectory.open(Paths.get(indexpath));	//FSDirectory
			Analyzer anal = new SmartChineseAnalyzer();				//SmartChineseAnalyzer
			IndexWriterConfig iwc = new IndexWriterConfig(anal);	//IndexWriterConfig
			if(create) {
				iwc.setOpenMode(OpenMode.CREATE);
			}else {
				iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
			}
			IndexWriter iwrt = new IndexWriter(dir,iwc);			//IndexWriter
			indexDocs(iwrt, docspt);
			
			iwrt.close();			
			
		} catch (Exception e) {
			System.err.println(e.toString());
		}
				
		long etime = System.currentTimeMillis();
		System.out.println("End Index, total time spend: " + (etime-stime)/1000 + " seconds.");
		
		
	}
	
	//为文件夹内文件建立索引
	public static void indexDocs(final IndexWriter writer, Path pth) throws Exception {
		if(Files.isDirectory(pth)) {//此处也可以递归实现
			Files.walkFileTree(pth, new SimpleFileVisitor<Path>(){
				@Override
				public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
					try {
						indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
					} catch (Exception e) {
						
					}					
					return FileVisitResult.CONTINUE;
				}
			});			
			
		}else {
			indexDoc(writer, pth, Files.getLastModifiedTime(pth).toMillis());
		}	
		
	}
	
	//建立索引
	public static void indexDoc(IndexWriter writer, Path path, long lastmodify) throws IOException, OpenXML4JException, XmlException {
		
		String suffix = path.toString().substring(path.toString().lastIndexOf(".")).toLowerCase();
		
		if(!lst.contains(suffix)) {
			return;
		}
		
		InputStream in = Files.newInputStream(path);
		
		Document doc = new Document();
		
		Field pathfield = new TextField("path", path.toString(), Store.YES);
		
		doc.add(pathfield);
		
		doc.add(new TextField("title", path.getFileName().toString(),  Store.YES));
		
		doc.add(new LongPoint("lastmodify",lastmodify));		
		
		if(".doc".equals(suffix)) {
			//WordExtractor wd = new WordExtractor(in);
			WordExtractor wd = (WordExtractor) ExtractorFactory.createExtractor(in);
			doc.add(new TextField("contents",wd.getText(),Store.YES));
			//wd.close();
		}else if(".docx".equals(suffix)){			
			XWPFWordExtractor wdx = (XWPFWordExtractor) ExtractorFactory.createExtractor(in);
			doc.add(new TextField("contents",wdx.getText(),Store.YES));
			//wdx.close();
		}else {
			//doc.add(new TextField("contents", ));
			File tmpfile = Paths.get(path.toUri()).toFile(); 
			Long len = tmpfile.length();			
			FileInputStream fin = new FileInputStream(tmpfile);
			byte[] buf = new byte[len.intValue()];			
			fin.read(buf);
			String text = new String(buf,"gb2312"); 
			fin.close();
			doc.add(new TextField("contents", text, Store.YES));			
			
		}		
		
		if(writer.getConfig().getOpenMode() == OpenMode.CREATE) {
			System.out.println("adding doc: " + path);
			writer.addDocument(doc);
		}else {
			System.out.println("updating doc: " + path);
			writer.updateDocument(new Term("path",path.toString()), doc);
		}
				
	}
	
}

检索结果（部分）：

[搜索词]：【等值连接】
[8.205898]: F:\worklog\leftjoin_innerjoin_rightjoin.doc, null

[摘要开始]------------------------------------------------------
join(等值连接) 只返回两个表中联结字段相等的行
举例如下：
--------------------------------------------
表A记录如下：
aID　　　　　aNum
[摘要结束]------------------------------------------------------

扫描二维码关注公众号，回复： 224910 查看本文章

使用apache Lucene作MSWord全文检索

猜你喜欢