使用Apache Lucene可以对文本文件作全文关键字检索,加入其它库的支持可以对pdf、word、excel等作全文内容检索,建立检索索引。
以下记录对word的两种格式作索引建立和全文检索以及高亮显示,但未作效率对比(相对文件读取)。
版本:
Lucene:7.2.1
POI:3.1.7
public class LuceneTest1 { public static String[] suf = new String[]{".doc",".docx"}; //{".txt"}; // public static List<String> lst = new ArrayList<String>(); static { lst = Arrays.asList(suf); } public static void main(String[] args) { //F:\worklog //createIndex(true, "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_txt" ); //search("字符串","D:\\ex_java\\lucene_test\\index_worklog_txt");// //createIndex(true, "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_msword" ); search("等值连接","D:\\ex_java\\lucene_test\\index_worklog_msword");//doc/docx类型必需加入POI库支持 } /*** * * * 以下方法是【文档检索】 * * * */ public static void search(String qwords, String indexdir ) { try { System.out.println("[搜索词]:【"+qwords+"】"); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexdir))); IndexSearcher searcher = new IndexSearcher(reader); SmartChineseAnalyzer anal = new SmartChineseAnalyzer(); QueryParser parser = new QueryParser("contents",anal);//lastmodify,contents,path,title Query qr = parser.parse(qwords); TopDocs tps = searcher.search(qr, 50); /**高亮部分*/ SimpleHTMLFormatter shf = new SimpleHTMLFormatter("<b><font color=\"red\">","</font></b>"); QueryScorer scorer = new QueryScorer(qr); Fragmenter frgm = new SimpleSpanFragmenter(scorer);//根据得分计算出一个片段 Highlighter hlt = new Highlighter(shf, scorer); hlt.setTextFragmenter(frgm); /**高亮部分*/ for(ScoreDoc sdoc: tps.scoreDocs) { Document doc = searcher.doc(sdoc.doc); //System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify")+", "+ doc.get("contents")+", "+ doc.get("title")); System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify")); //摘要高亮片段(已保存的Field) TokenStream tsm = anal.tokenStream("contents", new StringReader(doc.get("contents")));////////////////////// String summary = hlt.getBestFragment(tsm, doc.get("contents")); tsm.close(); System.out.println(); System.out.println("[摘要开始]------------------------------------------------------"); System.out.println(summary); System.out.println("[摘要结束]------------------------------------------------------"); System.out.println(); } reader.close(); } catch (Exception e) { System.err.println("Directory|Parse wrong. "+ e.toString()); } } /*** * * * 以下方法是【建立索引】 * * * * */ //创建或更新索引 public static void createIndex(boolean create, String docspath, String indexpath) { Path docspt = Paths.get(docspath); // if(!Files.isReadable(docspt)) { System.err.println("Docs Path not readable: "+ docspt); System.exit(1); } long stime = System.currentTimeMillis(); System.out.println("Begin Index ......"); try { Directory dir = FSDirectory.open(Paths.get(indexpath)); //FSDirectory Analyzer anal = new SmartChineseAnalyzer(); //SmartChineseAnalyzer IndexWriterConfig iwc = new IndexWriterConfig(anal); //IndexWriterConfig if(create) { iwc.setOpenMode(OpenMode.CREATE); }else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter iwrt = new IndexWriter(dir,iwc); //IndexWriter indexDocs(iwrt, docspt); iwrt.close(); } catch (Exception e) { System.err.println(e.toString()); } long etime = System.currentTimeMillis(); System.out.println("End Index, total time spend: " + (etime-stime)/1000 + " seconds."); } //为文件夹内文件建立索引 public static void indexDocs(final IndexWriter writer, Path pth) throws Exception { if(Files.isDirectory(pth)) {//此处也可以递归实现 Files.walkFileTree(pth, new SimpleFileVisitor<Path>(){ @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { try { indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); } catch (Exception e) { } return FileVisitResult.CONTINUE; } }); }else { indexDoc(writer, pth, Files.getLastModifiedTime(pth).toMillis()); } } //建立索引 public static void indexDoc(IndexWriter writer, Path path, long lastmodify) throws IOException, OpenXML4JException, XmlException { String suffix = path.toString().substring(path.toString().lastIndexOf(".")).toLowerCase(); if(!lst.contains(suffix)) { return; } InputStream in = Files.newInputStream(path); Document doc = new Document(); Field pathfield = new TextField("path", path.toString(), Store.YES); doc.add(pathfield); doc.add(new TextField("title", path.getFileName().toString(), Store.YES)); doc.add(new LongPoint("lastmodify",lastmodify)); if(".doc".equals(suffix)) { //WordExtractor wd = new WordExtractor(in); WordExtractor wd = (WordExtractor) ExtractorFactory.createExtractor(in); doc.add(new TextField("contents",wd.getText(),Store.YES)); //wd.close(); }else if(".docx".equals(suffix)){ XWPFWordExtractor wdx = (XWPFWordExtractor) ExtractorFactory.createExtractor(in); doc.add(new TextField("contents",wdx.getText(),Store.YES)); //wdx.close(); }else { //doc.add(new TextField("contents", )); File tmpfile = Paths.get(path.toUri()).toFile(); Long len = tmpfile.length(); FileInputStream fin = new FileInputStream(tmpfile); byte[] buf = new byte[len.intValue()]; fin.read(buf); String text = new String(buf,"gb2312"); fin.close(); doc.add(new TextField("contents", text, Store.YES)); } if(writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding doc: " + path); writer.addDocument(doc); }else { System.out.println("updating doc: " + path); writer.updateDocument(new Term("path",path.toString()), doc); } } }
检索结果(部分):
[搜索词]:【等值连接】
[8.205898]: F:\worklog\leftjoin_innerjoin_rightjoin.doc, null
[摘要开始]------------------------------------------------------
join(<b><font color="red">等值</font></b><b><font color="red">连接</font></b>) 只返回两个表中联结字段相等的行
举例如下:
--------------------------------------------
表A记录如下:
aID aNum
[摘要结束]------------------------------------------------------
扫描二维码关注公众号,回复:
224910 查看本文章