Using Apache Lucene, you can perform full-text keyword search on text files, and adding support from other libraries can perform full-text content retrieval on pdf, word, excel, etc., and establish a retrieval index.
The following records do indexing, full-text search and highlighting for the two formats of word, but do not compare the efficiency (relative file reading).
Version:
Lucene:7.2.1
POI: 3.1.7
public class LuceneTest1 { public static String[] suf = new String[]{".doc",".docx"}; //{".txt"}; // public static List<String> lst = new ArrayList<String>(); static { lst = Arrays.asList(suf); } public static void main(String[] args) { //F:\worklog //createIndex(true, "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_txt" ); //search("字符串","D:\\ex_java\\lucene_test\\index_worklog_txt");// //createIndex(true, "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_msword" ); search("Equivalent connection","D:\\ex_java\\lucene_test\\index_worklog_msword");//doc/docx type must be supported by POI library } /*** * * * The following method is 【Document Search】 * * * */ public static void search(String qwords, String indexdir ) { try { System.out.println("[Search word]: ["+qwords+"]"); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexdir))); IndexSearcher searcher = new IndexSearcher(reader); SmartChineseAnalyzer anal = new SmartChineseAnalyzer(); QueryParser parser = new QueryParser("contents",anal);//lastmodify,contents,path,title Query qr = parser.parse(qwords); TopDocs tps = searcher.search(qr, 50); /**Highlights*/ SimpleHTMLFormatter shf = new SimpleHTMLFormatter("<b><font color=\"red\">","</font></b>"); QueryScorer scorer = new QueryScorer(qr); Fragmenter frgm = new SimpleSpanFragmenter(scorer);//Calculate a fragment based on the score Highlighter hlt = new Highlighter (shf, scorer); hlt.setTextFragmenter(frgm); /**Highlights*/ for(ScoreDoc sdoc: tps.scoreDocs) { Document doc = searcher.doc(sdoc.doc); //System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify")+", "+ doc.get("contents")+", "+ doc.get("title")); System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify")); //Summary highlight fragment (saved Field) TokenStream tsm = anal.tokenStream("contents", new StringReader(doc.get("contents")));////////////////////// String summary = hlt.getBestFragment(tsm, doc.get("contents")); tsm.close(); System.out.println(); System.out.println("[summary start]---------------------------------------- ---------------"); System.out.println(summary); System.out.println("[End of summary]-------------------------------------- ---------------"); System.out.println(); } reader.close(); } catch (Exception e) { System.err.println("Directory|Parse wrong. "+ e.toString()); } } /*** * * * The following method is 【Create Index】 * * * * */ //create or update index public static void createIndex(boolean create, String docspath, String indexpath) { Path docspt = Paths.get(docspath); // if(!Files.isReadable(docspt)) { System.err.println("Docs Path not readable: "+ docspt); System.exit(1); } long stime = System.currentTimeMillis(); System.out.println("Begin Index ......"); try { Directory dir = FSDirectory.open(Paths.get(indexpath)); //FSDirectory Analyzer anal = new SmartChineseAnalyzer(); //SmartChineseAnalyzer IndexWriterConfig iwc = new IndexWriterConfig(anal); //IndexWriterConfig if(create) { iwc.setOpenMode(OpenMode.CREATE); }else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter iwrt = new IndexWriter(dir,iwc); //IndexWriter indexDocs(iwrt, docspt); iwrt.close(); } catch (Exception e) { System.err.println(e.toString()); } long etime = System.currentTimeMillis(); System.out.println("End Index, total time spend: " + (etime-stime)/1000 + " seconds."); } //Create an index for the files in the folder public static void indexDocs(final IndexWriter writer, Path pth) throws Exception { if(Files.isDirectory(pth)) {//This can also be implemented recursively Files.walkFileTree(pth, new SimpleFileVisitor<Path>(){ @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { try { indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); } catch (Exception e) { } return FileVisitResult.CONTINUE; } }); }else { indexDoc(writer, pth, Files.getLastModifiedTime(pth).toMillis()); } } //create index public static void indexDoc(IndexWriter writer, Path path, long lastmodify) throws IOException, OpenXML4JException, XmlException { String suffix = path.toString().substring(path.toString().lastIndexOf(".")).toLowerCase(); if(!lst.contains(suffix)) { return; } InputStream in = Files.newInputStream(path); Document doc = new Document(); Field pathfield = new TextField("path", path.toString(), Store.YES); doc.add(pathfield); doc.add(new TextField("title", path.getFileName().toString(), Store.YES)); doc.add(new LongPoint("lastmodify",lastmodify)); if(".doc".equals(suffix)) { //WordExtractor wd = new WordExtractor(in); WordExtractor wd = (WordExtractor) ExtractorFactory.createExtractor(in); doc.add(new TextField("contents",wd.getText(),Store.YES)); //wd.close(); }else if(".docx".equals(suffix)){ XWPFWordExtractor wdx = (XWPFWordExtractor) ExtractorFactory.createExtractor(in); doc.add(new TextField("contents",wdx.getText(),Store.YES)); //wdx.close(); }else { //doc.add(new TextField("contents", )); File tmpfile = Paths.get(path.toUri()).toFile(); Long len = tmpfile.length(); FileInputStream fin = new FileInputStream(tmpfile); byte [] buf = new byte [len.intValue ()]; fin.read(buf); String text = new String(buf,"gb2312"); fin.close(); doc.add(new TextField("contents", text, Store.YES)); } if(writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding doc: " + path); writer.addDocument(doc); }else { System.out.println("updating doc: " + path); writer.updateDocument(new Term("path",path.toString()), doc); } } }
Search results (part):
[Search term]: [equijoin]
[8.205898]: F:\worklog\leftjoin_innerjoin_rightjoin.doc, null
[Start of abstract]-------------------------------------------------------- --------
join(<b><font color="red">equivalent</font></b><b><font color="red">join</font></b >) returns only the rows where the join fields are equal in the two tables. An
example is as follows:
---------------------------------- ----------
Table A records as follows:
aID aNum
[end of abstract]---------------------------- -------------------------