Use apache Lucene for MSWord full-text search

Using Apache Lucene, you can perform full-text keyword search on text files, and adding support from other libraries can perform full-text content retrieval on pdf, word, excel, etc., and establish a retrieval index.

The following records do indexing, full-text search and highlighting for the two formats of word, but do not compare the efficiency (relative file reading).

Version:

Lucene:7.2.1

POI: 3.1.7

 

public class LuceneTest1 {
	
	public static String[] suf = new String[]{".doc",".docx"}; //{".txt"}; //
	
	public static List<String> lst = new ArrayList<String>();
	
	static {
		lst = Arrays.asList(suf);
	}

	public static void main(String[] args) {
		
		//F:\worklog
		//createIndex(true,  "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_txt" );
		//search("字符串","D:\\ex_java\\lucene_test\\index_worklog_txt");//
		//createIndex(true,  "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_msword" );
		search("Equivalent connection","D:\\ex_java\\lucene_test\\index_worklog_msword");//doc/docx type must be supported by POI library
	}

	/***		
	 *
	 *
	 * The following method is 【Document Search】
	 *
	 * 	  
	 * */
	
	public static void search(String qwords, String indexdir ) {
		
		try {
			System.out.println("[Search word]: ["+qwords+"]");
			IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexdir)));
			
			IndexSearcher searcher = new IndexSearcher(reader);
			
			SmartChineseAnalyzer anal = new SmartChineseAnalyzer();
			
			QueryParser parser = new QueryParser("contents",anal);//lastmodify,contents,path,title
			
			Query qr = parser.parse(qwords);
			
			TopDocs tps = searcher.search(qr, 50);
			
			/**Highlights*/
			SimpleHTMLFormatter shf = new SimpleHTMLFormatter("<b><font color=\"red\">","</font></b>");
			QueryScorer scorer = new QueryScorer(qr);
			Fragmenter frgm = new SimpleSpanFragmenter(scorer);//Calculate a fragment based on the score
			Highlighter hlt = new Highlighter (shf, scorer);
			hlt.setTextFragmenter(frgm);			
			/**Highlights*/
						
			for(ScoreDoc sdoc:  tps.scoreDocs) {
				Document doc = searcher.doc(sdoc.doc);
				//System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify")+", "+ doc.get("contents")+", "+ doc.get("title"));
				System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify"));
				
				//Summary highlight fragment (saved Field)				
				TokenStream tsm = anal.tokenStream("contents", new StringReader(doc.get("contents")));//////////////////////		
				String summary = hlt.getBestFragment(tsm, doc.get("contents"));
			
				tsm.close();
				System.out.println();
				System.out.println("[summary start]---------------------------------------- ---------------");
				System.out.println(summary);
				System.out.println("[End of summary]-------------------------------------- ---------------");
				System.out.println();
				
			}
			reader.close();			
			
		} catch (Exception e) {
			System.err.println("Directory|Parse wrong. "+ e.toString());
		}	
		
	}
	
	
	/***		
	 *
	 *
	 * The following method is 【Create Index】
	 *
	 * 	 *
	 * */
	
	
	//create or update index
	public static void createIndex(boolean create, String docspath, String indexpath) {
		
		Path docspt = Paths.get(docspath); //
		if(!Files.isReadable(docspt)) {
			System.err.println("Docs Path not readable: "+ docspt);
			System.exit(1);
		}
		long stime = System.currentTimeMillis();
		System.out.println("Begin Index ......");
		try {
			Directory dir = FSDirectory.open(Paths.get(indexpath));	//FSDirectory
			Analyzer anal = new SmartChineseAnalyzer();				//SmartChineseAnalyzer
			IndexWriterConfig iwc = new IndexWriterConfig(anal); //IndexWriterConfig
			if(create) {
				iwc.setOpenMode(OpenMode.CREATE);
			}else {
				iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
			}
			IndexWriter iwrt = new IndexWriter(dir,iwc); //IndexWriter
			indexDocs(iwrt, docspt);
			
			iwrt.close();			
			
		} catch (Exception e) {
			System.err.println(e.toString());
		}
				
		long etime = System.currentTimeMillis();
		System.out.println("End Index, total time spend: " + (etime-stime)/1000 + " seconds.");
		
		
	}
	
	//Create an index for the files in the folder
	public static void indexDocs(final IndexWriter writer, Path pth) throws Exception {
		if(Files.isDirectory(pth)) {//This can also be implemented recursively
			Files.walkFileTree(pth, new SimpleFileVisitor<Path>(){
				@Override
				public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
					try {
						indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
					} catch (Exception e) {
						
					}					
					return FileVisitResult.CONTINUE;
				}
			});			
			
		}else {
			indexDoc(writer, pth, Files.getLastModifiedTime(pth).toMillis());
		}	
		
	}
	
	//create index
	public static void indexDoc(IndexWriter writer, Path path, long lastmodify) throws IOException, OpenXML4JException, XmlException {
		
		String suffix = path.toString().substring(path.toString().lastIndexOf(".")).toLowerCase();
		
		if(!lst.contains(suffix)) {
			return;
		}
		
		InputStream in = Files.newInputStream(path);
		
		Document doc = new Document();
		
		Field pathfield = new TextField("path", path.toString(), Store.YES);
		
		doc.add(pathfield);
		
		doc.add(new TextField("title", path.getFileName().toString(),  Store.YES));
		
		doc.add(new LongPoint("lastmodify",lastmodify));		
		
		if(".doc".equals(suffix)) {
			//WordExtractor wd = new WordExtractor(in);
			WordExtractor wd = (WordExtractor) ExtractorFactory.createExtractor(in);
			doc.add(new TextField("contents",wd.getText(),Store.YES));
			//wd.close();
		}else if(".docx".equals(suffix)){			
			XWPFWordExtractor wdx = (XWPFWordExtractor) ExtractorFactory.createExtractor(in);
			doc.add(new TextField("contents",wdx.getText(),Store.YES));
			//wdx.close();
		}else {
			//doc.add(new TextField("contents", ));
			File tmpfile = Paths.get(path.toUri()).toFile();
			Long len = tmpfile.length();			
			FileInputStream fin = new FileInputStream(tmpfile);
			byte [] buf = new byte [len.intValue ()];			
			fin.read(buf);
			String text = new String(buf,"gb2312");
			fin.close();
			doc.add(new TextField("contents", text, Store.YES));			
			
		}		
		
		if(writer.getConfig().getOpenMode() == OpenMode.CREATE) {
			System.out.println("adding doc: " + path);
			writer.addDocument(doc);
		}else {
			System.out.println("updating doc: " + path);
			writer.updateDocument(new Term("path",path.toString()), doc);
		}
				
	}
	
}

 

Search results (part):

 

[Search term]: [equijoin]
[8.205898]: F:\worklog\leftjoin_innerjoin_rightjoin.doc, null

[Start of abstract]-------------------------------------------------------- --------
 join(<b><font color="red">equivalent</font></b><b><font color="red">join</font></b >) returns only the rows where the join fields are equal in the two tables. An
example is as follows:
---------------------------------- ----------
Table A records as follows:
aID aNum
[end of abstract]---------------------------- -------------------------

 

 

 

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326243826&siteId=291194637