还用lucene架了个搜索引擎,对pdf进行全文搜索(联合pdfbox)。
核心类是一个Agent,使用开源的庖丁中文分词器
代码:
QUOTE:
package gov.jsgs.ssgs.service;
import gov.jsgs.ssgs.form.PdfForm;
import gov.jsgs.ssgs.model.Ssgs_pdfModel;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.StaleReaderException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import common.Logger;
/**
* 搜索引擎代理,目前设置为在c:\pdf_index目录下保存索引,可以通过setIndexDir()修改.
* <p>
* <b>注意:必须用单例模式运行,使用init()初始化,destroy()释放资源</b>
*
* @author tedeyang
*
*/
public class LuceneAgent {
private static Logger log = Logger.getLogger(LuceneAgent.class);
static Object lock = new Object();
Analyzer analyzer = null;
Directory ramDir = null;
IndexWriter writer = null;
String indexDir = "c:/pdf_index";
IndexReader reader;
QueryParser parser;
Searcher searcher;
public void init() throws CorruptIndexException, LockObtainFailedException,
IOException {
log.info("初始化Lucene搜索引擎...");
log.debug("初始化分词器...");
analyzer = new PaodingAnalyzer();
ramDir = FSDirectory.getDirectory(indexDir);
if (ramDir.fileExists("write.lock")) {
ramDir.deleteFile("write.lock");
log.debug("清除引擎文件锁 ...");
}
try {
writer = new IndexWriter(ramDir, analyzer, false);
} catch (Exception e) {
writer = new IndexWriter(ramDir, analyzer, true);
}
reader = IndexReader.open(ramDir);
parser = new QueryParser(LucenePDFDocument.CONTENT, analyzer);
searcher = new IndexSearcher(ramDir);
}
public void destroy() {
log.info("关闭搜索引擎...");
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public String getIndexDir() {
return indexDir;
}
/**
* @param indexDir
* 存放索引文件的目录,默认c:/pdf_index
*/
public void setIndexDir(String indexDir) {
this.indexDir = indexDir;
}
public void resetIndex() {
}
/**
* 添加索引文档
*
* @param doc
* @throws CorruptIndexException
* @throws IOException
*/
public void addPdf(Document doc) throws CorruptIndexException, IOException {
writer.addDocument(doc, analyzer);
writer.optimize();
writer.flush();
}
/**
* 添加pdf文件索引。完成后会关闭pdf的inputstream
*
* @param pdf
* @throws CorruptIndexException
* @throws IOException
*/
public void addPdf(Ssgs_pdfModel pdf) throws CorruptIndexException,
IOException {
writer.addDocument(LucenePDFDocument.getDocument(pdf), analyzer);
writer.optimize();
writer.flush();
}
/**
* 查询
*
* @param keyword
* @return list[PdfForm]
* @throws ParseException
* @throws IOException
*/
public List search(String keyword) throws ParseException, IOException {
if (keyword == null || keyword.matches("^\\s*$")) {
return null;
}
Hits hits = null;
synchronized (lock) {
Query query = parser.parse(keyword).rewrite(reader);
hits = searcher.search(query);
}
List pdfs = new ArrayList(hits.length());
for (int i = 0; i < hits.length(); i++) {
PdfForm pdf = new PdfForm();
pdf.setFile_name(hits.doc(i).get(LucenePDFDocument.FILE_NAME));
pdf.setId(Integer.parseInt(hits.doc(i).get(LucenePDFDocument.ID)));
try {
pdf.setM_time( hits.doc(i).get( LucenePDFDocument.MODIFIED));
} catch (Exception e) {
pdf.setM_time(null);
}
pdf.setSummary(hits.doc(i).get(LucenePDFDocument.SUMMARY));
pdfs.add(pdf);
}
return pdfs;
}
/**
* 根据唯一主键删除索引
*
* @param id
* @throws StaleReaderException
* @throws CorruptIndexException
* @throws LockObtainFailedException
* @throws IOException
*/
public synchronized void delete(String id) throws StaleReaderException,
CorruptIndexException, LockObtainFailedException, IOException {
Term term = new Term(LucenePDFDocument.ID, id);
synchronized (lock) {
writer.deleteDocuments(term);
writer.optimize();
writer.flush();
}
}
}
import gov.jsgs.ssgs.form.PdfForm;
import gov.jsgs.ssgs.model.Ssgs_pdfModel;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.StaleReaderException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import common.Logger;
/**
* 搜索引擎代理,目前设置为在c:\pdf_index目录下保存索引,可以通过setIndexDir()修改.
* <p>
* <b>注意:必须用单例模式运行,使用init()初始化,destroy()释放资源</b>
*
* @author tedeyang
*
*/
public class LuceneAgent {
private static Logger log = Logger.getLogger(LuceneAgent.class);
static Object lock = new Object();
Analyzer analyzer = null;
Directory ramDir = null;
IndexWriter writer = null;
String indexDir = "c:/pdf_index";
IndexReader reader;
QueryParser parser;
Searcher searcher;
public void init() throws CorruptIndexException, LockObtainFailedException,
IOException {
log.info("初始化Lucene搜索引擎...");
log.debug("初始化分词器...");
analyzer = new PaodingAnalyzer();
ramDir = FSDirectory.getDirectory(indexDir);
if (ramDir.fileExists("write.lock")) {
ramDir.deleteFile("write.lock");
log.debug("清除引擎文件锁 ...");
}
try {
writer = new IndexWriter(ramDir, analyzer, false);
} catch (Exception e) {
writer = new IndexWriter(ramDir, analyzer, true);
}
reader = IndexReader.open(ramDir);
parser = new QueryParser(LucenePDFDocument.CONTENT, analyzer);
searcher = new IndexSearcher(ramDir);
}
public void destroy() {
log.info("关闭搜索引擎...");
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public String getIndexDir() {
return indexDir;
}
/**
* @param indexDir
* 存放索引文件的目录,默认c:/pdf_index
*/
public void setIndexDir(String indexDir) {
this.indexDir = indexDir;
}
public void resetIndex() {
}
/**
* 添加索引文档
*
* @param doc
* @throws CorruptIndexException
* @throws IOException
*/
public void addPdf(Document doc) throws CorruptIndexException, IOException {
writer.addDocument(doc, analyzer);
writer.optimize();
writer.flush();
}
/**
* 添加pdf文件索引。完成后会关闭pdf的inputstream
*
* @param pdf
* @throws CorruptIndexException
* @throws IOException
*/
public void addPdf(Ssgs_pdfModel pdf) throws CorruptIndexException,
IOException {
writer.addDocument(LucenePDFDocument.getDocument(pdf), analyzer);
writer.optimize();
writer.flush();
}
/**
* 查询
*
* @param keyword
* @return list[PdfForm]
* @throws ParseException
* @throws IOException
*/
public List search(String keyword) throws ParseException, IOException {
if (keyword == null || keyword.matches("^\\s*$")) {
return null;
}
Hits hits = null;
synchronized (lock) {
Query query = parser.parse(keyword).rewrite(reader);
hits = searcher.search(query);
}
List pdfs = new ArrayList(hits.length());
for (int i = 0; i < hits.length(); i++) {
PdfForm pdf = new PdfForm();
pdf.setFile_name(hits.doc(i).get(LucenePDFDocument.FILE_NAME));
pdf.setId(Integer.parseInt(hits.doc(i).get(LucenePDFDocument.ID)));
try {
pdf.setM_time( hits.doc(i).get( LucenePDFDocument.MODIFIED));
} catch (Exception e) {
pdf.setM_time(null);
}
pdf.setSummary(hits.doc(i).get(LucenePDFDocument.SUMMARY));
pdfs.add(pdf);
}
return pdfs;
}
/**
* 根据唯一主键删除索引
*
* @param id
* @throws StaleReaderException
* @throws CorruptIndexException
* @throws LockObtainFailedException
* @throws IOException
*/
public synchronized void delete(String id) throws StaleReaderException,
CorruptIndexException, LockObtainFailedException, IOException {
Term term = new Term(LucenePDFDocument.ID, id);
synchronized (lock) {
writer.deleteDocuments(term);
writer.optimize();
writer.flush();
}
}
}