1.从lucene官网下载需要的版本 http://lucene.apache.org/
也可以从我的百度网盘下载 lucene 6.6.0
链接:https://pan.baidu.com/s/1CMxWIWwo9ZCrs1_bhcsOZA 密码:8cx4
2.解压压缩包
3.在eclipse中右击项目->Build Path->Add Extenal Archives
在lucene-6.6.0\core 中选择jar包 lucene-core-6.6.0.jar
在lucene-6.6.0\analysis\comme 中选择jar包 lucene-analyzers-common-6.6.0.jar
在lucene-6.6.0\queryparser 中选择jar包 lucene-queryparser-6.6.0.jar
在lucene-6.6.0\demo 中选择jar包 lucene-demo-6.6.0.jar
4.nlpir是中科院分词系统,对中文分词有着优秀的功能
lucene相关部分 github地址https://github.com/NLPIR-team/nlpir-analysis-cn-ictclas
需要根据pom.xml导入jar包
lucene-analyzers-nlpir-ictclas-6.6.0.jar
log4j-1.2.17.jar
jna-4.1.0.jar
junit-3.8.1.jar(后三个均可百度下载)
nlpir分词应用示例
package cn.edu.bit.cs;
import java.nio.file.Paths;
import org.apache.log4j.BasicConfigurator;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.nlpir.lucene.cn.ictclas.NLPIRTokenizerAnalyzer;
public class Nlpirtest {
public static void main(String[] args) throws Exception {
// NLPIR
NLPIRTokenizerAnalyzer nta = new NLPIRTokenizerAnalyzer("", 1, "", "", false);
// Index
IndexWriterConfig inconf = new IndexWriterConfig(nta);
inconf.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter index = new IndexWriter(FSDirectory.open(Paths.get("index/")), inconf);
Document doc = new Document();
doc.add(new TextField("contents",
"特朗普表示,很高兴汉堡会晤后再次同习近平主席通话。我同习主席就重大问题保持沟通和协调、两国加强各层级和各领域交往十分重要。当前,美中关系发展态势良好,我相信可以发展得更好。我期待着对中国进行国事访问。",
Field.Store.YES));
index.addDocument(doc);
index.flush();
index.close();
// Search
String field = "contents";
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get("index/")));
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(field, nta);
Query query = parser.parse("习近平");
TopDocs top = searcher.search(query, 100);
System.out.println("总条数:" + top.totalHits);
ScoreDoc[] hits = top.scoreDocs;
for (int i = 0; i < hits.length; i++) {
System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score);
Document d = searcher.doc(hits[i].doc);
System.out.println(d.get("contents"));
}
BasicConfigurator.configure();
}
}