Hadoop是Lucene的子项目

Hadoop是Lucene的子项目，现在发展如火如荼，如何利用Hadoop的分布式处理能力，来给Lucene提高建索引的效率呢，如此一来，便能充分利用HDFS的所有优点，但众所周知，HDFS系统，对随机读支持的并不友好，而像Lucene这种全文检索的框架，几乎所有的检索操作，都离不开随机读写的操作，那么如何才能使Lucene结合hadoop完美的工作呢，其实hadoop的版本里，在一个contrib的工具包里面，带了Lucene索引的工具类，不过貌似是用的人很少，散仙没有用过这个，在这里就不多评价了。

在solr4.4之后的项目，里面已经集成了像HDFS写入索引的jar包，如果你是在solr里面，那么很容易就能够，把索引建在HDFS上，只需要在solrconfig.xml里面配置Directory的实现类为HDFSDirectory即可，但是solr4.4里面的jar仅仅支持，最新版的hadoop，也就2.0之后的，直接在1.x的hadoop里使用，会出现异常，这是由于，2.x和1.x的hadoop的API变化，散仙改了部分源码后，可以支持对1.x的hadoop进行索引，查询操作，在文末，散仙会把这几个类，给上传上来，用时，只需把这几个类导入工程即可。

下面看下散仙的测试demo的源码：

Java代码

package indexhadoop;
import hdfs.HdfsDirectory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
/**
*
* @author qindongliang
* 将索引存储在HDFS上的demo
* 支持hadoop1.x的版本
*
* **/
public class MyIndex {
public static void main(String[] args)throws Exception {
//long a=System.currentTimeMillis();
//add();
// long b=System.currentTimeMillis();
// System.out.println("耗时: "+(b-a)+"毫秒");
query("中国");
//delete("3");//删除指定ID的数据
}
/***
* 得到HDFS的writer
*
* **/
public static IndexWriter getIndexWriter() throws Exception{
Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_46);
IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_46, analyzer);
Configuration conf=new Configuration();
//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
//Path path=new Path("hdfs://10.2.143.5:9090/root/myfile");
Path path=new Path("hdfs://192.168.75.130:9000/root/index");
HdfsDirectory directory=new HdfsDirectory(path, conf);
IndexWriter writer=new IndexWriter(directory, config);
return writer;
}
/**
* 建索引的方法
*
* **/
public static void add()throws Exception{
IndexWriter writer=getIndexWriter();
// doc.add(new StringField("id", "3", Store.YES));
// doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES));
// doc.add(new TextField("content", "今天发工资了吗", Store.YES));
// Document doc2=new Document();
// doc.add(new StringField("id", "4", Store.YES));
// doc2.add(new StringField("name", "今天天气不错呀", Store.YES));
// doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES));
// Document doc3=new Document();
// doc3.add(new StringField("id", "5", Store.YES));
// doc3.add(new StringField("name", "没有根的野草，飘忽的命途！", Store.YES));
// doc3.add(new TextField("content", "你工资多少呀！", Store.YES));
// writer.addDocument(doc);
// writer.addDocument(doc2);
// writer.addDocument(doc3);
for(int i=6;i<10000;i++){
Document doc=new Document();
doc.add(new StringField("id", i+"", Store.YES));
doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架"+i, Store.YES));
doc.add(new TextField("content", "今天发工资了吗"+i, Store.YES));
writer.addDocument(doc);
if(i%1000==0){
writer.commit();
}
}
writer.forceMerge(1);
writer.commit();
System.out.println("索引10000条数据添加成功!");
writer.close();
}
/***
* 添加索引
*
* **/
public static void add(Document d)throws Exception{
IndexWriter writer=getIndexWriter();
writer.addDocument(d);
writer.forceMerge(1);
writer.commit();
System.out.println("索引10000条数据添加成功!");
writer.close();
}
/**
* 根据指定ID
* 删除HDFS上的一些数据
*
*
* **/
public static void delete(String id)throws Exception{
IndexWriter writer=getIndexWriter();
writer.deleteDocuments(new Term("id", id));//删除指定ID的数据
writer.forceMerge(1);//清除已经删除的索引空间
writer.commit();//提交变化
System.out.println("id为"+id+"的数据已经删除成功.........");
}
/**
* 检索的方法
*
* **/
public static void query(String queryTerm)throws Exception{
System.out.println("本次检索内容: "+queryTerm);
Configuration conf=new Configuration();
//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
// Path path=new Path("hdfs://192.168.75.130:9000/root/index");
Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1");
Directory directory=new HdfsDirectory(path, conf);
IndexReader reader=DirectoryReader.open(directory);
System.out.println("总数据量: "+reader.numDocs());
long a=System.currentTimeMillis();
IndexSearcher searcher=new IndexSearcher(reader);
QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46));
Query query=parse.parse(queryTerm);
TopDocs docs=searcher.search(query, 100);
System.out.println("本次命中结果: "+docs.totalHits+" 条" );
// for(ScoreDoc sc:docs.scoreDocs){
//
// System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content"));
//
// }
long b=System.currentTimeMillis();
System.out.println("第一次耗时:"+(b-a)+" 毫秒");
System.out.println("============================================");
long c=System.currentTimeMillis();
query=parse.parse(queryTerm);
docs=searcher.search(query, 100);
System.out.println("本次命中结果: "+docs.totalHits+" 条" );
// for(ScoreDoc sc:docs.scoreDocs){
//
// System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content"));
//
// }
long d=System.currentTimeMillis();
System.out.println("第二次耗时:"+(d-c)+" 毫秒");
reader.close();
directory.close();
System.out.println("检索完毕...............");
}
}

package  indexhadoop;

import hdfs.HdfsDirectory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
 

/**
 * 
 * @author qindongliang
 * 将索引存储在HDFS上的demo
 * 支持hadoop1.x的版本
 * 
 * **/
public class MyIndex {

	
	public static void main(String[] args)throws Exception {
		//long a=System.currentTimeMillis();
	  //add();
	 	// long b=System.currentTimeMillis();
	 	// System.out.println("耗时: "+(b-a)+"毫秒");
		     query("中国");
		//delete("3");//删除指定ID的数据
	}
	
	
	
	/***
	 * 得到HDFS的writer
	 * 
	 * **/
	public static IndexWriter  getIndexWriter() throws Exception{
		
		Analyzer  analyzer=new SmartChineseAnalyzer(Version.LUCENE_46);
 		IndexWriterConfig    config=new IndexWriterConfig(Version.LUCENE_46, analyzer);
 		Configuration conf=new Configuration();
 		//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
 		//Path path=new Path("hdfs://10.2.143.5:9090/root/myfile");
 		Path path=new Path("hdfs://192.168.75.130:9000/root/index");
 		HdfsDirectory directory=new HdfsDirectory(path, conf);
 		IndexWriter writer=new IndexWriter(directory, config);
 		
 		return writer;
		
	}
	
	/**
	 * 建索引的方法
	 * 
	 * **/
	public static void add()throws Exception{
	      
		IndexWriter writer=getIndexWriter();	
		 
		
//		doc.add(new StringField("id", "3", Store.YES));
//		doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES));
//		doc.add(new TextField("content", "今天发工资了吗", Store.YES));
//		Document doc2=new Document();
//		doc.add(new StringField("id", "4", Store.YES));
//		doc2.add(new StringField("name", "今天天气不错呀", Store.YES));
//		doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES));
//		Document doc3=new Document();
//		doc3.add(new StringField("id", "5", Store.YES));
//		doc3.add(new StringField("name", "没有根的野草，飘忽的命途！", Store.YES));
//		doc3.add(new TextField("content", "你工资多少呀！", Store.YES));
//		 writer.addDocument(doc);
//		 writer.addDocument(doc2);
//		writer.addDocument(doc3);
		for(int i=6;i<10000;i++){
			Document doc=new Document();
			doc.add(new StringField("id", i+"", Store.YES));
			doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架"+i, Store.YES));
			doc.add(new TextField("content", "今天发工资了吗"+i, Store.YES));
			writer.addDocument(doc);
			if(i%1000==0){
				writer.commit();
			}
		}
		 writer.forceMerge(1);
		writer.commit();
		System.out.println("索引10000条数据添加成功!");
		writer.close();
	}
	
	/***
	 * 添加索引
	 * 
	 * **/
	public static void add(Document d)throws Exception{
      
		IndexWriter writer=getIndexWriter();	
		writer.addDocument(d);
		 writer.forceMerge(1);
		writer.commit();
		System.out.println("索引10000条数据添加成功!");
		writer.close();
	}
	
	/**
	 * 根据指定ID
	 * 删除HDFS上的一些数据
	 * 
	 * 
	 * **/
	public static void delete(String id)throws Exception{
		
		
		IndexWriter writer=getIndexWriter();
		writer.deleteDocuments(new Term("id", id));//删除指定ID的数据
		writer.forceMerge(1);//清除已经删除的索引空间
		writer.commit();//提交变化
		
		System.out.println("id为"+id+"的数据已经删除成功.........");
		
		
	}
	
	/**
	 * 检索的方法 
	 * 
	 * **/
	public static void query(String queryTerm)throws Exception{
		System.out.println("本次检索内容:  "+queryTerm);
		Configuration conf=new Configuration();
 		//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
 	//	Path path=new Path("hdfs://192.168.75.130:9000/root/index");
 		Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1");
		Directory directory=new HdfsDirectory(path, conf);
		IndexReader reader=DirectoryReader.open(directory);
		System.out.println("总数据量: "+reader.numDocs());
		long a=System.currentTimeMillis();
		IndexSearcher searcher=new IndexSearcher(reader);
		QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46));
		
		 Query query=parse.parse(queryTerm);
		
		 TopDocs docs=searcher.search(query, 100);
		 
 	 System.out.println("本次命中结果:   "+docs.totalHits+"  条" );
//		 for(ScoreDoc sc:docs.scoreDocs){
//			 
//			 System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content"));
//			 
//		 }
		long b=System.currentTimeMillis();
		System.out.println("第一次耗时:"+(b-a)+" 毫秒");
		System.out.println("============================================");
		long c=System.currentTimeMillis();
		   query=parse.parse(queryTerm);
			
		   docs=searcher.search(query, 100);
		 System.out.println("本次命中结果:   "+docs.totalHits+"  条" );
//		 for(ScoreDoc sc:docs.scoreDocs){
//			 
//			 System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content"));
//			 
//		 }
		long d=System.currentTimeMillis();
		System.out.println("第二次耗时:"+(d-c)+" 毫秒");
		
		 reader.close();
		 directory.close();
		 
		 System.out.println("检索完毕...............");
	 
		
		
		
	}
	
	
	
	
}

上面是散仙测试的例子，经测试，对HDFS上的lucene索引的增删改查都没问题，但有一点需要注意，lucene结合hadoop，确实能大大提升建索引的速度，但是在检索上却没有任何优势，虽然也可以检索，但是速度比较慢，目前的存储实现，是利用了block cache的缓存特性，能使得检索性能差强人意，但是数据量大的时候，检索性能非常糟糕，这一点到现在还没有任何比较好的解决方法，除非，以后给lucene，或solr，增加类似Hbase的数据结构，如此以来，检索上可能会好很多。

上面的代码能够将索引，写入1.x的hadoop中，后续，散仙会给出，在hadoop2.x中建索引的例子，以及如何使用MapReduce并行建索引。

Hadoop是Lucene的子项目

猜你喜欢