Hadoop是Lucene的子项目

Hadoop是Lucene的子项目,现在发展如火如荼,如何利用Hadoop的分布式处理能力,来给Lucene提高建索引的效率呢,如此一来,便能充分利用HDFS的所有优点,但众所周知,HDFS系统,对随机读支持的并不友好,而像Lucene这种全文检索的框架,几乎所有的检索操作,都离不开随机读写的操作,那么如何才能使Lucene结合hadoop完美的工作呢,其实hadoop的版本里,在一个contrib的工具包里面,带了Lucene索引的工具类,不过貌似是用的人很少,散仙没有用过这个,在这里就不多评价了。


在solr4.4之后的项目,里面已经集成了像HDFS写入索引的jar包,如果你是在solr里面,那么很容易就能够,把索引建在HDFS上,只需要在solrconfig.xml里面配置Directory的实现类为HDFSDirectory即可,但是solr4.4里面的jar仅仅支持,最新版的hadoop,也就2.0之后的,直接在1.x的hadoop里使用,会出现异常,这是由于,2.x和1.x的hadoop的API变化,散仙改了部分源码后,可以支持对1.x的hadoop进行索引,查询操作,在文末,散仙会把这几个类,给上传上来,用时,只需把这几个类导入工程即可。

下面看下散仙的测试demo的源码:

Java代码 复制代码  收藏代码
  1. package  indexhadoop;  
  2.   
  3. import hdfs.HdfsDirectory;  
  4.   
  5. import org.apache.hadoop.conf.Configuration;  
  6. import org.apache.hadoop.fs.Path;  
  7. import org.apache.lucene.analysis.Analyzer;  
  8. import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;  
  9. import org.apache.lucene.document.Document;  
  10. import org.apache.lucene.document.Field.Store;  
  11. import org.apache.lucene.document.StringField;  
  12. import org.apache.lucene.document.TextField;  
  13. import org.apache.lucene.index.DirectoryReader;  
  14. import org.apache.lucene.index.IndexReader;  
  15. import org.apache.lucene.index.IndexWriter;  
  16. import org.apache.lucene.index.IndexWriterConfig;  
  17. import org.apache.lucene.index.Term;  
  18. import org.apache.lucene.queryparser.classic.QueryParser;  
  19. import org.apache.lucene.search.IndexSearcher;  
  20. import org.apache.lucene.search.Query;  
  21. import org.apache.lucene.search.ScoreDoc;  
  22. import org.apache.lucene.search.TopDocs;  
  23. import org.apache.lucene.store.Directory;  
  24. import org.apache.lucene.util.Version;  
  25.    
  26.   
  27. /** 
  28.  *  
  29.  * @author qindongliang 
  30.  * 将索引存储在HDFS上的demo 
  31.  * 支持hadoop1.x的版本 
  32.  *  
  33.  * **/  
  34. public class MyIndex {  
  35.   
  36.       
  37.     public static void main(String[] args)throws Exception {  
  38.         //long a=System.currentTimeMillis();  
  39.       //add();  
  40.         // long b=System.currentTimeMillis();  
  41.         // System.out.println("耗时: "+(b-a)+"毫秒");  
  42.              query("中国");  
  43.         //delete("3");//删除指定ID的数据  
  44.     }  
  45.       
  46.       
  47.       
  48.     /*** 
  49.      * 得到HDFS的writer 
  50.      *  
  51.      * **/  
  52.     public static IndexWriter  getIndexWriter() throws Exception{  
  53.           
  54.         Analyzer  analyzer=new SmartChineseAnalyzer(Version.LUCENE_46);  
  55.         IndexWriterConfig    config=new IndexWriterConfig(Version.LUCENE_46, analyzer);  
  56.         Configuration conf=new Configuration();  
  57.         //Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");  
  58.         //Path path=new Path("hdfs://10.2.143.5:9090/root/myfile");  
  59.         Path path=new Path("hdfs://192.168.75.130:9000/root/index");  
  60.         HdfsDirectory directory=new HdfsDirectory(path, conf);  
  61.         IndexWriter writer=new IndexWriter(directory, config);  
  62.           
  63.         return writer;  
  64.           
  65.     }  
  66.       
  67.     /** 
  68.      * 建索引的方法 
  69.      *  
  70.      * **/  
  71.     public static void add()throws Exception{  
  72.             
  73.         IndexWriter writer=getIndexWriter();      
  74.            
  75.           
  76. //      doc.add(new StringField("id", "3", Store.YES));  
  77. //      doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES));  
  78. //      doc.add(new TextField("content", "今天发工资了吗", Store.YES));  
  79. //      Document doc2=new Document();  
  80. //      doc.add(new StringField("id", "4", Store.YES));  
  81. //      doc2.add(new StringField("name", "今天天气不错呀", Store.YES));  
  82. //      doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES));  
  83. //      Document doc3=new Document();  
  84. //      doc3.add(new StringField("id", "5", Store.YES));  
  85. //      doc3.add(new StringField("name", "没有根的野草,飘忽的命途!", Store.YES));  
  86. //      doc3.add(new TextField("content", "你工资多少呀!", Store.YES));  
  87. //       writer.addDocument(doc);  
  88. //       writer.addDocument(doc2);  
  89. //      writer.addDocument(doc3);  
  90.         for(int i=6;i<10000;i++){  
  91.             Document doc=new Document();  
  92.             doc.add(new StringField("id", i+"", Store.YES));  
  93.             doc.add(new StringField("name""lucene是一款非常优秀的全文检索框架"+i, Store.YES));  
  94.             doc.add(new TextField("content""今天发工资了吗"+i, Store.YES));  
  95.             writer.addDocument(doc);  
  96.             if(i%1000==0){  
  97.                 writer.commit();  
  98.             }  
  99.         }  
  100.          writer.forceMerge(1);  
  101.         writer.commit();  
  102.         System.out.println("索引10000条数据添加成功!");  
  103.         writer.close();  
  104.     }  
  105.       
  106.     /*** 
  107.      * 添加索引 
  108.      *  
  109.      * **/  
  110.     public static void add(Document d)throws Exception{  
  111.         
  112.         IndexWriter writer=getIndexWriter();      
  113.         writer.addDocument(d);  
  114.          writer.forceMerge(1);  
  115.         writer.commit();  
  116.         System.out.println("索引10000条数据添加成功!");  
  117.         writer.close();  
  118.     }  
  119.       
  120.     /** 
  121.      * 根据指定ID 
  122.      * 删除HDFS上的一些数据 
  123.      *  
  124.      *  
  125.      * **/  
  126.     public static void delete(String id)throws Exception{  
  127.           
  128.           
  129.         IndexWriter writer=getIndexWriter();  
  130.         writer.deleteDocuments(new Term("id", id));//删除指定ID的数据  
  131.         writer.forceMerge(1);//清除已经删除的索引空间  
  132.         writer.commit();//提交变化  
  133.           
  134.         System.out.println("id为"+id+"的数据已经删除成功.........");  
  135.           
  136.           
  137.     }  
  138.       
  139.     /** 
  140.      * 检索的方法  
  141.      *  
  142.      * **/  
  143.     public static void query(String queryTerm)throws Exception{  
  144.         System.out.println("本次检索内容:  "+queryTerm);  
  145.         Configuration conf=new Configuration();  
  146.         //Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");  
  147.     //  Path path=new Path("hdfs://192.168.75.130:9000/root/index");  
  148.         Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1");  
  149.         Directory directory=new HdfsDirectory(path, conf);  
  150.         IndexReader reader=DirectoryReader.open(directory);  
  151.         System.out.println("总数据量: "+reader.numDocs());  
  152.         long a=System.currentTimeMillis();  
  153.         IndexSearcher searcher=new IndexSearcher(reader);  
  154.         QueryParser parse=new QueryParser(Version.LUCENE_46, "city"new SmartChineseAnalyzer(Version.LUCENE_46));  
  155.           
  156.          Query query=parse.parse(queryTerm);  
  157.           
  158.          TopDocs docs=searcher.search(query, 100);  
  159.            
  160.      System.out.println("本次命中结果:   "+docs.totalHits+"  条" );  
  161. //       for(ScoreDoc sc:docs.scoreDocs){  
  162. //             
  163. //           System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content"));  
  164. //             
  165. //       }  
  166.         long b=System.currentTimeMillis();  
  167.         System.out.println("第一次耗时:"+(b-a)+" 毫秒");  
  168.         System.out.println("============================================");  
  169.         long c=System.currentTimeMillis();  
  170.            query=parse.parse(queryTerm);  
  171.               
  172.            docs=searcher.search(query, 100);  
  173.          System.out.println("本次命中结果:   "+docs.totalHits+"  条" );  
  174. //       for(ScoreDoc sc:docs.scoreDocs){  
  175. //             
  176. //           System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content"));  
  177. //             
  178. //       }  
  179.         long d=System.currentTimeMillis();  
  180.         System.out.println("第二次耗时:"+(d-c)+" 毫秒");  
  181.           
  182.          reader.close();  
  183.          directory.close();  
  184.            
  185.          System.out.println("检索完毕...............");  
  186.        
  187.           
  188.           
  189.           
  190.     }  
  191.       
  192.       
  193.       
  194.       
  195. }  
package  indexhadoop;

import hdfs.HdfsDirectory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
 

/**
 * 
 * @author qindongliang
 * 将索引存储在HDFS上的demo
 * 支持hadoop1.x的版本
 * 
 * **/
public class MyIndex {

	
	public static void main(String[] args)throws Exception {
		//long a=System.currentTimeMillis();
	  //add();
	 	// long b=System.currentTimeMillis();
	 	// System.out.println("耗时: "+(b-a)+"毫秒");
		     query("中国");
		//delete("3");//删除指定ID的数据
	}
	
	
	
	/***
	 * 得到HDFS的writer
	 * 
	 * **/
	public static IndexWriter  getIndexWriter() throws Exception{
		
		Analyzer  analyzer=new SmartChineseAnalyzer(Version.LUCENE_46);
 		IndexWriterConfig    config=new IndexWriterConfig(Version.LUCENE_46, analyzer);
 		Configuration conf=new Configuration();
 		//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
 		//Path path=new Path("hdfs://10.2.143.5:9090/root/myfile");
 		Path path=new Path("hdfs://192.168.75.130:9000/root/index");
 		HdfsDirectory directory=new HdfsDirectory(path, conf);
 		IndexWriter writer=new IndexWriter(directory, config);
 		
 		return writer;
		
	}
	
	/**
	 * 建索引的方法
	 * 
	 * **/
	public static void add()throws Exception{
	      
		IndexWriter writer=getIndexWriter();	
		 
		
//		doc.add(new StringField("id", "3", Store.YES));
//		doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES));
//		doc.add(new TextField("content", "今天发工资了吗", Store.YES));
//		Document doc2=new Document();
//		doc.add(new StringField("id", "4", Store.YES));
//		doc2.add(new StringField("name", "今天天气不错呀", Store.YES));
//		doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES));
//		Document doc3=new Document();
//		doc3.add(new StringField("id", "5", Store.YES));
//		doc3.add(new StringField("name", "没有根的野草,飘忽的命途!", Store.YES));
//		doc3.add(new TextField("content", "你工资多少呀!", Store.YES));
//		 writer.addDocument(doc);
//		 writer.addDocument(doc2);
//		writer.addDocument(doc3);
		for(int i=6;i<10000;i++){
			Document doc=new Document();
			doc.add(new StringField("id", i+"", Store.YES));
			doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架"+i, Store.YES));
			doc.add(new TextField("content", "今天发工资了吗"+i, Store.YES));
			writer.addDocument(doc);
			if(i%1000==0){
				writer.commit();
			}
		}
		 writer.forceMerge(1);
		writer.commit();
		System.out.println("索引10000条数据添加成功!");
		writer.close();
	}
	
	/***
	 * 添加索引
	 * 
	 * **/
	public static void add(Document d)throws Exception{
      
		IndexWriter writer=getIndexWriter();	
		writer.addDocument(d);
		 writer.forceMerge(1);
		writer.commit();
		System.out.println("索引10000条数据添加成功!");
		writer.close();
	}
	
	/**
	 * 根据指定ID
	 * 删除HDFS上的一些数据
	 * 
	 * 
	 * **/
	public static void delete(String id)throws Exception{
		
		
		IndexWriter writer=getIndexWriter();
		writer.deleteDocuments(new Term("id", id));//删除指定ID的数据
		writer.forceMerge(1);//清除已经删除的索引空间
		writer.commit();//提交变化
		
		System.out.println("id为"+id+"的数据已经删除成功.........");
		
		
	}
	
	/**
	 * 检索的方法 
	 * 
	 * **/
	public static void query(String queryTerm)throws Exception{
		System.out.println("本次检索内容:  "+queryTerm);
		Configuration conf=new Configuration();
 		//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
 	//	Path path=new Path("hdfs://192.168.75.130:9000/root/index");
 		Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1");
		Directory directory=new HdfsDirectory(path, conf);
		IndexReader reader=DirectoryReader.open(directory);
		System.out.println("总数据量: "+reader.numDocs());
		long a=System.currentTimeMillis();
		IndexSearcher searcher=new IndexSearcher(reader);
		QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46));
		
		 Query query=parse.parse(queryTerm);
		
		 TopDocs docs=searcher.search(query, 100);
		 
 	 System.out.println("本次命中结果:   "+docs.totalHits+"  条" );
//		 for(ScoreDoc sc:docs.scoreDocs){
//			 
//			 System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content"));
//			 
//		 }
		long b=System.currentTimeMillis();
		System.out.println("第一次耗时:"+(b-a)+" 毫秒");
		System.out.println("============================================");
		long c=System.currentTimeMillis();
		   query=parse.parse(queryTerm);
			
		   docs=searcher.search(query, 100);
		 System.out.println("本次命中结果:   "+docs.totalHits+"  条" );
//		 for(ScoreDoc sc:docs.scoreDocs){
//			 
//			 System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content"));
//			 
//		 }
		long d=System.currentTimeMillis();
		System.out.println("第二次耗时:"+(d-c)+" 毫秒");
		
		 reader.close();
		 directory.close();
		 
		 System.out.println("检索完毕...............");
	 
		
		
		
	}
	
	
	
	
}




上面是散仙测试的例子,经测试,对HDFS上的lucene索引的增删改查都没问题,但有一点需要注意,lucene结合hadoop,确实能大大提升建索引的速度,但是在检索上却没有任何优势,虽然也可以检索,但是速度比较慢,目前的存储实现,是利用了block cache的缓存特性,能使得检索性能差强人意,但是数据量大的时候,检索性能非常糟糕,这一点到现在还没有任何比较好的解决方法,除非,以后给lucene,或solr,增加类似Hbase的数据结构,如此以来,检索上可能会好很多。

上面的代码能够将索引,写入1.x的hadoop中,后续,散仙会给出,在hadoop2.x中建索引的例子,以及如何使用MapReduce并行建索引。

猜你喜欢

转载自weitao1026.iteye.com/blog/2266852