lucene 与 tika 一起使用

1、获取某个目录下的所有文件

public class DirectoryUtil {
	
	/**
	 * 记录指定目录下的所有文件
	 * @param file
	 * @return
	 */
	public static List<String> listFiles(File file,List<String> list){
		if(list == null){
			list = new LinkedList<String>();
		}
		
		if(!file.isDirectory()){
			list.add(file.getName());
		}else{
//			System.out.println("---------------"+file.getName()+"是文件夹---------------");
			File[] files = file.listFiles();
			for(File tempfile : files){
				if(tempfile.isDirectory()){
					DirectoryUtil.listFiles(tempfile, list);
				}
//				System.out.println(tempfile.getName());
//				System.out.println(tempfile.getPath());
				list.add(tempfile.getPath());
			}
		}
		
		return list;
	}
	
	
	/**
	 * 打印对象下的数据
	 * @param obj
	 */
	public void print(Object obj){
		if(obj instanceof List){
			List list = (List)obj;
			Iterator it = list.iterator();
			while(it.hasNext()){
				Object tempObj = it.next();
				System.out.println(tempObj);
			}
		}
	}
	
	@Test
	public void listFilesTest(){
		DirectoryUtil util = new DirectoryUtil();
		File file = new File("H:/baiduyundownload");
		List list = util.listFiles(file,null);
		util.print(list);
	}
	
}

2、从文件中抽取信息，并添加到Lucene中的Document对象中

public class TikaExtractFile {

	public static Document getInformationInFile(File file){
		InputStream stream = null;
		Document doc = null;
		Tika tika = new Tika();
		try {
			//自动探测解析器 —— 可以由系统判断使用什么样的parser
			AutoDetectParser autoDetectParser = new AutoDetectParser();
			//获取文件流
			stream = new FileInputStream(file);
			Reader reader = tika.parse(file);
			//创建一个Metadata对象，在parser之后用于存放文件的Metadata信息
			Metadata metadata = new Metadata();
			//默认使用SAX解析
			ParseContext context = new ParseContext();
			////当文件大于100000时，new BodyContentHandler(1024*1024*1024);  
//			BodyContentHandler handler = new BodyContentHandler();
//			BodyContentHandler handler = new BodyContentHandler(1024*1024*1024);
			//WriteOutContentHandler就是用来处理文件大小的
			BodyContentHandler handler = new BodyContentHandler(new WriteOutContentHandler(1024*1024*1024));
			autoDetectParser.parse(stream, handler, metadata, context);
			doc = new Document();
			doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.ANALYZED));
			doc.add(new Field("address", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			//如果文件名有后缀，则文件类型根据文件名后缀来判断
			if(file.getName().indexOf(".")>0){
				doc.add(new Field("filetype", file.getName().substring(file.getName().indexOf(".")), Field.Store.YES, Field.Index.NOT_ANALYZED));
			}else{
				System.out.println("type : " + metadata.get("Content-Type"));
				String type = metadata.get("Content-Type").split("\\/")[1];
				doc.add(new Field("filetype", type, Field.Store.YES, Field.Index.NOT_ANALYZED));
			}
//			doc.add(new Field("filecontent", new InputStreamReader(stream), Field.Store.NO, Field.Index.ANALYZED));
			doc.add(new Field("filecontent",reader));
			return doc;
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (SAXException e) {
			e.printStackTrace();
		} catch (TikaException e) {
			e.printStackTrace();
		}finally{
			if(stream != null){
				try {
					stream.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return null;
	}
	
}

3、将文件信息写入到索引中

public class IndexUtil {

	//创建索引
	public void index(){
		try {
			//lucene存放所以的路径
			String dirpath = "g:/mylucene";
			File file = new File(dirpath);
			//打开硬盘上的某个目录（创建索引）
			Directory directory = FSDirectory.open(file);
//			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); 
			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new MMSegAnalyzer("G:\\c参考资料\\lucene\\mmseg4j-1.8.5\\data")); 
			//创建写索引对象
			IndexWriter writer = new IndexWriter(directory, config);
			File dir = new File("H:/baiduyundownload");
			//获取目录下的所有文件
			List<String> list = DirectoryUtil.listFiles(dir, null);
			Iterator<String> it = list.iterator();
			while(it.hasNext()){
				//因为前面的Iterator对象使用了泛型，所以这里不需要使用强制类型转换
				String path = it.next();
				File tempfile = new File(path);
				Document doc = null;
				if(tempfile.isFile()){
					//使用Tika从文件中抽取信息，返回Document对象
					 doc = TikaExtractFile.getInformationInFile(tempfile);
					 //将document对象写入到文件中
					 writer.addDocument(doc);  
				}
			}
			writer.close();
			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
		
	@Test
	public void test1(){
		String str = "type : application/xml";
		System.out.println(str.split("\\/")[1]);
		
	}
	
	public static void main(String[] args) {
		IndexUtil util = new IndexUtil();
		util.index();
	}
	
}

lucene 与 tika 一起使用

猜你喜欢