1、获取某个目录下的所有文件
public class DirectoryUtil { /** * 记录指定目录下的所有文件 * @param file * @return */ public static List<String> listFiles(File file,List<String> list){ if(list == null){ list = new LinkedList<String>(); } if(!file.isDirectory()){ list.add(file.getName()); }else{ // System.out.println("---------------"+file.getName()+"是文件夹---------------"); File[] files = file.listFiles(); for(File tempfile : files){ if(tempfile.isDirectory()){ DirectoryUtil.listFiles(tempfile, list); } // System.out.println(tempfile.getName()); // System.out.println(tempfile.getPath()); list.add(tempfile.getPath()); } } return list; } /** * 打印对象下的数据 * @param obj */ public void print(Object obj){ if(obj instanceof List){ List list = (List)obj; Iterator it = list.iterator(); while(it.hasNext()){ Object tempObj = it.next(); System.out.println(tempObj); } } } @Test public void listFilesTest(){ DirectoryUtil util = new DirectoryUtil(); File file = new File("H:/baiduyundownload"); List list = util.listFiles(file,null); util.print(list); } }
2、从文件中抽取信息,并添加到Lucene中的Document对象中
public class TikaExtractFile { public static Document getInformationInFile(File file){ InputStream stream = null; Document doc = null; Tika tika = new Tika(); try { //自动探测解析器 —— 可以由系统判断使用什么样的parser AutoDetectParser autoDetectParser = new AutoDetectParser(); //获取文件流 stream = new FileInputStream(file); Reader reader = tika.parse(file); //创建一个Metadata对象,在parser之后用于存放文件的Metadata信息 Metadata metadata = new Metadata(); //默认使用SAX解析 ParseContext context = new ParseContext(); ////当文件大于100000时,new BodyContentHandler(1024*1024*1024); // BodyContentHandler handler = new BodyContentHandler(); // BodyContentHandler handler = new BodyContentHandler(1024*1024*1024); //WriteOutContentHandler就是用来处理文件大小的 BodyContentHandler handler = new BodyContentHandler(new WriteOutContentHandler(1024*1024*1024)); autoDetectParser.parse(stream, handler, metadata, context); doc = new Document(); doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("address", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); //如果文件名有后缀,则文件类型根据文件名后缀来判断 if(file.getName().indexOf(".")>0){ doc.add(new Field("filetype", file.getName().substring(file.getName().indexOf(".")), Field.Store.YES, Field.Index.NOT_ANALYZED)); }else{ System.out.println("type : " + metadata.get("Content-Type")); String type = metadata.get("Content-Type").split("\\/")[1]; doc.add(new Field("filetype", type, Field.Store.YES, Field.Index.NOT_ANALYZED)); } // doc.add(new Field("filecontent", new InputStreamReader(stream), Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("filecontent",reader)); return doc; } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); }finally{ if(stream != null){ try { stream.close(); } catch (IOException e) { e.printStackTrace(); } } } return null; } }
3、将文件信息写入到索引中
public class IndexUtil { //创建索引 public void index(){ try { //lucene存放所以的路径 String dirpath = "g:/mylucene"; File file = new File(dirpath); //打开硬盘上的某个目录(创建索引) Directory directory = FSDirectory.open(file); // IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new MMSegAnalyzer("G:\\c参考资料\\lucene\\mmseg4j-1.8.5\\data")); //创建写索引对象 IndexWriter writer = new IndexWriter(directory, config); File dir = new File("H:/baiduyundownload"); //获取目录下的所有文件 List<String> list = DirectoryUtil.listFiles(dir, null); Iterator<String> it = list.iterator(); while(it.hasNext()){ //因为前面的Iterator对象使用了泛型,所以这里不需要使用强制类型转换 String path = it.next(); File tempfile = new File(path); Document doc = null; if(tempfile.isFile()){ //使用Tika从文件中抽取信息,返回Document对象 doc = TikaExtractFile.getInformationInFile(tempfile); //将document对象写入到文件中 writer.addDocument(doc); } } writer.close(); } catch (IOException e) { e.printStackTrace(); } } @Test public void test1(){ String str = "type : application/xml"; System.out.println(str.split("\\/")[1]); } public static void main(String[] args) { IndexUtil util = new IndexUtil(); util.index(); } }