Map Reduce运行的结果是一些列文件,通过使用如下函数将输出文件归并到一个文件中:
org.apache.hadoop.fs.FileUtil.copyMerge(FileSystem srcFS, Path srcDir, FileSystem dstFS, Path dstFile, boolean deleteSource, Configuration conf, String addString)
其内的代码如下:
/** Copy all files in a directory to one output file (merge). */ public static boolean copyMerge(FileSystem srcFS, Path srcDir, FileSystem dstFS, Path dstFile, boolean deleteSource, Configuration conf, String addString) throws IOException { dstFile = checkDest(srcDir.getName(), dstFS, dstFile, false); if (!srcFS.getFileStatus(srcDir).isDirectory()) return false; OutputStream out = dstFS.create(dstFile); try { FileStatus contents[] = srcFS.listStatus(srcDir); Arrays.sort(contents); for (int i = 0; i < contents.length; i++) { if (contents[i].isFile()) { InputStream in = srcFS.open(contents[i].getPath()); try { IOUtils.copyBytes(in, out, conf, false); if (addString!=null) out.write(addString.getBytes("UTF-8")); } finally { in.close(); } } } } finally { out.close(); } if (deleteSource) { return srcFS.delete(srcDir, true); } else { return true; } }
在第12行加入如下代码即可完成对UTF-8 BOM的写入:
out.write(new byte[]{(byte)0xEF,(byte)0xBB,(byte)0xBF});
最终的结果为:
/** Copy all files in a directory to one output file (merge). */ public static boolean copyMerge(FileSystem srcFS, Path srcDir, FileSystem dstFS, Path dstFile, boolean deleteSource, Configuration conf, String addString) throws IOException { dstFile = checkDest(srcDir.getName(), dstFS, dstFile, false); if (!srcFS.getFileStatus(srcDir).isDirectory()) return false; OutputStream out = dstFS.create(dstFile); out.write(new byte[]{(byte)0xEF,(byte)0xBB,(byte)0xBF}); // 写入BOM try { FileStatus contents[] = srcFS.listStatus(srcDir); Arrays.sort(contents); for (int i = 0; i < contents.length; i++) { if (contents[i].isFile()) { InputStream in = srcFS.open(contents[i].getPath()); try { IOUtils.copyBytes(in, out, conf, false); if (addString!=null) out.write(addString.getBytes("UTF-8")); } finally { in.close(); } } } } finally { out.close(); } if (deleteSource) { return srcFS.delete(srcDir, true); } else { return true; } }