大数据学习（三）：Java对Hadoop文件操作初步

通过URL访问hadoop是可行的，如：

package cn.weida.hadoop.read;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLStreamHandlerFactory;

import org.apache.hadoop.fs.FsUrlStreamHandlerFactory;
import org.apache.hadoop.io.IOUtils;

/**
* 通过URLStreamHandler实例以标准输出方式显示Hadoop文件系统的文件
* 注意： static 中每个java虚拟机只能调用一次，如果程序的其他组建声明了URLStreamHandlerFactory实例，将无法从Hadoop中获取数据
*
* @author lone
*
*/
public class URLCat {

   static {
       URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());

//   static URLStreamHandlerFactory factory;
//
//   /**
//   * Sets an application's {@code URLStreamHandlerFactory}.
//   * This method can be called at most once in a given Java Virtual
//   * Machine.
//   *
//   *<p> The {@code URLStreamHandlerFactory} instance is used to
//   *construct a stream protocol handler from a protocol name.
//   *
//   * <p> If there is a security manager, this method first calls
//   * the security manager's {@code checkSetFactory} method
//   * to ensure the operation is allowed.
//   * This could result in a SecurityException.
//   *
//   * @param fac the desired factory.
//   * @exception Error if the application has already set a factory.
//   * @exception SecurityException if a security manager exists and its
//   * {@code checkSetFactory} method doesn't allow
//   * the operation.
//   * @see java.net.URL#URL(java.lang.String, java.lang.String,
//   * int, java.lang.String)
//   * @see java.net.URLStreamHandlerFactory
//   * @see SecurityManager#checkSetFactory
//   */
//   public static void setURLStreamHandlerFactory(URLStreamHandlerFactory fac) {
//   synchronized (streamHandlerLock) {
//   if (factory != null) {
//   throw new Error("factory already defined");
//   }
//   SecurityManager security = System.getSecurityManager();
//   if (security != null) {
//   security.checkSetFactory();
//   }
//   handlers.clear();
//   factory = fac;
//   }
//   }
   }

   public static void main(String[] args) throws Exception {
       InputStream in = null;
       try {
           in = new URL(args[0]).openStream();
           //in = new URL("http://www.baidu.com").openStream();
           IOUtils.copyBytes(in,System.out,4096,false);
       } finally {
           IOUtils.closeStream(in);
       }
   }
}
注释中我们可以看到 statuc块内的语句在一个JVM虚拟机运行中只能运行一次，当别的未知组件也使用了后，自己的将会无效

因此我们使用Hadopp API 提供的FileSystem类来做上面工作

补充：LocalFileSystem类在数据传输是在客户端进行检验和校验，不检验使用RawLocalFileSystem类、

可以通过CharcksumFileSyste 向无检验和系统加入检验

FileSystem rawfs=....

FileSystem checksummedFs = new ChecksumFileSystem(rawfs);

package cn.weida.hadoop.read;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

public class FileSystemCat {

   /**
   * 使用Hadoop的FileSystem 获取Hadoop文件数据 fs.open() 返回FSDataInputStream对象
   * FSDataInputStream 的seek() 方法调整读取文件的位置 read(.....) 方法可以获取文件指定位置的一段内容
   * @param args
   * @throws IllegalArgumentException
   * @throws IOException
   */
   public static void main(String[] args) throws IllegalArgumentException, IOException {
       // TODO Auto-generated method stub
       String uri = args[0];
       Configuration conf = new Configuration();
       FileSystem fs=FileSystem.get(URI.create(uri),conf);
       InputStream in = null;
       //FSDataInputStream in=null;
       try {
           in=fs.open(new Path(uri));
   //       in.seek(0);//go back to the start of the file
           IOUtils.copyBytes(in, System.out,4096,false );
       } finally {
           IOUtils.closeStream(in);
       }
    }

}
通过FileStatus 类获取文件信息

package cn.weida.hadoop.search;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;

/**
* 文件元数据 FileStatus 封装了文件系统中文件和目录的元数据，包块文件长度，块大小，复本，修改时间，所有者以及权限信息
* FileSystem.getFileStatus() 获取文件或者目录的FileStatue对象，
* 如果文件不存在抛出FileNotFoundException()异常
* 判断文件或者目录是否存在调用exists() 方法
* @author lone
*
*/

public class showFileStatusTest {

   //private MiniDFSCluster cluster;//use an inpross HDFS cluster for testing
   private FileSystem fs;

   public void setUp() throws IOException {
       Configuration conf = new Configuration();
       if (System.getProperty("test.build.data")==null) {
           System.setProperty("test.build.data","/tmp");
       }
       FileStatus fs;
       /* private Path path; 路径
       private long length; 长度
       private boolean isdir; 是否是文件夹
       private short block_replication; 备份数
       private long blocksize; 大小
       private long modification_time; 修订时间
       private long access_time; 访问时间
       private FsPermission permission;
       private String owner; 所有者
       private String group; 所有组
       private Path symlink; 链接
       */

   }

}
获取某个目录下的文件信息，如下：

package cn.weida.hadoop.search;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;

public class ListStatus {

   /**
   * FileSystem.listStatus() 有多个重载方法用于返回目录的内容
   * 当传入参数为是文件时以数组形式放回长队为1 的FileStatus对象即本身文件
   * 当传入的是目录时返回0或者多个FukeStatus对象
   * 支持正则表达式匹配和过滤文件
   * FileSystem.globStatus(Path pathPattern) //返回路径格式与指定模式匹配的所有FileStatus对象数组并按路径排序
   * FileSystem.globStatus(Path pathPattern,PathFilter)
   * PathFilter接口用于排除匹配正则表达式的路径
   * 例如：fs.globStatus(new Path(表达式),new RegexExcludeFilter(表达式))
   * public class RegexExcludePathFilter implemets PathFilter() {
   *        private final String regex;
   *        public RegexExcludePathFilter(String regex) {
   *            this.regex= regex;
   *        }
   *        public boolean accept(Path path) {
   *            return !path.toString().matcher(regex);
   *        }
   *
   * }
   * * 匹配0或多个字符
   * ？匹配单一字符
   * [ab] 匹配{a,b}集合中的一个字符
   * [^ab] 匹配非{a,b}中的一个字符
   * [a-b] 匹配一个在a~b(包括a,b)
   * [^a-b] 匹配一个非在a~b(包括a,b)
   * {a,b} 或选择匹配包含a或b中的一个
   * \c 转义字符匹配元字符c
   *
   * 删除数据
   * FileSystem .delete(Path f,boolean recursive)
   * if f 是文件或者空目录 recursive被忽略
   * else if recursive = true
   *        非空目录被删除
   * else if recursive = false
   *        删除非空目录 thows IOException
    * @param args
   * @throws IOException
   */


   public static void main(String[] args) throws IOException {
       String uri = args[0];
       Configuration conf = new Configuration();
       FileSystem fs = FileSystem.get(URI.create(uri),conf);
       Path[] paths= new Path[args.length];

       for (int i=0;i<paths.length;i++) {
           paths[i]= new Path(args[i]);
       }
       FileStatus[] status=fs.listStatus(paths);
       Path[] listedPaths = FileUtil.stat2Paths(status); //将FileStatus对象数组-->>Path 对象数组
       for (Path p : listedPaths) {
           System.out.println(p);
       }
   }
}
将本地文件复制到Hadoop

package cn.weida.hadoop.write;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Progressable;

/**
* 将本地文件复制到Hadoop文件系统实现Progressable() 接口显示上传进度
* Filesystem 实例可以创建目录 mkdirs(Path f)
* 一致模型：新建一个文件后，在文件系统命名空间立即可见
*        写入文件的内容并不保证立即可见
*        FSDataOutputStream . hflust() 方法能保证文件中到目前为止所写入的数据均到达所有datanode 的写入管道并且对受有新的reader可见、
*                        同时只能保证数据已经到内存中，不能保证到磁盘
*
*
* Hadoop distcp操作替代hadoop fs -cp
* hadoop distcp file1 file2
* hadoop distp dir1 dir2
* if dir2 不存在
*        新建dir2 目录dir1将被复制到dir2下 dir2/dir1
* hadoop distcp -overwrite 保证同上目录结构的同时强制覆盖原有文件
* hadoop distcp - update 仅更新发生变化的文件
*
* @author lone
*
*/
public class FileCiopyWithProgress {
   /**
   *
   * @param args
   * @throws Exception
   */
   public static void main(String[] args) throws Exception {
       String localSrc = args[0];
       String dst = args[1];
       InputStream in = new BufferedInputStream(new FileInputStream(localSrc));
       Configuration conf = new Configuration();
       FileSystem fs = FileSystem.get(URI.create(dst),conf);
       OutputStream out = fs.create(new Path(dst),new Progressable() {

           @Override
           public void progress() {
               System.out.println(".");

           }
       } );
       IOUtils.copyBytes(in, out,4096, true);
    }

}
使用 hadoop FileCopyWithProgress input/docs/1400-8.txt hdfs://localhost/user/tom/1400-8.txt

大数据学习（三） ：Java对Hadoop文件操作 初步

猜你喜欢

大数据学习（三）：Java对Hadoop文件操作初步