hadoop basics (2)

JAVA client

Environment build

Create a Maven project and add Hadoop dependencies.

<dependencies>
        <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.47</version>
        </dependency>
        <!-- 添加hadoop的依赖  -->
        <!--添加hdfs的客户端依赖-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>3.1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>3.1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>3.1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>3.1.1</version>
        </dependency>
    </dependencies>

FileSystem uses

core class  org.apache.hadoop.fs.FileSystem  file system class abstract class

//静态方法创建对象
public static FileSystem newInstance(URI uri,Configuration conf,String user)
/*
     	  参数一  URI  分布式文件系统 HDFS的资源地址 NN地址 hdfs://linux01:8020
          参数二  Configuration  用户自定义参数设置   副本数 3   物理切块的大小 128M
          		 
          参数三  user  客户端用户名      
 */                        
/*
      org.apache.hadoop.fs.FileSystem  文件系统类  抽象类
         静态方法获取对象(子类对象)
              public static FileSystem newInstance(URI uri,Configuration conf,String user)
                    URI uri: 统一资源标识符   协议://
                                url统一资源定位符   www.baidu.com
                                只要是网络相关的 都是uri包括url
                                迅雷下载 百度网盘 邮件发送 mailto:  jdbc连接
                             分布式文件系统 HDFS的资源地址 NN地址
                             hdfs://linux01:8020
                             构造方法
                                  public URI uri
                   Configuration conf:用户自定义参数设置   副本数 3   物理切块的大小 128M
                                      如果不设置 使用默认设置
                   String user: 用户名  root
 */
public class Demo01_FileSystem {
    public static void main(String[] args) throws URISyntaxException, IOException, InterruptedException {
        //文件系统的客户端连接对象
        URI  uri = new URI("hdfs://linux01:8020");
        //配置对象 没有进行配置 使用默认配置
        Configuration  con = new Configuration();
        //用户名
        String user = "root";
        //通过静态方法 获取 分布式文件系统对象
        FileSystem fs = FileSystem.newInstance(uri, con, user);
        System.out.println(fs);
    }
}

common method

public void copyFromLocalFile(Path src, Path dst)  将本地文件上传到文件系统
public void copyFromLocalFile(boolean delSrc, Path src, Path dst)
public void copyToLocalFile(Path src, Path dst)    将文件系统上的文件下载到本地
public void copyToLocalFile(boolean delSrc, Path src, Path dst)
public FileStatus[] listStatus(Path f) 列出目录下所有的内容 包括文件  文件夹
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)列出目录下所有的文件
public FSDataOutputStream create(Path f, boolean overwrite)  获取字节输出流 向文件中写数据
public FSDataOutputStream append(Path path) 向指定的文件路径中追加写入数据
public FSDataInputStream open(Path f) 获取字节输入流 读取文件中数据

Get the file system object utility class

public class HDFSUtils {
    private HDFSUtils(){}

    public static FileSystem  getFileSystem() throws  Exception {
        //文件系统的客户端连接对象
        URI uri = new URI("hdfs://linux01:8020");
        //配置对象 没有进行配置 使用默认配置
        Configuration con = new Configuration();
        //用户名
        String user = "root";
        //通过静态方法 获取 分布式文件系统对象
        FileSystem fs = FileSystem.newInstance(uri, con, user);
        return fs;
    }
}

upload files

/*
     public void copyFromLocalFile(Path src, Path dst)  将本地文件上传到文件系统
     public void copyFromLocalFile(boolean delSrc, Path src, Path dst)
            boolean delSrc:是否删除源文件  true删除 false则不删除
            Path src:数据源  本地系统上的文件
            Path dst:数据目的 文件系统

     1.创建FileSystem对象
     2.调用方法上传文件到分布式文件系统
     3.关闭资源
 */
public class Demo02_FileSystem {
    public static void main(String[] args) throws Exception {
        //通过工具类获取对象
        FileSystem fileSystem = HDFSUtils.getFileSystem();

        /*
            public void copyFromLocalFile(Path src, Path dst)  将本地文件上传到文件系统
            数据源:本地文件  d:\\mm.jpg
            数据目的:分布式文件系统  /
         */
      //上传并改名
        Path  src = new Path("d:\\mm.jpg");
        Path  dest = new Path("/java/meimei.jpg");

        fileSystem.copyFromLocalFile(true,src,dest);

        fileSystem.close();    
    }
}

download file

/*
    将文件系统上的文件下载到本地
    public void copyToLocalFile(Path src, Path dst)
    public void copyToLocalFile(boolean delSrc, Path src, Path dst)

                boolean delSrc: 是否删除数据源  true删除 false不删除
                Path src:数据源 文件系统
                Path dst:数据目的 本地

    注意:由于win和HDFS分布式兼容不好 需要安装Hadoop环境 不安装不能下载文件
        解压 hadoop3.1.1 配置环境变量HADOOP_HOME 加入Path
        需要重启idea  可能需要重启电脑
 */
public class Demo03_FileSystem {
    public static void main(String[] args) throws Exception {
        //通过工具类获取对象
        FileSystem fs = HDFSUtils.getFileSystem();


//        Path  src = new Path("/mm.jpg");
//        Path  dest = new Path("d:\\");
        //将文件系统中的文件 下载到本地  自动生成.crc的检验文件
//        fs.copyToLocalFile(src,dest);

        Path  src = new Path("/mm.jpg");
        Path  dest = new Path("d:\\meimei.jpg");
        下载并改名
      //  fs.copyToLocalFile(src,dest);

        /*
            public void copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem)
            参数1:是否删除源文件
            参数2:数据源
            参数3:数据目的
            参数4:是否使用本地文件系统 true则不生成.crc文件
         */
        fs.copyToLocalFile(false,src,dest,true);
        fs.close();
    }
}

traverse files

/*
    public FileStatus[] listStatus(Path f) 列出指定目录下所有的内容 包括文件  文件夹
 */
public class Demo04_FileSystem {
    public static void main(String[] args) throws Exception {
        //获取对象
        FileSystem fs = HDFSUtils.getFileSystem();
        //获取根目录下所有内容 文件 文件夹
        FileStatus[] fileStatusArr = fs.listStatus(new Path("/"));
        //增强for循环遍历
        for(FileStatus file : fileStatusArr){
//            System.out.println(file);
            //判断是否是文件夹
            boolean b = file.isDirectory();
            //判断是否是文件
            boolean b2 = file.isFile();
            //获取文件路径
            Path path = file.getPath();
            if(b){
                System.out.println("文件夹:"+path);
            }else{
                System.out.println("文件:"+path);
            }
        }
    }
}

/*
    public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)列出目录下所有的文件 只获取文件
                方法参数
                    Path f:路径
                    boolean recursive:是否递归遍历  true false

                方法返回值
                    RemoteIterator 迭代器
                          hasNext()  判断是否有元素
                          next()  获取元素

              
                    LocatedFileStatus  分布式文件系统上的文件对象 可以获取文件的信息
                                       获取文件大小  副本个数  block块个数 大小等等

                                 Path getPath() 获取路径
                                 long getLen()  获取文件的字节数
                                 short getReplication()获取副本个数
                                 long  getBlockSize() 获取block块的大小
                                 BlockLocation[] getBlockLocations() 获取block块数组

                    BlockLocation  物理切块对象
                           String[]   getHosts()  获取block块在主机的位置
                           String[]   getNames()  获取block块在主机的名称端口
                           long   getLenth()  获取每个切块的大小
                           long   getOffset() 获取偏移量
 */
public class Demo05_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //调用方法获取根目录下的所有内容
        RemoteIterator<LocatedFileStatus> it = fs.listFiles(new Path("/"), true);
        while (it.hasNext()) {
            LocatedFileStatus file = it.next();
//            System.out.println(file.getPath());

            //获取路径
            Path path = file.getPath();
            //获取文件的字节数
            long len = file.getLen();
            System.out.println("文件大小:" + len * 1.0 / 1024 / 1024 + "M");
            //获取文件的副本
            short s = file.getReplication();
            System.out.println("副本个数:" + s);
            //获取block大小
            long blockSize = file.getBlockSize();
            System.out.println("block块大小:" + blockSize * 1.0 / 1024 / 1024 + "M");

            //获取block的数组
            BlockLocation[] blockLocations = file.getBlockLocations();
            System.out.println("block块个数:" + blockLocations.length);


            //遍历物理切块数组
            for (BlockLocation b : blockLocations) {
                //获取物理切块的主机地址
                String[] hosts = b.getHosts();
                System.out.println(Arrays.toString(hosts));
                String[] names = b.getNames();
                System.out.println(Arrays.toString(names));
                long length = b.getLength();
                System.out.println("block大小:" + length * 1.0 / 1024 / 1024 + "M");
                long offset = b.getOffset();
                System.out.println("偏移量:" + offset);

            }
            System.out.println("------------------------------");
        }

    }
}

write data

/*
    public FSDataOutputStream create(Path f, boolean overwrite)  获取字节输出流 向文件中写数据
                boolean overwrite:如果文件存在 是否覆盖  true 覆盖 false不覆盖
 */
public class Demo06_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //获取写数据的字节输出流
        FSDataOutputStream out = fs.create(new Path("/1.txt"), true);

        out.write("hello world\r\n".getBytes());

        out.write("hello boys  ".getBytes());
        out.write("hello girls  ".getBytes());

        out.close();
        fs.close();
    }
}

/*
    追加写入数据
      FSDataOutputStream append(Path path) 向指定的文件路径中追加写入数据
 */
public class Demo07_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //向文件中追加写入数据
        FSDataOutputStream out = fs.append(new Path("/1.txt"));
        out.write("hello  aaa\r\n".getBytes());
        out.write("hello  bbb\r\n".getBytes());
        out.write("hello  ccc\r\n".getBytes());

        out.close();
        fs.close();

    }
}

read data

/*
    public FSDataInputStream open(Path f) 获取字节输入流 读取文件中数据
 */
public class Demo08_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //获取字节输入流 读取文件中数据
        FSDataInputStream in = fs.open(new Path("/1.txt"));

//        byte[] bytes = new byte[1024];
//        int len =0;
//        while((len = in.read(bytes))!=-1){
//            System.out.println(new String(bytes,0,len));
//
//        }
        //一行一行读取数据
        InputStreamReader isr = new InputStreamReader(in);
        //创建缓冲流
        BufferedReader br = new BufferedReader(isr);
//        String s = br.readLine();
        String line = null;
        while((line = br.readLine())!=null){
            System.out.println(line);
        }
        br.close();
        fs.close();

    }
}
/*
	seek和skip方法补充
 */
public class Demo09_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fs = HDFSUtils.getFileSystem();

        //获取字节输入流 读取文件中数据
        FSDataInputStream in = fs.open(new Path("/1.txt"));

        //跳过几个字节
       // in.skip(1);
	    //指定指针标记读取
        in.seek(0);

        int read = in.read();
        System.out.println(read);

        in.close();

        fs.close();
    }
}

Other methods

//DistributedFileSystem
public class Demo10_FileSystem {
    public static void main(String[] args) throws Exception {
        FileSystem fileSystem = HDFSUtils.getFileSystem();

        fileSystem.delete(new Path("/1.txt"),true);  //删除
        fileSystem.mkdirs(new Path("/aaa/bbb") ); //创建文件夹
        fileSystem.exists(new Path("/aaa/1.txt")); //判断路径是否存在

        //...
      fileSystem.close();
    }
}

small file merge

Since Hadoop is good at storing large files, because the metadata information of large files is relatively small, if there are a large number of small files in the Hadoop cluster, each small file needs to maintain a piece of metadata information, which will greatly increase the cluster management metadata. Memory pressure, so in actual work, if necessary, small files must be merged into large files for processing together. In our HDFS shell command mode, many hdfs files can be merged into one large file and downloaded locally through the command line. .

 hdfs dfs -getmerge /aaa/* ./abc.txt  

Since these small files can be combined into one large file when downloading, it is definitely possible to combine small files into one large file when uploading.

     FileSystem fs = HDFSUtils.getFS();

        FSDataOutputStream out = fs.create(new Path("/aaa/big.txt"));

        LocalFileSystem local = FileSystem.getLocal(new Configuration());
        FileStatus[] fileStatuses = local.listStatus(new Path("file:///d:\\input"));

        for (FileStatus fileStatus : fileStatuses) {

            FSDataInputStream in = local.open(fileStatus.getPath());
            IOUtils.copy(in,out);

            IOUtils.closeQuietly(in);
        }

        IOUtils.closeQuietly(out);

parameter configuration

Configuration using the Configuration class

       Configuration conf = new Configuration();
        // 修改存储的副本个数  5  name  value
        conf.set( "dfs.replication", "5");
        // 修改物理切块的大小
        conf.set("dfs.blocksize", "64m");

Configure using configuration files

Create a configuration file hdfs-site.xml under resources

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->
<configuration>
        <property>
             <name>dfs.replication</name>
                 <value>5</value>
        </property>
   		<property>
             <name>dfs.blocksize</name>
                 <value>64m</value>
        </property>
</configuration>

HDFS file quota configuration

In an environment where multiple people share HDFS, configuration settings are very important. Especially in an environment where Hadoop handles a large amount of data, if there is no quota management, it is easy to use up all the space and make it impossible for others to access it. The quota setting of Hdfs is for the directory rather than the account, so that each account can only operate a certain directory, and then set the configuration for the directory.
The quota configuration of hdfs files allows us to limit the number of files or the total amount of file content we upload in a certain directory by the number of files or file size, so as to achieve the limit that each user is allowed to upload, such as Baidu network disk network disk, etc. The largest file size.

hdfs dfs -count -q -h /aaa    #查看配额信息

Quantity limit

hdfs dfs -mkdir -p /aaa
hdfs dfsadmin -setQuota 2 /aaa #给该文件夹下设置最多上传两个文件 ,发现只能上传一个
hdfs dfsadmin -clrQuota /aaa # 清除限额

Space quota

When setting the space quota, set the space to be at least block_size*3

hdfs dfsadmin -setSpaceQuota 4k /aaa #限制空间大小4KB
hdfs dfs -put  a.txt /aaa  #上传失败

Generate file command of any size

dd if=/dev/zero of=1.txt bs=1M count=2 #生成任意大小文件  bs*count

Clear space quota limit

hdfs dfsadmin -clrSpaceQuota /aaa

hdfs security mode

Security mode is a protection mechanism of Hadoop, which is used to ensure the security of data blocks in the cluster. When the cluster starts, it will first enter safe mode. The integrity of data blocks is checked when the system is in safe mode.
Assuming that the number of replicas we set (that is, the parameter dfs.replication) is 3, then there should be 3 replicas on the datanode. Assuming that there are only 2 replicas, the ratio is 2/3=0.666. The default copy rate of hdfs is 0.999. Our copy rate of 0.666 is obviously less than 0.999, so the system will automatically copy copies to other dataNodes so that the copy rate is not less than 0.999. If there are 5 copies in the system, exceeding the 3 copies we set, the system will also delete the more than 2 copies. In the safe mode state, the file system only accepts read data requests, but not delete, modify and other change requests. Now, when the entire system reaches the security standard, HDFS automatically leaves the safe mode.

dfs.namenode.safemode.threshold-pct  副本率 默认值 0.999f
dfs.namenode.safemode.extension 默认值30000   检查完成后 30秒后退出安全模式

Safe Mode Operation Commands

hdfs dfsadmin -safemode get  #查看安全模式状态
hdfs dfsadmin -safemode enter #进入安全模式
hdfs dfsadmin -safemode leave #离开安全模式

Principle introduction

Upload data process

Network topology and rack awareness

 网络拓扑
	节点距离:两个节点到达共同父节点的距离和

机架感知 ( 副本节点的选择 )
	例如:500个节点,上传数据my.tar.gz,副本数为3, 
		  根据机架感知,副本数据存储节点的选择。
	
    
   BlockPlacementPolicyDefault 
   官方注释
   the 1st replica is placed on the local machine, 
   otherwise a random datanode. The 2nd replica is placed on a datanode
   that is on a different rack. The 3rd replica is placed on a datanode
   which is on a different node of the rack as the second replica.

upload process

Block
HDFS中的文件在物理上是分块存储的,即分成Block;
block在Hadoop不同版本大小不同:
Hadoop1.x:64M
Hadoop2.x:128M


Pipeline,中文翻译为管道。这是HDFS在上传文件写数据过程中采用的一种数据传输方式。
客户端将数据块写入第一个数据节点,第一个数据节点保存数据之后再将块复制到第二个数据节点,后者保存后将其复制到第三个数据节点。

为什么datanode之间采用pipeline线性传输,而不是一次给三个datanode拓扑式传输呢?
因为数据以管道的方式,顺序的沿着一个方向传输,这样能够充分利用每个机器的带宽,避免网络瓶颈和高延迟时的连接,最小化推送所有数据的延时。

ACK (Acknowledge character )即是确认字符,在数据通信中,接收方发给发送方的一种传输类控制字符。表示发来的数据已确认接收无误。


Packet
Packet是Client端向Datanode,或者DataNode的PipLine之间传输数据的基本单位,默认64kB.
Chunk
Chunk是最小的Hadoop中最小的单位,是Client向DataNode或DataNode的PipLne之间进行数据校验的基本单位,默认512Byte,因为用作校验(自己校验自己),故每个chunk需要带有4Byte的校验位。
所以世纪每个chunk写入packet的大小为516Byte,真实数据与校验值数据的比值为128:1。

Download data flow

 IO操作过程中难免会出现数据丢失或脏数据,数据传输得量越大出错得几率越高。校验错误最常用得办法就是传输前计算一个校验和,传输后计算一个校验和,两个校验和如果不相同就说明数据存在错误,比较常用得错误校验码是CRC32.
 hdfs写入的时候计算出校验和,然后每次读的时候再计算校验和。要注意的一点是,hdfs每固定长度就会计算一次校验和,这个值由io.bytes.per.checksum指定,默认是512字节。

How does the DataNode node ensure the integrity of the data

1.当datanode读取Block的时候,会计算CheckSum
2.如果计算后的CheckSum,与Block创建时值不一样,说明Block已经损坏
3.客户端读取其他DataNode上的Block
4.常见的校验算法crc,md5等 
5.DataNode在其文件创建后周期验证CheckSum(DataNode后台进程(DataBlockScanner))

If the client finds that a block is broken, how will the bad block be recovered, mainly in several steps:

 1.客户端在抛出ChecksumException之前会把坏的block和block所在的datanode报告给namenode
 2.namenode把这个block标记为已损坏,这样namenode就不会把客户端指向这个block,也不会复制这个block到其他的datanode。
 3.namenode会把一个好的block复制到另外一个datanode
 4.namenode把坏的block删除掉

Metadata Information Management

What is metadata information

元数据:描述数据的数据信息
HDFS中元数据分为两种 
	1.描述文件自身属性的元数据信息
	2.描述文件与block之间映射的元数据信息

How to manage metadata information

​ Metadata information is a mapping relationship of virtual directories, how to save this information. According to the storage form, it is divided into memory metadata and metadata files , which are stored in memory and disk respectively .

memory metadata

In order to ensure high-efficiency metadata interaction and low latency for user operations, the NameNode stores all metadata in memory, which we call in-memory metadata. The metadata in the memory is the most complete , including the file's own attribute information and file block location mapping information. But the fatal problem of memory is that the breakpoint data is lost and the data will not be persisted. Therefore, NameNode also assists metadata files to ensure the security and integrity of metadata.

metadata file

There are two types: fsimage image file, edit edit log

fsimage image file : is a persistent checkpoint of memory metadata. However, fsimage only contains metadata information related to the file's own attributes in the Hadoop file system, but does not contain information about the location of file blocks . The file block location information is only stored in the memory. It is obtained by reporting the data block to the namenode when the datanode starts to join the cluster, and the data block report is performed at a specified time in the subsequent intervals.

The action of persistence is the IO process of data from memory to disk, and the fsimae file will be very large (GB level is very common). If all update operations are added to the fsimage file, this will cause the system to run very slowly. It will have a certain impact on the normal service of the namenode and cannot be persisted frequently.

edit edit log : In order to avoid the problem of data loss between two persistences, the Edits log edit log file is designed. What is recorded in the file is the log of all HDPS change operations (file creation, deletion or modification) , and the change operations performed by the file system client will first be recorded in the edits file.

When the HDFS cluster runs for a period of time, the following problems will occur:

1. edits logs因操作记录过多会变的很大,
2. fsimage因间隔时间长将会变得很旧;
3. namenode重启会花费很长时间,因为有很多改动要
   从edits log合并到fsimage文件上;
4. 如果频繁进行fsimage持久化,又会影响NN正常服
   务,毕竟I0操作是-种内存到磁盘的耗精力操作.

Therefore, in order to overcome the above problems, we need an easy-to-manage mechanism to help us reduce the size of the edit logs file and get a latest fsimage file, which will also reduce the pressure on the NameNode.

checkpoint mechanism

Through the Secondary Namenode, the metadata of the Name Node is updated and backed up at regular intervals, and then the fsimage is returned to the Name Node for reading when it starts next time.

When the cluster starts, both NN and SNN will start. After NN starts, it will read the latest fsimage file, read the newer metadata information, and also read the latest log information, and "roll back" according to the content of the log information. The operation information of the last boot, so that the current metadata information is complete and correct

​ SNN will go to NN to download its fsimage file and many edits files after a period of time, download it to the local machine of SNN, and then deserialize the fsimage into memory, and "play back" the operation information in many log files at the same time, update and supplement Full metadata, after the metadata is updated, SNN will serialize the metadata object to the local disk, and then send the metadata object to NN for the next startup to read

checkpoint process

Checkpint related parameter settings (understanding)

core-site.xml
dfs.namenode.checkpoint.period
--两次检查点创建之间的固定时间间隔,默认3600,即1小时。

dfs.namenode.checkpoint.txns
--未检查的事务数量。若没检查事务数达到这个值,也触发一次checkpoint,1,000,000。

dfs.namenode.checkpoint.check.period
--standby namenode检查是否满足建立checkpoint的条件的检查周期。默认60,即每1min检查一次。

dfs.namenode.num.checkpoints.retained
--在namenode上保存的fsimage的数目,超出的会被删除。默认保存2个。

dfs.namenode.num.checkpoints.retained
--最多能保存的edits文件个数,默认为1,000,000.

dfs.ha.tail-edits.period
--standby namenode每隔多长时间去检测新的Edits文件。只检测完成了的Edits, 不检测inprogress的文件。

View FSImage and edit files

cd /opt/hdpdata/name/current
hdfs oiv -p 文件类型  -i镜像文件  -o转换后的文件输出路径
hdfs oiv -p XML -i fsimage_0000000000000000434 -o /opt/fsimage.xml
hdfs oev -p XML -i edits_inprogress_0000000000000000695 -o /opt/edit.xml

Disadvantages of HDFS

1.不适合低延时数据访问,比如毫秒级的存储数据.
2.无法高效的对大量小文件进行存储
	  1.存储大量小文件的话,它会占用namenode大量内存来存储文件目录和块信息,而namenode的内存是有限的.
	  	一个block的需要存储的信息 150个字节
	  2.小文件存储的寻址时间会超过读取文件的时间,违反HDFS的设计目标
3.不支持并发写入,文件随机修改
	  1.一个文件只能有一个写,不允许多线程同时写
	  2.仅支持数据append,不支持文件的随机修改

Guess you like

Origin blog.csdn.net/qq_61162288/article/details/131265341