HDFS之NameNode的实现

1. 文件系统的目录树

1.1. INode

abstract class INode implements Comparable<byte[]> 
{
  //文件/目录名称
  protected byte[] name;
  //父目录
  protected INodeDirectory parent;
  //最近一次的修改时间
  protected long modificationTime;
  //最近访问时间
  protected long accessTime;
	//使用long整数的64位保存,分3段保存,分别为mode模式控制访问权限,所属组,所属用户
  private static enum PermissionStatusFormat 
  {
    //访问权限
    MODE(0, 16),
    //用户组标识符
    GROUP(MODE.OFFSET + MODE.LENGTH, 25),
    //文件主标识符
    USER(GROUP.OFFSET + GROUP.LENGTH, 23);

	//偏移量
    final int OFFSET;
    final int LENGTH; //bit length
    final long MASK;

    PermissionStatusFormat(int offset, int length) {
      OFFSET = offset;
      LENGTH = length;
      MASK = ((-1L) >>> (64 - LENGTH)) << OFFSET;
    }

    //与掩码计算并右移得到用户标识符
    long retrieve(long record) {
      return (record & MASK) >>> OFFSET;
    }

    long combine(long bits, long record) {
      return (record & ~MASK) | (bits << OFFSET);
    }
  }
  
  /** Get user name */
  public String getUserName() {
    int n = (int)PermissionStatusFormat.USER.retrieve(permission);
    //根据整形标识符,SerialNumberManager对象中取出,避免存储字符串消耗大量内存
    return SerialNumberManager.INSTANCE.getUser(n);
  }
  
   /**
   * Check whether this is the root inode.
   * 根节点的判断标准是名字长度为0
   */
  boolean isRoot() {
    return name.length == 0;
  }
  
   //移除自身节点方法
  boolean removeNode() 
  {
    if (parent == null) 
    {
      return false;
    }
    else 
    {  
     //调用父目录的removeChild方法
      parent.removeChild(this);
      parent = null;
      return true;
    }
  }
  
  //返回inode所在子目录树中所有文件拥有的数据块
  abstract int collectSubtreeBlocksAndClear(List<Block> v);
  
 }

1.2 INodeDirectory

/**
 * Directory INode class.
 */
class INodeDirectory extends INode 
{
	//保存子目录或子文件
	private List<INode> children;

	//移除节点方法
  INode removeChild(INode node) 
  {
    	assert children != null;
	    //用二分法寻找文件节点
    	int low = Collections.binarySearch(children, node.name);
	    if (low >= 0) 
	    {
    	  return children.remove(low);
	    }
	    else 
	    {
    	  return null;
    	}
  }

	  //递归删除文件目录下的所有block块
	  int collectSubtreeBlocksAndClear(List<Block> v) 
	  {
		    int total = 1;
		    //直到是空目录的情况,才直接返回
		    if (children == null) 
		    {
		      return total;
		    }
		    for (INode child : children) 
		    {
		      //递归删除
		      total += child.collectSubtreeBlocksAndClear(v);
		    }
	    
		    //删除完毕之后,置为空操作,并返回文件数计数结果
		    parent = null;
	    	children = null;
		    return total;
	  }

	  /** 
	  * 使用递归方法,计算目录树占用的空间
	   */
	  DirCounts spaceConsumedInTree(DirCounts counts) {
	    counts.nsCount += 1;
	    if (children != null) {
	      for (INode child : children) {
	        child.spaceConsumedInTree(counts);
	      }
	    }
	    return counts;    
	  }

}

1.3 INodeDirectoryWithQuota

HDFS允许管理员为每个目录设置配额,配额有两种:
(1)节点配额: 用于限制目录下的名字数量,如果创建文件或者目录时超过了该配额,操作会失败。这个配额用于控制用户对名字节点资源的占用,保存在成员变量nsQuota中
(2)空间配额:限制存在目录树中的所有文件的总规模,空间配额保证用户不会过多占用数据节点的资源,该配额由dsQuota变量保存。

1.4 INodeFile

class INodeFile extends INode 
{
	/**
	* 高16位存放副本系数
	* 低48位存放数据块大小
	**/
	  protected long header;
	  //文件数据block块
	  protected BlockInfo blocks[] = null;
	
	  //将自身拥有的block块加入到参数block列表中
	  int collectSubtreeBlocksAndClear(List<Block> v) 
	  {
	    parent = null;
	    for (Block blk : blocks) 
	    {
	      v.add(blk);
	    }
	    blocks = null;
	    return 1;
	  }
	
	

}

1.5 INodeFileUnderConstruction

//处于构建状态的文件节点
class INodeFileUnderConstruction extends INodeFile
{
	  //写文件的客户端名称,也是这个租约的持有者
	  String clientName;         // lease holder
	  //客户端所在的主机
	  private final String clientMachine;
	  //如果客户端同样存在于集群中,则记录所在的节点
	  private final DatanodeDescriptor clientNode; // if client is a cluster node too.
	  
	  //租约恢复时的节点
	  private int primaryNodeIndex = -1; //the node working on lease recovery
	  //最后一个block块所处的节点组,又名数据流管道成员
	  private DatanodeDescriptor[] targets = null;   //locations for last block
	  //最近租约恢复时间
	  private long lastRecoveryTime = 0;
}

2 命名空间镜像

名字节点通常把命名空间镜像和编辑日志保存在“current”目录下。目录下通常有四个文件:
(1)fsimage:元数据镜像文件
(2)edits:日志文件,和元数据镜像文件一起,提供一个完整的HDFS目录树和元信息
(3)fstime:保存最近一次检查点的时间,检查点一般由SecondaryNameNode产生,是一次fsimage和edits合并的结果
(4)VERSION:和数据节点类似,该文件保存了NameNode存储的一些属性

2.1 saveFSImage

FSImage.saveFSImage()会将当前时刻的fsimage保存到newFile指定的文件中
*先输出文件头

/**
   * Save the contents of the FS image to the file.
   * 保存镜像文件
   */
  void saveFSImage(File newFile) throws IOException 
  {
	    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
	    FSDirectory fsDir = fsNamesys.dir;
	    long startTime = FSNamesystem.now();
	    //
	    
	    /**
	    * 根据输入文件构建输出流
	    **/
	    DataOutputStream out = new DataOutputStream(
	                                                new BufferedOutputStream(
	                                                                         new FileOutputStream(newFile)));
	    try {
	    	/**
	    	* 下面四个是写文件头
	    	**/
	      //写入版本号
	      out.writeInt(FSConstants.LAYOUT_VERSION);
	      //写入命名空间ID
	      out.writeInt(namespaceID);
	      //写入目录下的孩子总数
	      out.writeLong(fsDir.rootDir.numItemsInTree());
	      //写入时间
	      out.writeLong(fsNamesys.getGenerationStamp());
	      
	      /*
	      * HDFS文件或者目录的绝对路径不能超过8000字节
	      * 所以缓冲区的大小31.25KB
	      */
	      byte[] byteStore = new byte[4*FSConstants.MAX_PATH_LENGTH];
	      ByteBuffer strbuf = ByteBuffer.wrap(byteStore);
	      
	      /**
	      * save the root
	      * 根节点长度为0,所以必须要做特殊处理
	      **/ 
	      saveINode2Image(strbuf, fsDir.rootDir, out);
	      // save the rest of the nodes
	      saveImage(strbuf, 0, fsDir.rootDir, out);
	      //保存构建中的节点
	      fsNamesys.saveFilesUnderConstruction(out);
	      //保存安全信息
	      fsNamesys.saveSecretManagerState(out);
	      strbuf = null;
	    } 
	    finally 
	    {
	      out.close();
	    }
	
	    LOG.info("Image file of size " + newFile.length() + " saved in " 
	        + (FSNamesystem.now() - startTime)/1000 + " seconds.");
  }

2.2 saveImage

 /**
   * Save file tree image starting from the given root.
   * This is a recursive procedure, which first saves all children of
   * a current directory and then moves inside the sub-directories.
   * 按照给定节点进行镜像的保存,每个节点目录会采取递归的方式进行遍历
   */
  private static void saveImage(ByteBuffer parentPrefix,
                                int prefixLength,
                                INodeDirectory current,
                                DataOutputStream out) throws IOException
 {
	    int newPrefixLength = prefixLength;
	    //空目录
	    if (current.getChildrenRaw() == null)
	      return;
	    //输出当前节点的所有子节点  
	    for(INode child : current.getChildren()) 
	    {
	      //设置缓冲区位置
	      parentPrefix.position(prefixLength);
	      //将当前目录追加到缓冲区中
	      parentPrefix.put(PATH_SEPARATOR).put(child.getLocalNameBytes());
	      //输出节点信息
	      saveINode2Image(parentPrefix, child, out);
	    }
	    //子节点是目录,输出该目录
	    for(INode child : current.getChildren()) 
	    {
	    	//文件,忽略
	       if(!child.isDirectory())
	         continue;
	      
	      //准备参数  
	      parentPrefix.position(prefixLength);
	      parentPrefix.put(PATH_SEPARATOR).put(child.getLocalNameBytes());
	      newPrefixLength = parentPrefix.position();
	      
	      //递归调用
	      saveImage(parentPrefix, newPrefixLength, (INodeDirectory)child, out);
	    }
	    parentPrefix.position(prefixLength);
  }

2.3 saveINode2Image

/*
   * Save one inode's attributes to the image.
   * 保留一个节点的属性到镜像中
   */
  private static void saveINode2Image(ByteBuffer name,
                                      INode node,
                                      DataOutputStream out) throws IOException 
{
    int nameLen = name.position();
    out.writeShort(nameLen);
    out.write(name.array(), name.arrayOffset(), nameLen);
    
    // write file inode
    if (!node.isDirectory()) 
    {  
      INodeFile fileINode = (INodeFile)node;
      //写入的属性包括,副本数,最近修改数据,最近访问时间
      out.writeShort(fileINode.getReplication());
      out.writeLong(fileINode.getModificationTime());
      out.writeLong(fileINode.getAccessTime());
      out.writeLong(fileINode.getPreferredBlockSize());
      Block[] blocks = fileINode.getBlocks();
      out.writeInt(blocks.length);
      for (Block blk : blocks)
        //将数据块信息也写入
        blk.write(out);
      FILE_PERM.fromShort(fileINode.getFsPermissionShort());
      PermissionStatus.write(out, fileINode.getUserName(),
                             fileINode.getGroupName(),
                             FILE_PERM);
    } 
    else  // write directory inode
    {   
      //如果是目录,则还要写入节点的配额限制值
      out.writeShort(0);  // replication
      out.writeLong(node.getModificationTime());
      out.writeLong(0);   // access time
      out.writeLong(0);   // preferred block size
	 // # of blocks 。-1代表这是目录
      out.writeInt(-1);    
      out.writeLong(node.getNsQuota());
      out.writeLong(node.getDsQuota());
      FILE_PERM.fromShort(node.getFsPermissionShort());
      PermissionStatus.write(out, node.getUserName(),
                             node.getGroupName(),
                             FILE_PERM);
    }
  }

2.4 FSNamesystem.saveFilesUnderConstruction

/**
   * Serializes leases. 
   */
  void saveFilesUnderConstruction(DataOutputStream out) throws IOException 
  {
    synchronized (leaseManager) 
    {
      /**
      * write the size
      * 租约管理器中的项目数
      **/ 
      out.writeInt(leaseManager.countPath()); 

      for (Lease lease : leaseManager.getSortedLeases()) 
      {
        for(String path : lease.getPaths()) 
        {
          // verify that path exists in namespace
          INode node = dir.getFileINode(path);
          if (node == null) 
          {
            throw new IOException("saveLeases found path " + path +
                                  " but no matching entry in namespace.");
          }
          //缺人文件处于构建过程
          if (!node.isUnderConstruction()) 
          {
            throw new IOException("saveLeases found path " + path +
                                  " but is not under construction.");
          }
          INodeFileUnderConstruction cons = (INodeFileUnderConstruction) node;
          FSImage.writeINodeUnderConstruction(out, cons, path);
        }
      }
    }
  }

3.编辑日志

/**
 * A generic abstract class to support journaling of edits logs into 
 * a persistent storage.
 */
abstract class EditLogOutputStream extends OutputStream 
{
	  //下面是2个统计量
	  //文件同步的次数,可以理解为就是缓冲写入的次数
	  private long numSync;        // number of sync(s) to disk
	  //同步写入的总时间计数
	  private long totalTimeSync;  // total time to sync

	 /**
    * Create and initialize new edits log storage.
    * 创建日志文件
    * @throws IOException
   	*/
	  abstract void create() throws IOException;

	 /**
     * Flush data to persistent store.
     * Collect sync metrics.
     * 刷出时间方法
     */
    public void flush() throws IOException 
   {
    	//同步次数加1
	    numSync++;
	    long start = FSNamesystem.now();
	    //刷出同步方法为抽象方法,由继承的子类具体实现
	    flushAndSync();
	    long end = FSNamesystem.now();
	    //同时进行耗时的累加
	    totalTimeSync += (end - start);
  	}

 /**
   * Return the size of the current edits log.
   * Length is used to check when it is large enough to start a checkpoint.
   */
  abstract long length() throws IOException;

}

EditLogFileOutputStream有两个工作缓冲区:(1)bufCurrent:日志写入缓冲区(2)bufReady写文件缓冲区。

通过wirte()输出的日志记录会写到缓冲区bufCurrent中,当bufCurrent中的内容需要写往文件时,EditLogFileOutputStream会交换两个缓冲区,原来的日志写入缓冲区会编程文件写入缓冲区,原来的文件缓冲区会变成日志写入缓冲区

EditLogFileOutputStream对象构造后,还需要调用对象的create()方法,才能开始网输出流中写入数据。

一般来说,往日志文件中写数据按照先setReadyToFlush()后Flush的调用顺序进行。

3.1 EditLogFileOutputStream

public class EditLogFileOutputStream extends EditLogOutputStream 
{
  	  private static Log LOG = LogFactory.getLog(EditLogFileOutputStream.class);

	  private File file;
	  private FileOutputStream fp; // file stream for storing edit logs
	  //输出文件对应的文件通道
	  private FileChannel fc; // channel of the file stream for sync
	  private EditsDoubleBuffer doubleBuf;

	  static ByteBuffer fill = ByteBuffer.allocateDirect(1024 * 1024); // preallocation, 1MB

	 /**
	   * Create empty edits logs file.
	   */
	  @Override
	  public void create() throws IOException 
	  {
	  	//清空文件现有内容,fc是文件通道对象
	    fc.truncate(0);
	    fc.position(0);
	    doubleBuf.getCurrentBuf().writeInt(FSConstants.LAYOUT_VERSION);
	    setReadyToFlush();
	    //调用了基类的flush()方法
	    flush();
	  }

	 /**
	   * All data that has been written to the stream so far will be flushed. New
	   * data can be still written to the stream while flushing is performed.
	   */
	  @Override
	  public void setReadyToFlush() throws IOException 
	  {
	    //插入日志文件结束标识OP_INVALID
	    doubleBuf.getCurrentBuf().write(FSEditLogOpCodes.OP_INVALID.getOpCode()); // insert eof marker
	    doubleBuf.setReadyToFlush();
	  }
	
	 /**
	   * Flush ready buffer to persistent store. currentBuffer is not flushed as it
	   * accumulates new log records while readyBuffer will be flushed and synced.
	   */
	  @Override
	  protected void flushAndSync(boolean durable) throws IOException {
	    if (fp == null) {
	      throw new IOException("Trying to use aborted output stream");
	    }
	    preallocate(); // preallocate file if necessary
	    if (doubleBuf.isFlushed()) {
	      return;
	    }
	    doubleBuf.flushTo(fp);
	    if (durable) 
	    {
	      //持久化日志数据
	      fc.force(false); // metadata updates not needed
	    }
	    //忽略日志文件结束标识,为下一次写做准备
	    fc.position(fc.position() - 1); // skip back the end-of-file marker
	  }
  }

3.2 FSEditLog

 public void logOpenFile(String path, INodeFileUnderConstruction newNode) 
                   throws IOException 
 {
    //绝对路径
    UTF8 nameReplicationPair[] = new UTF8[] { 
      new UTF8(path), 
      //副本数
      FSEditLog.toLogReplication(newNode.getReplication()),
      //修改时间
      FSEditLog.toLogLong(newNode.getModificationTime()),
      //访问时间
      FSEditLog.toLogLong(newNode.getAccessTime()),
      //数据块大小
      FSEditLog.toLogLong(newNode.getPreferredBlockSize())};
    logEdit(OP_ADD,
            new ArrayWritable(UTF8.class, nameReplicationPair), 
            new ArrayWritable(Block.class, newNode.getBlocks()),
            newNode.getPermissionStatus(),
            new UTF8(newNode.getClientName()),
            new UTF8(newNode.getClientMachine()));
  }
 // stores the most current transactionId of this thread.
 //通过ThreadLocal类保存线程私有的状态信息
 //保存当前线程最近一次日志交易标识
  private static final ThreadLocal<TransactionId> myTransactionId = new ThreadLocal<TransactionId>() 
  {
    protected synchronized TransactionId initialValue() {
      return new TransactionId(Long.MAX_VALUE);
    }
  };


/**
   * Write an operation to the edit log. Do not sync to persistent
   * store yet.
   * 写入一个操作到编辑日志中
   */
  synchronized void logEdit(byte op, Writable ... writables) {
    if (getNumEditStreams() < 1) {
      throw new AssertionError("No edit streams to log to");
    }
    long start = FSNamesystem.now();
    for (int idx = 0; idx < editStreams.size(); idx++) {
      EditLogOutputStream eStream = editStreams.get(idx);
      try {
        // 写入操作到每个输出流中
        eStream.write(op, writables);
      } catch (IOException ioe) {
        removeEditsAndStorageDir(idx);
        idx--; 
      }
    }
    exitIfNoStreams();
    // get a new transactionId
    //获取一个新的事务Id
    txid++;

    //
    // record the transactionId when new data was written to the edits log
    //
    TransactionId id = myTransactionId.get();
    id.txid = txid;

    // update statistics
    long end = FSNamesystem.now();
    //在每次进行logEdit写入记录操作的时候,都会累加事物次数和耗时
    numTransactions++;
    totalTimeTransactions += (end-start);
    if (metrics != null) // Metrics is non-null only when used inside name node
      metrics.addTransaction(end-start);
  }

3.2.3 logSync

logSync的逻辑:
(1)为同步日志记录做准备。准备动作的第一步就是保证没有其它线程在执行日志记录同步工作。
如果当前线程可以执行日志记录同步操作,在记录相关信息后,调用EditLogOutputStream.setReadyToFlush(),交换输出流的日志写入缓冲区和写文件缓冲区。
(2)准备工作完成后,logSync()调用所有输出流的flush()方法,刷新并同步日志记录。
(3) 最后,logSync()设置FSEidtLog的成员变量synctxid和isSyncRunning,结束这次logSync()调用并为下次调用做好准备

public void logSync() throws IOException 
{
    ArrayList<EditLogOutputStream> errorStreams = null;
    long syncStart = 0;

    // Fetch the transactionId of this thread. 
    long mytxid = myTransactionId.get().txid;

    ArrayList<EditLogOutputStream> streams = new ArrayList<EditLogOutputStream>();
    boolean sync = false;
    try {
      synchronized (this) {
        printStatistics(false);

        /**
        * if somebody is already syncing, then wait
        * 有其它线程在执行日志同步操作
        **/ 
        while (mytxid > synctxid && isSyncRunning) {
          try {
            wait(1000);
          } catch (InterruptedException ie) { 
          }
        }

        
        // If this transaction was already flushed, then nothing to do
        // 日志已经被其它线程同步,返回
        if (mytxid <= synctxid) 
        {
          //当执行的事物id小于已同步的Id,也进行计数累加
          numTransactionsBatchedInSync++;
          if (metrics != null) // Metrics is non-null only when used inside name node
            metrics.incrTransactionsBatchedInSync();
          return;
        }

        // now, this thread will do the sync
        // 同步由当前线程执行,记录相关信息
        syncStart = txid;
        isSyncRunning = true;
        sync = true;

        // swap buffers
        exitIfNoStreams();
        for(EditLogOutputStream eStream : editStreams) 
        {
          try {
          	//交换缓冲
            eStream.setReadyToFlush();
            streams.add(eStream);
          } catch (IOException ie) {
            FSNamesystem.LOG.error("Unable to get ready to flush.", ie);
            //
            // remember the streams that encountered an error.
            //
            if (errorStreams == null) {
              errorStreams = new ArrayList<EditLogOutputStream>(1);
            }
            errorStreams.add(eStream);
          }
        }
      }

      // do the sync
      // 执行日志同步操作
      long start = FSNamesystem.now();
      for (EditLogOutputStream eStream : streams) {
        try {
          //同步完成之后,做输入数据操作
          eStream.flush();
        } catch (IOException ie) {
          FSNamesystem.LOG.error("Unable to sync edit log.", ie);
          //
          // remember the streams that encountered an error.
          //
          if (errorStreams == null) {
            errorStreams = new ArrayList<EditLogOutputStream>(1);
          }
          errorStreams.add(eStream);
        }
      }
      long elapsed = FSNamesystem.now() - start;
      removeEditsStreamsAndStorageDirs(errorStreams);
      exitIfNoStreams();

      if (metrics != null) // Metrics is non-null only when used inside name node
        metrics.addSync(elapsed);

    } finally {
      synchronized (this) {
        if(sync) {
          synctxid = syncStart;
          isSyncRunning = false;
        }
        this.notifyAll();
      }
    }
  }

3.3 loadFSImage

  /**
   * Load in the filesystem imagefrom file. It's a big list of
   * filenames and blocks.  Return whether we should
   * "re-save" and consolidate the edit-logs
   */
  boolean loadFSImage(File curFile) throws IOException {
    assert this.getLayoutVersion() < 0 : "Negative layout version is expected.";
    assert curFile != null : "curFile is null";

    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
    FSDirectory fsDir = fsNamesys.dir;

    //
    // Load in bits
    //
    //以输入流的方式读取镜像文件数据
    boolean needToSave = true;
    DataInputStream in = new DataInputStream(new BufferedInputStream(
                              new FileInputStream(curFile)));
    try {
      /*
       * Note: Remove any checks for version earlier than 
       * Storage.LAST_UPGRADABLE_LAYOUT_VERSION since we should never get 
       * to here with older images.
       */
      
      /*
       * TODO we need to change format of the image file
       * it should not contain version and namespace fields
       */
      // read image version: first appeared in version -1
      // 读入image版本号
      int imgVersion = in.readInt();
      // read namespaceID: first appeared in version -2
      //读入namespaceId
      this.namespaceID = in.readInt();

      // read number of files
      //目录树项目数
      long numFiles;
      if (imgVersion <= -16) {
        numFiles = in.readLong();
      } else {
        numFiles = in.readInt();
      }

      this.layoutVersion = imgVersion;
      // read in the last generation stamp.
      if (imgVersion <= -12) {
        long genstamp = in.readLong();
        fsNamesys.setGenerationStamp(genstamp); 
      }

      needToSave = (imgVersion != FSConstants.LAYOUT_VERSION);

      // read file info
      short replication = FSNamesystem.getFSNamesystem().getDefaultReplication();

      LOG.info("Number of files = " + numFiles);

      String path;
      String parentPath = "";
      INodeDirectory parentINode = fsDir.rootDir;
      for (long i = 0; i < numFiles; i++) {
        long modificationTime = 0;
        long atime = 0;
        long blockSize = 0;
        path = readString(in);
        replication = in.readShort();
        replication = FSEditLog.adjustReplication(replication);
        modificationTime = in.readLong();
        if (imgVersion <= -17) {
          atime = in.readLong();
        }
        if (imgVersion <= -8) {
          blockSize = in.readLong();
        }
        int numBlocks = in.readInt();
        Block blocks[] = null;

        // for older versions, a blocklist of size 0
        // indicates a directory.
        if ((-9 <= imgVersion && numBlocks > 0) ||
            (imgVersion < -9 && numBlocks >= 0)) {
          blocks = new Block[numBlocks];
          for (int j = 0; j < numBlocks; j++) {
            blocks[j] = new Block();
            if (-14 < imgVersion) {
              //一个个数据块的恢复
              blocks[j].set(in.readLong(), in.readLong(), 
                            Block.GRANDFATHER_GENERATION_STAMP);
            } else {
              blocks[j].readFields(in);
            }
          }
        }
        // Older versions of HDFS does not store the block size in inode.
        // If the file has more than one block, use the size of the 
        // first block as the blocksize. Otherwise use the default block size.
        //
        if (-8 <= imgVersion && blockSize == 0) {
          if (numBlocks > 1) {
            blockSize = blocks[0].getNumBytes();
          } else {
            long first = ((numBlocks == 1) ? blocks[0].getNumBytes(): 0);
            blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
          }
        }
        
        // get quota only when the node is a directory
        long nsQuota = -1L;
        if (imgVersion <= -16 && blocks == null) {
          nsQuota = in.readLong();
        }
        long dsQuota = -1L;
        if (imgVersion <= -18 && blocks == null) {
          dsQuota = in.readLong();
        }
        
        PermissionStatus permissions = fsNamesys.getUpgradePermission();
        if (imgVersion <= -11) {
          permissions = PermissionStatus.read(in);
        }
        if (path.length() == 0) { // it is the root
          // update the root's attributes
          if (nsQuota != -1 || dsQuota != -1) {
            fsDir.rootDir.setQuota(nsQuota, dsQuota);
          }
          fsDir.rootDir.setModificationTime(modificationTime);
          fsDir.rootDir.setPermissionStatus(permissions);
          continue;
        }
        // check if the new inode belongs to the same parent
        if(!isParent(path, parentPath)) {
          parentINode = null;
          parentPath = getParent(path);
        }
        // add new inode
        // 将读入的INode添加到目录树中
        parentINode = fsDir.addToParent(path, parentINode, permissions,
                                        blocks, replication, modificationTime, 
                                        atime, nsQuota, dsQuota, blockSize);
      }
      
      // load datanode info
      this.loadDatanodes(imgVersion, in);

      // load Files Under Construction
      this.loadFilesUnderConstruction(imgVersion, in, fsNamesys);
      
      this.loadSecretManagerState(imgVersion, in, fsNamesys);
      
    } finally {
      in.close();
    }
    
    return needToSave;
  }

  /**
   * Return string representing the parent of the given path.
   */
  String getParent(String path) {
    return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
  }

  private boolean isParent(String path, String parent) {
    return parent != null && path != null
          && path.indexOf(parent) == 0
          && path.lastIndexOf(Path.SEPARATOR) == parent.length();
  }

  /**
   * Load and merge edits from two edits files
   * 
   * @param sd storage directory
   * @return number of edits loaded
   * @throws IOException
   */
  int loadFSEdits(StorageDirectory sd) throws IOException {
    int numEdits = 0;
    EditLogFileInputStream edits = 
      new EditLogFileInputStream(getImageFile(sd, NameNodeFile.EDITS));
    numEdits = FSEditLog.loadFSEdits(edits);
    edits.close();
    File editsNew = getImageFile(sd, NameNodeFile.EDITS_NEW);
    if (editsNew.exists() && editsNew.length() > 0) {
      edits = new EditLogFileInputStream(editsNew);
      numEdits += FSEditLog.loadFSEdits(edits);
      edits.close();
    }
    // update the counts.
    FSNamesystem.getFSNamesystem().dir.updateCountForINodeWithQuota();    
    return numEdits;
  }

3.3.1 loadFilesUnderConstruction

private void loadFilesUnderConstruction(int version, DataInputStream in, 
                                  FSNamesystem fs) throws IOException 
{
    FSDirectory fsDir = fs.dir;
    if (version > -13) // pre lease image version
      return;
    int size = in.readInt();

    LOG.info("Number of files under construction = " + size);

    for (int i = 0; i < size; i++) 
    {
    	//读入INodeFileUnderConstruction信息
      INodeFileUnderConstruction cons = readINodeUnderConstruction(in);

      // verify that file exists in namespace
      String path = cons.getLocalName();
      INode old = fsDir.getFileINode(path);
      if (old == null) {
        throw new IOException("Found lease for non-existent file " + path);
      }
      if (old.isDirectory()) {
        throw new IOException("Found lease for directory " + path);
      }
      INodeFile oldnode = (INodeFile) old;
      fsDir.replaceNode(path, oldnode, cons);
      fs.leaseManager.addLease(cons.clientName, path); 
    }
  }

3.3.2

   * Load an edit log, and apply the changes to the in-memory structure
   * This is where we apply edits that we've been writing to disk all
   * along.
   * 导入编辑日志文件,并在内存中构建此时状态
   */
  static int loadFSEdits(EditLogInputStream edits) throws IOException {
    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
    //FSDirectory是一个门面模式的体现,所有的操作都是在这个类中分给里面的子系数实现
    FSDirectory fsDir = fsNamesys.dir;
    int numEdits = 0;
    int logVersion = 0;
    String clientName = null;
    String clientMachine = null;
    String path = null;
    int numOpAdd = 0, numOpClose = 0, numOpDelete = 0,
        numOpRename = 0, numOpSetRepl = 0, numOpMkDir = 0,
        numOpSetPerm = 0, numOpSetOwner = 0, numOpSetGenStamp = 0,
        numOpTimes = 0, numOpGetDelegationToken = 0,
        numOpRenewDelegationToken = 0, numOpCancelDelegationToken = 0,
        numOpUpdateMasterKey = 0, numOpOther = 0;

    long startTime = FSNamesystem.now();

    DataInputStream in = new DataInputStream(new BufferedInputStream(edits));
    try {
      // Read log file version. Could be missing. 
      in.mark(4);
      // If edits log is greater than 2G, available method will return negative
      // numbers, so we avoid having to call available
      boolean available = true;
      try {
        // 首先读入日志版本号
        logVersion = in.readByte();
      } catch (EOFException e) {
        available = false;
      }
      if (available) {
        in.reset();
        logVersion = in.readInt();
        if (logVersion < FSConstants.LAYOUT_VERSION) // future version
          throw new IOException(
                          "Unexpected version of the file system log file: "
                          + logVersion + ". Current version = " 
                          + FSConstants.LAYOUT_VERSION + ".");
      }
      assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION :
                            "Unsupported version " + logVersion;

      while (true) {
        long timestamp = 0;
        long mtime = 0;
        long atime = 0;
        long blockSize = 0;
        byte opcode = -1;
        try {
          //读入操作参数
          opcode = in.readByte();
          //如果读入的是无效参数,则表明已经读到日志的尾部了,可以跳出循环
          if (opcode == OP_INVALID) {
            FSNamesystem.LOG.info("Invalid opcode, reached end of edit log " +
                                   "Number of transactions found " + numEdits);
            break; // no more transactions
          }
        } catch (EOFException e) {
          break; // no more transactions
        }
        //进行记录数的累加
        numEdits++;
        
        //下面根据操作类型进行值的设置
        switch (opcode) {
        case OP_ADD:
        case OP_CLOSE: {
          // versions > 0 support per file replication
          // get name and replication
          int length = in.readInt();
          if (-7 == logVersion && length != 3||
              -17 < logVersion && logVersion < -7 && length != 4 ||
              logVersion <= -17 && length != 5) {
              throw new IOException("Incorrect data format."  +
                                    " logVersion is " + logVersion +
                                    " but writables.length is " +
                                    length + ". ");
          }
          path = FSImage.readString(in);
          short replication = adjustReplication(readShort(in));
          mtime = readLong(in);
          if (logVersion <= -17) {
            atime = readLong(in);
          }
          if (logVersion < -7) {
            blockSize = readLong(in);
          }
          // get blocks
          Block blocks[] = null;
          if (logVersion <= -14) {
            blocks = readBlocks(in);
          } else {
            BlockTwo oldblk = new BlockTwo();
            int num = in.readInt();
            blocks = new Block[num];
            for (int i = 0; i < num; i++) {
              oldblk.readFields(in);
              blocks[i] = new Block(oldblk.blkid, oldblk.len, 
                                    Block.GRANDFATHER_GENERATION_STAMP);
            }
          }

          // Older versions of HDFS does not store the block size in inode.
          // If the file has more than one block, use the size of the
          // first block as the blocksize. Otherwise use the default
          // block size.
          if (-8 <= logVersion && blockSize == 0) {
            if (blocks.length > 1) {
              blockSize = blocks[0].getNumBytes();
            } else {
              long first = ((blocks.length == 1)? blocks[0].getNumBytes(): 0);
              blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
            }
          }
           
          PermissionStatus permissions = fsNamesys.getUpgradePermission();
          if (logVersion <= -11) {
            permissions = PermissionStatus.read(in);
          }

          // clientname, clientMachine and block locations of last block.
          if (opcode == OP_ADD && logVersion <= -12) {
            clientName = FSImage.readString(in);
            clientMachine = FSImage.readString(in);
            if (-13 <= logVersion) {
              readDatanodeDescriptorArray(in);
            }
          } else {
            clientName = "";
            clientMachine = "";
          }

          // The open lease transaction re-creates a file if necessary.
          // Delete the file if it already exists.
          if (FSNamesystem.LOG.isDebugEnabled()) {
            FSNamesystem.LOG.debug(opcode + ": " + path + 
                                   " numblocks : " + blocks.length +
                                   " clientHolder " +  clientName +
                                   " clientMachine " + clientMachine);
          }

          fsDir.unprotectedDelete(path, mtime);

          // add to the file tree
          INodeFile node = (INodeFile)fsDir.unprotectedAddFile(
                                                    path, permissions,
                                                    blocks, replication, 
                                                    mtime, atime, blockSize);
          if (opcode == OP_ADD) {
            numOpAdd++;
            //
            // Replace current node with a INodeUnderConstruction.
            // Recreate in-memory lease record.
            // 构造出处于构建状态的的文件对象
            //
            INodeFileUnderConstruction cons = new INodeFileUnderConstruction(
                                      node.getLocalNameBytes(),
                                      node.getReplication(), 
                                      node.getModificationTime(),
                                      node.getPreferredBlockSize(),
                                      node.getBlocks(),
                                      node.getPermissionStatus(),
                                      clientName, 
                                      clientMachine, 
                                      null);
            fsDir.replaceNode(path, node, cons);
            fsNamesys.leaseManager.addLease(cons.clientName, path);
          }
          break;
        } 
        case OP_SET_REPLICATION: {
          numOpSetRepl++;
          path = FSImage.readString(in);
          short replication = adjustReplication(readShort(in));
          fsDir.unprotectedSetReplication(path, replication, null);
          break;
        } 
        case OP_RENAME: {
          numOpRename++;
          int length = in.readInt();
          if (length != 3) {
            throw new IOException("Incorrect data format. " 
                                  + "Mkdir operation.");
          }
          //读入改名操作的日志记录内容
          String s = FSImage.readString(in);
          String d = FSImage.readString(in);
          timestamp = readLong(in);
          HdfsFileStatus dinfo = fsDir.getFileInfo(d);
          //改名
          fsDir.unprotectedRenameTo(s, d, timestamp);
          //修改租约管理器中的记录
          fsNamesys.changeLease(s, d, dinfo);
          break;
        }
        case OP_DELETE: {
          numOpDelete++;
          int length = in.readInt();
          if (length != 2) {
            throw new IOException("Incorrect data format. " 
                                  + "delete operation.");
          }
          path = FSImage.readString(in);
          timestamp = readLong(in);
          fsDir.unprotectedDelete(path, timestamp);
          break;
        }
        case OP_MKDIR: {
          numOpMkDir++;
          PermissionStatus permissions = fsNamesys.getUpgradePermission();
          int length = in.readInt();
          if (-17 < logVersion && length != 2 ||
              logVersion <= -17 && length != 3) {
            throw new IOException("Incorrect data format. " 
                                  + "Mkdir operation.");
          }
          path = FSImage.readString(in);
          timestamp = readLong(in);

          // The disk format stores atimes for directories as well.
          // However, currently this is not being updated/used because of
          // performance reasons.
          if (logVersion <= -17) {
            atime = readLong(in);
          }

          if (logVersion <= -11) {
            permissions = PermissionStatus.read(in);
          }
          fsDir.unprotectedMkdir(path, permissions, timestamp);
          break;
        }
        case OP_SET_GENSTAMP: {
          numOpSetGenStamp++;
          long lw = in.readLong();
          fsDir.namesystem.setGenerationStamp(lw);
          break;
        } 
        case OP_DATANODE_ADD: {
          numOpOther++;
          FSImage.DatanodeImage nodeimage = new FSImage.DatanodeImage();
          nodeimage.readFields(in);
          //Datnodes are not persistent any more.
          break;
        }
        case OP_DATANODE_REMOVE: {
          numOpOther++;
          DatanodeID nodeID = new DatanodeID();
          nodeID.readFields(in);
          //Datanodes are not persistent any more.
          break;
        }
        case OP_SET_PERMISSIONS: {
          numOpSetPerm++;
          if (logVersion > -11)
            throw new IOException("Unexpected opcode " + opcode
                                  + " for version " + logVersion);
          fsDir.unprotectedSetPermission(
              FSImage.readString(in), FsPermission.read(in));
          break;
        }
        case OP_SET_OWNER: {
          numOpSetOwner++;
          if (logVersion > -11)
            throw new IOException("Unexpected opcode " + opcode
                                  + " for version " + logVersion);
          fsDir.unprotectedSetOwner(FSImage.readString(in),
              FSImage.readString_EmptyAsNull(in),
              FSImage.readString_EmptyAsNull(in));
          break;
        }
        case OP_SET_NS_QUOTA: {
          if (logVersion > -16) {
            throw new IOException("Unexpected opcode " + opcode
                + " for version " + logVersion);
          }
          fsDir.unprotectedSetQuota(FSImage.readString(in), 
                                    readLongWritable(in), 
                                    FSConstants.QUOTA_DONT_SET);
          break;
        }
        case OP_CLEAR_NS_QUOTA: {
          if (logVersion > -16) {
            throw new IOException("Unexpected opcode " + opcode
                + " for version " + logVersion);
          }
          fsDir.unprotectedSetQuota(FSImage.readString(in),
                                    FSConstants.QUOTA_RESET,
                                    FSConstants.QUOTA_DONT_SET);
          break;
        }

        case OP_SET_QUOTA:
          fsDir.unprotectedSetQuota(FSImage.readString(in),
                                    readLongWritable(in),
                                    readLongWritable(in));
                                      
          break;

        case OP_TIMES: {
          numOpTimes++;
          int length = in.readInt();
          if (length != 3) {
            throw new IOException("Incorrect data format. " 
                                  + "times operation.");
          }
          path = FSImage.readString(in);
          mtime = readLong(in);
          atime = readLong(in);
          fsDir.unprotectedSetTimes(path, mtime, atime, true);
          break;
        }
        case OP_GET_DELEGATION_TOKEN: {
          if (logVersion > -19) {
            throw new IOException("Unexpected opcode " + opcode
                + " for version " + logVersion);
          }
          numOpGetDelegationToken++;
          DelegationTokenIdentifier delegationTokenId = 
              new DelegationTokenIdentifier();
          delegationTokenId.readFields(in);
          long expiryTime = readLong(in);
          fsNamesys.getDelegationTokenSecretManager()
              .addPersistedDelegationToken(delegationTokenId, expiryTime);
          break;
        }
        case OP_RENEW_DELEGATION_TOKEN: {
          if (logVersion > -19) {
            throw new IOException("Unexpected opcode " + opcode
                + " for version " + logVersion);
          }
          numOpRenewDelegationToken++;
          DelegationTokenIdentifier delegationTokenId = 
              new DelegationTokenIdentifier();
          delegationTokenId.readFields(in);
          long expiryTime = readLong(in);
          fsNamesys.getDelegationTokenSecretManager()
              .updatePersistedTokenRenewal(delegationTokenId, expiryTime);
          break;
        }
        case OP_CANCEL_DELEGATION_TOKEN: {
          if (logVersion > -19) {
            throw new IOException("Unexpected opcode " + opcode
                + " for version " + logVersion);
          }
          numOpCancelDelegationToken++;
          DelegationTokenIdentifier delegationTokenId = 
              new DelegationTokenIdentifier();
          delegationTokenId.readFields(in);
          fsNamesys.getDelegationTokenSecretManager()
              .updatePersistedTokenCancellation(delegationTokenId);
          break;
        }
        case OP_UPDATE_MASTER_KEY: {
          if (logVersion > -19) {
            throw new IOException("Unexpected opcode " + opcode
                + " for version " + logVersion);
          }
          numOpUpdateMasterKey++;
          DelegationKey delegationKey = new DelegationKey();
          delegationKey.readFields(in);
          fsNamesys.getDelegationTokenSecretManager().updatePersistedMasterKey(
              delegationKey);
          break;
        }
        default: {
          throw new IOException("Never seen opcode " + opcode);
        }
        }
      }
    } catch (IOException ex) {
      // Failed to load 0.20.203 version edits during upgrade. This version has
      // conflicting opcodes with the later releases. The editlog must be 
      // emptied by restarting the namenode, before proceeding with the upgrade.
      if (Storage.is203LayoutVersion(logVersion) &&
          logVersion != FSConstants.LAYOUT_VERSION) {
        String msg = "During upgrade, failed to load the editlog version " + 
        logVersion + " from release 0.20.203. Please go back to the old " + 
        " release and restart the namenode. This empties the editlog " +
        " and saves the namespace. Resume the upgrade after this step.";
        throw new IOException(msg, ex);
      } else {
        throw ex;
      }
      
    } finally {
      in.close();
    }
    FSImage.LOG.info("Edits file " + edits.getName() 
        + " of size " + edits.length() + " edits # " + numEdits 
        + " loaded in " + (FSNamesystem.now()-startTime)/1000 + " seconds.");

    if (FSImage.LOG.isDebugEnabled()) {
      FSImage.LOG.debug("numOpAdd = " + numOpAdd + " numOpClose = " + numOpClose 
          + " numOpDelete = " + numOpDelete + " numOpRename = " + numOpRename 
          + " numOpSetRepl = " + numOpSetRepl + " numOpMkDir = " + numOpMkDir
          + " numOpSetPerm = " + numOpSetPerm 
          + " numOpSetOwner = " + numOpSetOwner
          + " numOpSetGenStamp = " + numOpSetGenStamp 
          + " numOpTimes = " + numOpTimes
          + " numOpGetDelegationToken = " + numOpGetDelegationToken
          + " numOpRenewDelegationToken = " + numOpRenewDelegationToken
          + " numOpCancelDelegationToken = " + numOpCancelDelegationToken
          + " numOpUpdateMasterKey = " + numOpUpdateMasterKey
          + " numOpOther = " + numOpOther);
    }

    if (logVersion != FSConstants.LAYOUT_VERSION) // other version
      numEdits++; // save this image asap
    return numEdits;
  }
发布了280 篇原创文章 · 获赞 49 · 访问量 31万+

猜你喜欢

转载自blog.csdn.net/kaikai_sk/article/details/88661877