1. 文件系统的目录树
1.1. INode
abstract class INode implements Comparable<byte[]>
{
//文件/目录名称
protected byte[] name;
//父目录
protected INodeDirectory parent;
//最近一次的修改时间
protected long modificationTime;
//最近访问时间
protected long accessTime;
//使用long整数的64位保存,分3段保存,分别为mode模式控制访问权限,所属组,所属用户
private static enum PermissionStatusFormat
{
//访问权限
MODE(0, 16),
//用户组标识符
GROUP(MODE.OFFSET + MODE.LENGTH, 25),
//文件主标识符
USER(GROUP.OFFSET + GROUP.LENGTH, 23);
//偏移量
final int OFFSET;
final int LENGTH; //bit length
final long MASK;
PermissionStatusFormat(int offset, int length) {
OFFSET = offset;
LENGTH = length;
MASK = ((-1L) >>> (64 - LENGTH)) << OFFSET;
}
//与掩码计算并右移得到用户标识符
long retrieve(long record) {
return (record & MASK) >>> OFFSET;
}
long combine(long bits, long record) {
return (record & ~MASK) | (bits << OFFSET);
}
}
/** Get user name */
public String getUserName() {
int n = (int)PermissionStatusFormat.USER.retrieve(permission);
//根据整形标识符,SerialNumberManager对象中取出,避免存储字符串消耗大量内存
return SerialNumberManager.INSTANCE.getUser(n);
}
/**
* Check whether this is the root inode.
* 根节点的判断标准是名字长度为0
*/
boolean isRoot() {
return name.length == 0;
}
//移除自身节点方法
boolean removeNode()
{
if (parent == null)
{
return false;
}
else
{
//调用父目录的removeChild方法
parent.removeChild(this);
parent = null;
return true;
}
}
//返回inode所在子目录树中所有文件拥有的数据块
abstract int collectSubtreeBlocksAndClear(List<Block> v);
}
1.2 INodeDirectory
/**
* Directory INode class.
*/
class INodeDirectory extends INode
{
//保存子目录或子文件
private List<INode> children;
//移除节点方法
INode removeChild(INode node)
{
assert children != null;
//用二分法寻找文件节点
int low = Collections.binarySearch(children, node.name);
if (low >= 0)
{
return children.remove(low);
}
else
{
return null;
}
}
//递归删除文件目录下的所有block块
int collectSubtreeBlocksAndClear(List<Block> v)
{
int total = 1;
//直到是空目录的情况,才直接返回
if (children == null)
{
return total;
}
for (INode child : children)
{
//递归删除
total += child.collectSubtreeBlocksAndClear(v);
}
//删除完毕之后,置为空操作,并返回文件数计数结果
parent = null;
children = null;
return total;
}
/**
* 使用递归方法,计算目录树占用的空间
*/
DirCounts spaceConsumedInTree(DirCounts counts) {
counts.nsCount += 1;
if (children != null) {
for (INode child : children) {
child.spaceConsumedInTree(counts);
}
}
return counts;
}
}
1.3 INodeDirectoryWithQuota
HDFS允许管理员为每个目录设置配额,配额有两种:
(1)节点配额: 用于限制目录下的名字数量,如果创建文件或者目录时超过了该配额,操作会失败。这个配额用于控制用户对名字节点资源的占用,保存在成员变量nsQuota中
(2)空间配额:限制存在目录树中的所有文件的总规模,空间配额保证用户不会过多占用数据节点的资源,该配额由dsQuota变量保存。
1.4 INodeFile
class INodeFile extends INode
{
/**
* 高16位存放副本系数
* 低48位存放数据块大小
**/
protected long header;
//文件数据block块
protected BlockInfo blocks[] = null;
//将自身拥有的block块加入到参数block列表中
int collectSubtreeBlocksAndClear(List<Block> v)
{
parent = null;
for (Block blk : blocks)
{
v.add(blk);
}
blocks = null;
return 1;
}
}
1.5 INodeFileUnderConstruction
//处于构建状态的文件节点
class INodeFileUnderConstruction extends INodeFile
{
//写文件的客户端名称,也是这个租约的持有者
String clientName; // lease holder
//客户端所在的主机
private final String clientMachine;
//如果客户端同样存在于集群中,则记录所在的节点
private final DatanodeDescriptor clientNode; // if client is a cluster node too.
//租约恢复时的节点
private int primaryNodeIndex = -1; //the node working on lease recovery
//最后一个block块所处的节点组,又名数据流管道成员
private DatanodeDescriptor[] targets = null; //locations for last block
//最近租约恢复时间
private long lastRecoveryTime = 0;
}
2 命名空间镜像
名字节点通常把命名空间镜像和编辑日志保存在“current”目录下。目录下通常有四个文件:
(1)fsimage:元数据镜像文件
(2)edits:日志文件,和元数据镜像文件一起,提供一个完整的HDFS目录树和元信息
(3)fstime:保存最近一次检查点的时间,检查点一般由SecondaryNameNode产生,是一次fsimage和edits合并的结果
(4)VERSION:和数据节点类似,该文件保存了NameNode存储的一些属性
2.1 saveFSImage
FSImage.saveFSImage()会将当前时刻的fsimage保存到newFile指定的文件中
*先输出文件头
/**
* Save the contents of the FS image to the file.
* 保存镜像文件
*/
void saveFSImage(File newFile) throws IOException
{
FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
FSDirectory fsDir = fsNamesys.dir;
long startTime = FSNamesystem.now();
//
/**
* 根据输入文件构建输出流
**/
DataOutputStream out = new DataOutputStream(
new BufferedOutputStream(
new FileOutputStream(newFile)));
try {
/**
* 下面四个是写文件头
**/
//写入版本号
out.writeInt(FSConstants.LAYOUT_VERSION);
//写入命名空间ID
out.writeInt(namespaceID);
//写入目录下的孩子总数
out.writeLong(fsDir.rootDir.numItemsInTree());
//写入时间
out.writeLong(fsNamesys.getGenerationStamp());
/*
* HDFS文件或者目录的绝对路径不能超过8000字节
* 所以缓冲区的大小31.25KB
*/
byte[] byteStore = new byte[4*FSConstants.MAX_PATH_LENGTH];
ByteBuffer strbuf = ByteBuffer.wrap(byteStore);
/**
* save the root
* 根节点长度为0,所以必须要做特殊处理
**/
saveINode2Image(strbuf, fsDir.rootDir, out);
// save the rest of the nodes
saveImage(strbuf, 0, fsDir.rootDir, out);
//保存构建中的节点
fsNamesys.saveFilesUnderConstruction(out);
//保存安全信息
fsNamesys.saveSecretManagerState(out);
strbuf = null;
}
finally
{
out.close();
}
LOG.info("Image file of size " + newFile.length() + " saved in "
+ (FSNamesystem.now() - startTime)/1000 + " seconds.");
}
2.2 saveImage
/**
* Save file tree image starting from the given root.
* This is a recursive procedure, which first saves all children of
* a current directory and then moves inside the sub-directories.
* 按照给定节点进行镜像的保存,每个节点目录会采取递归的方式进行遍历
*/
private static void saveImage(ByteBuffer parentPrefix,
int prefixLength,
INodeDirectory current,
DataOutputStream out) throws IOException
{
int newPrefixLength = prefixLength;
//空目录
if (current.getChildrenRaw() == null)
return;
//输出当前节点的所有子节点
for(INode child : current.getChildren())
{
//设置缓冲区位置
parentPrefix.position(prefixLength);
//将当前目录追加到缓冲区中
parentPrefix.put(PATH_SEPARATOR).put(child.getLocalNameBytes());
//输出节点信息
saveINode2Image(parentPrefix, child, out);
}
//子节点是目录,输出该目录
for(INode child : current.getChildren())
{
//文件,忽略
if(!child.isDirectory())
continue;
//准备参数
parentPrefix.position(prefixLength);
parentPrefix.put(PATH_SEPARATOR).put(child.getLocalNameBytes());
newPrefixLength = parentPrefix.position();
//递归调用
saveImage(parentPrefix, newPrefixLength, (INodeDirectory)child, out);
}
parentPrefix.position(prefixLength);
}
2.3 saveINode2Image
/*
* Save one inode's attributes to the image.
* 保留一个节点的属性到镜像中
*/
private static void saveINode2Image(ByteBuffer name,
INode node,
DataOutputStream out) throws IOException
{
int nameLen = name.position();
out.writeShort(nameLen);
out.write(name.array(), name.arrayOffset(), nameLen);
// write file inode
if (!node.isDirectory())
{
INodeFile fileINode = (INodeFile)node;
//写入的属性包括,副本数,最近修改数据,最近访问时间
out.writeShort(fileINode.getReplication());
out.writeLong(fileINode.getModificationTime());
out.writeLong(fileINode.getAccessTime());
out.writeLong(fileINode.getPreferredBlockSize());
Block[] blocks = fileINode.getBlocks();
out.writeInt(blocks.length);
for (Block blk : blocks)
//将数据块信息也写入
blk.write(out);
FILE_PERM.fromShort(fileINode.getFsPermissionShort());
PermissionStatus.write(out, fileINode.getUserName(),
fileINode.getGroupName(),
FILE_PERM);
}
else // write directory inode
{
//如果是目录,则还要写入节点的配额限制值
out.writeShort(0); // replication
out.writeLong(node.getModificationTime());
out.writeLong(0); // access time
out.writeLong(0); // preferred block size
// # of blocks 。-1代表这是目录
out.writeInt(-1);
out.writeLong(node.getNsQuota());
out.writeLong(node.getDsQuota());
FILE_PERM.fromShort(node.getFsPermissionShort());
PermissionStatus.write(out, node.getUserName(),
node.getGroupName(),
FILE_PERM);
}
}
2.4 FSNamesystem.saveFilesUnderConstruction
/**
* Serializes leases.
*/
void saveFilesUnderConstruction(DataOutputStream out) throws IOException
{
synchronized (leaseManager)
{
/**
* write the size
* 租约管理器中的项目数
**/
out.writeInt(leaseManager.countPath());
for (Lease lease : leaseManager.getSortedLeases())
{
for(String path : lease.getPaths())
{
// verify that path exists in namespace
INode node = dir.getFileINode(path);
if (node == null)
{
throw new IOException("saveLeases found path " + path +
" but no matching entry in namespace.");
}
//缺人文件处于构建过程
if (!node.isUnderConstruction())
{
throw new IOException("saveLeases found path " + path +
" but is not under construction.");
}
INodeFileUnderConstruction cons = (INodeFileUnderConstruction) node;
FSImage.writeINodeUnderConstruction(out, cons, path);
}
}
}
}
3.编辑日志
/**
* A generic abstract class to support journaling of edits logs into
* a persistent storage.
*/
abstract class EditLogOutputStream extends OutputStream
{
//下面是2个统计量
//文件同步的次数,可以理解为就是缓冲写入的次数
private long numSync; // number of sync(s) to disk
//同步写入的总时间计数
private long totalTimeSync; // total time to sync
/**
* Create and initialize new edits log storage.
* 创建日志文件
* @throws IOException
*/
abstract void create() throws IOException;
/**
* Flush data to persistent store.
* Collect sync metrics.
* 刷出时间方法
*/
public void flush() throws IOException
{
//同步次数加1
numSync++;
long start = FSNamesystem.now();
//刷出同步方法为抽象方法,由继承的子类具体实现
flushAndSync();
long end = FSNamesystem.now();
//同时进行耗时的累加
totalTimeSync += (end - start);
}
/**
* Return the size of the current edits log.
* Length is used to check when it is large enough to start a checkpoint.
*/
abstract long length() throws IOException;
}
EditLogFileOutputStream有两个工作缓冲区:(1)bufCurrent:日志写入缓冲区(2)bufReady写文件缓冲区。
通过wirte()输出的日志记录会写到缓冲区bufCurrent中,当bufCurrent中的内容需要写往文件时,EditLogFileOutputStream会交换两个缓冲区,原来的日志写入缓冲区会编程文件写入缓冲区,原来的文件缓冲区会变成日志写入缓冲区
EditLogFileOutputStream对象构造后,还需要调用对象的create()方法,才能开始网输出流中写入数据。
一般来说,往日志文件中写数据按照先setReadyToFlush()后Flush的调用顺序进行。
3.1 EditLogFileOutputStream
public class EditLogFileOutputStream extends EditLogOutputStream
{
private static Log LOG = LogFactory.getLog(EditLogFileOutputStream.class);
private File file;
private FileOutputStream fp; // file stream for storing edit logs
//输出文件对应的文件通道
private FileChannel fc; // channel of the file stream for sync
private EditsDoubleBuffer doubleBuf;
static ByteBuffer fill = ByteBuffer.allocateDirect(1024 * 1024); // preallocation, 1MB
/**
* Create empty edits logs file.
*/
@Override
public void create() throws IOException
{
//清空文件现有内容,fc是文件通道对象
fc.truncate(0);
fc.position(0);
doubleBuf.getCurrentBuf().writeInt(FSConstants.LAYOUT_VERSION);
setReadyToFlush();
//调用了基类的flush()方法
flush();
}
/**
* All data that has been written to the stream so far will be flushed. New
* data can be still written to the stream while flushing is performed.
*/
@Override
public void setReadyToFlush() throws IOException
{
//插入日志文件结束标识OP_INVALID
doubleBuf.getCurrentBuf().write(FSEditLogOpCodes.OP_INVALID.getOpCode()); // insert eof marker
doubleBuf.setReadyToFlush();
}
/**
* Flush ready buffer to persistent store. currentBuffer is not flushed as it
* accumulates new log records while readyBuffer will be flushed and synced.
*/
@Override
protected void flushAndSync(boolean durable) throws IOException {
if (fp == null) {
throw new IOException("Trying to use aborted output stream");
}
preallocate(); // preallocate file if necessary
if (doubleBuf.isFlushed()) {
return;
}
doubleBuf.flushTo(fp);
if (durable)
{
//持久化日志数据
fc.force(false); // metadata updates not needed
}
//忽略日志文件结束标识,为下一次写做准备
fc.position(fc.position() - 1); // skip back the end-of-file marker
}
}
3.2 FSEditLog
public void logOpenFile(String path, INodeFileUnderConstruction newNode)
throws IOException
{
//绝对路径
UTF8 nameReplicationPair[] = new UTF8[] {
new UTF8(path),
//副本数
FSEditLog.toLogReplication(newNode.getReplication()),
//修改时间
FSEditLog.toLogLong(newNode.getModificationTime()),
//访问时间
FSEditLog.toLogLong(newNode.getAccessTime()),
//数据块大小
FSEditLog.toLogLong(newNode.getPreferredBlockSize())};
logEdit(OP_ADD,
new ArrayWritable(UTF8.class, nameReplicationPair),
new ArrayWritable(Block.class, newNode.getBlocks()),
newNode.getPermissionStatus(),
new UTF8(newNode.getClientName()),
new UTF8(newNode.getClientMachine()));
}
// stores the most current transactionId of this thread.
//通过ThreadLocal类保存线程私有的状态信息
//保存当前线程最近一次日志交易标识
private static final ThreadLocal<TransactionId> myTransactionId = new ThreadLocal<TransactionId>()
{
protected synchronized TransactionId initialValue() {
return new TransactionId(Long.MAX_VALUE);
}
};
/**
* Write an operation to the edit log. Do not sync to persistent
* store yet.
* 写入一个操作到编辑日志中
*/
synchronized void logEdit(byte op, Writable ... writables) {
if (getNumEditStreams() < 1) {
throw new AssertionError("No edit streams to log to");
}
long start = FSNamesystem.now();
for (int idx = 0; idx < editStreams.size(); idx++) {
EditLogOutputStream eStream = editStreams.get(idx);
try {
// 写入操作到每个输出流中
eStream.write(op, writables);
} catch (IOException ioe) {
removeEditsAndStorageDir(idx);
idx--;
}
}
exitIfNoStreams();
// get a new transactionId
//获取一个新的事务Id
txid++;
//
// record the transactionId when new data was written to the edits log
//
TransactionId id = myTransactionId.get();
id.txid = txid;
// update statistics
long end = FSNamesystem.now();
//在每次进行logEdit写入记录操作的时候,都会累加事物次数和耗时
numTransactions++;
totalTimeTransactions += (end-start);
if (metrics != null) // Metrics is non-null only when used inside name node
metrics.addTransaction(end-start);
}
3.2.3 logSync
logSync的逻辑:
(1)为同步日志记录做准备。准备动作的第一步就是保证没有其它线程在执行日志记录同步工作。
如果当前线程可以执行日志记录同步操作,在记录相关信息后,调用EditLogOutputStream.setReadyToFlush(),交换输出流的日志写入缓冲区和写文件缓冲区。
(2)准备工作完成后,logSync()调用所有输出流的flush()方法,刷新并同步日志记录。
(3) 最后,logSync()设置FSEidtLog的成员变量synctxid和isSyncRunning,结束这次logSync()调用并为下次调用做好准备
public void logSync() throws IOException
{
ArrayList<EditLogOutputStream> errorStreams = null;
long syncStart = 0;
// Fetch the transactionId of this thread.
long mytxid = myTransactionId.get().txid;
ArrayList<EditLogOutputStream> streams = new ArrayList<EditLogOutputStream>();
boolean sync = false;
try {
synchronized (this) {
printStatistics(false);
/**
* if somebody is already syncing, then wait
* 有其它线程在执行日志同步操作
**/
while (mytxid > synctxid && isSyncRunning) {
try {
wait(1000);
} catch (InterruptedException ie) {
}
}
// If this transaction was already flushed, then nothing to do
// 日志已经被其它线程同步,返回
if (mytxid <= synctxid)
{
//当执行的事物id小于已同步的Id,也进行计数累加
numTransactionsBatchedInSync++;
if (metrics != null) // Metrics is non-null only when used inside name node
metrics.incrTransactionsBatchedInSync();
return;
}
// now, this thread will do the sync
// 同步由当前线程执行,记录相关信息
syncStart = txid;
isSyncRunning = true;
sync = true;
// swap buffers
exitIfNoStreams();
for(EditLogOutputStream eStream : editStreams)
{
try {
//交换缓冲
eStream.setReadyToFlush();
streams.add(eStream);
} catch (IOException ie) {
FSNamesystem.LOG.error("Unable to get ready to flush.", ie);
//
// remember the streams that encountered an error.
//
if (errorStreams == null) {
errorStreams = new ArrayList<EditLogOutputStream>(1);
}
errorStreams.add(eStream);
}
}
}
// do the sync
// 执行日志同步操作
long start = FSNamesystem.now();
for (EditLogOutputStream eStream : streams) {
try {
//同步完成之后,做输入数据操作
eStream.flush();
} catch (IOException ie) {
FSNamesystem.LOG.error("Unable to sync edit log.", ie);
//
// remember the streams that encountered an error.
//
if (errorStreams == null) {
errorStreams = new ArrayList<EditLogOutputStream>(1);
}
errorStreams.add(eStream);
}
}
long elapsed = FSNamesystem.now() - start;
removeEditsStreamsAndStorageDirs(errorStreams);
exitIfNoStreams();
if (metrics != null) // Metrics is non-null only when used inside name node
metrics.addSync(elapsed);
} finally {
synchronized (this) {
if(sync) {
synctxid = syncStart;
isSyncRunning = false;
}
this.notifyAll();
}
}
}
3.3 loadFSImage
/**
* Load in the filesystem imagefrom file. It's a big list of
* filenames and blocks. Return whether we should
* "re-save" and consolidate the edit-logs
*/
boolean loadFSImage(File curFile) throws IOException {
assert this.getLayoutVersion() < 0 : "Negative layout version is expected.";
assert curFile != null : "curFile is null";
FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
FSDirectory fsDir = fsNamesys.dir;
//
// Load in bits
//
//以输入流的方式读取镜像文件数据
boolean needToSave = true;
DataInputStream in = new DataInputStream(new BufferedInputStream(
new FileInputStream(curFile)));
try {
/*
* Note: Remove any checks for version earlier than
* Storage.LAST_UPGRADABLE_LAYOUT_VERSION since we should never get
* to here with older images.
*/
/*
* TODO we need to change format of the image file
* it should not contain version and namespace fields
*/
// read image version: first appeared in version -1
// 读入image版本号
int imgVersion = in.readInt();
// read namespaceID: first appeared in version -2
//读入namespaceId
this.namespaceID = in.readInt();
// read number of files
//目录树项目数
long numFiles;
if (imgVersion <= -16) {
numFiles = in.readLong();
} else {
numFiles = in.readInt();
}
this.layoutVersion = imgVersion;
// read in the last generation stamp.
if (imgVersion <= -12) {
long genstamp = in.readLong();
fsNamesys.setGenerationStamp(genstamp);
}
needToSave = (imgVersion != FSConstants.LAYOUT_VERSION);
// read file info
short replication = FSNamesystem.getFSNamesystem().getDefaultReplication();
LOG.info("Number of files = " + numFiles);
String path;
String parentPath = "";
INodeDirectory parentINode = fsDir.rootDir;
for (long i = 0; i < numFiles; i++) {
long modificationTime = 0;
long atime = 0;
long blockSize = 0;
path = readString(in);
replication = in.readShort();
replication = FSEditLog.adjustReplication(replication);
modificationTime = in.readLong();
if (imgVersion <= -17) {
atime = in.readLong();
}
if (imgVersion <= -8) {
blockSize = in.readLong();
}
int numBlocks = in.readInt();
Block blocks[] = null;
// for older versions, a blocklist of size 0
// indicates a directory.
if ((-9 <= imgVersion && numBlocks > 0) ||
(imgVersion < -9 && numBlocks >= 0)) {
blocks = new Block[numBlocks];
for (int j = 0; j < numBlocks; j++) {
blocks[j] = new Block();
if (-14 < imgVersion) {
//一个个数据块的恢复
blocks[j].set(in.readLong(), in.readLong(),
Block.GRANDFATHER_GENERATION_STAMP);
} else {
blocks[j].readFields(in);
}
}
}
// Older versions of HDFS does not store the block size in inode.
// If the file has more than one block, use the size of the
// first block as the blocksize. Otherwise use the default block size.
//
if (-8 <= imgVersion && blockSize == 0) {
if (numBlocks > 1) {
blockSize = blocks[0].getNumBytes();
} else {
long first = ((numBlocks == 1) ? blocks[0].getNumBytes(): 0);
blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
}
}
// get quota only when the node is a directory
long nsQuota = -1L;
if (imgVersion <= -16 && blocks == null) {
nsQuota = in.readLong();
}
long dsQuota = -1L;
if (imgVersion <= -18 && blocks == null) {
dsQuota = in.readLong();
}
PermissionStatus permissions = fsNamesys.getUpgradePermission();
if (imgVersion <= -11) {
permissions = PermissionStatus.read(in);
}
if (path.length() == 0) { // it is the root
// update the root's attributes
if (nsQuota != -1 || dsQuota != -1) {
fsDir.rootDir.setQuota(nsQuota, dsQuota);
}
fsDir.rootDir.setModificationTime(modificationTime);
fsDir.rootDir.setPermissionStatus(permissions);
continue;
}
// check if the new inode belongs to the same parent
if(!isParent(path, parentPath)) {
parentINode = null;
parentPath = getParent(path);
}
// add new inode
// 将读入的INode添加到目录树中
parentINode = fsDir.addToParent(path, parentINode, permissions,
blocks, replication, modificationTime,
atime, nsQuota, dsQuota, blockSize);
}
// load datanode info
this.loadDatanodes(imgVersion, in);
// load Files Under Construction
this.loadFilesUnderConstruction(imgVersion, in, fsNamesys);
this.loadSecretManagerState(imgVersion, in, fsNamesys);
} finally {
in.close();
}
return needToSave;
}
/**
* Return string representing the parent of the given path.
*/
String getParent(String path) {
return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
}
private boolean isParent(String path, String parent) {
return parent != null && path != null
&& path.indexOf(parent) == 0
&& path.lastIndexOf(Path.SEPARATOR) == parent.length();
}
/**
* Load and merge edits from two edits files
*
* @param sd storage directory
* @return number of edits loaded
* @throws IOException
*/
int loadFSEdits(StorageDirectory sd) throws IOException {
int numEdits = 0;
EditLogFileInputStream edits =
new EditLogFileInputStream(getImageFile(sd, NameNodeFile.EDITS));
numEdits = FSEditLog.loadFSEdits(edits);
edits.close();
File editsNew = getImageFile(sd, NameNodeFile.EDITS_NEW);
if (editsNew.exists() && editsNew.length() > 0) {
edits = new EditLogFileInputStream(editsNew);
numEdits += FSEditLog.loadFSEdits(edits);
edits.close();
}
// update the counts.
FSNamesystem.getFSNamesystem().dir.updateCountForINodeWithQuota();
return numEdits;
}
3.3.1 loadFilesUnderConstruction
private void loadFilesUnderConstruction(int version, DataInputStream in,
FSNamesystem fs) throws IOException
{
FSDirectory fsDir = fs.dir;
if (version > -13) // pre lease image version
return;
int size = in.readInt();
LOG.info("Number of files under construction = " + size);
for (int i = 0; i < size; i++)
{
//读入INodeFileUnderConstruction信息
INodeFileUnderConstruction cons = readINodeUnderConstruction(in);
// verify that file exists in namespace
String path = cons.getLocalName();
INode old = fsDir.getFileINode(path);
if (old == null) {
throw new IOException("Found lease for non-existent file " + path);
}
if (old.isDirectory()) {
throw new IOException("Found lease for directory " + path);
}
INodeFile oldnode = (INodeFile) old;
fsDir.replaceNode(path, oldnode, cons);
fs.leaseManager.addLease(cons.clientName, path);
}
}
3.3.2
* Load an edit log, and apply the changes to the in-memory structure
* This is where we apply edits that we've been writing to disk all
* along.
* 导入编辑日志文件,并在内存中构建此时状态
*/
static int loadFSEdits(EditLogInputStream edits) throws IOException {
FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
//FSDirectory是一个门面模式的体现,所有的操作都是在这个类中分给里面的子系数实现
FSDirectory fsDir = fsNamesys.dir;
int numEdits = 0;
int logVersion = 0;
String clientName = null;
String clientMachine = null;
String path = null;
int numOpAdd = 0, numOpClose = 0, numOpDelete = 0,
numOpRename = 0, numOpSetRepl = 0, numOpMkDir = 0,
numOpSetPerm = 0, numOpSetOwner = 0, numOpSetGenStamp = 0,
numOpTimes = 0, numOpGetDelegationToken = 0,
numOpRenewDelegationToken = 0, numOpCancelDelegationToken = 0,
numOpUpdateMasterKey = 0, numOpOther = 0;
long startTime = FSNamesystem.now();
DataInputStream in = new DataInputStream(new BufferedInputStream(edits));
try {
// Read log file version. Could be missing.
in.mark(4);
// If edits log is greater than 2G, available method will return negative
// numbers, so we avoid having to call available
boolean available = true;
try {
// 首先读入日志版本号
logVersion = in.readByte();
} catch (EOFException e) {
available = false;
}
if (available) {
in.reset();
logVersion = in.readInt();
if (logVersion < FSConstants.LAYOUT_VERSION) // future version
throw new IOException(
"Unexpected version of the file system log file: "
+ logVersion + ". Current version = "
+ FSConstants.LAYOUT_VERSION + ".");
}
assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION :
"Unsupported version " + logVersion;
while (true) {
long timestamp = 0;
long mtime = 0;
long atime = 0;
long blockSize = 0;
byte opcode = -1;
try {
//读入操作参数
opcode = in.readByte();
//如果读入的是无效参数,则表明已经读到日志的尾部了,可以跳出循环
if (opcode == OP_INVALID) {
FSNamesystem.LOG.info("Invalid opcode, reached end of edit log " +
"Number of transactions found " + numEdits);
break; // no more transactions
}
} catch (EOFException e) {
break; // no more transactions
}
//进行记录数的累加
numEdits++;
//下面根据操作类型进行值的设置
switch (opcode) {
case OP_ADD:
case OP_CLOSE: {
// versions > 0 support per file replication
// get name and replication
int length = in.readInt();
if (-7 == logVersion && length != 3||
-17 < logVersion && logVersion < -7 && length != 4 ||
logVersion <= -17 && length != 5) {
throw new IOException("Incorrect data format." +
" logVersion is " + logVersion +
" but writables.length is " +
length + ". ");
}
path = FSImage.readString(in);
short replication = adjustReplication(readShort(in));
mtime = readLong(in);
if (logVersion <= -17) {
atime = readLong(in);
}
if (logVersion < -7) {
blockSize = readLong(in);
}
// get blocks
Block blocks[] = null;
if (logVersion <= -14) {
blocks = readBlocks(in);
} else {
BlockTwo oldblk = new BlockTwo();
int num = in.readInt();
blocks = new Block[num];
for (int i = 0; i < num; i++) {
oldblk.readFields(in);
blocks[i] = new Block(oldblk.blkid, oldblk.len,
Block.GRANDFATHER_GENERATION_STAMP);
}
}
// Older versions of HDFS does not store the block size in inode.
// If the file has more than one block, use the size of the
// first block as the blocksize. Otherwise use the default
// block size.
if (-8 <= logVersion && blockSize == 0) {
if (blocks.length > 1) {
blockSize = blocks[0].getNumBytes();
} else {
long first = ((blocks.length == 1)? blocks[0].getNumBytes(): 0);
blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
}
}
PermissionStatus permissions = fsNamesys.getUpgradePermission();
if (logVersion <= -11) {
permissions = PermissionStatus.read(in);
}
// clientname, clientMachine and block locations of last block.
if (opcode == OP_ADD && logVersion <= -12) {
clientName = FSImage.readString(in);
clientMachine = FSImage.readString(in);
if (-13 <= logVersion) {
readDatanodeDescriptorArray(in);
}
} else {
clientName = "";
clientMachine = "";
}
// The open lease transaction re-creates a file if necessary.
// Delete the file if it already exists.
if (FSNamesystem.LOG.isDebugEnabled()) {
FSNamesystem.LOG.debug(opcode + ": " + path +
" numblocks : " + blocks.length +
" clientHolder " + clientName +
" clientMachine " + clientMachine);
}
fsDir.unprotectedDelete(path, mtime);
// add to the file tree
INodeFile node = (INodeFile)fsDir.unprotectedAddFile(
path, permissions,
blocks, replication,
mtime, atime, blockSize);
if (opcode == OP_ADD) {
numOpAdd++;
//
// Replace current node with a INodeUnderConstruction.
// Recreate in-memory lease record.
// 构造出处于构建状态的的文件对象
//
INodeFileUnderConstruction cons = new INodeFileUnderConstruction(
node.getLocalNameBytes(),
node.getReplication(),
node.getModificationTime(),
node.getPreferredBlockSize(),
node.getBlocks(),
node.getPermissionStatus(),
clientName,
clientMachine,
null);
fsDir.replaceNode(path, node, cons);
fsNamesys.leaseManager.addLease(cons.clientName, path);
}
break;
}
case OP_SET_REPLICATION: {
numOpSetRepl++;
path = FSImage.readString(in);
short replication = adjustReplication(readShort(in));
fsDir.unprotectedSetReplication(path, replication, null);
break;
}
case OP_RENAME: {
numOpRename++;
int length = in.readInt();
if (length != 3) {
throw new IOException("Incorrect data format. "
+ "Mkdir operation.");
}
//读入改名操作的日志记录内容
String s = FSImage.readString(in);
String d = FSImage.readString(in);
timestamp = readLong(in);
HdfsFileStatus dinfo = fsDir.getFileInfo(d);
//改名
fsDir.unprotectedRenameTo(s, d, timestamp);
//修改租约管理器中的记录
fsNamesys.changeLease(s, d, dinfo);
break;
}
case OP_DELETE: {
numOpDelete++;
int length = in.readInt();
if (length != 2) {
throw new IOException("Incorrect data format. "
+ "delete operation.");
}
path = FSImage.readString(in);
timestamp = readLong(in);
fsDir.unprotectedDelete(path, timestamp);
break;
}
case OP_MKDIR: {
numOpMkDir++;
PermissionStatus permissions = fsNamesys.getUpgradePermission();
int length = in.readInt();
if (-17 < logVersion && length != 2 ||
logVersion <= -17 && length != 3) {
throw new IOException("Incorrect data format. "
+ "Mkdir operation.");
}
path = FSImage.readString(in);
timestamp = readLong(in);
// The disk format stores atimes for directories as well.
// However, currently this is not being updated/used because of
// performance reasons.
if (logVersion <= -17) {
atime = readLong(in);
}
if (logVersion <= -11) {
permissions = PermissionStatus.read(in);
}
fsDir.unprotectedMkdir(path, permissions, timestamp);
break;
}
case OP_SET_GENSTAMP: {
numOpSetGenStamp++;
long lw = in.readLong();
fsDir.namesystem.setGenerationStamp(lw);
break;
}
case OP_DATANODE_ADD: {
numOpOther++;
FSImage.DatanodeImage nodeimage = new FSImage.DatanodeImage();
nodeimage.readFields(in);
//Datnodes are not persistent any more.
break;
}
case OP_DATANODE_REMOVE: {
numOpOther++;
DatanodeID nodeID = new DatanodeID();
nodeID.readFields(in);
//Datanodes are not persistent any more.
break;
}
case OP_SET_PERMISSIONS: {
numOpSetPerm++;
if (logVersion > -11)
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
fsDir.unprotectedSetPermission(
FSImage.readString(in), FsPermission.read(in));
break;
}
case OP_SET_OWNER: {
numOpSetOwner++;
if (logVersion > -11)
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
fsDir.unprotectedSetOwner(FSImage.readString(in),
FSImage.readString_EmptyAsNull(in),
FSImage.readString_EmptyAsNull(in));
break;
}
case OP_SET_NS_QUOTA: {
if (logVersion > -16) {
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
}
fsDir.unprotectedSetQuota(FSImage.readString(in),
readLongWritable(in),
FSConstants.QUOTA_DONT_SET);
break;
}
case OP_CLEAR_NS_QUOTA: {
if (logVersion > -16) {
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
}
fsDir.unprotectedSetQuota(FSImage.readString(in),
FSConstants.QUOTA_RESET,
FSConstants.QUOTA_DONT_SET);
break;
}
case OP_SET_QUOTA:
fsDir.unprotectedSetQuota(FSImage.readString(in),
readLongWritable(in),
readLongWritable(in));
break;
case OP_TIMES: {
numOpTimes++;
int length = in.readInt();
if (length != 3) {
throw new IOException("Incorrect data format. "
+ "times operation.");
}
path = FSImage.readString(in);
mtime = readLong(in);
atime = readLong(in);
fsDir.unprotectedSetTimes(path, mtime, atime, true);
break;
}
case OP_GET_DELEGATION_TOKEN: {
if (logVersion > -19) {
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
}
numOpGetDelegationToken++;
DelegationTokenIdentifier delegationTokenId =
new DelegationTokenIdentifier();
delegationTokenId.readFields(in);
long expiryTime = readLong(in);
fsNamesys.getDelegationTokenSecretManager()
.addPersistedDelegationToken(delegationTokenId, expiryTime);
break;
}
case OP_RENEW_DELEGATION_TOKEN: {
if (logVersion > -19) {
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
}
numOpRenewDelegationToken++;
DelegationTokenIdentifier delegationTokenId =
new DelegationTokenIdentifier();
delegationTokenId.readFields(in);
long expiryTime = readLong(in);
fsNamesys.getDelegationTokenSecretManager()
.updatePersistedTokenRenewal(delegationTokenId, expiryTime);
break;
}
case OP_CANCEL_DELEGATION_TOKEN: {
if (logVersion > -19) {
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
}
numOpCancelDelegationToken++;
DelegationTokenIdentifier delegationTokenId =
new DelegationTokenIdentifier();
delegationTokenId.readFields(in);
fsNamesys.getDelegationTokenSecretManager()
.updatePersistedTokenCancellation(delegationTokenId);
break;
}
case OP_UPDATE_MASTER_KEY: {
if (logVersion > -19) {
throw new IOException("Unexpected opcode " + opcode
+ " for version " + logVersion);
}
numOpUpdateMasterKey++;
DelegationKey delegationKey = new DelegationKey();
delegationKey.readFields(in);
fsNamesys.getDelegationTokenSecretManager().updatePersistedMasterKey(
delegationKey);
break;
}
default: {
throw new IOException("Never seen opcode " + opcode);
}
}
}
} catch (IOException ex) {
// Failed to load 0.20.203 version edits during upgrade. This version has
// conflicting opcodes with the later releases. The editlog must be
// emptied by restarting the namenode, before proceeding with the upgrade.
if (Storage.is203LayoutVersion(logVersion) &&
logVersion != FSConstants.LAYOUT_VERSION) {
String msg = "During upgrade, failed to load the editlog version " +
logVersion + " from release 0.20.203. Please go back to the old " +
" release and restart the namenode. This empties the editlog " +
" and saves the namespace. Resume the upgrade after this step.";
throw new IOException(msg, ex);
} else {
throw ex;
}
} finally {
in.close();
}
FSImage.LOG.info("Edits file " + edits.getName()
+ " of size " + edits.length() + " edits # " + numEdits
+ " loaded in " + (FSNamesystem.now()-startTime)/1000 + " seconds.");
if (FSImage.LOG.isDebugEnabled()) {
FSImage.LOG.debug("numOpAdd = " + numOpAdd + " numOpClose = " + numOpClose
+ " numOpDelete = " + numOpDelete + " numOpRename = " + numOpRename
+ " numOpSetRepl = " + numOpSetRepl + " numOpMkDir = " + numOpMkDir
+ " numOpSetPerm = " + numOpSetPerm
+ " numOpSetOwner = " + numOpSetOwner
+ " numOpSetGenStamp = " + numOpSetGenStamp
+ " numOpTimes = " + numOpTimes
+ " numOpGetDelegationToken = " + numOpGetDelegationToken
+ " numOpRenewDelegationToken = " + numOpRenewDelegationToken
+ " numOpCancelDelegationToken = " + numOpCancelDelegationToken
+ " numOpUpdateMasterKey = " + numOpUpdateMasterKey
+ " numOpOther = " + numOpOther);
}
if (logVersion != FSConstants.LAYOUT_VERSION) // other version
numEdits++; // save this image asap
return numEdits;
}