drivers/mtd/ubi/ubi-media.h
EC: Erase Count,记录块的擦除次数
struct ubi_ec_hdr {
__be32 magic;
__u8 version;
__u8 padding1[3];
__be64 ec; /* Warning: the current limit is 31-bit anyway! */
__be32 vid_hdr_offset;
__be32 data_offset;
__be32 image_seq;
__u8 padding2[32];
__be32 hdr_crc;
} __packed;
在ubiattach的时候指定一个mtd,如果PEB上没有EC,则用平均的EC值写入
EC值只有在擦除的时候才会增加1
/* Erase counter header magic number (ASCII "UBI#") */
#define UBI_EC_HDR_MAGIC 0x55424923
/* Volume identifier header magic number (ASCII "UBI!") */
#define UBI_VID_HDR_MAGIC 0x55424921
VID:volume identifier
struct ubi_vid_hdr {
__be32 magic;
__u8 version;
__u8 vol_type; //volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC)
__u8 copy_flag;
__u8 compat;
__be32 vol_id; //ID of this volume
__be32 lnum; //logical eraseblock number
__u8 padding1[4];
__be32 data_size;
__be32 used_ebs;
__be32 data_pad;
__be32 data_crc;
__u8 padding2[4];
__be64 sqnum; //
__u8 padding3[12];
__be32 hdr_crc;
} __packed;
* The @sqnum is the value of the global sequence counter at the time when this
* VID header was created. The global sequence counter is incremented each time
* UBI writes a new VID header to the flash, i.e. when it maps a logical
* eraseblock to a new physical eraseblock. The global sequence counter is an
* unsigned 64-bit integer and we assume it never overflows. The @sqnum
* (sequence number) is used to distinguish between older and newer versions of
* logical eraseblocks.
*
scrub: //发生bitflip时进行清洗
int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
int len)
{
err = mtd_read(ubi->mtd, addr, len, &read, buf);
if (err) {
const char *errstr = mtd_is_eccerr(err) ? " (ECC error)" : "";
if (mtd_is_bitflip(err)) { //bitflip是可以ecc可以矫正的,所以读到的数据都是对的
/*
* -EUCLEAN is reported if there was a bit-flip which
* was corrected, so this is harmless.
*
* We do not report about it here unless debugging is
* enabled. A corresponding message will be printed
* later, when it is has been scrubbed.
*/
ubi_msg(ubi, "fixable bit-flip detected at PEB %d",
pnum);
ubi_assert(len == read);
return UBI_IO_BITFLIPS;
}
}
当发生bitflip时,调用ubi_wl_scrub_peb()函数进行scrub
找到volume table:
/*
* Starting ID of internal volumes: 0x7fffefff.
* There is reserved room for 4096 internal volumes.
*/
#define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096)
/* The layout volume contains the volume table */
#define UBI_LAYOUT_VOLUME_ID UBI_INTERNAL_VOL_START
#define UBI_LAYOUT_VOLUME_TYPE UBI_VID_DYNAMIC
#define UBI_LAYOUT_VOLUME_ALIGN 1
#define UBI_LAYOUT_VOLUME_EBS 2
#define UBI_LAYOUT_VOLUME_NAME "layout volume"
#define UBI_LAYOUT_VOLUME_COMPAT UBI_COMPAT_REJECT
/* The maximum number of volumes per one UBI device */
#define UBI_MAX_VOLUMES 128
/* The maximum volume name length */
#define UBI_VOL_NAME_MAX 127
/* Size of the volume table record */
#define UBI_VTBL_RECORD_SIZE sizeof(struct ubi_vtbl_record)
struct ubi_vtbl_record {
__be32 reserved_pebs;
__be32 alignment;
__be32 data_pad;
__u8 vol_type;
__u8 upd_marker;
__be16 name_len;
__u8 name[UBI_VOL_NAME_MAX+1];
__u8 flags;
__u8 padding[23];
__be32 crc;
} __packed;
ubi_read_volume_table()读取volume table
process_lvol()
读取vid,如果里面的vol_id==UBI_LAYOUT_VOLUME_ID,则说明这是一个layout volume
err = ubi_io_read_data(ubi, leb[aeb->lnum], aeb->pnum, 0,
ubi->vtbl_size);
读取对应的LEB,存放的是结构化的struct ubi_vtbl_record 数据,这个结构记录的就是用户的分卷信息
ubi设备:
struct ubi_device {
struct cdev cdev;
struct device dev;
int ubi_num; //ubi设备号
char ubi_name[sizeof(UBI_NAME_STR)+5]; //ubi设备名称
int vol_count; //记录上面卷的个数
struct ubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT]; //上面有多少个卷设备
spinlock_t volumes_lock;
int ref_count;
int image_seq;
int rsvd_pebs;
int avail_pebs;
int beb_rsvd_pebs;
int beb_rsvd_level;
int bad_peb_limit;
int autoresize_vol_id;
int vtbl_slots;
int vtbl_size;
struct ubi_vtbl_record *vtbl; //系统卷记录
struct mutex device_mutex;
int max_ec;
/* Note, mean_ec is not updated run-time - should be fixed */
int mean_ec;
...
/* Wear-leveling sub-system's stuff */
struct rb_root used;
struct rb_root erroneous;
struct rb_root free;
int free_count;
struct rb_root scrub;
struct list_head pq[UBI_PROT_QUEUE_LEN];
int pq_head;
spinlock_t wl_lock;
struct mutex move_mutex;
struct rw_semaphore work_sem;
int wl_scheduled;
struct ubi_wl_entry **lookuptbl;
struct ubi_wl_entry *move_from;
struct ubi_wl_entry *move_to;
int move_to_put;
struct list_head works;
int works_count;
struct task_struct *bgt_thread;
int thread_enabled;
char bgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2];
/* I/O sub-system's stuff */
long long flash_size;
int peb_count; //关联的mtd上物理擦除块的个数
int peb_size; //物理擦除块的大小
int bad_peb_count; //坏的擦除块的个数
int good_peb_count;
int corr_peb_count;
int erroneous_peb_count;
int max_erroneous;
int min_io_size;
int hdrs_min_io_size;
int ro_mode;
int leb_size; //逻辑擦除块的大小
int leb_start;
int ec_hdr_alsize;
int vid_hdr_alsize;
int vid_hdr_offset;
int vid_hdr_aloffset;
int vid_hdr_shift;
unsigned int bad_allowed:1;
unsigned int nor_flash:1;
int max_write_size;
struct mtd_info *mtd; //该ubi设备和哪个mtd关联
void *peb_buf;
struct mutex buf_mutex;
struct mutex ckvol_mutex;
struct ubi_debug_info dbg;
};
ubi卷设备:
struct ubi_volume {
struct device dev;
struct cdev cdev;
struct ubi_device *ubi; //该卷设备属于哪个ubi设备
int vol_id; //该卷的id
int ref_count;
int readers;
int writers;
int exclusive;
int metaonly;
int reserved_pebs;
int vol_type; //该卷是静态卷还是动态卷
int usable_leb_size;
int used_ebs;
int last_eb_bytes;
long long used_bytes;
int alignment;
int data_pad;
int name_len;
char name[UBI_VOL_NAME_MAX + 1]; //该卷的名字
int upd_ebs;
int ch_lnum;
long long upd_bytes;
long long upd_received;
void *upd_buf;
struct ubi_eba_table *eba_tbl; //leb到peb的映射表
unsigned int checked:1;
unsigned int corrupted:1;
unsigned int upd_marker:1;
unsigned int updating:1;
unsigned int changing_leb:1;
unsigned int direct_writes:1;
};
ubifs和ubi直接交互的接口 fs/ubifs/io.c :
int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
int len, int even_ebadmsg)
{
err = ubi_read(c->ubi, lnum, buf, offs, len);
}
int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
int len)
{
int err;
ubifs_assert(!c->ro_media && !c->ro_mount);
if (c->ro_error)
return -EROFS;
if (!dbg_is_tst_rcvry(c))
err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
else
err = dbg_leb_write(c, lnum, buf, offs, len);
if (err) {
ubifs_err(c, "writing %d bytes to LEB %d:%d failed, error %d",
len, lnum, offs, err);
ubifs_ro_mode(c, err);
dump_stack();
}
return err;
}
ubi模块对外提供的接口drivers/mtd/ubi/kapi.c:
int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, ubi_leb_unmap
int len, int check)
{
err = ubi_eba_read_leb(ubi, vol, lnum, buf, offset, len, check);
}
//drivers/mtd/ubi/eba.c
/*
* The UBI Eraseblock Association (EBA) sub-system.
*
* This sub-system is responsible for I/O to/from logical eraseblock.
*/
int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum, +entries
void *buf, int offset, int len, int check)
{
pnum = vol->eba_tbl->entries[lnum].pnum; //由leb到peb的映射
...
err = ubi_io_read_data(ubi, buf, pnum, offset, len); //读取映射的peb块
}
static inline int ubi_io_read_data(const struct ubi_device *ubi, void *buf,
int pnum, int offset, int len)
{
ubi_assert(offset >= 0);
return ubi_io_read(ubi, buf, pnum, offset + ubi->leb_start, len);
}
int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
int len)
{
addr = (loff_t)pnum * ubi->peb_size + offset;
err = mtd_read(ubi->mtd, addr, len, &read, buf);
}
ubifs存储的基本单位是node:
/**
* struct ubifs_ch - common header node.
* @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
* @crc: CRC-32 checksum of the node header
* @sqnum: sequence number
* @len: full node length
* @node_type: node type
* @group_type: node group type
* @padding: reserved for future, zeroes
*
* Every UBIFS node starts with this common part. If the node has a key, the
* key always goes next.
*/
struct ubifs_ch {
__le32 magic; //#define UBIFS_NODE_MAGIC 0x06101831
__le32 crc;
__le64 sqnum;
__le32 len;
__u8 node_type;
__u8 group_type;
__u8 padding[2];
} __packed;
所有的node都包含这个公共的node 头
superblock node,存放在LEB0: //#define UBIFS_SB_LNUM 0
struct ubifs_sb_node { //#define UBIFS_SB_LEBS 1
struct ubifs_ch ch;
__u8 padding[2];
__u8 key_hash;
__u8 key_fmt;
__le32 flags;
__le32 min_io_size; //最小的io单位
__le32 leb_size;
__le32 leb_cnt; //整个文件系统的大小
__le32 max_leb_cnt; //挂载的卷中除去坏块后剩下的leb数
__le64 max_bud_bytes;
__le32 log_lebs;
__le32 lpt_lebs;
__le32 orph_lebs;
__le32 jhead_cnt;
__le32 fanout;
__le32 lsave_cnt;
__le32 fmt_version;
__le16 default_compr;
__u8 padding1[2];
__le32 rp_uid;
__le32 rp_gid;
__le64 rp_size;
__le32 time_gran;
__u8 uuid[16];
__le32 ro_compat_version;
__u8 padding2[3968];
} __packed;
master node,存放在LEB1,LEB2: //#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
struct ubifs_mst_node { //#define UBIFS_MST_LEBS 2
struct ubifs_ch ch;
__le64 highest_inum;
__le64 cmt_no;
__le32 flags;
__le32 log_lnum;
__le32 root_lnum; //指出根节点的位置
__le32 root_offs;
__le32 root_len;
__le32 gc_lnum;
__le32 ihead_lnum;
__le32 ihead_offs;
__le64 index_size;
__le64 total_free;
__le64 total_dirty;
__le64 total_used;
__le64 total_dead;
__le64 total_dark;
__le32 lpt_lnum;
__le32 lpt_offs;
__le32 nhead_lnum;
__le32 nhead_offs;
__le32 ltab_lnum;
__le32 ltab_offs;
__le32 lsave_lnum;
__le32 lsave_offs;
__le32 lscan_lnum;
__le32 empty_lebs;
__le32 idx_lebs;
__le32 leb_cnt;
__u8 padding[344];
} __packed;
index对应的node:
struct ubifs_idx_node {
struct ubifs_ch ch;
__le16 child_cnt; //该节点有多少个key,对于B+tree,多少个key就对应多少个branches
__le16 level;
__u8 branches[];
} __packed;
其中branches指向:struct ubifs_branch,每个结构体描述一个key
struct ubifs_branch {
__le32 lnum; //子节点对应的leb号
__le32 offs; //子节点在leb上的偏移
__le32 len;
__u8 key[]; //这个key,子节点的大小介于该相邻的两个key之间
} __packed;
其中key指向:
union ubifs_key {
uint8_t u8[UBIFS_SK_LEN];
uint32_t u32[UBIFS_SK_LEN/4];
uint64_t u64[UBIFS_SK_LEN/8];
__le32 j32[UBIFS_SK_LEN/4];
};
/**
* key_inum - fetch inode number from key.
* @c: UBIFS file-system description object
* @k: key to fetch inode number from
*/
static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
{
const union ubifs_key *key = k;
return key->u32[0];
}
ubifs采用的是B+Tree,正如下图所示:
一个节点包含15,56,77三个key,三个branch,中间节点就是index node,是不带数据的。
inode对应的node:
struct ubifs_ino_node {
struct ubifs_ch ch;
__u8 key[UBIFS_MAX_KEY_LEN]; //inode对应的key
__le64 creat_sqnum;
__le64 size;
__le64 atime_sec;
__le64 ctime_sec;
__le64 mtime_sec;
__le32 atime_nsec;
__le32 ctime_nsec;
__le32 mtime_nsec;
__le32 nlink;
__le32 uid;
__le32 gid;
__le32 mode;
__le32 flags;
__le32 data_len;
__le32 xattr_cnt;
__le32 xattr_size;
__u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
__le32 xattr_names;
__le16 compr_type;
__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
__u8 data[];
} __packed;
data对应的node:
struct ubifs_data_node {
struct ubifs_ch ch;
__u8 key[UBIFS_MAX_KEY_LEN]; //数据node对应的key
__le32 size;
__le16 compr_type;
__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
__u8 data[];
} __packed;
ubifs_data_node是ubifs文件数据的载体,对数据的访问,需要首先生成待访问数据所对应节点的key,然后根据这个key到UBIFS wandering 树中找到这个ubifs_data_node。
directory entry node:
struct ubifs_dent_node {
struct ubifs_ch ch;
__u8 key[UBIFS_MAX_KEY_LEN]; //dent对应的key
__le64 inum; //目录项对应的文件的inode num
__u8 padding1;
__u8 type;
__le16 nlen;
__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
__u8 name[];
} __packed;
非叶子节点包括index node(包含多个key),叶子节点包括inode node,Dent node,data node(包含一个key)
tnc: tree node cache,是在内存中维护的一个tree node,构建的依据是Flash上的node
ubifs_tnc_add()
ubifs_tnc_lookup(c, key, node); 根据key在tnc中查找对应的node
static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
int err;
union ubifs_key key;
struct inode *inode = NULL;
struct ubifs_dent_node *dent;
struct ubifs_info *c = dir->i_sb->s_fs_info;
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
if (dentry->d_name.len > UBIFS_MAX_NLEN)
return ERR_PTR(-ENAMETOOLONG);
dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
if (!dent)
return ERR_PTR(-ENOMEM);
dent_key_init(c, &key, dir->i_ino, &dentry->d_name); //根据父目录的inode号和目录项的名称生成索引的key
err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); //根据key生成对应目录项的inode
if (err) {
if (err == -ENOENT) {
dbg_gen("not found");
goto done;
}
goto out;
}
inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); //分配目录项对应文件的inode
}
static int read_block(struct inode *inode, void *addr, unsigned int block,
struct ubifs_data_node *dn)
{
struct ubifs_info *c = inode->i_sb->s_fs_info;
int err, len, out_len;
union ubifs_key key;
unsigned int dlen;
data_key_init(c, &key, inode->i_ino, block); //通过文件的inode号和文件的偏移生成索引数据的key
err = ubifs_tnc_lookup(c, &key, dn); //根据key得到数据node
if (err) {
if (err == -ENOENT)
/* Not found, so it must be a hole */
memset(addr, 0, UBIFS_BLOCK_SIZE);
return err;
}
}
padding node:
struct ubifs_pad_node {
struct ubifs_ch ch;
__le32 pad_len;
} __packed;
* The flash media obliges us to write only in chunks of %c->min_io_size and
* when we have to write less data we add padding node to the write-buffer and
* pad it to the next minimal I/O unit's boundary. Padding nodes help when the
* media is being scanned. If the amount of wasted space is not enough to fit a
* padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
* pattern (%UBIFS_PADDING_BYTE).
*
* Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
* used.
*/
void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
{
uint32_t crc;
ubifs_assert(pad >= 0 && !(pad & 7));
if (pad >= UBIFS_PAD_NODE_SZ) { //填充的大小是否超过一个pad node节点的大小
struct ubifs_ch *ch = buf;
struct ubifs_pad_node *pad_node = buf;
ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
ch->node_type = UBIFS_PAD_NODE;
ch->group_type = UBIFS_NO_NODE_GROUP;
ch->padding[0] = ch->padding[1] = 0;
ch->sqnum = 0;
ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
pad -= UBIFS_PAD_NODE_SZ;
pad_node->pad_len = cpu_to_le32(pad); //记录后面还有多少个0
crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
ch->crc = cpu_to_le32(crc);
memset(buf + UBIFS_PAD_NODE_SZ, 0, pad); //pad node后面填充0
} else if (pad > 0)
/* Too little space, padding node won't fit */
memset(buf, UBIFS_PADDING_BYTE, pad); //#define UBIFS_PADDING_BYTE 0xCE
}
void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
{
uint32_t crc;
struct ubifs_ch *ch = node;
unsigned long long sqnum = next_sqnum(c); //每一个node该值加1
ubifs_assert(len >= UBIFS_CH_SZ);
ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
ch->len = cpu_to_le32(len);
ch->group_type = UBIFS_NO_NODE_GROUP;
ch->sqnum = cpu_to_le64(sqnum);
ch->padding[0] = ch->padding[1] = 0;
crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
ch->crc = cpu_to_le32(crc);
if (pad) {
len = ALIGN(len, 8);
pad = ALIGN(len, c->min_io_size) - len;
ubifs_pad(c, node + len, pad);
}
}
int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
int offs)
{
int err, buf_len = ALIGN(len, c->min_io_size);
dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
buf_len);
ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
ubifs_assert(!c->ro_media && !c->ro_mount);
ubifs_assert(!c->space_fixup);
if (c->ro_error)
return -EROFS;
ubifs_prepare_node(c, buf, len, 1); //所以只在node后填充一个字节0xCE
err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
if (err)
ubifs_dump_node(c, buf);
return err;
}
如果是空的ubi卷,这在挂载的时候是不会报错的,会调用create_default_filesystem()创建一个空的ubifs,建议看看这个函数的实现,可以更好的理解一些字段的含义。
lpt:leb properties tree,记录leb的使用情况, 包括lpt区域自己使用情况的记录和main分区使用情况的记录
static int lpt_init_wr(struct ubifs_info *c)
{
c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
}
struct ubifs_lpt_lprops { //记录每个leb上的使用情况
int free; //可以使用
int dirty; //已经释放了,但是还没有擦除,所以还不能使用
unsigned tgc:1;
unsigned cmt:1;
};
因为ubifs是工作在raw Flash之上的,在写之前必要要先擦除,dirty记录了已经释放了但是还没有擦除,因此也是不能使用的,GC负责垃圾回收,当发现lpt中记录的整个leb都是dirty时,调用ubifs_leb_unmap(),该函数负责leb到peb的解绑,同时将对应的peb擦除,这样这个leb就可以重新使用了。
mkfs.ubifs打包rootfs.ubifs的时候,leb中是可能存在空白页的,即leb中min_io 到leb最后都是FF,但是在lpt中记录的是free,在ubinize之后生成rootfs.ubi,如果直接用nandwrite来写rootfs.ubi,则这些空白页也会被写入,同时也会更新OOB,因为这些在lpt中记录的又是free,所以ubifs可以直接分配使用这些页,写入这些页显然是有问题的,因为OOB之前已经被写过了,nandflash只能由1变为0,所以OOB写入肯定会出错,有两个方法来解决这个问题:
1.nandwrite的时候leb中后面的空白页直接跳过;
2.在mkfs.ubifs的时候加上-F选项,这需要比较新的内核支持,它的原理就是在首次挂载后根据lpt中的leb部分标记为free,首先读出leb中的数据,然后擦除leb(ubifs_leb_unmap),然后重新写入数据,free部分不用写。
参考文档:
http://www.linux-mtd.infradead.org/doc/ubifs.html
http://www.linux-mtd.infradead.org/doc/ubifs_whitepaper.pdf
http://linux-mtd.infradead.org/doc/ubidesign/ubidesign.pdf
http://www.sourceware.org/jffs2/jffs2.pdf