f2fs系列文章gc

这篇文章将讲述f2fs的gc，其主要的步骤应该是分为两步，首先select一个合适的section，然后将section中的数据全部迁移。

f2fs_gc：这个函数主要有两个函数调用gc线程和f2fs_balance_fs。首先检查super_block是否设置MS_ACTIVE,也就是super_block处于活动状态（目前不知道什么个状态），如果设置了就不做gc了。然后再检查是否设置了CP_ERROR_FLAG(这个表示文件系统的没有稳定的cp pack)，如果设置了，也是不做gc了。如果此时是BG_GC并且已经没有足够的section了，那么将gc_type设置成FG_GC。如果这种情况下，没有废弃的prefree segment并且调用get_victim函数也没有获得section，但是还有足够的section，那么此时不需做write_checkpoint，其他情况下都要进行write_checkpoint并且选择的segno被设置成NULL_SEGNO。接着检查调用get_victim_by_default来选择victim section来进行垃圾回收。然后就调用do_garbage_collect对选择的section进行数据的迁移。如果是BG_GC进入的f2fs_gc，这个时候还需要检查有没有足够的section，如果没有的话，继续回去进行新一轮的回收，另外如果此时的gc_type是FG_GC，那么进行write_checkpoint操作。

int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
	unsigned int segno;
	int gc_type = sync ? FG_GC : BG_GC;
	int sec_freed = 0;
	int ret = -EINVAL;
	struct cp_control cpc;
	struct gc_inode_list gc_list = {
		.ilist = LIST_HEAD_INIT(gc_list.ilist),
		.iroot = RADIX_TREE_INIT(GFP_NOFS),
	};

	cpc.reason = __get_cp_reason(sbi);
gc_more:
	segno = NULL_SEGNO;

	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
		goto stop;
	if (unlikely(f2fs_cp_error(sbi))) {
		ret = -EIO;
		goto stop;
	}

	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) {
		gc_type = FG_GC;
		if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) {
			ret = write_checkpoint(sbi, &cpc);
			if (ret)
				goto stop;
			segno = NULL_SEGNO;
		} else if (has_not_enough_free_secs(sbi, 0, 0)) {
			ret = write_checkpoint(sbi, &cpc);
			if (ret)
				goto stop;
		}
	}

	if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
		goto stop;
	ret = 0;

	if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
			gc_type == FG_GC)
		sec_freed++;

	if (gc_type == FG_GC)
		sbi->cur_victim_sec = NULL_SEGNO;

	if (!sync) {
		if (has_not_enough_free_secs(sbi, sec_freed, 0))
			goto gc_more;

		if (gc_type == FG_GC)
			ret = write_checkpoint(sbi, &cpc);
	}
stop:
	mutex_unlock(&sbi->gc_mutex);

	put_gc_inode(&gc_list);

	if (sync)
		ret = sec_freed ? 0 : -EAGAIN;
	return ret;
}

get_victim_by_default：首先初始化选择过程中使用到的数据结构victim_sel_policy，先说一下其字段的含义：alloc_mode，可以取值LFS和SSR，选择过程中这两种的处理模式是不同的。gc_mode这个是计算cost的算法，取值GC_CB和GC_GREEDY。dirty_segmap是记录dirty的segment的位图，选择过程中需要在dirty的segment的section中选择。max_search表示查找过程中的最多的segment的数量，实际就是上述位图的dirty的segment的数量。offset表示在便利过程中的当前的查找偏移。ofs_unit表示在查找过程中每次查找跨越的单元，SSR是以1个segment为单元，LFS是以1个section为单元。min_cost记录查找过程中的最小cost。min_segno记录的是查找过程中最小cost所对应的segno。

struct victim_sel_policy {
	int alloc_mode;	
	int gc_mode;
	unsigned long *dirty_segmap;
	unsigned int max_search;
	unsigned int offset;
	unsigned int ofs_unit;
	unsigned int min_cost;
	unsigned int min_segno;
};

所以其初始化时通过函数select_policy完成的。然后检查是不是以LSF并且是FG_GC的方式进行select，如果是就直接使用之前BG_GC选择过的GC(这样选择的section其有效块数比较少，具体原因不清楚)。如果对应的segno不是NULL_SEGNO，那就找到了。否则需要跟其他的一样对所有的有脏的segment的section进行计算cost，然后选择出最小的cost作为最后的结果。可能是为了均匀的原因，这个遍历不是从头开始的，而是从上次的选择开始的，然后遍历整个循环。首先通过函数find_next_bit在dirty_segmap中找到dirty的segment，接下来的判断是为了完成循环的掉头。然后调用函数count_bits计算这个单元中的dirty的segment的数量。对找到的dirty的segment所在的单元检查，调用sec_usage_check检查这个segment是否为current segment，如果是就跳过这个单元。另外为了平均一下被选中的情况victim_secmap记录了BG_GC情况下被选择过的section，如果是备选过，那也跳过这个选择。没有问题了就调用get_gc_cost计算整个单元中的cost，如果这个cost比之前的所有的cost都小，那么修改victim_sel_policy来记录当前的单元是cost最小的单元。然后比较检查过的dirty的segment和dirtysegmap中的dirty的segment的个数，如果超过了，那就直接可以停止查找了。查找结束之后，检查segno==NULL_SEGNO，如果是，那就是在查找过程中没有找到相应的segno，返回NULL_SEGNO。如果不是，对于FG_GC那就将cur_victim_sec设置为选择的segment（根据上面的查找可以看出，这里虽然是选择的segment，实际上计算的cost是以section计算的，所以这个segment指的是代表了它所在的section的segment）的对应的section。对于BG_GC，将victim_secmap对应的section置位。接下来就将结果置为segment所在的section的起始segment。

static int get_victim_by_default(struct f2fs_sb_info *sbi,
		unsigned int *result, int gc_type, int type, char alloc_mode)
{
	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
	struct victim_sel_policy p;
	unsigned int secno, last_victim;
	unsigned int last_segment = MAIN_SEGS(sbi);
	unsigned int nsearched = 0;

	mutex_lock(&dirty_i->seglist_lock);

	p.alloc_mode = alloc_mode;
	select_policy(sbi, gc_type, type, &p);

	p.min_segno = NULL_SEGNO;
	p.min_cost = get_max_cost(sbi, &p);
	if (p.max_search == 0)
		goto out;
	
	last_victim = sbi->last_victim[p.gc_mode];
	if (p.alloc_mode == LFS && gc_type == FG_GC) {
		p.min_segno = check_bg_victims(sbi);
		if (p.min_segno != NULL_SEGNO)
			goto got_it;
	}

	while (1) {
		unsigned long cost;
		unsigned int segno;
		segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
		if (segno >= last_segment) {
			if (sbi->last_victim[p.gc_mode]) {
				last_segment = sbi->last_victim[p.gc_mode];
				sbi->last_victim[p.gc_mode] = 0;
				p.offset = 0;
				continue;
			}
			break;
		}
		p.offset = segno + p.ofs_unit;
		if (p.ofs_unit > 1) {
			p.offset -= segno % p.ofs_unit;
			nsearched += count_bits(p.dirty_segmap, p.offset - p.ofs_unit, p.ofs_unit);
		} else {
			nsearched++;
		}
		secno = GET_SECNO(sbi, segno);
		if (sec_usage_check(sbi, secno))
			goto next;
		if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
			goto next;
		cost = get_gc_cost(sbi, segno, &p);
		if (p.min_cost > cost) {
			p.min_segno = segno;
			p.min_cost = cost;
		}
next:
		if (nsearched >= p.max_search) {
			if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
				sbi->last_victim[p.gc_mode] = last_victim + 1;
			else
				sbi->last_victim[p.gc_mode] = segno + 1;
			break;
		}
	}
	if (p.min_segno != NULL_SEGNO) {
got_it:
		if (p.alloc_mode == LFS) {
			secno = GET_SECNO(sbi, p.min_segno);
			if (gc_type == FG_GC)
				sbi->cur_victim_sec = secno;
			else
				set_bit(secno, dirty_i->victim_secmap);
		}
		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;

		trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
				sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi));
	}
out:
	mutex_unlock(&dirty_i->seglist_lock);
	return (p.min_segno == NULL_SEGNO) ? 0 : 1;
}

do_garbage_collect：首先预读需要gc的section中的所有的segment的f2fs_summary，然后对section中的每个segment进行遍历。首先检查segment中的有效块数是否为零，如果是零就直接跳到下一个segment，另外如果segment对应的summary不是最新的，或者f2fs_checkpoint设置了CP_ERROR_FLAG，也是跳过下一个segment。如果不满足上述的条件，那么根据需要迁移的segment的类型来进行node或者data的迁移，这个是通过两个不同的函数gc_node_segment和gc_data_segment实现的。回收完成之后，如果是FG_GC，那就马上提交Io，另外如果FG_GC下经过回收后的section中的有效块数变为了0，那么就返回1,作为释放的section的个数，可能用于接下来继续回收。

static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno,
				struct gc_inode_list *gc_list, int gc_type)
{
	struct page *sum_page;
	struct f2fs_summary_block *sum;
	struct blk_plug plug;
	unsigned int segno = start_segno;
	unsigned int end_segno = start_segno + sbi->segs_per_sec;
	int sec_freed = 0;
	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE;

	if (sbi->segs_per_sec > 1)
		ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, META_SSA, true);

	while (segno < end_segno) {
		sum_page = get_sum_page(sbi, segno++);
		unlock_page(sum_page);
	}

	blk_start_plug(&plug);

	for (segno = start_segno; segno < end_segno; segno++) {
		sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno));
		f2fs_put_page(sum_page, 0);

		if (get_valid_blocks(sbi, segno, 1) == 0 || !PageUptodate(sum_page) ||
				unlikely(f2fs_cp_error(sbi)))
			goto next;

		sum = page_address(sum_page);
		f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));

		if (type == SUM_TYPE_NODE)
			gc_node_segment(sbi, sum->entries, segno, gc_type);
		else
			gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type);

		stat_inc_seg_count(sbi, type, gc_type);
next:
		f2fs_put_page(sum_page, 0);
	}

	if (gc_type == FG_GC)
		f2fs_submit_merged_bio(sbi, (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE);

	blk_finish_plug(&plug);

	if (gc_type == FG_GC && get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
		sec_freed = 1;

	stat_inc_call_count(sbi->stat_info);

	return sec_freed;
}

gc_node_segment：首先计算segment的起始块地址，然后再对segment中的每个block遍历。如果是BG_GC并且没有足够的section了，那么就直接返回停止BG_GC。然后检查当前block的sit是否为有效，如果无效，那么这块是不用迁移的，直接跨过。然后读取当前node的所在的f2fs_nat_block，然后预读该block所放置的node，接着获取该f2fs_node，再次检查当前block的sit是否有效，如果无效也是直接跨过。接着获取该node的node_info，然后检查node_info中的最新的块地址是否跟当前块地址相同，如果不同说明当前block盛放的不是该node的最新数据，所以快过当前block。否则调用move_node_page进行真正的迁移动作。

static void gc_node_segment(struct f2fs_sb_info *sbi,
		struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
	struct f2fs_summary *entry;
	block_t start_addr;
	int off;
	int phase = 0;

	start_addr = START_BLOCK(sbi, segno);

next_step:
	entry = sum;

	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
		nid_t nid = le32_to_cpu(entry->nid);
		struct page *node_page;
		struct node_info ni;

		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
			return;
		if (check_valid_map(sbi, segno, off) == 0)
			continue;
		if (phase == 0) {
			ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true);
			continue;
		}
		if (phase == 1) {
			ra_node_page(sbi, nid);
			continue;
		}

		node_page = get_node_page(sbi, nid);
		if (IS_ERR(node_page))
			continue;

		if (check_valid_map(sbi, segno, off) == 0) {
			f2fs_put_page(node_page, 1);
			continue;
		}
		get_node_info(sbi, nid, &ni);
		if (ni.blk_addr != start_addr + off) {
			f2fs_put_page(node_page, 1);
			continue;
		}
		move_node_page(node_page, gc_type);
		stat_inc_node_blk_count(sbi, 1, gc_type);
	}

	if (++phase < 3)
		goto next_step;
}

move_node_page：对于FG_GC ，首先将f2fs_node置成dirty，然后调用node的写函数f2fs_write_node_page来进行真正的操作。对于BG_GC，由于不着急，那么仅仅将当前的f2fs_node置为dirty就行了。

void move_node_page(struct page *node_page, int gc_type)
{
	if (gc_type == FG_GC) {
		struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
		struct writeback_control wbc = {
			.sync_mode = WB_SYNC_ALL,
			.nr_to_write = 1,
			.for_reclaim = 0,
		};

		set_page_dirty(node_page);
		f2fs_wait_on_page_writeback(node_page, NODE, true);
		f2fs_bug_on(sbi, PageWriteback(node_page));
		if (!clear_page_dirty_for_io(node_page))
			goto out_page;

		if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
			unlock_page(node_page);
		goto release_page;
	} else {
		if (!PageWriteback(node_page))
			set_page_dirty(node_page);
	}
out_page:
	unlock_page(node_page);
release_page:
	f2fs_put_page(node_page, 0);
}

gc_data_segment：首先计算segment的起始块地址，然后再对segment中的每个block遍历。如果是BG_GC并且没有足够的section了，那么就直接返回停止BG_GC。然后检查当前block的sit是否为有效，如果无效，那么这块是不用迁移的，直接跨过。然后预读当前data block的dnode所在的f2fs_nat_block，接着预读dnode本身。然后调用is_alive函数检查当前的数据块是不是有效的（可能在dnode中相应的位置已经有新的数据填充了，这个新的数据在新的位置），如果无效就跨过。接着预读对应的f2fs_inode，接着获取inode，如果该inode加密其标志REG，那么调用add_gc_inode将inode加入到平gc管理的链表和radixtree中，如果不满足这个条件的，调用get_read_data_page来读取该block并且调用函数add_gc_inode将inode加入到平gc管理的链表和radixtree中。接着对每个块的inode，都在gc管理的radix tree中寻找，然后计算block在文件中的index，然后根据是否加密分别调用move_encrypted_block和move_data_page对数据进行迁移。

static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
		struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
	struct super_block *sb = sbi->sb;
	struct f2fs_summary *entry;
	block_t start_addr;
	int off;
	int phase = 0;

	start_addr = START_BLOCK(sbi, segno);

next_step:
	entry = sum;

	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
		struct page *data_page;
		struct inode *inode;
		struct node_info dni; 
		unsigned int ofs_in_node, nofs;
		block_t start_bidx;
		nid_t nid = le32_to_cpu(entry->nid);

		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
			return;
		if (check_valid_map(sbi, segno, off) == 0)
			continue;
		if (phase == 0) {
			ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true);
			continue;
		}

		if (phase == 1) {
			ra_node_page(sbi, nid);
			continue;
		}
		if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
			continue;

		if (phase == 2) {
			ra_node_page(sbi, dni.ino);
			continue;
		}

		ofs_in_node = le16_to_cpu(entry->ofs_in_node);
		if (phase == 3) {
			inode = f2fs_iget(sb, dni.ino);
			if (IS_ERR(inode) || is_bad_inode(inode))
				continue;
			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
				add_gc_inode(gc_list, inode);
				continue;
			}

			start_bidx = start_bidx_of_node(nofs, inode);
			data_page = get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true);
			if (IS_ERR(data_page)) {
				iput(inode);
				continue;
			}
			f2fs_put_page(data_page, 0);
			add_gc_inode(gc_list, inode);
			continue;
		}
		inode = find_gc_inode(gc_list, dni.ino);
		if (inode) {
			struct f2fs_inode_info *fi = F2FS_I(inode);
			bool locked = false;

			if (S_ISREG(inode->i_mode)) {
				if (!down_write_trylock(&fi->dio_rwsem[READ]))
					continue;
				if (!down_write_trylock(&fi->dio_rwsem[WRITE])) {
					up_write(&fi->dio_rwsem[READ]);
					continue;
				}
				locked = true;
			}

			start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node;
			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
				move_encrypted_block(inode, start_bidx);
			else
				move_data_page(inode, start_bidx, gc_type);
			if (locked) {
				up_write(&fi->dio_rwsem[WRITE]);
				up_write(&fi->dio_rwsem[READ]);
			}

			stat_inc_data_blk_count(sbi, 1, gc_type);
		}
	}

	if (++phase < 5)
		goto next_step;
}

is_alive：这个函数主要确认一个data block是不是有效的。首先根据summary得到对应的dnode，然后对比dnode中的version和node_info中的version，然后再对比block的地址和dnode中的地址，只有两者都是一致的才能说明这个data block是有效的。

static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
		struct node_info *dni, block_t blkaddr, unsigned int *nofs)
{
	struct page *node_page;
	nid_t nid;
	unsigned int ofs_in_node;
	block_t source_blkaddr;

	nid = le32_to_cpu(sum->nid);
	ofs_in_node = le16_to_cpu(sum->ofs_in_node);

	node_page = get_node_page(sbi, nid);
	if (IS_ERR(node_page))
		return false;

	get_node_info(sbi, nid, dni);

	if (sum->version != dni->version) {
		f2fs_put_page(node_page, 1);
		return false;
	}

	*nofs = ofs_of_node(node_page);
	source_blkaddr = datablock_addr(node_page, ofs_in_node);
	f2fs_put_page(node_page, 1);

	if (source_blkaddr != blkaddr)
		return false;
	return true;
}

move_data_page：完成真正的迁移。首先判断是否为BG_GC，如果是的话就不着急回收，将对应的data block设置dirty并置为cold就行了。否则就着急回收。所以将对应的data block设置dirty并置为cold，然后调用写函数do_write_data_page进行写回操作。

static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
{
	struct page *page;

	page = get_lock_data_page(inode, bidx, true);
	if (IS_ERR(page))
		return;

	if (gc_type == BG_GC) {
		if (PageWriteback(page))
			goto out;
		set_page_dirty(page);
		set_cold_data(page);
	} else {
		struct f2fs_io_info fio = {
			.sbi = F2FS_I_SB(inode),
			.type = DATA,
			.op = REQ_OP_WRITE,
			.op_flags = WRITE_SYNC,
			.page = page,
			.encrypted_page = NULL,
		};
		bool is_dirty = PageDirty(page);
		int err;

retry:
		set_page_dirty(page);
		f2fs_wait_on_page_writeback(page, DATA, true);
		if (clear_page_dirty_for_io(page))
			inode_dec_dirty_pages(inode);

		set_cold_data(page);

		err = do_write_data_page(&fio);
		if (err == -ENOMEM && is_dirty) {
			congestion_wait(BLK_RW_ASYNC, HZ/50);
			goto retry;
		}

		clear_cold_data(page);
	}
out:
	f2fs_put_page(page, 1);
}

猜你喜欢