DPDK系列之二十五DPDK中的内存池

一、内存池

内存池，那真是分析的太多了，几乎这玩意儿是所有的框架必备。哪有计算机框架不用内存的，哪有内存分配就直接扎楞楞的就new,malloc的。先不说是不是看上去让人觉得太LOW，关键是内存确实是很难管理，应用场景的不同，内存池的管理也要有调整。你看有单一大小的，这算是入门版；有多个大小动态适应的，这算高级版；还有动态生成和静态分组管理的，这算是专家版。
总之，就是要让内存分配和管理回收之间达到一个效率与资源的平衡，这在不同场景下可能就有所不同，侧重点有所不一样，所以一切以实际应用为原则。

二、DPDK中的内存池

做为固定大小内存分配的一种管理方式，内存池mempool其实是预应对内存快速的分配和回收的。但是一般情况下，内存的大小是无法精确确定的，所以内存池往往的结果是浪费一小部分内存，这也是一种妥协。在DPDK的内存池中，由三个部分来实现：
1、内存池的节点对象。这些对象存储在全局队列中，可通过唯一标识来访问，当然它只是一个指针结构并不是真正的内存区。
2、内存的实际存储区。它在rte_memzone 中分配出来的连续内存中，用来存储相关的内存池对象。
3、ring无锁队列。无锁队列意味着多线程中是安全的，它可以用来管理mempool的对象。
整个内存池通过环形无锁队列映射进行内存池对象的存取，同时为了兼顾多核冲突而引入了local_cache对象缓冲区，尽量减少多核访问环形队列的并发处理。

三、数据结构和源码

在前面已经把内存池rte_mempool这个数据结构简单分析了一下，这里只给出相关的代码片段：

struct rte_mempool {
    
    
	char name[RTE_MEMZONE_NAMESIZE];
	RTE_STD_C11
	union {
    
    
		void * pool_data;         
		uint64_t pool_id;         
	};
	void * pool_config;              
.......
#ifdef RTE_LIBRTE_MEMPOOL_DEBUG

	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
#endif
}  __rte_cache_aligned;

1、创建

/* create the mempool */
struct rte_mempool *
rte_mempool_create(const char * name, unsigned n, unsigned elt_size,
	unsigned cache_size, unsigned private_data_size,
	rte_mempool_ctor_t * mp_init, void * mp_init_arg,
	rte_mempool_obj_cb_t * obj_init, void * obj_init_arg,
	int socket_id, unsigned flags)
{
    
    
	int ret;
	struct rte_mempool * mp;

	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
		private_data_size, socket_id, flags);
	if (mp == NULL)
		return NULL;

	/*
	 * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
	 * set the correct index into the table of ops structs.
	 */
	if ((flags & MEMPOOL_F_SP_PUT) && (flags & MEMPOOL_F_SC_GET))
		ret = rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL);
	else if (flags & MEMPOOL_F_SP_PUT)
		ret = rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL);
	else if (flags & MEMPOOL_F_SC_GET)
		ret = rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL);
	else
		ret = rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);

	if (ret)
		goto fail;

	/* call the mempool priv initializer */
	if (mp_init)
		mp_init(mp, mp_init_arg);

	if (rte_mempool_populate_default(mp) < 0)
		goto fail;

	/* call the object initializers * /
	if (obj_init)
		rte_mempool_obj_iter(mp, obj_init, obj_init_arg);

	return mp;

 fail:
	rte_mempool_free(mp);
	return NULL;
}

它调用了rte_mempool_create_empty（）这个函数：

/*
 * Free a cache. It's the responsibility of the user to make sure that any
 * remaining objects in the cache are flushed to the corresponding
 * mempool.
 */
void
rte_mempool_cache_free(struct rte_mempool_cache *cache)
{
    
    
	rte_free(cache);
}

/* create an empty mempool */
struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
	unsigned cache_size, unsigned private_data_size,
	int socket_id, unsigned flags)
{
    
    
	char mz_name[RTE_MEMZONE_NAMESIZE];
	struct rte_mempool_list *mempool_list;
	struct rte_mempool *mp = NULL;
	struct rte_tailq_entry *te = NULL;
	const struct rte_memzone *mz = NULL;
	size_t mempool_size;
	unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
	struct rte_mempool_objsz objsz;
	unsigned lcore_id;
	int ret;

	/* compilation-time checks */
	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool) &
			  RTE_CACHE_LINE_MASK) != 0);
	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
			  RTE_CACHE_LINE_MASK) != 0);
#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
			  RTE_CACHE_LINE_MASK) != 0);
	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
			  RTE_CACHE_LINE_MASK) != 0);
#endif

	mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list);

	/* asked for zero items */
	if (n == 0) {
    
    
		rte_errno = EINVAL;
		return NULL;
	}

	/* asked cache too big */
	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
    
    
		rte_errno = EINVAL;
		return NULL;
	}

	/* "no cache align" imply "no spread" */
	if (flags & MEMPOOL_F_NO_CACHE_ALIGN)
		flags |= MEMPOOL_F_NO_SPREAD;

	/* calculate mempool object sizes. */
	if (!rte_mempool_calc_obj_size(elt_size, flags, &objsz)) {
    
    
		rte_errno = EINVAL;
		return NULL;
	}

	rte_mcfg_mempool_write_lock();

	/*
	 * reserve a memory zone for this mempool: private data is
	 * cache-aligned
	 */
	private_data_size = (private_data_size +
			     RTE_MEMPOOL_ALIGN_MASK) & (~RTE_MEMPOOL_ALIGN_MASK);


	/* try to allocate tailq entry */
	te = rte_zmalloc("MEMPOOL_TAILQ_ENTRY", sizeof(*te), 0);
	if (te == NULL) {
    
    
		RTE_LOG(ERR, MEMPOOL, "Cannot allocate tailq entry!\n");
		goto exit_unlock;
	}

	mempool_size = MEMPOOL_HEADER_SIZE(mp, cache_size);
	mempool_size += private_data_size;
	mempool_size = RTE_ALIGN_CEIL(mempool_size, RTE_MEMPOOL_ALIGN);

	ret = snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_MZ_FORMAT, name);
	if (ret < 0 || ret >= (int)sizeof(mz_name)) {
    
    
		rte_errno = ENAMETOOLONG;
		goto exit_unlock;
	}

	mz = rte_memzone_reserve(mz_name, mempool_size, socket_id, mz_flags);
	if (mz == NULL)
		goto exit_unlock;

	/* init the mempool structure */
	mp = mz->addr;
	memset(mp, 0, MEMPOOL_HEADER_SIZE(mp, cache_size));
	ret = strlcpy(mp->name, name, sizeof(mp->name));
	if (ret < 0 || ret >= (int)sizeof(mp->name)) {
    
    
		rte_errno = ENAMETOOLONG;
		goto exit_unlock;
	}
	mp->mz = mz;
	mp->size = n;
	mp->flags = flags;
	mp->socket_id = socket_id;
	mp->elt_size = objsz.elt_size;
	mp->header_size = objsz.header_size;
	mp->trailer_size = objsz.trailer_size;
	/* Size of default caches, zero means disabled. */
	mp->cache_size = cache_size;
	mp->private_data_size = private_data_size;
	STAILQ_INIT(&mp->elt_list);
	STAILQ_INIT(&mp->mem_list);

	/*
	 * local_cache pointer is set even if cache_size is zero.
	 * The local_cache points to just past the elt_pa[] array.
	 */
	mp->local_cache = (struct rte_mempool_cache *)
		RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));

	/* Init all default caches. */
	if (cache_size != 0) {
    
    
		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
			mempool_cache_init(&mp->local_cache[lcore_id],
					   cache_size);
	}

	te->data = mp;

	rte_mcfg_tailq_write_lock();
	TAILQ_INSERT_TAIL(mempool_list, te, next);
	rte_mcfg_tailq_write_unlock();
	rte_mcfg_mempool_write_unlock();

	return mp;

exit_unlock:
	rte_mcfg_mempool_write_unlock();
	rte_free(te);
	rte_mempool_free(mp);
	return NULL;
}

在前面说过，在内存池的数据结构定义中有三个重要的部分 rte_mempool ， rte_mempool_cache 和mempool private，它们都在上面的函数中创建。生成的对象都挂在rte_tailq_elem类型的静态变量rte_mempool_tailq中。
然后通过对三个数据结构体进行计算得到mempool的头大小并得到所有核的Cache的大小。
真正的内存，也就是实际的内存创建是在rte_mempool_populate_default这个函数中创建的。

/* Default function to populate the mempool: allocate memory in memzones,
 * and populate them. Return the number of objects added, or a negative
 * value on error.
 */
int
rte_mempool_populate_default(struct rte_mempool *mp)
{
    
    
	unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
	char mz_name[RTE_MEMZONE_NAMESIZE];
	const struct rte_memzone *mz;
	ssize_t mem_size;
	size_t align, pg_sz, pg_shift = 0;
	rte_iova_t iova;
	unsigned mz_id, n;
	int ret;
	bool need_iova_contig_obj;
	size_t max_alloc_size = SIZE_MAX;

	ret = mempool_ops_alloc_once(mp);
	if (ret != 0)
		return ret;

	/* mempool must not be populated */
	if (mp->nb_mem_chunks != 0)
		return -EEXIST;

	/*
	 * the following section calculates page shift and page size values.
	 *
	 * these values impact the result of calc_mem_size operation, which
	 * returns the amount of memory that should be allocated to store the
	 * desired number of objects. when not zero, it allocates more memory
	 * for the padding between objects, to ensure that an object does not
	 * cross a page boundary. in other words, page size/shift are to be set
	 * to zero if mempool elements won't care about page boundaries.
	 * there are several considerations for page size and page shift here.
	 *
	 * if we don't need our mempools to have physically contiguous objects,
	 * then just set page shift and page size to 0, because the user has
	 * indicated that there's no need to care about anything.
	 *
	 * if we do need contiguous objects (if a mempool driver has its
	 * own calc_size() method returning min_chunk_size = mem_size),
	 * there is also an option to reserve the entire mempool memory
	 * as one contiguous block of memory.
	 *
	 * if we require contiguous objects, but not necessarily the entire
	 * mempool reserved space to be contiguous, pg_sz will be != 0,
	 * and the default ops->populate() will take care of not placing
	 * objects across pages.
	 *
	 * if our IO addresses are physical, we may get memory from bigger
	 * pages, or we might get memory from smaller pages, and how much of it
	 * we require depends on whether we want bigger or smaller pages.
	 * However, requesting each and every memory size is too much work, so
	 * what we'll do instead is walk through the page sizes available, pick
	 * the smallest one and set up page shift to match that one. We will be
	 * wasting some space this way, but it's much nicer than looping around
	 * trying to reserve each and every page size.
	 *
	 * If we fail to get enough contiguous memory, then we'll go and
	 * reserve space in smaller chunks.
	 */

	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
	ret = rte_mempool_get_page_size(mp, &pg_sz);
	if (ret < 0)
		return ret;

	if (pg_sz != 0)
		pg_shift = rte_bsf32(pg_sz);

	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
    
    
		size_t min_chunk_size;

		mem_size = rte_mempool_ops_calc_mem_size(
			mp, n, pg_shift, &min_chunk_size, &align);

		if (mem_size < 0) {
    
    
			ret = mem_size;
			goto fail;
		}

		ret = snprintf(mz_name, sizeof(mz_name),
			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
    
    
			ret = -ENAMETOOLONG;
			goto fail;
		}

		/* if we're trying to reserve contiguous memory, add appropriate
		 * memzone flag.
		 */
		if (min_chunk_size == (size_t)mem_size)
			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;

		/* Allocate a memzone, retrying with a smaller area on ENOMEM */
		do {
    
    
			mz = rte_memzone_reserve_aligned(mz_name,
				RTE_MIN((size_t)mem_size, max_alloc_size),
				mp->socket_id, mz_flags, align);

			if (mz != NULL || rte_errno != ENOMEM)
				break;

			max_alloc_size = RTE_MIN(max_alloc_size,
						(size_t)mem_size) / 2;
		} while (mz == NULL && max_alloc_size >= min_chunk_size);

		if (mz == NULL) {
    
    
			ret = -rte_errno;
			goto fail;
		}

		if (need_iova_contig_obj)
			iova = mz->iova;
		else
			iova = RTE_BAD_IOVA;

		if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
			ret = rte_mempool_populate_iova(mp, mz->addr,
				iova, mz->len,
				rte_mempool_memchunk_mz_free,
				(void *)(uintptr_t)mz);
		else
			ret = rte_mempool_populate_virt(mp, mz->addr,
				mz->len, pg_sz,
				rte_mempool_memchunk_mz_free,
				(void *)(uintptr_t)mz);
		if (ret < 0) {
    
    
			rte_memzone_free(mz);
			goto fail;
		}
	}

	return mp->size;

 fail:
	rte_mempool_free_memchunks(mp);
	return ret;
}

其中通过下面的函数来创建Ring：

static int
mempool_ops_alloc_once(struct rte_mempool *mp)
{
    
    
	int ret;

	/* create the internal ring if not already done */
	if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
    
    
		ret = rte_mempool_ops_alloc(mp);
		if (ret != 0)
			return ret;
		mp->flags |= MEMPOOL_F_POOL_CREATED;
	}
	return 0;
}
/* wrapper to allocate an external mempool's private (pool) data. */
int
rte_mempool_ops_alloc(struct rte_mempool *mp)
{
    
    
	struct rte_mempool_ops *ops;

	ops = rte_mempool_get_ops(mp->ops_index);
	return ops->alloc(mp);
}

分配就得看rte_mempool_ops中的这个alloc在哪里初始化了：

/*
 * The following 4 declarations of mempool ops structs address
 * the need for the backward compatible mempool handlers for
 * single/multi producers and single/multi consumers as dictated by the
 * flags provided to the rte_mempool_create function
 */
static const struct rte_mempool_ops ops_mp_mc = {
    
    
	.name = "ring_mp_mc",
	.alloc = common_ring_alloc,
	.free = common_ring_free,
	.enqueue = common_ring_mp_enqueue,
	.dequeue = common_ring_mc_dequeue,
	.get_count = common_ring_get_count,
};
MEMPOOL_REGISTER_OPS(ops_mp_mc);
MEMPOOL_REGISTER_OPS(ops_sp_sc);
MEMPOOL_REGISTER_OPS(ops_mp_sc);
MEMPOOL_REGISTER_OPS(ops_sp_mc);

这样基本注册宏的调用就可以明白流程了。这里面嵌套的很深，有兴趣可以逐一跟进。整体上就是rte_ring_create_elem函数创建后插入到全局的rte_ring_tailq，然后再通过rte_mempool_ops_populate函数调用mempool_add_elem函数将申请的实际内存插入到链表中。
2、使用
使用是分配Alloc：

//下面是m_buf的调用
static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
{
    
    
	struct rte_mbuf *m;

	if (rte_mempool_get(mp, (void **)&m) < 0)
		return NULL;
	MBUF_RAW_ALLOC_CHECK(m);
	return m;
}
//下面是mempool的接口
static __rte_always_inline int
rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned int n)
{
    
    
	struct rte_mempool_cache *cache;
	cache = rte_mempool_default_cache(mp, rte_lcore_id());
	return rte_mempool_generic_get(mp, obj_table, n, cache);
}

static __rte_always_inline int
rte_mempool_get(struct rte_mempool *mp, void **obj_p)
{
    
    
	return rte_mempool_get_bulk(mp, obj_p, 1);
}

它最终会调用rte_mempool_get_bulk函数来获得内存对象。先从本地的Cache中获取，不够才会从rte_ring中获取mbuf并存储在本地的local_cache中。

3、回收
回收是Free：

static __rte_always_inline void
rte_mbuf_raw_free(struct rte_mbuf *m)
{
    
    
	RTE_ASSERT(RTE_MBUF_DIRECT(m));
	RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
	RTE_ASSERT(m->next == NULL);
	RTE_ASSERT(m->nb_segs == 1);
	__rte_mbuf_sanity_check(m, 0);
	rte_mempool_put(m->pool, m);
}

static __rte_always_inline void
rte_mempool_put(struct rte_mempool *mp, void *obj)
{
    
    
	rte_mempool_put_bulk(mp, &obj, 1);
}
static __rte_always_inline void
rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
		     unsigned int n)
{
    
    
	struct rte_mempool_cache *cache;
	cache = rte_mempool_default_cache(mp, rte_lcore_id());
	rte_mempool_generic_put(mp, obj_table, n, cache);
}

一路下去就找到结果了。

四、总结

在可预见的将来，计算机的内存管理仍然是一件很重要的事。所以学习一些内存管理的算法和相关的技巧是十分必要的，不可忽视的是，内存池的使用，是用空间换了时间，用浪费一部分内存换取了内存碎片的降低。编程人的眼中，永远是平衡两个字，这个平衡不是简单的平均，而从整体考虑的平衡。在数据层不平衡，在应用反向不平衡，整体就平衡了。好好理解和思考就会明白，内存做为基础的开发应用资源，是多么的重要。