概述
对于硬盘的访问,如果IO所涉及的数据量太大、或者跨页等原因,有可能需要对访问IO进行拆分成多个小IO来访问。本文根据SPDK开源代码中example\nvme\hello_world
示例进行研究
Request结构
在SPDK中将对硬盘的访问IO首先包装成一个Request请求,如果此IO需要拆分,则会将拆分后的IO记录到未拆分时创建的这个Request的children
字段(队列),下面是Request结构,只显示几个重要的以及跟拆分IO相关的字段
struct nvme_request {
struct spdk_nvme_cmd cmd; // SQE
......
/**
* Number of children requests still outstanding for this
* request which was split into multiple child requests.
*/
uint16_t num_children;
......
struct spdk_nvme_qpair *qpair; // IO Qpair
......
struct spdk_nvme_cpl cpl; // CQE
/**
* The following members should not be reordered with members
* above. These members are only needed when splitting
* requests which is done rarely, and the driver is careful
* to not touch the following fields until a split operation is
* needed, to avoid touching an extra cacheline.
*/
/**
* Points to the outstanding child requests for a parent request.
* Only valid if a request was split into multiple children
* requests, and is not initialized for non-split requests.
*/
TAILQ_HEAD(, nvme_request) children;
/**
* Linked-list pointers for a child request in its parent's list.
*/
TAILQ_ENTRY(nvme_request) child_tailq;
/**
* Points to a parent request if part of a split request,
* NULL otherwise.
*/
struct nvme_request *parent;
/**
* Completion status for a parent request. Initialized to all 0's
* (SUCCESS) before child requests are submitted. If a child
* request completes with error, the error status is copied here,
* to ensure that the parent request is also completed with error
* status once all child requests are completed.
*/
struct spdk_nvme_cpl parent_status;
/**
* The user_cb_fn and user_cb_arg fields are used for holding the original
* callback data when using nvme_allocate_request_user_copy.
*/
spdk_nvme_cmd_cb user_cb_fn;
void *user_cb_arg;
void *user_buffer;
};
IO拆分
接口调用关系
下图是一个IO的request的创建以及拆分动作的调用过程。
<img src="D:\总结\md\spdk_IO_split\image-20230321144008359.png" alt="image-20230321144008359" style="zoom:50%;" />
实现
拆分过程最主要的就是一个while
循环,将一个大的IO拆分成多个能一次处理的小IO
吐槽:接口的参数不是一般的多。。。
static struct nvme_request *
_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns,
struct spdk_nvme_qpair *qpair,
const struct nvme_payload *payload,
uint32_t payload_offset, uint32_t md_offset,
uint64_t lba, uint32_t lba_count,
spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
uint32_t io_flags, struct nvme_request *req,
uint32_t sectors_per_max_io, uint32_t sector_mask,
uint16_t apptag_mask, uint16_t apptag, int *rc)
{
uint32_t sector_size = _nvme_get_host_buffer_sector_size(ns, io_flags);
uint32_t remaining_lba_count = lba_count;
struct nvme_request *child;
while (remaining_lba_count > 0) {
lba_count = sectors_per_max_io - (lba & sector_mask);
lba_count = spdk_min(remaining_lba_count, lba_count);
child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
lba, lba_count, cb_fn, cb_arg, opc,
io_flags, apptag_mask, apptag, req, true, rc);
if (child == NULL) {
return NULL;
}
remaining_lba_count -= lba_count;
lba += lba_count;
payload_offset += lba_count * sector_size;
md_offset += lba_count * ns->md_size;
}
return req;
}
而对于每一个小的IO,都会调用接口_nvme_ns_cmd_rw()
创建成一个request,然后调用nvme_request_add_child()
将新的request放到到最开始的大IO对应的Request的一个子request队列中,如下
static inline void
nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child)
{
......
parent->num_children++;
TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); // 链接到parent的children队列
child->parent = parent;
child->cb_fn = nvme_cb_complete_child;
child->cb_arg = child;
}
从上面代码中可以看出,每一个子request完成后的回调接口都是指向nvme_cb_complete_child
,这个接口做了一些资源清理的工作,如下:
static inline void
nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl)
{
struct nvme_request *child = child_arg;
struct nvme_request *parent = child->parent;
nvme_request_remove_child(parent, child);
if (spdk_nvme_cpl_is_error(cpl)) {
memcpy(&parent->parent_status, cpl, sizeof(*cpl));
}
if (parent->num_children == 0) {
nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
parent, &parent->parent_status);
nvme_free_request(parent);
}
}
IO执行
前面将IO拆分后组成多个小的request放到父request的一个队列中。对父request进行submit,在处理时会判断父request中是否存在子request,如果有则会循环将子request进行submit,所有子request完成之后直接退出不会再处理父request(相当于父request只是一个容器),代码如下:
static inline int
_nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
{
......
if (req->num_children) {
/*
* This is a split (parent) request. Submit all of the children but not the parent
* request itself, since the parent is the original unsplit request.
*/
TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
if (spdk_likely(!child_req_failed)) {
rc = nvme_qpair_submit_request(qpair, child_req);
if (spdk_unlikely(rc != 0)) {
child_req_failed = true;
}
} else { /* free remaining child_reqs since one child_req fails */
nvme_request_remove_child(req, child_req);
nvme_request_free_children(child_req);
nvme_free_request(child_req);
}
}
if (spdk_unlikely(child_req_failed)) {
/* part of children requests have been submitted,
* return success since we must wait for those children to complete,
* but set the parent request to failure.
*/
if (req->num_children) {
req->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
req->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
return 0;
}
goto error;
}
return rc;
}
......
}
学习地址:Dpdk/网络协议栈/vpp/OvS/DDos/NFV/虚拟化/高性能专家(免费订阅,永久学习)
【文章福利】需要更多DPDK/SPDK学习资料加群793599096(资料包括C/C++,Linux,golang技术,内核,Nginx,ZeroMQ,MySQL,Redis,fastdfs,MongoDB,ZK,CDN,P2P,K8S,Docker,TCP/IP,协程,DPDK,大厂面试题 等)可以自行添加学习交流群点击这里噢~
IO拆分分析
SPDK中IO拆分条件
SPDK中调用IO拆分的点如下代码:
static inline struct nvme_request *
_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl, int *rc)
{
......
/*
* Intel DC P3*00 NVMe controllers benefit from driver-assisted striping.
* If this controller defines a stripe boundary and this I/O spans a stripe
* boundary, split the request into multiple requests and submit each
* separately to hardware.
*/
if (sectors_per_stripe > 0 &&
(((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) {
return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
cb_fn,
cb_arg, opc,
io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag, rc);
} else if (lba_count > sectors_per_max_io) {
return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
cb_fn,
cb_arg, opc,
io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag, rc);
} else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) {
if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset,
lba, lba_count, cb_fn, cb_arg, opc, io_flags,
req, apptag_mask, apptag, rc);
} else {
return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset,
lba, lba_count, cb_fn, cb_arg, opc, io_flags,
req, apptag_mask, apptag, rc);
}
}
_nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
return req;
}
代码中只有三个拆分IO的分支:
- 磁盘有设置stripe,并且此次IO跨stripe边界了
- IO中LBA数量超过磁盘的一个IO支持最大的sector数量
- 这个很奇怪,看最外层条件是设置使用SGL方式,但是内部条件又分成了SGL和RPR两种方式。。。
构造验证IO拆分
SPDK的hello_world示例中,可以针对第二个拆分条件进行构造,hello_world.c
文件做如下修改
-
申请buffer时申请2M的空间,如下
sequence.buf = spdk_zmalloc(0x200000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
-
写IO时入参
lba_count
改为4096rc = spdk_nvme_ns_cmd_write(ns_entry->ns, ns_entry->qpair, sequence.buf, 0, /* LBA start */ 4096, /* number of LBAs */ write_complete, &sequence, 0);
经过调试,如上构造满足SPDK的IO拆分条二个条件lba_count > sectors_per_max_io
在前面IO执行章节的_nvme_qpair_submit_request
接口中增加打印children的数量,如下
SPDK_ERRLOG("----------num of children = %d------------\n\n", req->num_children);
if (req->num_children) {
......
}
打印子request的个数如下
[2023-03-23 09:02:41.777381] nvme_qpair.c: 946:_nvme_qpair_submit_request: *ERROR*:----------num of children = 2------------