This chapter provides code examples using the RDMA Verbs
8.1.Automatic Path Migration (APM)
//*
* Compile Command:
* gcc apm.c -o apm -libverbs -lrdmacm
*
* Description:
* This example demonstrates Automatic Path Migration (APM). The basic flow is
* as follows:
* 1. Create connection between client and server
* 2. Set the alternate path details on each side of the connection
* 3. Perform send operations back and forth between client and server
* 4. Cause the path to be migrated (manually or automatically)
* 5. Complete sends using the alternate path
*
* There are two ways to cause the path to be migrated.
* 1. Use the ibv_modify_qp verb to set path_mig_state = IBV_MIG_MIGRATED
* 2. Assuming there are two ports on at least one side of the connection, and
* each port has a path to the other host, pull out the cable of the original
* port and watch it migrate to the other port.
*
* Running the Example:
* This example requires a specific IB network configuration to properly
* demonstrate APM. Two hosts are required, one for the client and one for the
* server. At least one of these two hosts must have a IB card with two ports.
* Both of these ports should be connected to the same subnet and each have a
* route to the other host through an IB switch.
* The executable can operate as either the client or server application. Start
* the server side first on one host then start the client on the other host. With default parameters, the
* client and server will exchange 100 sends over 100 seconds. During that time,
* manually unplug the cable connected to the original port of the two port
* host, and watch the path get migrated to the other port. It may take up to
* a minute for the path to migrated. To see the path get migrated by software,
* use the -m option on the client side.
*
* Server:
* ./apm -s
*
* Client (-a is IP of remote interface):
* ./apm -a 192.168.1.12
*
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <getopt.h>
#include <rdma/rdma_verbs.h>
#define VERB_ERR(verb, ret) \
fprintf(stderr, "%s returned %d errno %d\n", verb, ret, errno)
/* Default parameter values */
#define DEFAULT_PORT "51216"
#define DEFAULT_MSG_COUNT 100
#define DEFAULT_MSG_LENGTH 1000000
#define DEFAULT_MSEC_DELAY 500
/* Resources used in the example */
struct context
{
/* User parameters */
int server;
char *server_name;
char *server_port;
int msg_count;
int msg_length;
int msec_delay;
uint8_t alt_srcport;
uint16_t alt_dlid;
uint16_t my_alt_dlid;
int migrate_after;
/* Resources */
struct rdma_cm_id *id;
struct rdma_cm_id *listen_id;
struct ibv_mr *send_mr;
struct ibv_mr *recv_mr;
char *send_buf;
char *recv_buf;
pthread_t async_event_thread;
};
/*
* Function: async_event_thread
*
* Input:
* arg The context object
*
* Output:
* none
*
* Returns:
* NULL
*
* Description:
* Reads any Asynchronous events that occur during the sending of data
* and prints out the details of the event. Specifically migration
* related events.
*/
static void *async_event_thread(void *arg)
{
struct ibv_async_event event;
int ret;
struct context *ctx = (struct context *) arg;
while (1) {
ret = ibv_get_async_event(ctx->id->verbs, &event);
if (ret) {
VERB_ERR("ibv_get_async_event", ret);
break;
}
switch (event.event_type) {
case IBV_EVENT_PATH_MIG:
printf("QP path migrated\n");
break;
case IBV_EVENT_PATH_MIG_ERR:
printf("QP path migration error\n");
break;
default:
printf("Async Event %d\n", event.event_type);
break;
}
ibv_ack_async_event(&event);
}
return NULL;
}
/*
* Function: get_alt_dlid_from_private_data
*
* Input:
* event The RDMA event containing private data
*
* Output:
* dlid The DLID that was sent in the private data
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Takes the private data sent from the remote side and returns the
* destination LID that was contained in the private data
*/
int get_alt_dlid_from_private_data(struct rdma_cm_event *event, uint16_t *dlid)
{
if (event->param.conn.private_data_len < 4) {
printf("unexpected private data len: %d",
event->param.conn.private_data_len);
return -1;
}
*dlid = ntohs(*((uint16_t *) event->param.conn.private_data));
return 0;
}
/*
* Function: get_alt_port_details
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* First, query the device to determine if path migration is supported.
* Next, queries all the ports on the device to determine if there is
* different port than the current one to use as an alternate port. If so,
* copy the port number and dlid to the context so they can be used when
* the alternate path is loaded.
*
* Note:
* This function assumes that if another port is found in the active state,
* that the port is connected to the same subnet as the initial port and
* that there is a route to the other hosts alternate port.
*/
int get_alt_port_details(struct context *ctx)
{
int ret, i;
struct ibv_qp_attr qp_attr;
struct ibv_qp_init_attr qp_init_attr;
struct ibv_device_attr dev_attr;
/* This example assumes the alternate port we want to use is on the same
* HCA. Ports from other HCAs can be used as alternate paths as well. Get
* a list of devices using ibv_get_device_list or rdma_get_devices.*/
ret = ibv_query_device(ctx->id->verbs, &dev_attr);
if (ret) {
VERB_ERR("ibv_query_device", ret);
return ret;
}
/* Verify the APM is supported by the HCA */
if (!(dev_attr.device_cap_flags | IBV_DEVICE_AUTO_PATH_MIG)) {
printf("device does not support auto path migration!\n");
return -1;
}
/* Query the QP to determine which port we are bound to */
ret = ibv_query_qp(ctx->id->qp, &qp_attr, 0, &qp_init_attr);
if (ret) {
VERB_ERR("ibv_query_qp", ret);
return ret;
}
for (i = 1; i <= dev_attr.phys_port_cnt; i++) {
/* Query all ports until we find one in the active state that is
* not the port we are currently connected to. */
struct ibv_port_attr port_attr;
ret = ibv_query_port(ctx->id->verbs, i, &port_attr);
if (ret) {
VERB_ERR("ibv_query_device", ret);
return ret;
}
if (port_attr.state == IBV_PORT_ACTIVE) {
ctx->my_alt_dlid = port_attr.lid;
ctx->alt_srcport = i;
if (qp_attr.port_num != i)
break;
}
}
return 0;
}
/*
* Function: load_alt_path
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Uses ibv_modify_qp to load the alternate path information and set the
* path migration state to rearm.
*/
int load_alt_path(struct context *ctx)
{
int ret;
struct ibv_qp_attr qp_attr;
struct ibv_qp_init_attr qp_init_attr;
/* query to get the current attributes of the qp */
ret = ibv_query_qp(ctx->id->qp, &qp_attr, 0, &qp_init_attr);
if (ret) {
VERB_ERR("ibv_query_qp", ret);
return ret;
}
/* initialize the alternate path attributes with the current path
* attributes */
memcpy(&qp_attr.alt_ah_attr, &qp_attr.ah_attr, sizeof (struct ibv_ah_attr));
/* set the alt path attributes to some basic values */
qp_attr.alt_pkey_index = qp_attr.pkey_index;
qp_attr.alt_timeout = qp_attr.timeout;
qp_attr.path_mig_state = IBV_MIG_REARM;
/* if an alternate path was supplied, set the source port and the dlid */
if (ctx->alt_srcport)
qp_attr.alt_port_num = ctx->alt_srcport;
else
qp_attr.alt_port_num = qp_attr.port_num;
if (ctx->alt_dlid)
qp_attr.alt_ah_attr.dlid = ctx->alt_dlid;
printf("loading alt path - local port: %d, dlid: %d\n",
qp_attr.alt_port_num, qp_attr.alt_ah_attr.dlid);
ret = ibv_modify_qp(ctx->id->qp, &qp_attr,
IBV_QP_ALT_PATH | IBV_QP_PATH_MIG_STATE);
if (ret) {
VERB_ERR("ibv_modify_qp", ret);
return ret;
}
}
/*
* Function: reg_mem
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Registers memory regions to use for our data transfer
*/
int reg_mem(struct context *ctx)
{
ctx->send_buf = (char *) malloc(ctx->msg_length);
memset(ctx->send_buf, 0x12, ctx->msg_length);
ctx->recv_buf = (char *) malloc(ctx->msg_length);
memset(ctx->recv_buf, 0x00, ctx->msg_length);
ctx->send_mr = rdma_reg_msgs(ctx->id, ctx->send_buf, ctx->msg_length);
if (!ctx->send_mr) {
VERB_ERR("rdma_reg_msgs", -1);
return -1;
}
ctx->recv_mr = rdma_reg_msgs(ctx->id, ctx->recv_buf, ctx->msg_length);
if (!ctx->recv_mr) {
VERB_ERR("rdma_reg_msgs", -1);
return -1;
}
return 0;
}
/*
* Function: getaddrinfo_and_create_ep
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Gets the address information and creates our endpoint
*/
int getaddrinfo_and_create_ep(struct context *ctx)
{
int ret;
struct rdma_addrinfo *rai, hints;
struct ibv_qp_init_attr qp_init_attr;
memset(&hints, 0, sizeof (hints));
hints.ai_port_space = RDMA_PS_TCP;
if (ctx->server == 1)
hints.ai_flags = RAI_PASSIVE; /* this makes it a server */
printf("rdma_getaddrinfo\n");
ret = rdma_getaddrinfo(ctx->server_name, ctx->server_port, &hints, &rai);
if (ret) {
VERB_ERR("rdma_getaddrinfo", ret);
return ret;
}
memset(&qp_init_attr, 0, sizeof (qp_init_attr));
qp_init_attr.cap.max_send_wr = 1;
qp_init_attr.cap.max_recv_wr = 1;
qp_init_attr.cap.max_send_sge = 1;
qp_init_attr.cap.max_recv_sge = 1;
printf("rdma_create_ep\n");
ret = rdma_create_ep(&ctx->id, rai, NULL, &qp_init_attr);
if (ret) {
VERB_ERR("rdma_create_ep", ret);
return ret;
}
rdma_freeaddrinfo(rai);
return 0;
}
/*
* Function: get_connect_request
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Wait for a connect request from the client
*/
int get_connect_request(struct context *ctx)
{
int ret;
printf("rdma_listen\n");
ret = rdma_listen(ctx->id, 4);
if (ret) {
VERB_ERR("rdma_listen", ret);
return ret;
}
ctx->listen_id = ctx->id;
printf("rdma_get_request\n");
ret = rdma_get_request(ctx->listen_id, &ctx->id);
if (ret) {
VERB_ERR("rdma_get_request", ret);
return ret;
}
if (ctx->id->event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
printf("unexpected event: %s",
rdma_event_str(ctx->id->event->event));
return ret;
}
/* If the alternate path info was not set on the command line, get
* it from the private data */
if (ctx->alt_dlid == 0 && ctx->alt_srcport == 0) {
ret = get_alt_dlid_from_private_data(ctx->id->event, &ctx->alt_dlid);
if (ret) {
return ret;
}
}
return 0;
}
/*
* Function: establish_connection
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Create the connection. For the client, call rdma_connect. For the
* server, the connect request was already received, so just do
* rdma_accept to complete the connection.
*/
int establish_connection(struct context *ctx)
{
int ret;
uint16_t private_data;
struct rdma_conn_param conn_param;
/* post a receive to catch the first send */
ret = rdma_post_recv(ctx->id, NULL, ctx->recv_buf, ctx->msg_length,
ctx->recv_mr);
if (ret) {
VERB_ERR("rdma_post_recv", ret);
return ret;
}
/* send the dlid for the alternate port in the private data */
private_data = htons(ctx->my_alt_dlid);
memset(&conn_param, 0, sizeof (conn_param));
conn_param.private_data_len = sizeof (int);
conn_param.private_data = &private_data;
conn_param.responder_resources = 2;
conn_param.initiator_depth = 2;
conn_param.retry_count = 5;
conn_param.rnr_retry_count = 5;
if (ctx->server) {
printf("rdma_accept\n");
ret = rdma_accept(ctx->id, &conn_param);
if (ret) {
VERB_ERR("rdma_accept", ret);
return ret;
}
}
else {
printf("rdma_connect\n");
ret = rdma_connect(ctx->id, &conn_param);
if (ret) {
VERB_ERR("rdma_connect", ret);
return ret;
}
if (ctx->id->event->event != RDMA_CM_EVENT_ESTABLISHED) {
printf("unexpected event: %s",
rdma_event_str(ctx->id->event->event));
return -1;
}
/* If the alternate path info was not set on the command line, get
* it from the private data */
if (ctx->alt_dlid == 0 && ctx->alt_srcport == 0) {
ret = get_alt_dlid_from_private_data(ctx->id->event,
&ctx->alt_dlid);
if (ret)
return ret;
}
}
return 0;
}
/*
* Function: send_msg
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Performs an Send and gets the completion
*
*/
int send_msg(struct context *ctx)
{
int ret;
struct ibv_wc wc;
ret = rdma_post_send(ctx->id, NULL, ctx->send_buf, ctx->msg_length,
ctx->send_mr, IBV_SEND_SIGNALED);
if (ret) {
VERB_ERR("rdma_send_recv", ret);
return ret;
}
ret = rdma_get_send_comp(ctx->id, &wc);
if (ret < 0) {
VERB_ERR("rdma_get_send_comp", ret);
return ret;
}
return 0;
}
/*
* Function: recv_msg
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Waits for a receive completion and posts a new receive buffer
*/
int recv_msg(struct context *ctx)
{
int ret;
struct ibv_wc wc;
ret = rdma_get_recv_comp(ctx->id, &wc);
if (ret < 0) {
VERB_ERR("rdma_get_recv_comp", ret);
return ret;
}
ret = rdma_post_recv(ctx->id, NULL, ctx->recv_buf, ctx->msg_length,
ctx->recv_mr);
if (ret) {
VERB_ERR("rdma_post_recv", ret);
return ret;
}
return 0;
}
/*
* Function: main
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
*
*/
int main(int argc, char** argv)
{
int ret, op, i, send_cnt, recv_cnt;
struct context ctx;
struct ibv_qp_attr qp_attr;
memset(&ctx, 0, sizeof (ctx));
memset(&qp_attr, 0, sizeof (qp_attr));
ctx.server = 0;
ctx.server_port = DEFAULT_PORT;
ctx.msg_count = DEFAULT_MSG_COUNT;
ctx.msg_length = DEFAULT_MSG_LENGTH;
ctx.msec_delay = DEFAULT_MSEC_DELAY;
ctx.alt_dlid = 0;
ctx.alt_srcport = 0;
ctx.migrate_after = -1;
while ((op = getopt(argc, argv, "sa:p:c:l:d:r:m:")) != -1) {
switch (op) {
case 's':
ctx.server = 1;
break;
case 'a':
ctx.server_name = optarg;
break;
case 'p':
ctx.server_port = optarg;
break;
case 'c':
ctx.msg_count = atoi(optarg);
break;
case 'l':
ctx.msg_length = atoi(optarg);
break;
case 'd':
ctx.alt_dlid = atoi(optarg);
break;
case 'r':
ctx.alt_srcport = atoi(optarg);
break;
case 'm':
ctx.migrate_after = atoi(optarg);
break;
case 'w':
ctx.msec_delay = atoi(optarg);
break;
default:
printf("usage: %s [-s or -a required]\n", argv[0]);
printf("\t[-s[erver mode]\n");
printf("\t[-a ip_address]\n");
printf("\t[-p port_number]\n");
printf("\t[-c msg_count]\n");
printf("\t[-l msg_length]\n");
printf("\t[-d alt_dlid] (requires -r)\n");
printf("\t[-r alt_srcport] (requires -d)\n");
printf("\t[-m num_iterations_then_migrate] (client only)\n");
printf("\t[-w msec_wait_between_sends]\n");
exit(1);
}
}
printf("mode: %s\n", (ctx.server) ? "server" : "client");
printf("address: %s\n", (!ctx.server_name) ? "NULL" : ctx.server_name);
printf("port: %s\n", ctx.server_port);
printf("count: %d\n", ctx.msg_count);
printf("length: %d\n", ctx.msg_length);
printf("alt_dlid: %d\n", ctx.alt_dlid);
printf("alt_port: %d\n", ctx.alt_srcport);
printf("mig_after: %d\n", ctx.migrate_after);
printf("msec_wait: %d\n", ctx.msec_delay);
printf("\n");
if (!ctx.server && !ctx.server_name) {
printf("server address must be specified for client mode\n");
exit(1);
}
/* both of these must be set or neither should be set */
if (!((ctx.alt_dlid > 0 && ctx.alt_srcport > 0) ||
(ctx.alt_dlid == 0 && ctx.alt_srcport == 0))) {
printf("-d and -r must be used together\n");
exit(1);
}
if (ctx.migrate_after > ctx.msg_count) {
printf("num_iterations_then_migrate must be less than msg_count\n");
exit(1);
}
ret = getaddrinfo_and_create_ep(&ctx);
if (ret)
goto out;
if (ctx.server) {
ret = get_connect_request(&ctx);
if (ret)
goto out;
}
/* only query for alternate port if information was not specified on the
* command line */
if (ctx.alt_dlid == 0 && ctx.alt_srcport == 0) {
ret = get_alt_port_details(&ctx);
if (ret)
goto out;
}
/* create a thread to handle async events */
pthread_create(&ctx.async_event_thread, NULL, async_event_thread, &ctx);
ret = reg_mem(&ctx);
if (ret)
goto out;
ret = establish_connection(&ctx);
/* load the alternate path after the connection was created. This can be
* done at connection time, but the connection must be created and
* established using all ib verbs */
ret = load_alt_path(&ctx);
if (ret)
goto out;
send_cnt = recv_cnt = 0;
for (i = 0; i < ctx.msg_count; i++) {
if (ctx.server) {
if (recv_msg(&ctx))
break;
printf("recv: %d\n", ++recv_cnt);
}
if (ctx.msec_delay > 0)
usleep(ctx.msec_delay * 1000);
if (send_msg(&ctx))
break;
printf("send: %d\n", ++send_cnt);
if (!ctx.server) {
if (recv_msg(&ctx))
break;
printf("recv: %d\n", ++recv_cnt);
}
/* migrate the path manually if desired after the specified number of
* sends */
if (!ctx.server && i == ctx.migrate_after) {
qp_attr.path_mig_state = IBV_MIG_MIGRATED;
ret = ibv_modify_qp(ctx.id->qp, &qp_attr, IBV_QP_PATH_MIG_STATE);
if (ret) {
VERB_ERR("ibv_modify_qp", ret);
goto out;
}
}
}
rdma_disconnect(ctx.id);
out:
if (ctx.send_mr)
rdma_dereg_mr(ctx.send_mr);
if (ctx.recv_mr)
rdma_dereg_mr(ctx.recv_mr);
if (ctx.id)
rdma_destroy_ep(ctx.id);
if (ctx.listen_id)
rdma_destroy_ep(ctx.listen_id);
if (ctx.send_buf)
free(ctx.send_buf);
if (ctx.recv_buf)
free(ctx.recv_buf);
return ret;
}
8.3.Shared Received Queue (SRQ)
/*
* Compile Command:
* gcc srq.c -o srq -libverbs -lrdmacm
*
* Description:
* Both the client and server use an SRQ. A number of Queue Pairs (QPs) are
* created (ctx.qp_count) and each QP uses the SRQ. The connection between the
* client and server is established using the IP address details passed on the
* command line. After the connection is established, the client starts
* blasting sends to the server and stops when the maximum work requests
* (ctx.max_wr) have been sent. When the server has received all the sends, it
* performs a send to the client to tell it to continue. The process repeats
* until the number of requested number of sends (ctx.msg_count) have been
* performed.
*
* Running the Example:
* The executable can operate as either the client or server application. It
* can be demonstrated on a simple fabric of two nodes with the server
* application running on one node and the client application running on the
* other. Each node must be configured to support IPoIB and the IB interface
* (ex. ib0) must be assigned an IP Address. Finally, the fabric must be
* initialized using OpenSM.
*
* Server (-a is IP of local interface):
* ./srq -s -a 192.168.1.12
*
* Client (-a is IP of remote interface):
* ./srq -a 192.168.1.12
*
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <getopt.h>
#include <rdma/rdma_verbs.h>
#define VERB_ERR(verb, ret) \
fprintf(stderr, "%s returned %d errno %d\n", verb, ret, errno)
/* Default parameters values */
#define DEFAULT_PORT "51216"
#define DEFAULT_MSG_COUNT 100
#define DEFAULT_MSG_LENGTH 100000
#define DEFAULT_QP_COUNT 4
#define DEFAULT_MAX_WR 64
/* Resources used in the example */
struct context
{
/* User parameters */
int server;
char *server_name;
char *server_port;
int msg_count;
int msg_length;
int qp_count;
int max_wr;
/* Resources */
struct rdma_cm_id *srq_id;
struct rdma_cm_id *listen_id;
struct rdma_cm_id **conn_id;
struct ibv_mr *send_mr;
struct ibv_mr *recv_mr;
struct ibv_srq *srq;
struct ibv_cq *srq_cq;
struct ibv_comp_channel *srq_cq_channel;
char *send_buf;
char *recv_buf;
};
/*
* Function: init_resources
*
* Input:
* ctx The context object
* rai The RDMA address info for the connection
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* This function initializes resources that are common to both the client
* and server functionality.
* It creates our SRQ, registers memory regions, posts receive buffers
* and creates a single completion queue that will be used for the receive
* queue on each queue pair.
*/
int init_resources(struct context *ctx, struct rdma_addrinfo *rai)
{
int ret, i;
struct rdma_cm_id *id;
/* Create an ID used for creating/accessing our SRQ */
ret = rdma_create_id(NULL, &ctx->srq_id, NULL, RDMA_PS_TCP);
if (ret) {
VERB_ERR("rdma_create_id", ret);
return ret;
}
/* We need to bind the ID to a particular RDMA device
* This is done by resolving the address or binding to the address */
if (ctx->server == 0) {
ret = rdma_resolve_addr(ctx->srq_id, NULL, rai->ai_dst_addr, 1000);
if (ret) {
VERB_ERR("rdma_resolve_addr", ret);
return ret;
}
}
else {
ret = rdma_bind_addr(ctx->srq_id, rai->ai_src_addr);
if (ret) {
VERB_ERR("rdma_bind_addr", ret);
return ret;
}
}
/* Create the memory regions being used in this example */
ctx->recv_mr = rdma_reg_msgs(ctx->srq_id, ctx->recv_buf, ctx->msg_length);
if (!ctx->recv_mr) {
VERB_ERR("rdma_reg_msgs", -1);
return -1;
}
ctx->send_mr = rdma_reg_msgs(ctx->srq_id, ctx->send_buf, ctx->msg_length);
if (!ctx->send_mr) {
VERB_ERR("rdma_reg_msgs", -1);
return -1;
}
/* Create our shared receive queue */
struct ibv_srq_init_attr srq_attr;
memset(&srq_attr, 0, sizeof (srq_attr));
srq_attr.attr.max_wr = ctx->max_wr;
srq_attr.attr.max_sge = 1;
ret = rdma_create_srq(ctx->srq_id, NULL, &srq_attr);
if (ret) {
VERB_ERR("rdma_create_srq", ret);
return -1;
}
/* Save the SRQ in our context so we can assign it to other QPs later */
ctx->srq = ctx->srq_id->srq;
/* Post our receive buffers on the SRQ */
for (i = 0; i < ctx->max_wr; i++) {
ret = rdma_post_recv(ctx->srq_id, NULL, ctx->recv_buf, ctx->msg_length,
ctx->recv_mr);
if (ret) {
VERB_ERR("rdma_post_recv", ret);
return ret;
}
}
/* Create a completion channel to use with the SRQ CQ */
ctx->srq_cq_channel = ibv_create_comp_channel(ctx->srq_id->verbs);
if (!ctx->srq_cq_channel) {
VERB_ERR("ibv_create_comp_channel", -1);
return -1;
}
/* Create a CQ to use for all connections (QPs) that use the SRQ */
ctx->srq_cq = ibv_create_cq(ctx->srq_id->verbs, ctx->max_wr, NULL,
ctx->srq_cq_channel, 0);
if (!ctx->srq_cq) {
VERB_ERR("ibv_create_cq", -1);
return -1;
}
/* Make sure that we get notified on the first completion */
ret = ibv_req_notify_cq(ctx->srq_cq, 0);
if (ret) {
VERB_ERR("ibv_req_notify_cq", ret);
return ret;
}
return 0;
}
/*
* Function: destroy_resources
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* This function cleans up resources used by the application
*/
void destroy_resources(struct context *ctx)
{
int i;
if (ctx->conn_id) {
for (i = 0; i < ctx->qp_count; i++) {
if (ctx->conn_id[i]) {
if (ctx->conn_id[i]->qp &&
ctx->conn_id[i]->qp->state == IBV_QPS_RTS) {
rdma_disconnect(ctx->conn_id[i]);
}
rdma_destroy_qp(ctx->conn_id[i]);
rdma_destroy_id(ctx->conn_id[i]);
}
}
free(ctx->conn_id);
}
if (ctx->recv_mr)
rdma_dereg_mr(ctx->recv_mr);
if (ctx->send_mr)
rdma_dereg_mr(ctx->send_mr);
if (ctx->recv_buf)
free(ctx->recv_buf);
if (ctx->send_buf)
free(ctx->send_buf);
if (ctx->srq_cq)
ibv_destroy_cq(ctx->srq_cq);
if (ctx->srq_cq_channel)
ibv_destroy_comp_channel(ctx->srq_cq_channel);
if (ctx->srq_id) {
rdma_destroy_srq(ctx->srq_id);
rdma_destroy_id(ctx->srq_id);
}
}
/*
* Function: await_completion
*
* Input:
* ctx The context object
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Waits for a completion on the SRQ CQ
*
*/
int await_completion(struct context *ctx)
{
int ret;
struct ibv_cq *ev_cq;
void *ev_ctx;
/* Wait for a CQ event to arrive on the channel */
ret = ibv_get_cq_event(ctx->srq_cq_channel, &ev_cq, &ev_ctx);
if (ret) {
VERB_ERR("ibv_get_cq_event", ret);
return ret;
}
ibv_ack_cq_events(ev_cq, 1);
/* Reload the event notification */
ret = ibv_req_notify_cq(ctx->srq_cq, 0);
if (ret) {
VERB_ERR("ibv_req_notify_cq", ret);
return ret;
}
return 0;
}
/*
* Function: run_server
*
* Input:
* ctx The context object
* rai The RDMA address info for the connection
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Executes the server side of the example
*/
int run_server(struct context *ctx, struct rdma_addrinfo *rai)
{
int ret, i;
uint64_t send_count = 0;
uint64_t recv_count = 0;
struct ibv_wc wc;
struct ibv_qp_init_attr qp_attr;
ret = init_resources(ctx, rai);
if (ret) {
printf("init_resources returned %d\n", ret);
return ret;
}
/* Use the srq_id as the listen_id since it is already setup */
ctx->listen_id = ctx->srq_id;
ret = rdma_listen(ctx->listen_id, 4);
if (ret) {
VERB_ERR("rdma_listen", ret);
return ret;
}
printf("waiting for connection from client...\n");
for (i = 0; i < ctx->qp_count; i++) {
ret = rdma_get_request(ctx->listen_id, &ctx->conn_id[i]);
if (ret) {
VERB_ERR("rdma_get_request", ret);
return ret;
}
/* Create the queue pair */
memset(&qp_attr, 0, sizeof (qp_attr));
qp_attr.qp_context = ctx;
qp_attr.qp_type = IBV_QPT_RC;
qp_attr.cap.max_send_wr = ctx->max_wr;
qp_attr.cap.max_recv_wr = ctx->max_wr;
qp_attr.cap.max_send_sge = 1;
qp_attr.cap.max_recv_sge = 1;
qp_attr.cap.max_inline_data = 0;
qp_attr.recv_cq = ctx->srq_cq;
qp_attr.srq = ctx->srq;
qp_attr.sq_sig_all = 0;
ret = rdma_create_qp(ctx->conn_id[i], NULL, &qp_attr);
if (ret) {
VERB_ERR("rdma_create_qp", ret);
return ret;
}
/* Set the new connection to use our SRQ */
ctx->conn_id[i]->srq = ctx->srq;
ret = rdma_accept(ctx->conn_id[i], NULL);
if (ret) {
VERB_ERR("rdma_accept", ret);
return ret;
}
}
while (recv_count < ctx->msg_count) {
i = 0;
while (i < ctx->max_wr && recv_count < ctx->msg_count) {
int ne;
ret = await_completion(ctx);
if (ret) {
printf("await_completion %d\n", ret);
return ret;
}
do {
ne = ibv_poll_cq(ctx->srq_cq, 1, &wc);
if (ne < 0) {
VERB_ERR("ibv_poll_cq", ne);
return ne;
}
else if (ne == 0)
break;
if (wc.status != IBV_WC_SUCCESS) {
printf("work completion status %s\n",
ibv_wc_status_str(wc.status));
return -1;
}
recv_count++;
printf("recv count: %d, qp_num: %d\n", recv_count, wc.qp_num);
ret = rdma_post_recv(ctx->srq_id, (void *) wc.wr_id,
ctx->recv_buf, ctx->msg_length,
ctx->recv_mr);
if (ret) {
VERB_ERR("rdma_post_recv", ret);
return ret;
}
i++;
}
while (ne);
}
ret = rdma_post_send(ctx->conn_id[0], NULL, ctx->send_buf,
ctx->msg_length, ctx->send_mr, IBV_SEND_SIGNALED);
if (ret) {
VERB_ERR("rdma_post_send", ret);
return ret;
}
ret = rdma_post_send(ctx->conn_id[0], NULL, ctx->send_buf,
ctx->msg_length, ctx->send_mr, IBV_SEND_SIGNALED);
if (ret) {
VERB_ERR("rdma_post_send", ret);
return ret;
}
ret = rdma_get_send_comp(ctx->conn_id[0], &wc);
if (ret <= 0) {
VERB_ERR("rdma_get_send_comp", ret);
return -1;
}
send_count++;
printf("send count: %d\n", send_count);
}
return 0;
}
/*
* Function: run_client
*
* Input:
* ctx The context object
* rai The RDMA address info for the connection
* * Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Executes the client side of the example
*/
int run_client(struct context *ctx, struct rdma_addrinfo *rai)
{
int ret, i, ne;
uint64_t send_count = 0;
uint64_t recv_count = 0;
struct ibv_wc wc;
struct ibv_qp_init_attr attr;
ret = init_resources(ctx, rai);
if (ret) {
printf("init_resources returned %d\n", ret);
return ret;
}
for (i = 0; i < ctx->qp_count; i++) {
memset(&attr, 0, sizeof (attr));
attr.qp_context = ctx;
attr.cap.max_send_wr = ctx->max_wr;
attr.cap.max_recv_wr = ctx->max_wr;
attr.cap.max_send_sge = 1;
attr.cap.max_recv_sge = 1;
attr.cap.max_inline_data = 0;
attr.recv_cq = ctx->srq_cq;
attr.srq = ctx->srq;
attr.sq_sig_all = 0;
ret = rdma_create_ep(&ctx->conn_id[i], rai, NULL, &attr);
if (ret) {
VERB_ERR("rdma_create_ep", ret);
return ret;
}
ret = rdma_connect(ctx->conn_id[i], NULL);
if (ret) {
VERB_ERR("rdma_connect", ret);
return ret;
}
}
while (send_count < ctx->msg_count) {
for (i = 0; i < ctx->max_wr && send_count < ctx->msg_count; i++) {
/* perform our send to the server */
ret = rdma_post_send(ctx->conn_id[i % ctx->qp_count], NULL,
ctx->send_buf, ctx->msg_length, ctx->send_mr,
IBV_SEND_SIGNALED);
if (ret) {
VERB_ERR("rdma_post_send", ret);
return ret;
}
ret = rdma_get_send_comp(ctx->conn_id[i % ctx->qp_count], &wc);
if (ret <= 0) {
VERB_ERR("rdma_get_send_comp", ret);
return ret;
}
send_count++;
printf("send count: %d, qp_num: %d\n", send_count, wc.qp_num);
}
/* wait for a recv indicating that all buffers were processed */
ret = await_completion(ctx);
if (ret) {
VERB_ERR("await_completion", ret);
return ret;
}
do {
ne = ibv_poll_cq(ctx->srq_cq, 1, &wc);
if (ne < 0) {
VERB_ERR("ibv_poll_cq", ne);
return ne;
}
else if (ne == 0)
break;
if (wc.status != IBV_WC_SUCCESS) {
printf("work completion status %s\n",
ibv_wc_status_str(wc.status));
return -1;
}
recv_count++;
printf("recv count: %d\n", recv_count);
ret = rdma_post_recv(ctx->srq_id, (void *) wc.wr_id,
ctx->recv_buf, ctx->msg_length, ctx->recv_mr);
if (ret) {
VERB_ERR("rdma_post_recv", ret);
return ret;
}
}
while (ne);
}
return ret;
}
/*
* Function: main
*
* Input:
* argc The number of arguments
* argv Command line arguments
*
* Output:
* none
*
* Returns:
* 0 on success, non-zero on failure
*
* Description:
* Main program to demonstrate SRQ functionality.
* Both the client and server use an SRQ. ctx.qp_count number of QPs are
* created and each one of them uses the SRQ. After the connection, the
* client starts blasting sends to the server upto ctx.max_wr. When the
* server has received all the sends, it performs a send to the client to
* tell it that it can continue. Process repeats until ctx.msg_count
* sends have been performed.
*/
int main(int argc, char** argv)
{
int ret, op;
struct context ctx;
struct rdma_addrinfo *rai, hints;
memset(&ctx, 0, sizeof (ctx));
memset(&hints, 0, sizeof (hints));
ctx.server = 0;
ctx.server_port = DEFAULT_PORT;
ctx.msg_count = DEFAULT_MSG_COUNT;
ctx.msg_length = DEFAULT_MSG_LENGTH;
ctx.qp_count = DEFAULT_QP_COUNT;
ctx.max_wr = DEFAULT_MAX_WR;
/* Read options from command line */
while ((op = getopt(argc, argv, "sa:p:c:l:q:w:")) != -1) {
switch (op) {
case 's':
ctx.server = 1;
break;
case 'a':
ctx.server_name = optarg;
break;
case 'p':
ctx.server_port = optarg;
break;
case 'c':
ctx.msg_count = atoi(optarg);
break;
case 'l':
ctx.msg_length = atoi(optarg);
break;
case 'q':
ctx.qp_count = atoi(optarg);
break;
case 'w':
ctx.max_wr = atoi(optarg);
break;
default:
printf("usage: %s -a server_address\n", argv[0]);
printf("\t[-s server mode]\n");
printf("\t[-p port_number]\n");
printf("\t[-c msg_count]\n");
printf("\t[-l msg_length]\n");
printf("\t[-q qp_count]\n");
printf("\t[-w max_wr]\n");
exit(1);
}
}
if (ctx.server_name == NULL) {
printf("server address required (use -a)!\n");
exit(1);
}
hints.ai_port_space = RDMA_PS_TCP;
if (ctx.server == 1)
hints.ai_flags = RAI_PASSIVE; /* this makes it a server */
ret = rdma_getaddrinfo(ctx.server_name, ctx.server_port, &hints, &rai);
if (ret) {
VERB_ERR("rdma_getaddrinfo", ret);
exit(1);
}
/* allocate memory for our QPs and send/recv buffers */
ctx.conn_id = (struct rdma_cm_id **) calloc(ctx.qp_count,
sizeof (struct rdma_cm_id *));
memset(ctx.conn_id, 0, sizeof (ctx.conn_id));
ctx.send_buf = (char *) malloc(ctx.msg_length);
memset(ctx.send_buf, 0, ctx.msg_length);
ctx.recv_buf = (char *) malloc(ctx.msg_length);
memset(ctx.recv_buf, 0, ctx.msg_length);
if (ctx.server)
ret = run_server(&ctx, rai);
else
ret = run_client(&ctx, rai);
destroy_resources(&ctx);
free(rai);
return ret;
}