Programming Examples Using RDMA Verbs

This chapter provides code examples using the RDMA Verbs

8.1.Automatic Path Migration (APM)

//*
 * Compile Command:
 * gcc apm.c -o apm -libverbs -lrdmacm
 * 
 * Description:
 * This example demonstrates Automatic Path Migration (APM). The basic flow is
 * as follows:
 * 1. Create connection between client and server
 * 2. Set the alternate path details on each side of the connection
 * 3. Perform send operations back and forth between client and server
 * 4. Cause the path to be migrated (manually or automatically)
 * 5. Complete sends using the alternate path
 * 
 * There are two ways to cause the path to be migrated.
 * 1. Use the ibv_modify_qp verb to set path_mig_state = IBV_MIG_MIGRATED
 * 2. Assuming there are two ports on at least one side of the connection, and
 * each port has a path to the other host, pull out the cable of the original
 * port and watch it migrate to the other port.
 * 
 * Running the Example:
 * This example requires a specific IB network configuration to properly
 * demonstrate APM. Two hosts are required, one for the client and one for the
 * server. At least one of these two hosts must have a IB card with two ports.
 * Both of these ports should be connected to the same subnet and each have a
 * route to the other host through an IB switch.
 * The executable can operate as either the client or server application. Start
 * the server side first on one host then start the client on the other host. With default parameters, the 
 * client and server will exchange 100 sends over 100 seconds. During that time,
 * manually unplug the cable connected to the original port of the two port
 * host, and watch the path get migrated to the other port. It may take up to
 * a minute for the path to migrated. To see the path get migrated by software,
 * use the -m option on the client side.
 * 
 * Server:
 * ./apm -s
 * 
 * Client (-a is IP of remote interface):
 * ./apm -a 192.168.1.12
 * 
 */


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <getopt.h>
#include <rdma/rdma_verbs.h>

#define VERB_ERR(verb, ret) \
 fprintf(stderr, "%s returned %d errno %d\n", verb, ret, errno)
/* Default parameter values */
#define DEFAULT_PORT "51216"
#define DEFAULT_MSG_COUNT 100
#define DEFAULT_MSG_LENGTH 1000000
#define DEFAULT_MSEC_DELAY 500

/* Resources used in the example */
struct context
{
 /* User parameters */
 int server;
 char *server_name;
 char *server_port;
 int msg_count;
 int msg_length;
 int msec_delay;
 uint8_t alt_srcport;
 uint16_t alt_dlid;
 uint16_t my_alt_dlid;
 int migrate_after;
 /* Resources */
 struct rdma_cm_id *id;
 struct rdma_cm_id *listen_id;
 struct ibv_mr *send_mr;
 struct ibv_mr *recv_mr;
 char *send_buf;
 char *recv_buf;
 pthread_t async_event_thread;
};

/*
 * Function: async_event_thread
 * 
 * Input:
 * arg The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * NULL
 * 
 * Description:
 * Reads any Asynchronous events that occur during the sending of data
 * and prints out the details of the event. Specifically migration
 * related events.
 */
static void *async_event_thread(void *arg)
{
 struct ibv_async_event event;
 int ret;
 struct context *ctx = (struct context *) arg;
 while (1) {
 ret = ibv_get_async_event(ctx->id->verbs, &event);
 if (ret) {
 VERB_ERR("ibv_get_async_event", ret);
 break;
 }
 switch (event.event_type) {
 case IBV_EVENT_PATH_MIG:
 printf("QP path migrated\n");
 break;
 case IBV_EVENT_PATH_MIG_ERR:
 printf("QP path migration error\n");
 break;
 default:
 printf("Async Event %d\n", event.event_type);
 break;
 }
 ibv_ack_async_event(&event);
 }
 return NULL;
}

/*
 * Function: get_alt_dlid_from_private_data 
 * 
 * Input:
 * event The RDMA event containing private data
 * 
 * Output:
 * dlid The DLID that was sent in the private data
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Takes the private data sent from the remote side and returns the
 * destination LID that was contained in the private data
 */
int get_alt_dlid_from_private_data(struct rdma_cm_event *event, uint16_t *dlid)
{
 if (event->param.conn.private_data_len < 4) {
 printf("unexpected private data len: %d",
 event->param.conn.private_data_len);
 return -1;
 }
 *dlid = ntohs(*((uint16_t *) event->param.conn.private_data));
 return 0;
}

/*
 * Function: get_alt_port_details 
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * First, query the device to determine if path migration is supported.
 * Next, queries all the ports on the device to determine if there is
 * different port than the current one to use as an alternate port. If so,
 * copy the port number and dlid to the context so they can be used when
 * the alternate path is loaded. 
 * 
 * Note: 
 * This function assumes that if another port is found in the active state,
 * that the port is connected to the same subnet as the initial port and
 * that there is a route to the other hosts alternate port.
 */
int get_alt_port_details(struct context *ctx)
{
 int ret, i;
 struct ibv_qp_attr qp_attr;
 struct ibv_qp_init_attr qp_init_attr;
 struct ibv_device_attr dev_attr;
 /* This example assumes the alternate port we want to use is on the same
 * HCA. Ports from other HCAs can be used as alternate paths as well. Get
 * a list of devices using ibv_get_device_list or rdma_get_devices.*/
 ret = ibv_query_device(ctx->id->verbs, &dev_attr);
 if (ret) {
 VERB_ERR("ibv_query_device", ret);
 return ret;
 }
 /* Verify the APM is supported by the HCA */
 if (!(dev_attr.device_cap_flags | IBV_DEVICE_AUTO_PATH_MIG)) {
 printf("device does not support auto path migration!\n");
 return -1;
 }
 /* Query the QP to determine which port we are bound to */
 ret = ibv_query_qp(ctx->id->qp, &qp_attr, 0, &qp_init_attr);
 if (ret) {
 VERB_ERR("ibv_query_qp", ret);
return ret;
 }

 for (i = 1; i <= dev_attr.phys_port_cnt; i++) {
 /* Query all ports until we find one in the active state that is
 * not the port we are currently connected to. */
 struct ibv_port_attr port_attr;
 ret = ibv_query_port(ctx->id->verbs, i, &port_attr);
 if (ret) {
 VERB_ERR("ibv_query_device", ret);
 return ret;
 }
 if (port_attr.state == IBV_PORT_ACTIVE) {
 ctx->my_alt_dlid = port_attr.lid;
 ctx->alt_srcport = i;
 if (qp_attr.port_num != i)
 break;
 }
 }
 return 0;
}


/*
 * Function: load_alt_path
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Uses ibv_modify_qp to load the alternate path information and set the
 * path migration state to rearm.
 */
int load_alt_path(struct context *ctx)
{
 int ret;
 struct ibv_qp_attr qp_attr;
 struct ibv_qp_init_attr qp_init_attr;
 /* query to get the current attributes of the qp */
 ret = ibv_query_qp(ctx->id->qp, &qp_attr, 0, &qp_init_attr);
 if (ret) {
 VERB_ERR("ibv_query_qp", ret);
 return ret;
 }
/* initialize the alternate path attributes with the current path 
 * attributes */
 memcpy(&qp_attr.alt_ah_attr, &qp_attr.ah_attr, sizeof (struct ibv_ah_attr));
 /* set the alt path attributes to some basic values */
 qp_attr.alt_pkey_index = qp_attr.pkey_index;
 qp_attr.alt_timeout = qp_attr.timeout;
 qp_attr.path_mig_state = IBV_MIG_REARM;
 /* if an alternate path was supplied, set the source port and the dlid */
 if (ctx->alt_srcport)
 qp_attr.alt_port_num = ctx->alt_srcport;
 else
 qp_attr.alt_port_num = qp_attr.port_num;
 if (ctx->alt_dlid)
 qp_attr.alt_ah_attr.dlid = ctx->alt_dlid;
 printf("loading alt path - local port: %d, dlid: %d\n",
 qp_attr.alt_port_num, qp_attr.alt_ah_attr.dlid);
 ret = ibv_modify_qp(ctx->id->qp, &qp_attr,
 IBV_QP_ALT_PATH | IBV_QP_PATH_MIG_STATE);
 if (ret) {
 VERB_ERR("ibv_modify_qp", ret);
 return ret;
 }
}


/*
 * Function: reg_mem
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Registers memory regions to use for our data transfer 
 */
int reg_mem(struct context *ctx)
{
 ctx->send_buf = (char *) malloc(ctx->msg_length);
 memset(ctx->send_buf, 0x12, ctx->msg_length);
 ctx->recv_buf = (char *) malloc(ctx->msg_length);
 memset(ctx->recv_buf, 0x00, ctx->msg_length);
 ctx->send_mr = rdma_reg_msgs(ctx->id, ctx->send_buf, ctx->msg_length);
 if (!ctx->send_mr) {
VERB_ERR("rdma_reg_msgs", -1);
 return -1;
 }
 ctx->recv_mr = rdma_reg_msgs(ctx->id, ctx->recv_buf, ctx->msg_length);
 if (!ctx->recv_mr) {
 VERB_ERR("rdma_reg_msgs", -1);
 return -1;
 }
 return 0;
}



/*
 * Function: getaddrinfo_and_create_ep
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Gets the address information and creates our endpoint 
 */
int getaddrinfo_and_create_ep(struct context *ctx)
{
 int ret;
 struct rdma_addrinfo *rai, hints;
 struct ibv_qp_init_attr qp_init_attr;
 memset(&hints, 0, sizeof (hints));
 hints.ai_port_space = RDMA_PS_TCP;
 if (ctx->server == 1)
 hints.ai_flags = RAI_PASSIVE; /* this makes it a server */
 printf("rdma_getaddrinfo\n");
 ret = rdma_getaddrinfo(ctx->server_name, ctx->server_port, &hints, &rai);
 if (ret) {
 VERB_ERR("rdma_getaddrinfo", ret);
 return ret;
 }
 memset(&qp_init_attr, 0, sizeof (qp_init_attr));
 qp_init_attr.cap.max_send_wr = 1;
 qp_init_attr.cap.max_recv_wr = 1;
 qp_init_attr.cap.max_send_sge = 1;
 qp_init_attr.cap.max_recv_sge = 1;
 printf("rdma_create_ep\n");

ret = rdma_create_ep(&ctx->id, rai, NULL, &qp_init_attr);
 if (ret) {
 VERB_ERR("rdma_create_ep", ret);
 return ret;
 }
 rdma_freeaddrinfo(rai);
 return 0;
}

/*
 * Function: get_connect_request
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Wait for a connect request from the client 
 */
int get_connect_request(struct context *ctx)
{
 int ret;
 printf("rdma_listen\n");
 ret = rdma_listen(ctx->id, 4);
 if (ret) {
 VERB_ERR("rdma_listen", ret);
 return ret;
 }
 ctx->listen_id = ctx->id;
 printf("rdma_get_request\n");
 ret = rdma_get_request(ctx->listen_id, &ctx->id);
 if (ret) {
 VERB_ERR("rdma_get_request", ret);
 return ret;
 }
 if (ctx->id->event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
 printf("unexpected event: %s",
 rdma_event_str(ctx->id->event->event));
 return ret;
 }

 /* If the alternate path info was not set on the command line, get
 * it from the private data */
if (ctx->alt_dlid == 0 && ctx->alt_srcport == 0) {
 ret = get_alt_dlid_from_private_data(ctx->id->event, &ctx->alt_dlid);
 if (ret) {
 return ret;
 }
 }
 return 0;
}


/*
 * Function: establish_connection
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Create the connection. For the client, call rdma_connect. For the
 * server, the connect request was already received, so just do
 * rdma_accept to complete the connection.
 */
int establish_connection(struct context *ctx)
{
 int ret;
 uint16_t private_data;
 struct rdma_conn_param conn_param;
 /* post a receive to catch the first send */
 ret = rdma_post_recv(ctx->id, NULL, ctx->recv_buf, ctx->msg_length,
 ctx->recv_mr);
 if (ret) {
 VERB_ERR("rdma_post_recv", ret);
 return ret;
 }
 /* send the dlid for the alternate port in the private data */
 private_data = htons(ctx->my_alt_dlid);
 memset(&conn_param, 0, sizeof (conn_param));
 conn_param.private_data_len = sizeof (int);
 conn_param.private_data = &private_data;
 conn_param.responder_resources = 2;
 conn_param.initiator_depth = 2;
 conn_param.retry_count = 5;
 conn_param.rnr_retry_count = 5;
if (ctx->server) {
 printf("rdma_accept\n");
ret = rdma_accept(ctx->id, &conn_param);
 if (ret) {
 VERB_ERR("rdma_accept", ret);
 return ret;
 }
 }
 else {
 printf("rdma_connect\n");
 ret = rdma_connect(ctx->id, &conn_param);
 if (ret) {
 VERB_ERR("rdma_connect", ret);
 return ret;
 }
 if (ctx->id->event->event != RDMA_CM_EVENT_ESTABLISHED) {
 printf("unexpected event: %s",
 rdma_event_str(ctx->id->event->event));
 return -1;
 }
 /* If the alternate path info was not set on the command line, get
 * it from the private data */
 if (ctx->alt_dlid == 0 && ctx->alt_srcport == 0) {
 ret = get_alt_dlid_from_private_data(ctx->id->event,
 &ctx->alt_dlid);
 if (ret)
 return ret;
 }
 }
 return 0;
}

/*
 * Function: send_msg
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Performs an Send and gets the completion
 * 
 */
int send_msg(struct context *ctx)
{
 int ret;
 struct ibv_wc wc;
ret = rdma_post_send(ctx->id, NULL, ctx->send_buf, ctx->msg_length,
 ctx->send_mr, IBV_SEND_SIGNALED);
 if (ret) {
 VERB_ERR("rdma_send_recv", ret);
 return ret;
 }
 ret = rdma_get_send_comp(ctx->id, &wc);
 if (ret < 0) {
 VERB_ERR("rdma_get_send_comp", ret);
 return ret;
 }
 return 0;
}

/*
 * Function: recv_msg
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Waits for a receive completion and posts a new receive buffer
 */
int recv_msg(struct context *ctx)
{
 int ret;
 struct ibv_wc wc;
 ret = rdma_get_recv_comp(ctx->id, &wc);
 if (ret < 0) {
 VERB_ERR("rdma_get_recv_comp", ret);
 return ret;
 }
 ret = rdma_post_recv(ctx->id, NULL, ctx->recv_buf, ctx->msg_length,
 ctx->recv_mr);
 if (ret) {
 VERB_ERR("rdma_post_recv", ret);
 return ret;
 }
 return 0;
}

/*
 * Function: main
* 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * 
 */
int main(int argc, char** argv)
{
 int ret, op, i, send_cnt, recv_cnt;
 struct context ctx;
 struct ibv_qp_attr qp_attr;
 memset(&ctx, 0, sizeof (ctx));
 memset(&qp_attr, 0, sizeof (qp_attr));
 ctx.server = 0;
 ctx.server_port = DEFAULT_PORT;
 ctx.msg_count = DEFAULT_MSG_COUNT;
 ctx.msg_length = DEFAULT_MSG_LENGTH;
 ctx.msec_delay = DEFAULT_MSEC_DELAY;
 ctx.alt_dlid = 0;
 ctx.alt_srcport = 0;
 ctx.migrate_after = -1;
 while ((op = getopt(argc, argv, "sa:p:c:l:d:r:m:")) != -1) {
 switch (op) {
 case 's':
 ctx.server = 1;
 break;
 case 'a':
 ctx.server_name = optarg;
 break;
 case 'p':
 ctx.server_port = optarg;
 break;
 case 'c':
 ctx.msg_count = atoi(optarg);
 break;
 case 'l':
 ctx.msg_length = atoi(optarg);
 break;
 case 'd':
 ctx.alt_dlid = atoi(optarg);
 break;
 case 'r':
 ctx.alt_srcport = atoi(optarg);
 break;
case 'm':
 ctx.migrate_after = atoi(optarg);
 break;
 case 'w':
 ctx.msec_delay = atoi(optarg);
 break;
 default:
 printf("usage: %s [-s or -a required]\n", argv[0]);
 printf("\t[-s[erver mode]\n");
 printf("\t[-a ip_address]\n");
 printf("\t[-p port_number]\n");
 printf("\t[-c msg_count]\n");
 printf("\t[-l msg_length]\n");
 printf("\t[-d alt_dlid] (requires -r)\n");
 printf("\t[-r alt_srcport] (requires -d)\n");
 printf("\t[-m num_iterations_then_migrate] (client only)\n");
 printf("\t[-w msec_wait_between_sends]\n");
 exit(1);
 }
 }

printf("mode: %s\n", (ctx.server) ? "server" : "client");
 printf("address: %s\n", (!ctx.server_name) ? "NULL" : ctx.server_name);
 printf("port: %s\n", ctx.server_port);
 printf("count: %d\n", ctx.msg_count);
 printf("length: %d\n", ctx.msg_length);
 printf("alt_dlid: %d\n", ctx.alt_dlid);
 printf("alt_port: %d\n", ctx.alt_srcport);
 printf("mig_after: %d\n", ctx.migrate_after);
 printf("msec_wait: %d\n", ctx.msec_delay);
 printf("\n");
 if (!ctx.server && !ctx.server_name) {
 printf("server address must be specified for client mode\n");
 exit(1);
 }
 /* both of these must be set or neither should be set */
 if (!((ctx.alt_dlid > 0 && ctx.alt_srcport > 0) ||
 (ctx.alt_dlid == 0 && ctx.alt_srcport == 0))) {
 printf("-d and -r must be used together\n");
 exit(1);
 }
 if (ctx.migrate_after > ctx.msg_count) {
 printf("num_iterations_then_migrate must be less than msg_count\n");
 exit(1);
 }
 ret = getaddrinfo_and_create_ep(&ctx);
 if (ret)
 goto out;

 if (ctx.server) {
ret = get_connect_request(&ctx);
 if (ret)
 goto out;
 }
 /* only query for alternate port if information was not specified on the 
 * command line */
 if (ctx.alt_dlid == 0 && ctx.alt_srcport == 0) {
 ret = get_alt_port_details(&ctx);
 if (ret)
 goto out;
 }
 /* create a thread to handle async events */
 pthread_create(&ctx.async_event_thread, NULL, async_event_thread, &ctx);
 ret = reg_mem(&ctx);
 if (ret)
 goto out;
 ret = establish_connection(&ctx);
 /* load the alternate path after the connection was created. This can be
 * done at connection time, but the connection must be created and 
 * established using all ib verbs */
 ret = load_alt_path(&ctx);
 if (ret)
 goto out;
 send_cnt = recv_cnt = 0;
 for (i = 0; i < ctx.msg_count; i++) {
 if (ctx.server) {
 if (recv_msg(&ctx))
 break;
 printf("recv: %d\n", ++recv_cnt);
 }

if (ctx.msec_delay > 0)
 usleep(ctx.msec_delay * 1000);
 if (send_msg(&ctx))
 break;
 printf("send: %d\n", ++send_cnt);
 if (!ctx.server) {
 if (recv_msg(&ctx))
 break;
 printf("recv: %d\n", ++recv_cnt);
 }

 /* migrate the path manually if desired after the specified number of
 * sends */
 if (!ctx.server && i == ctx.migrate_after) {
 qp_attr.path_mig_state = IBV_MIG_MIGRATED;
 ret = ibv_modify_qp(ctx.id->qp, &qp_attr, IBV_QP_PATH_MIG_STATE);
 if (ret) {
 VERB_ERR("ibv_modify_qp", ret);
 goto out;
 }
 }
 }
 rdma_disconnect(ctx.id);
out:
 if (ctx.send_mr)
 rdma_dereg_mr(ctx.send_mr);
 if (ctx.recv_mr)
 rdma_dereg_mr(ctx.recv_mr);
 if (ctx.id)
 rdma_destroy_ep(ctx.id);
 if (ctx.listen_id)
 rdma_destroy_ep(ctx.listen_id);
 if (ctx.send_buf)
 free(ctx.send_buf);
 if (ctx.recv_buf)
 free(ctx.recv_buf);
 return ret;
}

8.3.Shared Received Queue (SRQ)

/*
 * Compile Command:
 * gcc srq.c -o srq -libverbs -lrdmacm
 * 
 * Description:
 * Both the client and server use an SRQ. A number of Queue Pairs (QPs) are
 * created (ctx.qp_count) and each QP uses the SRQ. The connection between the
 * client and server is established using the IP address details passed on the
 * command line. After the connection is established, the client starts
 * blasting sends to the server and stops when the maximum work requests
 * (ctx.max_wr) have been sent. When the server has received all the sends, it
 * performs a send to the client to tell it to continue. The process repeats
 * until the number of requested number of sends (ctx.msg_count) have been
 * performed.
 * 
 * Running the Example:
 * The executable can operate as either the client or server application. It
 * can be demonstrated on a simple fabric of two nodes with the server
 * application running on one node and the client application running on the
 * other. Each node must be configured to support IPoIB and the IB interface
 * (ex. ib0) must be assigned an IP Address. Finally, the fabric must be
 * initialized using OpenSM.
 * 
 * Server (-a is IP of local interface):
 * ./srq -s -a 192.168.1.12
 * 
 * Client (-a is IP of remote interface):
 * ./srq -a 192.168.1.12
 * 
 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <getopt.h>
#include <rdma/rdma_verbs.h>
#define VERB_ERR(verb, ret) \
 fprintf(stderr, "%s returned %d errno %d\n", verb, ret, errno)
/* Default parameters values */
#define DEFAULT_PORT "51216"
#define DEFAULT_MSG_COUNT 100
#define DEFAULT_MSG_LENGTH 100000
#define DEFAULT_QP_COUNT 4
#define DEFAULT_MAX_WR 64
/* Resources used in the example */
struct context
{
 /* User parameters */
int server;
 char *server_name;
 char *server_port;
 int msg_count;
 int msg_length;
 int qp_count;
 int max_wr;
 /* Resources */
 struct rdma_cm_id *srq_id;
 struct rdma_cm_id *listen_id;
 struct rdma_cm_id **conn_id;
 struct ibv_mr *send_mr;
 struct ibv_mr *recv_mr;
 struct ibv_srq *srq;
 struct ibv_cq *srq_cq;
 struct ibv_comp_channel *srq_cq_channel;
 char *send_buf;
 char *recv_buf;
};


/*
 * Function: init_resources
 * 
 * Input:
 * ctx The context object
 * rai The RDMA address info for the connection
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * This function initializes resources that are common to both the client
 * and server functionality. 
 * It creates our SRQ, registers memory regions, posts receive buffers 
 * and creates a single completion queue that will be used for the receive 
 * queue on each queue pair.
 */
int init_resources(struct context *ctx, struct rdma_addrinfo *rai)
{
 int ret, i;
 struct rdma_cm_id *id;
 /* Create an ID used for creating/accessing our SRQ */
 ret = rdma_create_id(NULL, &ctx->srq_id, NULL, RDMA_PS_TCP);
 if (ret) {
 VERB_ERR("rdma_create_id", ret);
 return ret;
 }
 /* We need to bind the ID to a particular RDMA device
* This is done by resolving the address or binding to the address */
 if (ctx->server == 0) {
 ret = rdma_resolve_addr(ctx->srq_id, NULL, rai->ai_dst_addr, 1000);
 if (ret) {
 VERB_ERR("rdma_resolve_addr", ret);
 return ret;
 }
 }
 else {
 ret = rdma_bind_addr(ctx->srq_id, rai->ai_src_addr);
 if (ret) {
 VERB_ERR("rdma_bind_addr", ret);
 return ret;
 }
 }
 /* Create the memory regions being used in this example */
 ctx->recv_mr = rdma_reg_msgs(ctx->srq_id, ctx->recv_buf, ctx->msg_length);
 if (!ctx->recv_mr) {
 VERB_ERR("rdma_reg_msgs", -1);
 return -1;
 }
 ctx->send_mr = rdma_reg_msgs(ctx->srq_id, ctx->send_buf, ctx->msg_length);
 if (!ctx->send_mr) {
 VERB_ERR("rdma_reg_msgs", -1);
 return -1;
 }

/* Create our shared receive queue */
 struct ibv_srq_init_attr srq_attr;
 memset(&srq_attr, 0, sizeof (srq_attr));
 srq_attr.attr.max_wr = ctx->max_wr;
 srq_attr.attr.max_sge = 1;
 ret = rdma_create_srq(ctx->srq_id, NULL, &srq_attr);
 if (ret) {
 VERB_ERR("rdma_create_srq", ret);
 return -1;
 }
 /* Save the SRQ in our context so we can assign it to other QPs later */
 ctx->srq = ctx->srq_id->srq;
 /* Post our receive buffers on the SRQ */
 for (i = 0; i < ctx->max_wr; i++) {
 ret = rdma_post_recv(ctx->srq_id, NULL, ctx->recv_buf, ctx->msg_length,
 ctx->recv_mr);
 if (ret) {
 VERB_ERR("rdma_post_recv", ret);
 return ret;
 }
 }

 /* Create a completion channel to use with the SRQ CQ */
 ctx->srq_cq_channel = ibv_create_comp_channel(ctx->srq_id->verbs);
 if (!ctx->srq_cq_channel) {
 VERB_ERR("ibv_create_comp_channel", -1);
 return -1;
 }
 /* Create a CQ to use for all connections (QPs) that use the SRQ */
 ctx->srq_cq = ibv_create_cq(ctx->srq_id->verbs, ctx->max_wr, NULL,
 ctx->srq_cq_channel, 0);
 if (!ctx->srq_cq) {
 VERB_ERR("ibv_create_cq", -1);
 return -1;
 }
 /* Make sure that we get notified on the first completion */
 ret = ibv_req_notify_cq(ctx->srq_cq, 0);
 if (ret) {
 VERB_ERR("ibv_req_notify_cq", ret);
 return ret;
 }
 return 0;
}


/*
 * Function: destroy_resources
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * This function cleans up resources used by the application
 */
void destroy_resources(struct context *ctx)
{
 int i;
 if (ctx->conn_id) {
 for (i = 0; i < ctx->qp_count; i++) {
 if (ctx->conn_id[i]) {
 if (ctx->conn_id[i]->qp &&
 ctx->conn_id[i]->qp->state == IBV_QPS_RTS) {
 rdma_disconnect(ctx->conn_id[i]);
 }
 rdma_destroy_qp(ctx->conn_id[i]);
 rdma_destroy_id(ctx->conn_id[i]);
 }

}
 free(ctx->conn_id);
 }
 if (ctx->recv_mr)
 rdma_dereg_mr(ctx->recv_mr);
 if (ctx->send_mr)
 rdma_dereg_mr(ctx->send_mr);
 if (ctx->recv_buf)
 free(ctx->recv_buf);
 if (ctx->send_buf)
 free(ctx->send_buf);
 if (ctx->srq_cq)
 ibv_destroy_cq(ctx->srq_cq);
 if (ctx->srq_cq_channel)
 ibv_destroy_comp_channel(ctx->srq_cq_channel);
 if (ctx->srq_id) {
 rdma_destroy_srq(ctx->srq_id);
 rdma_destroy_id(ctx->srq_id);
 }
}

/*
 * Function: await_completion
 * 
 * Input:
 * ctx The context object
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Waits for a completion on the SRQ CQ
 * 
 */
int await_completion(struct context *ctx)
{
 int ret;
 struct ibv_cq *ev_cq;
 void *ev_ctx;
 /* Wait for a CQ event to arrive on the channel */
 ret = ibv_get_cq_event(ctx->srq_cq_channel, &ev_cq, &ev_ctx);
 if (ret) {
VERB_ERR("ibv_get_cq_event", ret);
 return ret;
 }
 ibv_ack_cq_events(ev_cq, 1);
 /* Reload the event notification */
 ret = ibv_req_notify_cq(ctx->srq_cq, 0);
 if (ret) {
 VERB_ERR("ibv_req_notify_cq", ret);
 return ret;
 }
 return 0;
}

/*
 * Function: run_server
 * 
 * Input:
 * ctx The context object
 * rai The RDMA address info for the connection
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Executes the server side of the example
 */
int run_server(struct context *ctx, struct rdma_addrinfo *rai)
{
 int ret, i;
 uint64_t send_count = 0;
 uint64_t recv_count = 0;
 struct ibv_wc wc;
 struct ibv_qp_init_attr qp_attr;
 ret = init_resources(ctx, rai);
 if (ret) {
 printf("init_resources returned %d\n", ret);
 return ret;
 }
 /* Use the srq_id as the listen_id since it is already setup */
 ctx->listen_id = ctx->srq_id;
 ret = rdma_listen(ctx->listen_id, 4);
 if (ret) {
 VERB_ERR("rdma_listen", ret);
 return ret;
 }

 printf("waiting for connection from client...\n");
 for (i = 0; i < ctx->qp_count; i++) {
 ret = rdma_get_request(ctx->listen_id, &ctx->conn_id[i]);
 if (ret) {
 VERB_ERR("rdma_get_request", ret);
 return ret;
 }
 /* Create the queue pair */
 memset(&qp_attr, 0, sizeof (qp_attr));
 qp_attr.qp_context = ctx;
 qp_attr.qp_type = IBV_QPT_RC;
 qp_attr.cap.max_send_wr = ctx->max_wr;
 qp_attr.cap.max_recv_wr = ctx->max_wr;
 qp_attr.cap.max_send_sge = 1;
 qp_attr.cap.max_recv_sge = 1;
 qp_attr.cap.max_inline_data = 0;
 qp_attr.recv_cq = ctx->srq_cq;
 qp_attr.srq = ctx->srq;
 qp_attr.sq_sig_all = 0;
 ret = rdma_create_qp(ctx->conn_id[i], NULL, &qp_attr);
 if (ret) {
 VERB_ERR("rdma_create_qp", ret);
 return ret;
 }

/* Set the new connection to use our SRQ */
 ctx->conn_id[i]->srq = ctx->srq;
 ret = rdma_accept(ctx->conn_id[i], NULL);
 if (ret) {
 VERB_ERR("rdma_accept", ret);
 return ret;
 }
 }
 while (recv_count < ctx->msg_count) {
 i = 0;
 while (i < ctx->max_wr && recv_count < ctx->msg_count) {
 int ne;
 ret = await_completion(ctx);
 if (ret) {
 printf("await_completion %d\n", ret);
 return ret;
 }
 do {
 ne = ibv_poll_cq(ctx->srq_cq, 1, &wc);
 if (ne < 0) {
 VERB_ERR("ibv_poll_cq", ne);

return ne;
 }
 else if (ne == 0)
 break;
 if (wc.status != IBV_WC_SUCCESS) {
 printf("work completion status %s\n",
 ibv_wc_status_str(wc.status));
 return -1;
 }
 recv_count++;
 printf("recv count: %d, qp_num: %d\n", recv_count, wc.qp_num);
 ret = rdma_post_recv(ctx->srq_id, (void *) wc.wr_id,
 ctx->recv_buf, ctx->msg_length, 
 ctx->recv_mr);
 if (ret) {
 VERB_ERR("rdma_post_recv", ret);
 return ret;
 }
 i++;
 }
 while (ne);
 }
 ret = rdma_post_send(ctx->conn_id[0], NULL, ctx->send_buf, 
 ctx->msg_length, ctx->send_mr, IBV_SEND_SIGNALED);
 if (ret) {
 VERB_ERR("rdma_post_send", ret);
 return ret;
 }

ret = rdma_post_send(ctx->conn_id[0], NULL, ctx->send_buf, 
 ctx->msg_length, ctx->send_mr, IBV_SEND_SIGNALED);
 if (ret) {
 VERB_ERR("rdma_post_send", ret);
 return ret;
 }
 ret = rdma_get_send_comp(ctx->conn_id[0], &wc);
 if (ret <= 0) {
 VERB_ERR("rdma_get_send_comp", ret);
 return -1;
 }
 send_count++;
 printf("send count: %d\n", send_count);
 }
 return 0;
}

/*
 * Function: run_client
 * 
 * Input:
 * ctx The context object
 * rai The RDMA address info for the connection
 * * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Executes the client side of the example
 */
int run_client(struct context *ctx, struct rdma_addrinfo *rai)
{
 int ret, i, ne;
 uint64_t send_count = 0;
 uint64_t recv_count = 0;
 struct ibv_wc wc;
 struct ibv_qp_init_attr attr;
 ret = init_resources(ctx, rai);
 if (ret) {
 printf("init_resources returned %d\n", ret);
 return ret;
 }
 for (i = 0; i < ctx->qp_count; i++) {
 memset(&attr, 0, sizeof (attr));
 attr.qp_context = ctx;
 attr.cap.max_send_wr = ctx->max_wr;
 attr.cap.max_recv_wr = ctx->max_wr;
 attr.cap.max_send_sge = 1;
 attr.cap.max_recv_sge = 1;
 attr.cap.max_inline_data = 0;
 attr.recv_cq = ctx->srq_cq;
 attr.srq = ctx->srq;
 attr.sq_sig_all = 0;
 ret = rdma_create_ep(&ctx->conn_id[i], rai, NULL, &attr);
 if (ret) {
 VERB_ERR("rdma_create_ep", ret);
 return ret;
 }
 ret = rdma_connect(ctx->conn_id[i], NULL);
 if (ret) {
 VERB_ERR("rdma_connect", ret);
 return ret;
 }
 }
 while (send_count < ctx->msg_count) {
 for (i = 0; i < ctx->max_wr && send_count < ctx->msg_count; i++) {
 /* perform our send to the server */
 ret = rdma_post_send(ctx->conn_id[i % ctx->qp_count], NULL,
 ctx->send_buf, ctx->msg_length, ctx->send_mr,

IBV_SEND_SIGNALED);
 if (ret) {
 VERB_ERR("rdma_post_send", ret);
 return ret;
 }
 ret = rdma_get_send_comp(ctx->conn_id[i % ctx->qp_count], &wc);
 if (ret <= 0) {
 VERB_ERR("rdma_get_send_comp", ret);
 return ret;
 }
 send_count++;
 printf("send count: %d, qp_num: %d\n", send_count, wc.qp_num);
 }
 /* wait for a recv indicating that all buffers were processed */
 ret = await_completion(ctx);
 if (ret) {
 VERB_ERR("await_completion", ret);
 return ret;
 }
 do {
 ne = ibv_poll_cq(ctx->srq_cq, 1, &wc);
 if (ne < 0) {
 VERB_ERR("ibv_poll_cq", ne);
 return ne;
 }
 else if (ne == 0)
 break;

if (wc.status != IBV_WC_SUCCESS) {
 printf("work completion status %s\n",
 ibv_wc_status_str(wc.status));
 return -1;
 }
 recv_count++;
 printf("recv count: %d\n", recv_count);
 ret = rdma_post_recv(ctx->srq_id, (void *) wc.wr_id,
 ctx->recv_buf, ctx->msg_length, ctx->recv_mr);
 if (ret) {
 VERB_ERR("rdma_post_recv", ret);
 return ret;
 }
 }
 while (ne);
 }
 return ret;
}


/*
 * Function: main
 * 
 * Input:
 * argc The number of arguments
 * argv Command line arguments
 * 
 * Output:
 * none
 * 
 * Returns:
 * 0 on success, non-zero on failure
 * 
 * Description:
 * Main program to demonstrate SRQ functionality.
 * Both the client and server use an SRQ. ctx.qp_count number of QPs are
 * created and each one of them uses the SRQ. After the connection, the
 * client starts blasting sends to the server upto ctx.max_wr. When the
 * server has received all the sends, it performs a send to the client to
 * tell it that it can continue. Process repeats until ctx.msg_count
 * sends have been performed.
 */
int main(int argc, char** argv)
{
int ret, op;
 struct context ctx;
 struct rdma_addrinfo *rai, hints;
 memset(&ctx, 0, sizeof (ctx));
 memset(&hints, 0, sizeof (hints));
 ctx.server = 0;
 ctx.server_port = DEFAULT_PORT;
 ctx.msg_count = DEFAULT_MSG_COUNT;
 ctx.msg_length = DEFAULT_MSG_LENGTH;
 ctx.qp_count = DEFAULT_QP_COUNT;
 ctx.max_wr = DEFAULT_MAX_WR;
 /* Read options from command line */
 while ((op = getopt(argc, argv, "sa:p:c:l:q:w:")) != -1) {
 switch (op) {
 case 's':
 ctx.server = 1;
 break;
 case 'a':
 ctx.server_name = optarg;
 break;
 case 'p':
 ctx.server_port = optarg;
 break;
 case 'c':
 ctx.msg_count = atoi(optarg);
 break;
 case 'l':
ctx.msg_length = atoi(optarg);
 break;
 case 'q':
 ctx.qp_count = atoi(optarg);
 break;
 case 'w':
 ctx.max_wr = atoi(optarg);
 break;
 default:
 printf("usage: %s -a server_address\n", argv[0]);
 printf("\t[-s server mode]\n");
 printf("\t[-p port_number]\n");
 printf("\t[-c msg_count]\n");
 printf("\t[-l msg_length]\n");
 printf("\t[-q qp_count]\n");
 printf("\t[-w max_wr]\n");
 exit(1);
 }
 }
 if (ctx.server_name == NULL) {
 printf("server address required (use -a)!\n");
 exit(1);
 }

 hints.ai_port_space = RDMA_PS_TCP;
 if (ctx.server == 1)
 hints.ai_flags = RAI_PASSIVE; /* this makes it a server */
 ret = rdma_getaddrinfo(ctx.server_name, ctx.server_port, &hints, &rai);
 if (ret) {
 VERB_ERR("rdma_getaddrinfo", ret);
 exit(1);
 }
 /* allocate memory for our QPs and send/recv buffers */
 ctx.conn_id = (struct rdma_cm_id **) calloc(ctx.qp_count,
 sizeof (struct rdma_cm_id *));
 memset(ctx.conn_id, 0, sizeof (ctx.conn_id));
 ctx.send_buf = (char *) malloc(ctx.msg_length);
 memset(ctx.send_buf, 0, ctx.msg_length);
 ctx.recv_buf = (char *) malloc(ctx.msg_length);
 memset(ctx.recv_buf, 0, ctx.msg_length);
 if (ctx.server)
 ret = run_server(&ctx, rai);
 else
 ret = run_client(&ctx, rai);
 destroy_resources(&ctx);
 free(rai);
 return ret;
}

Programming Examples Using RDMA Verbs

8.1.Automatic Path Migration (APM)

8.3.Shared Received Queue (SRQ)

猜你喜欢