Zero‐Copy API

        #include <onload/extensions_zc.h

1.1.Zero‐Copy Data Buffers

        

        To avoid the copy data is passed to and from the application in special buffers  described by a struct onload_zc_iovec . A message or datagram can consist of  multiple iovecs using a struct onload_zc_msg . A single call to send may involve 
multiple messages using an array of  struct onload_zc_mmsg .
/* A zc_iovec describes a single buffer */
struct onload_zc_iovec {
  void* iov_base;                /* Address within buffer */
  size_t iov_len;                /* Length of data */
  onload_zc_handle buf;          /* (opaque) buffer handle */
  unsigned iov_flags;            /* Not currently used */
};
/* A msg describes array of iovecs that make up datagram */
struct onload_zc_msg {
  struct onload_zc_iovec* iov;   /* Array of buffers */
  struct msghdr msghdr;          /* Message metadata */
};
/* An mmsg describes a message, the socket, and its result */
struct onload_zc_mmsg {
  struct onload_zc_msg msg;      /* Message */
  int rc;                        /* Result of send operation */
  int fd;                        /* socket to send on */
};

1.2.Zero‐Copy TCP Send Overview

        Figure 31 illustrates the difference between the normal TCP transmit method and 

the zero‐ copy method.
        When using standard POSIX socket calls, the application first creates the payload 
data in an application allocated buffer before calling the  send()  function. Onload 
will copy the data to a Onload packet buffer in memory and post a descriptor to this 
buffer in the network adapter TX descriptor ring.
        Using the zero‐copy TCP transmit API the application calls the 
onload_zc_alloc_buffers()  function to request buffers from Onload. A pointer 
to a packet buffer is returned in response. The application places the data to send 
directly into this buffer and then calls  onload_zc_send()  to indicate to Onload that 
data is available to send.
        Onload will post a descriptor for the packet buffer in the network adapter TX 
descriptor ring and ring the TX doorbell. The network adapter fetches the data for 
transmission.

         

        The socket used to allocate zero‐copy buffers must be in the same stack as 
the socket used to send the buffers. When using TCP loopback, Onload can move a 
socket from one stack to another. Users must ensure that they  ALWAYS USE 
BUFFERS FROM THE CORRECT STACK .
        Zero‐copy TCP transmit is implemented within the Onload Extensions API.

2.3.Zero‐Copy TCP Send

        The zero‐copy send API supports the sending of multiple messages to different 

sockets in a single call. Data buffers must be allocated in advance and for best 
efficiency these should be allocated in blocks and off the critical path. The user 
should avoid simply moving the copy from Onload into the application, but where 
this is unavoidable, it should also be done off the critical path.
int onload_zc_send(struct onload_zc_mmsg* msgs, int mlen, int flags);

int onload_zc_alloc_buffers(int fd,
    struct onload_zc_iovec* iovecs,
    int iovecs_len,
    onload_zc_buffer_type_flags flags);
int onload_zc_release_buffers(int fd,
    onload_zc_handle* bufs,
    int bufs_len);
        The onload_zc_send()  function return value identifies how many of the 
onload_zc_mmsg  array’s rc fields are set. Each  onload_zc_mmsg.rc  returns how 
many bytes (or error) were sent in for that message. Refer to the table below.

         

        Buffers sent with the ONLOAD_MSG_WARM feature enabled are not 
actually sent buffers, ownership remains with the user who is responsible for 
freeing these buffers.

2.4.Zero‐Copy Send ‐ Single Message, Single Buffer

        

struct onload_zc_iovec iovec;
struct onload_zc_mmsg mmsg;
rc = onload_zc_alloc_buffers(fd, &iovec, 1, ONLOAD_ZC_BUFFER_HDR_TCP);
assert(rc == O);
assert(my_data_len <= iovec.iov_len);
memcpy(iovec.iov_base, my_data, my_data_len);
iovec.iov_len = my_data_len;
mmsg.fd = fd;
mmsg.msg.iov = &iovec;
mmsg.msg.msghdr.msg_iovlen = 1;
rc = onload_zc_send(&mmsg, 1, 0);
if( rc <= 0) {
  /* Probably application bug */
  return rc;
} else {
  /* Only one message, so rc should be 1 */
  assert(rc == 1);
  /* rc == 1 so we can look at the first (only) mmsg.rc */
  if( mmsg.rc < 0 )
    /* Error sending message */
    onload_zc_release_buffers(fd, &iovec.buf, 1);
  else
    /* Message sent, single msg, single iovec so
     * shouldn't worry about partial sends */
    assert(mmsg.rc == my_data_len);
}
        
        The example above demonstrates error code handling. Note it contains an examples 
of bad practice where buffers are allocated and populated on the critical path.

2.5.Zero‐Copy Send ‐ Multiple Message, Multiple Buffers

        

#define N_BUFFERS 2
#define N_MSGS 2
struct onload_zc_iovec iovec[N_MSGS][N_BUFFERS];
struct onload_zc_mmsg mmsg[N_MSGS];
for( i = 0; i < N_MSGS; ++i ) {
  rc = onload_zc_alloc_buffers(fd, iovec[i], N_BUFFERS, ONLOAD_ZC_BUFFER_HDR_TCP);
  assert(rc == 0);
  /* TODO store data in iovec[i][j].iov_base,
   * set iovec[i][j]iov_len */
  mmsg[i]fd = fd; /* Could be different for each message */
  mmsg[i].iov = iovec[i];
  mmsg[i].msg.msghdr.msg_iovlen = N_BUFFERS;
}

rc = onload_zc_send(mmsg, N_MSGS, 0);
if( rc <= 0 ) {
  /* Probably application bug */
  return rc;
} else {
  for( i = 0; i < N_MSGS; ++i ) {
    if( i < rc ) {
      /* mmsg[i] is set and we can use it */
      if( mmsg[i] < 0) {
        /* error sending this message ‐ release buffers */
        for( j = 0; j < N_BUFFERS; ++j )
          onload_zc_release_buffers(fd, &iovec[i][j].buf, 1);
      } else if( mmsg(i] < sum_over_j(iovec[i][j].iov_len) ) {
        /* partial success */
        /* TODO use mmsg[i] to determine which buffers in
         * iovec[i] array are sent and which are still
         * owned by application */
      } else {
        /* Whole message sent, buffers now owned by Onload */
      }
    } else {
      /* mmsg[i] is not set, this message was not sent */
      for( j = 0; j < N_BUFFERS; ++j )
        onload_zc_release_buffers(fd, &iovec[i][j].buf, 1);
    }
  } }

     

        The example above demonstrates error code handling and contains some examples 
of bad practice where buffers are allocated and populated on the critical path.

2.6.Zero‐Copy Send ‐ Full Example

static struct onload_zc_iovec iovec[NUM_ZC_BUFFERS];
static ssize_t do_send_zc(int fd, const void* buf, size_t len, int flags)
{
  int bytes_done, rc, i, bufs_needed;
  struct onload_zc_mmsg mmsg;
  mmsg.fd = fd;
  mmsg.msg.iov = iovec;
  bytes_done = 0;
  mmsg.msg.msghdr.msg_iovlen = 0;
  while( bytes_done < len ) {
    if( iovec[mmsg.msg.msghdr.msg_iovlen].iov_len > (len ‐ bytes_done))
      iovec[mmsg.msg.msghdr.msg_iovlen].iov_len = (len ‐ bytes_done);
    memcpy(iovec[i].iov_base, buf+bytes_done, iov_len);
    bytes_done += iovec[mmsg.msg.msghdr.msg_iovlen].iov_len;
    ++mmsg.msg.msghdr.msg_iovlen;
  }

  rc = onload_zc_send(&mmsg, 1, 0);
  if( rc != 1 /* Number of messages we sent */ ) {
    printf("onload_zc_send failed to process msg, %d\n", rc);
    return ‐1;
  } else {
    if( mmsg.rc < 0 )
      printf("onload_zc_send message error %d\n", mmsg.rc);
    else {
      /* Iterate over the iovecs; any that were sent we must
replenish. */
      i = 0; bufs_needed= 0;
      while( i < mmsg.msg.msghdr.msg_iovlen ) {
        if( bytes_done == mmsg.rc ) {
          printf(onload_zc_send did not send iovec %d\n", i);
          /* In other buffer allocation schemes we would have to
release
           * these buffers, but seems pointless as we guarantee at the
           * end of this function to have iovec array full, so do
nothing. */
        } else {
          /* Buffer sent, now owned by Onload, so replenish iovec
array */
          ++bufs needed;
          bytes_done += iovec[i].iov_len;
        }
        ++i;
      }
      if( bufs_needed ) /* replenish the iovec array */
        rc = onload_zc_alloc_buffers(fd, iovec, bufs_needed, ONLOAD_ZC_BUFFER_HDR_TCP);
    }
  }
  /* Set a return code that looks similar enough to send(). NB. we're
   * not setting (and neither does onload_zc_send()) errno */
  if( mmsg.rc < 0 ) return ‐1;
  else return bytes_done;
}

猜你喜欢

转载自blog.csdn.net/x13262608581/article/details/125400826