实现基于XDP/eBPF的快速路由转发功能

周末用eBPF实现了学习型网桥的XDP快速转发路径之后，再来用eBPF实现一个快速路由转发。同样很有意思。

关于eBPF和XDP的前置基础知识，我在前面实现网桥转发路径前已经概览过了，所以本文不再赘述。

当我们面对各种网络技术的时候，路由和交换技术是最基本的。路由和交换有什么区别，为什么不能像实现交换机那样实现IP路由，很有必要说一说。

简单点来说，路由和交换的实现技术有以下的区别：

交换：精确匹配交换表项的二层转发过程。
路由：模糊通配路由表项的三层转发过程。

二层网络链路的scope是局域的，三层IP网络的scope则是全世界IP网络，如果我们需要用一张表来装二层或者三层的转发表项，那么很显然，二层网络的转发表项的数量在这张表的可承受范围之内，而三层网络由于总表项数巨大则必然需要某种通配技术，这就是IP路由的最长前缀匹配技术。

落实到实现上，二层转发表项精确匹配的特征很容易使其被固化于硬件中用于快速交换，而三层路由表项的通配查找则必然需要牵扯到高性能查找算法，这些很难用硬件固化，所以一般的高性能路由器均采用了数据面和控制面相分离的方案来实现。

以下的说法本质上都差不多：

一次路由，多次交换。
Cisco CEF。
路由器快速转发由线卡实现，而线卡未命中表项的报文则被CPU软执行路由查找，最终将表项注入线卡。
…

在XDP技术出现之前，我就用Netfilter的nf_conntrack实现过类似的想法，尝试过各种所谓的 “协议栈短路” 方案或者说trick，但都显得远远不够酷。现在有了XDP，又有了eBPF，是时候玩一把了。

我们先来看生成eBPF字节码并注入XDP的C代码：

// xdp_rtcache_kern.c
#include <uapi/linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/ip.h>

#include "bpf_helpers.h"

// 最简单的一个转发表项
struct rt_item {
	int ifindex; // 转发出去的接口
	char eth_source[ETH_ALEN]; // 封装帧的源MAC地址。
	char eth_dest[ETH_ALEN]; // 封装帧的目标MAC地址。
};

// 路由转发表缓存
struct bpf_map_def SEC("maps") rtcache_map = {
	.type = BPF_MAP_TYPE_LRU_HASH, // 采用LRU机制，自动老化表项
	.key_size = sizeof(int),
	.value_size = sizeof(struct rt_item),
	.max_entries = 100,
};

// 递减TTL还是要的
static __always_inline int ip_decrease_ttl(struct iphdr *iph)
{
	u32 check = (__force u32)iph->check;

	check += (__force u32)htons(0x0100);
	iph->check = (__force __sum16)(check + (check >= 0xFFFF));
	return --iph->ttl;
}

// 字节码的C程序本身
SEC("xdp_rtcache")
int xdp_rtcache_prog(struct xdp_md *ctx)
{
	void *data_end = (void *)(long)ctx->data_end;
	void *data = (void *)(long)ctx->data;
	struct bpf_fib_lookup ifib;
	struct ethhdr *eth = data;
	struct iphdr *iph;
	struct rt_item *pitem = NULL;
	unsigned int daddr = 0;
	u16 h_proto;
	u64 nh_off;
	// 至今不知道如何让ebpf程序支持 "%s"
	char fast_info[] = "Fast path to [%d]\n";
	char slow_info[] = "Slow path to [%d]\n";

	nh_off = sizeof(*eth);
	if (data + nh_off > data_end) {
		return XDP_DROP;
	}

	__builtin_memset(&ifib, 0, sizeof(ifib));
	h_proto = eth->h_proto;
	if (h_proto != htons(ETH_P_IP)) {
		return XDP_PASS;
	}

	iph = data + nh_off;

	if (iph + 1 > data_end) {
		return XDP_DROP;
	}

	daddr = iph->daddr;

	pitem = bpf_map_lookup_elem(&rtcache_map, &daddr);
	// 首先精确查找转发表，如果找到就直接转发，不必再经历最长前缀匹配的慢速通配查找
	// 这个动作是可以offload到硬件中的。
	if (pitem) {
		ip_decrease_ttl(iph);
		memcpy(eth->h_dest, pitem->eth_dest, ETH_ALEN);
		memcpy(eth->h_source, pitem->eth_source, ETH_ALEN);
		bpf_trace_printk(fast_info, sizeof(fast_info), pitem->ifindex);
		return bpf_redirect(pitem->ifindex, 0);
	}

	// 否则只能执行最长前缀匹配了
	ifib.family = AF_INET;
	ifib.tos = iph->tos;
	ifib.l4_protocol = iph->protocol;
	ifib.tot_len = ntohs(iph->tot_len);
	ifib.ipv4_src = iph->saddr;
	ifib.ipv4_dst = iph->daddr;
	ifib.ifindex = ctx->ingress_ifindex;

	// 调用eBPF封装的路由查找函数，虽然所谓慢速查找，也依然不会进入协议栈的。
	if (bpf_fib_lookup(ctx, &ifib, sizeof(ifib), 0) == 0) {
		struct rt_item nitem;

		memset(&nitem, 0, sizeof(nitem));
		memcpy(&nitem.eth_dest, ifib.dmac, ETH_ALEN);
		memcpy(&nitem.eth_source, ifib.smac, ETH_ALEN);
		nitem.ifindex = ifib.ifindex;
		// 插入新的表项
		bpf_map_update_elem(&rtcache_map, &daddr, &nitem, BPF_ANY);
		ip_decrease_ttl(iph);
		memcpy(eth->h_dest, ifib.dmac, ETH_ALEN);
		memcpy(eth->h_source, ifib.smac, ETH_ALEN);
		bpf_trace_printk(slow_info, sizeof(slow_info), ifib.ifindex);
		return bpf_redirect(ifib.ifindex, 0);
	}

	return XDP_PASS;
}

char _license[] SEC("license") = "GPL";

是不是很闭环的一个代码呢？这是不是意味着我们只需要将其 编译加载到对应的网卡(调用API和iproute2的ip-link命令均可以完成) 就可以了呢？并不是这样。

事实上，我们需要考虑更细节的情况。

如果路由改变了呢？如果ARP表项改变了呢？…

所以说，我们必须侦听这种事件。简单起见，我们只要监测到这种事件发生了，就清空转发表。以下是对应的用户控制C代码：

// xdp_rtcache_user.c

#include <linux/bpf.h>
#include <linux/rtnetlink.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <bpf/bpf.h>
#include <poll.h>
#include <net/if.h>
#include "bpf_util.h"

int sock, sock_arp, flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
static int *ifindex_list;
char buf[1024];
static int rtcache_map_fd;

static void int_exit(int sig)
{
	int i = 0;

	for (i = 0; i < 2; i++) {
		bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
	}
	exit(0);
}

int main(int ac, char **argv)
{
	struct bpf_prog_load_attr prog_load_attr = {
		.prog_type	= BPF_PROG_TYPE_XDP,
	};
	struct bpf_object *obj;
	char filename[256];
	int prog_fd;
	int i = 1;
	struct pollfd fds_route, fds_arp;
	struct sockaddr_nl la, lr;
	struct nlmsghdr *nh;

	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
	prog_load_attr.file = filename;

	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
		return 1;

	rtcache_map_fd = bpf_object__find_map_fd_by_name(obj, "rtcache_map");
	ifindex_list = (int *)calloc(2, sizeof(int *));
	ifindex_list[0] = if_nametoindex(argv[1]);
	ifindex_list[1] = if_nametoindex(argv[2]);

	for (i = 0; i < 2; i++) {
		bpf_set_link_xdp_fd(ifindex_list[i], prog_fd, flags);
	}
	signal(SIGINT, int_exit);
	signal(SIGTERM, int_exit);

	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);

	memset(&lr, 0, sizeof(lr));
	lr.nl_family = AF_NETLINK;
	lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
	bind(sock, (struct sockaddr *)&lr, sizeof(lr));
	fds_route.fd = sock;
	fds_route.events = POLL_IN;

	sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
	
	memset(&la, 0, sizeof(la));
	la.nl_family = AF_NETLINK;
	la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY;
	bind(sock_arp, (struct sockaddr *)&la, sizeof(la));
	fds_arp.fd = sock_arp;
	fds_arp.events = POLL_IN;

	while (1) {
		memset(buf, 0, sizeof(buf));
		if (poll(&fds_route, 1, 3) == POLL_IN) {
			recv(sock, buf, sizeof(buf), 0);
			nh = (struct nlmsghdr *)buf;
		} else if (poll(&fds_arp, 1, 3) == POLL_IN) {
			recv(sock_arp, buf, sizeof(buf), 0);
			nh = (struct nlmsghdr *)buf;
		}
		if (nh->nlmsg_type == RTM_NEWNEIGH || nh->nlmsg_type == RTM_DELNEIGH ||
			nh->nlmsg_type == RTM_NEWROUTE || nh->nlmsg_type == RTM_DELROUTE) {
			__u32 id = 0, next_id;
			while (bpf_map_get_next_key(rtcache_map_fd, &id, &next_id) == 0) {
				bpf_map_delete_elem(rtcache_map_fd, &next_id);
				id = next_id;
			}
		}
	}
}

接下来让我们编译它们。

我依然建议使用内核源码树的samples/bpf编译目录，将上述两个文件放在该目录下，修改Makefile文件，加入以下的内容：

always += xdp_rtcache_kern.o
hostprogs-y += xdp_rtcache
xdp_rtcache-objs := xdp_rtcache_user.o

最后make之。

好了，是时候演示一下了。

首先，我们在不启用该eBPF的情况下，确认路由的畅通，在路由器出入网卡抓包显示数据包有来有回。保持ping的同时，加载eBPF字节码：

root@zhaoya-VirtualBox:# ./xdp_rtcache enp0s9 enp0s10

另起一个终端，观察调试输出：

root@zhaoya-VirtualBox:# cat /sys/kernel/debug/tracing/trace_pipe
          <idle>-0     [000] ..s. 26971.653936: 0: Slow path to [4]
     ksoftirqd/3-30    [003] ..s. 26971.654749: 0: Slow path to [5]
          <idle>-0     [000] .Ns. 26972.655742: 0: Fast path to [4]
          <idle>-0     [003] .Ns. 26972.657759: 0: Fast path to [5]
          ...

OK，可以看到前面两个Slow报文是从双向ICMP报文学习而来的，此后就都走Fast路径了。此时tcpdump抓包，nothing will be shown。

浙江温州皮鞋湿，下雨进水不会胖。

dog250 博客专家

发布了1545 篇原创文章 · 获赞 4728 · 访问量 1055万+

他的留言板关注

实现基于XDP/eBPF的快速路由转发功能

猜你喜欢