k8s CNI 之 aws vpc-cni 插件源码解析

前言

在k8s网络模型中(CNI)中,aws vpc cni采用的是大二层underlay的网络方案,pod分配到的是vpc ip,与节点主机同属于一层网络中,相对于flannel等overlay方案更加简单直接,这样做带来的优势是:

    1. 实现集群内外ip直接互通

       在k8s的落地过程中,很多企业往往都有强烈的需求实现k8s集群内外网络的直连互通,从而实现平缓迁移的目标

    2. 减少了overlay方案中数据包的封包和拆包的网络损耗,能够提升性能

原理

主要实现逻辑:

Worker节点启动的时候挂载多个虚拟网卡ENI(Elastic Netowrk Interface)

  • 每个ENI都绑定了一个主IP(Primary ip)  和 多个 Secondry ip

  • ipamd(Local IP Address Manager)运行在每个worker 节点上,将所有ENI的所有secondary - ip 加入到本地ip地址池中

  • 当cni接受到创建pod事件请求时,就会通过grpc请求ipamd拿到ip并设置pod网络栈;反之,当接收到删除pod请求时就会通知ipamd释放ip并同时删除pod网络栈

    image.png

CNI

遵守k8S CNI网络模型的接口规范,主要实现了cmdAdd cmdDel接口,分别处理pod网络的创建和销毁事件

  • cmdAdd

代码路径: cmd/routed-eni-cni-plugin/cni.go

func cmdAdd(args *skel.CmdArgs) error {
	return add(args, typeswrapper.New(), grpcwrapper.New(), rpcwrapper.New(), driver.New())
}

func add(args *skel.CmdArgs, cniTypes typeswrapper.CNITYPES, grpcClient grpcwrapper.GRPC,
	rpcClient rpcwrapper.RPC, driverClient driver.NetworkAPIs) error {

	conf, log, err := LoadNetConf(args.StdinData)
    ...
	// 解析 k8s参数
    var k8sArgs K8sArgs
	if err := cniTypes.LoadArgs(args.Args, &k8sArgs); err != nil {
		log.Errorf("Failed to load k8s config from arg: %v", err)
		return errors.Wrap(err, "add cmd: failed to load k8s config from arg")
	}
    ...
	// 通过grpc发起请求到ipamd server
	conn, err := grpcClient.Dial(ipamdAddress, grpc.WithInsecure())
	...
	c := rpcClient.NewCNIBackendClient(conn)
    
        // 调用ipamd的AddNetwork接口获取ip地址
	r, err := c.AddNetwork(context.Background(),
		&pb.AddNetworkRequest{
			ClientVersion:              version,
			K8S_POD_NAME:               string(k8sArgs.K8S_POD_NAME),
			K8S_POD_NAMESPACE:          string(k8sArgs.K8S_POD_NAMESPACE),
			K8S_POD_INFRA_CONTAINER_ID: string(k8sArgs.K8S_POD_INFRA_CONTAINER_ID),
			Netns:                      args.Netns,
			ContainerID:                args.ContainerID,
			NetworkName:                conf.Name,
			IfName:                     args.IfName,
		})
    ...
	addr := &net.IPNet{
		IP:   net.ParseIP(r.IPv4Addr),
		Mask: net.IPv4Mask(255, 255, 255, 255),
	}
    ...
                // 获取到ip后,调用driver模块配置pod的network namespace
		err = driverClient.SetupNS(hostVethName, args.IfName, args.Netns, addr, int(r.DeviceNumber), r.VPCcidrs, r.UseExternalSNAT, mtu, log)
	}
    ...
	ips := []*current.IPConfig{
		{
			Version: "4",
			Address: *addr,
		},
	}

	result := &current.Result{
		IPs: ips,
	}

	return cniTypes.PrintResult(result, conf.CNIVersion)
}
复制代码

总结:cni通过grpc请求ipamd服务获取ip,拿到ip后调用driver模块设置pod的网络环境

  • cmdDel

      释放pod ip并清理pod的网络环境

func cmdAdd(args *skel.CmdArgs) error {
	return add(args, typeswrapper.New(), grpcwrapper.New(), rpcwrapper.New(), driver.New())
}

func add(args *skel.CmdArgs, cniTypes typeswrapper.CNITYPES, grpcClient grpcwrapper.GRPC,
	rpcClient rpcwrapper.RPC, driverClient driver.NetworkAPIs) error {

	conf, log, err := LoadNetConf(args.StdinData)
    ...
    // 解析 k8s参数
    var k8sArgs K8sArgs
	if err := cniTypes.LoadArgs(args.Args, &k8sArgs); err != nil {
		log.Errorf("Failed to load k8s config from arg: %v", err)
		return errors.Wrap(err, "add cmd: failed to load k8s config from arg")
	}
    ...
	// 通过grpc发起请求到ipamd server
	conn, err := grpcClient.Dial(ipamdAddress, grpc.WithInsecure())
	...
	c := rpcClient.NewCNIBackendClient(conn)
    
        // 调用ipamd的AddNetwork接口获取ip地址
	r, err := c.AddNetwork(context.Background(),
		&pb.AddNetworkRequest{
			ClientVersion:              version,
			K8S_POD_NAME:               string(k8sArgs.K8S_POD_NAME),
			K8S_POD_NAMESPACE:          string(k8sArgs.K8S_POD_NAMESPACE),
			K8S_POD_INFRA_CONTAINER_ID: string(k8sArgs.K8S_POD_INFRA_CONTAINER_ID),
			Netns:                      args.Netns,
			ContainerID:                args.ContainerID,
			NetworkName:                conf.Name,
			IfName:                     args.IfName,
		})
    ...
	addr := &net.IPNet{
		IP:   net.ParseIP(r.IPv4Addr),
		Mask: net.IPv4Mask(255, 255, 255, 255),
	}
    ...
                // 获取到ip后,调用driver模块配置pod的network namespace
		err = driverClient.SetupNS(hostVethName, args.IfName, args.Netns, addr, int(r.DeviceNumber), r.VPCcidrs, r.UseExternalSNAT, mtu, log)
	}
    ...
	ips := []*current.IPConfig{
		{
			Version: "4",
			Address: *addr,
		},
	}

	result := &current.Result{
		IPs: ips,
	}

	return cniTypes.PrintResult(result, conf.CNIVersion)
}
复制代码

driver


该模块主要提供创建和销毁pod网络栈的工具,dirver模块的主函数是SetupNS和TeardownNS

代码路径: cmd/routed-eni-cni-plugin/driver.go

代码逻辑:

image.png

  • SetupNS

  该函数主要功能是配置pod网络栈,包括准备pod的网络环境和策略路由的配置

  在 aws-cni 网络模型中,节点上的每一个ENI都会生成相应的路由表来转发from-pod的流量;通过策略路由方式,让to-pod的流量优先走主路由表,而对于from-pod的流量则走ENI对应的路由表,所以在配置pod网络环境中有配置策略路由的过程

func (os *linuxNetwork) SetupNS(hostVethName string, contVethName string, netnsPath string, addr *net.IPNet, deviceNumber int, vpcCIDRs []string, useExternalSNAT bool, mtu int, log logger.Logger) error {
	log.Debugf("SetupNS: hostVethName=%s, contVethName=%s, netnsPath=%s, deviceNumber=%d, mtu=%d", hostVethName, contVethName, netnsPath, deviceNumber, mtu)
	return setupNS(hostVethName, contVethName, netnsPath, addr, deviceNumber, vpcCIDRs, useExternalSNAT, os.netLink, os.ns, mtu, log, os.procSys)
}


func setupNS(hostVethName string, contVethName string, netnsPath string, addr *net.IPNet, deviceNumber int, vpcCIDRs []string, useExternalSNAT bool,
	netLink netlinkwrapper.NetLink, ns nswrapper.NS, mtu int, log logger.Logger, procSys procsyswrapper.ProcSys) error {

        // 调用setupVeth函数设置pod网络环境
	hostVeth, err := setupVeth(hostVethName, contVethName, netnsPath, addr, netLink, ns, mtu, procSys, log)
    ...
	addrHostAddr := &net.IPNet{
		IP:   addr.IP,
		Mask: net.CIDRMask(32, 32)}

        // 在节点上的主路由表添加到pod的路由 ip route add $ip dev veth-1 
	route := netlink.Route{
		LinkIndex: hostVeth.Attrs().Index,
		Scope:     netlink.SCOPE_LINK,
		Dst:       addrHostAddr}
   
        // netlink接口封装了linux的 "ip link"、"ip route"、 "ip rule"等命令
	if err := netLink.RouteReplace(&route); err != nil {
		return errors.Wrapf(err, "setupNS: unable to add or replace route entry for %s", route.Dst.IP.String())
	}
    
        // 使用"ip rule"命令添加to-pod策略路由  512: from all to 10.0.97.30 lookup main 
	err = addContainerRule(netLink, true, addr, mainRouteTable)
       ...
    
       // 通过ENI deviceNumber 判断是否primary ENI, 0表示Primary ENI
       // 如果ENI不是 primary ENI,则添加流量从pod出来的策略路由 
       //  1536: from 10.0.97.30 lookup eni-1 
	if deviceNumber > 0 {
		tableNumber := deviceNumber + 1
		err = addContainerRule(netLink, false, addr, tableNumber)
        ...
	}
	return nil
}
复制代码

最终实现的效果:

# ip rule list
0:	from all lookup local 
512:	from all to 10.0.97.30 lookup main <---------- to Pod's traffic
1025:	not from all to 10.0.0.0/16 lookup main 
1536:	from 10.0.97.30 lookup eni-1 <-------------- from Pod's traffic
复制代码
  • createVethPairContext

 createVethPairContext 结构体包含了创建vethpair所需参数,run 方法其实是setupVeth函数的具体实现,包含了创建vethpair,启用vethpir、配置pod网关、路由等步骤

func newCreateVethPairContext(contVethName string, hostVethName string, addr *net.IPNet, mtu int) *createVethPairContext {
	return &createVethPairContext{
		contVethName: contVethName,
		hostVethName: hostVethName,
		addr:         addr,
		netLink:      netlinkwrapper.NewNetLink(),
		ip:           ipwrapper.NewIP(),
		mtu:          mtu,
	}
}

func (createVethContext *createVethPairContext) run(hostNS ns.NetNS) error {
	veth := &netlink.Veth{
		LinkAttrs: netlink.LinkAttrs{
			Name:  createVethContext.contVethName,
			Flags: net.FlagUp,
			MTU:   createVethContext.mtu,
		},
		PeerName: createVethContext.hostVethName,
	}
    
        // 执行 ip link add 为pod创建vethpair
	if err := createVethContext.netLink.LinkAdd(veth); err != nil {
		return err
	}

	hostVeth, err := createVethContext.netLink.LinkByName(createVethContext.hostVethName)
	...
        // 执行 ip link set $link up 启用vethpair的主机端
	if err = createVethContext.netLink.LinkSetUp(hostVeth); err != nil {
		return errors.Wrapf(err, "setup NS network: failed to set link %q up", createVethContext.hostVethName)
	}

	contVeth, err := createVethContext.netLink.LinkByName(createVethContext.contVethName)
	if err != nil {
		return errors.Wrapf(err, "setup NS network: failed to find link %q", createVethContext.contVethName)
	}

	// 启用pod端的vethpair
	if err = createVethContext.netLink.LinkSetUp(contVeth); err != nil {
		return errors.Wrapf(err, "setup NS network: failed to set link %q up", createVethContext.contVethName)
	}

        // 添加默认网关169.254.1.1   route add default gw addr
	if err = createVethContext.netLink.RouteReplace(&netlink.Route{
		LinkIndex: contVeth.Attrs().Index,
		Scope:     netlink.SCOPE_LINK,
		Dst:       gwNet}); err != nil {
		return errors.Wrap(err, "setup NS network: failed to add default gateway")
	}

        // 添加默认路由 效果 default via 169.254.1.1 dev eth0
	if err = createVethContext.ip.AddDefaultRoute(gwNet.IP, contVeth); err != nil {
		return errors.Wrap(err, "setup NS network: failed to add default route")
	}
    
        // 给网卡eth0添加ip地址 "ip addr add $ip dev eth0"
	if err = createVethContext.netLink.AddrAdd(contVeth, &netlink.Addr{IPNet: createVethContext.addr}); err != nil {
		return errors.Wrapf(err, "setup NS network: failed to add IP addr to %q", createVethContext.contVethName)
	}

	// 为默认网关添加arp静态条目
	neigh := &netlink.Neigh{
		LinkIndex:    contVeth.Attrs().Index,
		State:        netlink.NUD_PERMANENT,
		IP:           gwNet.IP,
		HardwareAddr: hostVeth.Attrs().HardwareAddr,
	}

	if err = createVethContext.netLink.NeighAdd(neigh); err != nil {
		return errors.Wrap(err, "setup NS network: failed to add static ARP")
	}
    
        // 将vethpair 的一端移动到主机侧 network namespace 
	if err = createVethContext.netLink.LinkSetNsFd(hostVeth, int(hostNS.Fd())); err != nil {
		return errors.Wrap(err, "setup NS network: failed to move veth to host netns")
	}
	return nil
}
复制代码
  • TeardownNS

     清理pod网络环境

func (os *linuxNetwork) TeardownNS(addr *net.IPNet, deviceNumber int, log logger.Logger) error {
	log.Debugf("TeardownNS: addr %s, deviceNumber %d", addr.String(), deviceNumber)
	return tearDownNS(addr, deviceNumber, os.netLink, log)
}

func tearDownNS(addr *net.IPNet, deviceNumber int, netLink netlinkwrapper.NetLink, log logger.Logger) error {
   ...
	// 删除to-pod方向的策略路由 执行 "ip rule del"
	toContainerRule := netLink.NewRule()
	toContainerRule.Dst = addr
	toContainerRule.Priority = toContainerRulePriority
	err := netLink.RuleDel(toContainerRule)
     ...
     // 判断ENI是否为Primary ENI,如果是非Primary,则同时删除from-pod的策略路由
	if deviceNumber > 0 {
		err := deleteRuleListBySrc(*addr)
      ...
	}
	addrHostAddr := &net.IPNet{
		IP:   addr.IP,
		Mask: net.CIDRMask(32, 32)}
         ...
	return nil
}
复制代码

IPAMD

本地ip地址池管理进程,以daemonset的方式运行在每个worker节点上,维护着节点上所有可用ip地址;那么,问题来了,ip地址池中的数据是从哪里来的呢?

其实,aws ec2中有一个 ec2metadata 的概念,保存着关于该实例的元数据信息,包括绑定到ec2的所有ENI,以及ENI上的所有ip,并提供接口获取:

curl  http://169.254.169.254/latest/meta-data/network/interfaces/macs/

curl  http://169.254.169.254/latest/meta-data/network/interfaces/macs/0a:da:9d:51:47:28/local-ipv4s

ipamd在初始化的过程中将ENI/IP信息保存在dataStore中,以上过程是在nodeInit中实现的

nodeInit

func (c *IPAMContext) nodeInit() error {
        ...
        // 请求ec2元数据接口,获取所有的ENI数据
	metadataResult, err := c.awsClient.DescribeAllENIs()
	...
	enis := c.filterUnmanagedENIs(metadataResult.ENIMetadata)
         ....
		// 添加ENI信息
		retry := 0
		for {
			retry++
			if err = c.setupENI(eni.ENIID, eni, isTrunkENI, isEFAENI); err == nil {
				log.Infof("ENI %s set up.", eni.ENIID)
				break
			}
                 ...
	return nil
}
复制代码
  • setupENI

   setupENI的主要任务是完成dataStore数据初始化,包括:

  • 将ENI 添加到 datastore中
  • 启用与eni相关的 vethpair
  • 将ENI 的所有secondary IP 添加datastore中
func (c *IPAMContext) setupENI(eni string, eniMetadata awsutils.ENIMetadata, isTrunkENI, isEFAENI bool) error {
	primaryENI := c.awsClient.GetPrimaryENI()
    
	err := c.dataStore.AddENI(eni, eniMetadata.DeviceNumber, eni == primaryENI, isTrunkENI, isEFAENI)
	...
	c.primaryIP[eni] = eniMetadata.PrimaryIPv4Address()

	if eni != primaryENI {
		err = c.networkClient.SetupENINetwork(c.primaryIP[eni], eniMetadata.MAC, eniMetadata.DeviceNumber, eniMetadata.SubnetIPv4CIDR)
        ...
	}
    ...
	c.addENIsecondaryIPsToDataStore(eniMetadata.IPv4Addresses, eni)
	c.addENIprefixesToDataStore(eniMetadata.IPv4Prefixes, eni)

	return nil
}
复制代码

dataStore

dataStore 是一个通过结构体构造的本地DB,维护着本地节点ENI信息,以及ENI上绑定的所有ip,每条ip信息都以ipamkey作为主键;当ip被分配,则会以(network name, CNI_CONTAINERID, CNI_IFNAME)作为主键值;反之,ip没有被分配,ipamkey会被设置为空值

代码路径 /pkg/ipamd/datastore/data_store.go

type DataStore struct {
	total                    int 
	assigned                 int  
	allocatedPrefix          int
	eniPool                  ENIPool 
	lock                     sync.Mutex
	log                      logger.Logger
	CheckpointMigrationPhase int 
	backingStore             Checkpointer
	cri                      cri.APIs
	isPDEnabled              bool
}

type ENI struct {
	ID         string
	createTime time.Time
	IsPrimary bool
	IsTrunk bool
	IsEFA bool
	DeviceNumber int
	AvailableIPv4Cidrs map[string]*CidrInfo
}

type AddressInfo struct {
	IPAMKey        IPAMKey
	Address        string
	UnassignedTime time.Time
}

type CidrInfo struct {
	Cidr net.IPNet    // 192.168.1.1/24
	IPv4Addresses map[string]*AddressInfo
	IsPrefix bool
}

type ENIPool map[string]*ENI   //['eniid]eni
复制代码

datastore包含两个主要的方法 AssignPodIPv4Address和UnAssignPodIPv4Address cni本质上是直接调用这两个方法来分别获取ip和释放ip

  • AssignPodIPv4Address
// 将ip分配给pod
func (ds *DataStore) AssignPodIPv4Address(ipamKey IPAMKey) (ipv4address string, deviceNumber int, err error) {
   // 对 dataStore操作加互斥锁
	ds.lock.Lock()
	defer ds.lock.Unlock()
      ...
      // 遍历dataStore的eniPool拿到ip
      for _, eni := range ds.eniPool {
		for _, availableCidr := range eni.AvailableIPv4Cidrs {
			var addr *AddressInfo
			var strPrivateIPv4 string
			var err error

			if (ds.isPDEnabled && availableCidr.IsPrefix) || (!ds.isPDEnabled && !availableCidr.IsPrefix) {
				strPrivateIPv4, err = ds.getFreeIPv4AddrfromCidr(availableCidr)
				if err != nil {
					ds.log.Debugf("Unable to get IP address from CIDR: %v", err)
					//Check in next CIDR
					continue
				}
				...

			addr = availableCidr.IPv4Addresses[strPrivateIPv4]
		        ...
			availableCidr.IPv4Addresses[strPrivateIPv4] = addr
                        // 对于已分配的ip,设置其ipamkey
			ds.assignPodIPv4AddressUnsafe(ipamKey, eni, addr)
                         ...
			return addr.Address, eni.DeviceNumber, nil
		}
	}
    ...
}
复制代码
  • UnAssignPodIPv4Address
// 释放ip地址
func (ds *DataStore) UnassignPodIPv4Address(ipamKey IPAMKey) (e *ENI, ip string, deviceNumber int, err error) {

    ...
        // 通过主键ipamKey 在enipool中找对对应的pod ip地址
	eni, availableCidr, addr := ds.eniPool.FindAddressForSandbox(ipamKey)
    ...
        // 调用unassignPodIPv4AddressUnsafe 设置ip为未分配状态,即将IP地址对应的主键ipamkey设置为空
	ds.unassignPodIPv4AddressUnsafe(addr)
	...
        // 设置ip释放时间为当前时间
	addr.UnassignedTime = time.Now()
    ...
	return eni, addr.Address, eni.DeviceNumber, nil
}
复制代码

猜你喜欢

转载自juejin.im/post/7016692775496187911
cni