linux kernel protocol stack TCP congestion control algorithm support

table of Contents

1 Data structure tcp_congestion_ops

2 Initialize tcp_cong_list

2.1 Resident memory congestion control algorithm

3 Basic operation

3.1 Algorithm registration/deregistration tcp_register_congestion_control

3.2 socket specified congestion control algorithm tcp_set_congestion_control

3.3 Set the default algorithm tcp_set_default_congestion_control


The Linux kernel supports the coexistence of multiple congestion control algorithms, and supports the use of different congestion control algorithms for different TCP flows.

1 Data structure tcp_congestion_ops

Each congestion control algorithm must provide a struct tcp_congestion_ops structure, and then register with the system. The system organizes all registered congestion control algorithms into a single linked list.

/*
 * Interface for adding new TCP congestion control handlers
 */
#define TCP_CA_NAME_MAX	16
#define TCP_CA_MAX	128
#define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)

#define TCP_CONG_NON_RESTRICTED 0x1
#define TCP_CONG_RTT_STAMP	0x2

struct tcp_congestion_ops {
	//所有拥塞控制算法组织成单链表
	struct list_head	list;
	//当前只定义了两个标记:
	//TCP_CONG_NON_RESTRICTED: 如果没有设置该标记,表示算法的使用者需要有网络管理权限
	//TCP_CONG_RTT_STAMP: 
	unsigned long flags;

	/* initialize private data (optional) */
	void (*init)(struct sock *sk);
	/* cleanup private data  (optional) */
	void (*release)(struct sock *sk);

	/* return slow start threshold (required) */
	u32 (*ssthresh)(struct sock *sk);
	/* lower bound for congestion window (optional) */
	u32 (*min_cwnd)(const struct sock *sk);
	/* do new cwnd calculation (required) */
	void (*cong_avoid)(struct sock *sk, u32 ack, u32 in_flight);
	/* call before changing ca_state (optional) */
	void (*set_state)(struct sock *sk, u8 new_state);
	/* call when cwnd event occurs (optional) */
	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
	/* new value of cwnd after loss (optional) */
	u32  (*undo_cwnd)(struct sock *sk);
	/* hook for packet ack accounting (optional) */
	void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us);
	/* get info for inet_diag (optional) */
	void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb);

	//可以为每个拥塞控制算法提供一个名字
	char 		name[TCP_CA_NAME_MAX];
	struct module 	*owner;
};

2 Initialize tcp_cong_list

First, all congestion control algorithms are organized into a singly linked list. The definition of the linked list is as follows:

static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);

2.1 Resident memory congestion control algorithm

When the system starts, tcp_init() will register Reno (actually New Reno ) as the default congestion control algorithm. Of course, there are other ways to change this configuration, but in any case, New Reno is compiled to the kernel image, and other algorithms can exist as modules, which can be dynamically loaded when needed.

void __init tcp_init(void)
{
...
	tcp_register_congestion_control(&tcp_reno);
...
}

3 Basic operation

3.1 Algorithm registration/deregistration tcp_register_congestion_control

As described in tcp_init(), the congestion control algorithm can be registered with the kernel through the interface tcp_register_congestion_control().

/*
 * Attach new congestion control algorithm to the list
 * of available options.
 */
入参就是拥塞控制算法
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
	int ret = 0;

	//ssthresh()和cong_avoid()回调是必须提供的
	if (!ca->ssthresh || !ca->cong_avoid) {
		printk(KERN_ERR "TCP %s does not implement required ops\n",
		       ca->name);
		return -EINVAL;
	}

	spin_lock(&tcp_cong_list_lock);
	//如果算法已经注册,那么注册失败
	if (tcp_ca_find(ca->name)) {
		printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
		ret = -EEXIST;
	} else {
		//将注册的算法添加到算法列表的末尾
		list_add_tail_rcu(&ca->list, &tcp_cong_list);
		printk(KERN_INFO "TCP %s registered\n", ca->name);
	}
	spin_unlock(&tcp_cong_list_lock);

	return ret;
}
EXPORT_SYMBOL_GPL(tcp_register_congestion_control);

Similarly, the cancellation is achieved through tcp_unregister_congestion_control(), which is not listed here.

3.2 socket specified congestion control algorithm tcp_set_congestion_control

The congestion control algorithm can be set for a socket through tcp_set_congestion_control(). The user space program can set the congestion control algorithm for the socket through the TCP option TCP_CONGESTION, and this function is ultimately called.

/* Change congestion control for socket */
int tcp_set_congestion_control(struct sock *sk, const char *name)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_congestion_ops *ca;
	int err = 0;

	rcu_read_lock();
	//查找算法是否已经注册
	ca = tcp_ca_find(name);

	/* no change asking for existing value */
	//如果当前TCB的算法就是要设定的,直接返回
	if (ca == icsk->icsk_ca_ops)
		goto out;

#ifdef CONFIG_KMOD
	//算法尚未注册,但是支持模块的动态加载,先尝试加载指定算法
	/* not found attempt to autoload module */
	if (!ca && capable(CAP_SYS_MODULE)) {
		rcu_read_unlock();
		request_module("tcp_%s", name);
		rcu_read_lock();
		ca = tcp_ca_find(name);
	}
#endif
	//最终还是没有找到指定的算法,失败
	if (!ca)
		err = -ENOENT;
	//如果算法的使用是受限的,但是当前进程又没有网络管理权限,失败
	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
		err = -EPERM;
	//获取不到引用计数,失败
	else if (!try_module_get(ca->owner))
		err = -EBUSY;
	//设定成功
	else {
		//清理TCB中之前的拥塞控制算法
		tcp_cleanup_congestion_control(sk);
		//设置算法
		icsk->icsk_ca_ops = ca;
		//如果算法有提供初始化函数,调用初始化函数
		if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
			icsk->icsk_ca_ops->init(sk);
	}
 out:
	rcu_read_unlock();
	return err;
}

/* Manage refcounts on socket close. */
void tcp_cleanup_congestion_control(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	//调用release()回调
	if (icsk->icsk_ca_ops->release)
		icsk->icsk_ca_ops->release(sk);
	//释放对算法的引用计数
	module_put(icsk->icsk_ca_ops->owner);
}

3.3 Set the default algorithm tcp_set_default_congestion_control

You can call tcp_set_default_congestion_control() to set the default congestion control algorithm for the system.

/* Used by sysctl to change default congestion control */
int tcp_set_default_congestion_control(const char *name)
{
	struct tcp_congestion_ops *ca;
	int ret = -ENOENT;

	spin_lock(&tcp_cong_list_lock);
	//类似tcp_set_congestion_control()中的算法查找部分
	ca = tcp_ca_find(name);
#ifdef CONFIG_KMOD
	if (!ca && capable(CAP_SYS_MODULE)) {
		spin_unlock(&tcp_cong_list_lock);

		request_module("tcp_%s", name);
		spin_lock(&tcp_cong_list_lock);
		ca = tcp_ca_find(name);
	}
#endif

	if (ca) {
		//默认的算法没有使用权限限制
		ca->flags |= TCP_CONG_NON_RESTRICTED;	/* default is always allowed */
		//链表中第一个算法就是默认算法
		list_move(&ca->list, &tcp_cong_list);
		ret = 0;
	}
	spin_unlock(&tcp_cong_list_lock);

	return ret;
}

/* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default(void)
{
	//可以通过配置CONFIG_DEFAULT_TCP_CONG为系统设置默认的拥塞控制算法
	return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
}
late_initcall(tcp_congestion_default);

Note: In fact, there are some other common operations, but they are relatively simple.

Guess you like

Origin blog.csdn.net/wangquan1992/article/details/109076072