从前面的数据收发过程中也看到了，在收发流程中很多检查设备状态的操作，这篇笔记来完整的看下在设备注册成功后，到底有哪些状态，它们是如何控制收发流程的。

1. net_device的state字段

dev->state字段描述了设备和设备队列的状态，当前定义有如下值：

/* These flag bits are private to the generic network queueing
 * layer, they may not be explicitly referenced by any other
 * code.
 */
enum netdev_state_t
{
	__LINK_STATE_XOFF=0,
	__LINK_STATE_START,
	__LINK_STATE_PRESENT,
	__LINK_STATE_SCHED,
	__LINK_STATE_NOCARRIER,
	__LINK_STATE_LINKWATCH_PENDING,
	__LINK_STATE_DORMANT,
	__LINK_STATE_QDISC_RUNNING,
};

1.1 __LINK_STATE_PRESENT

表示设备是否存在。完成注册过程的设备的该标志位置位。此外，在电源管理过程中，如果设备被挂起会清除该标志位使得设备暂时不可用，并且在恢复时重新置位使设备恢复，具体看下面的“设备挂起”与“设备恢复”。

可以使用netif_device_present()检测是否设置了该标志位：

/**
 *	netif_device_present - is device available or removed
 *	@dev: network device
 *
 * Check if device has not been removed from system.
 */
static inline int netif_device_present(struct net_device *dev)
{
	return test_bit(__LINK_STATE_PRESENT, &dev->state);
}

1.2 __LINK_STATE_START

表示设备是否已经被打开，处于该状态的设备是就绪状态，可以正常收发数据。在dev_open()的时候设置该标志位，在dev_close()清除该标志位。

可以使用netif_running()检测是否设置了该标志位：

/**
 *	netif_running - test if up
 *	@dev: network device
 *
 *	Test if the device has been brought up.
 */
static inline int netif_running(const struct net_device *dev)
{
	return test_bit(__LINK_STATE_START, &dev->state);
}

1.3 __LINK_STATE_XOFF

表示设备的发送队列是否可用。该标志位的设置与清除一般由驱动程序负责，因为只有驱动程序才知道硬件当前是否能够发送数据，驱动程序可能会设备没有可用缓存、内存不足等等因素而临时将该标志位清除，进而停止流量控制机制发送数据。

可以使用如下三个接口操作和检查该标志位:

/**
 *	netif_start_queue - allow transmit
 *	@dev: network device
 *
 *	Allow upper layers to call the device hard_start_xmit routine.
 */
static inline void netif_start_queue(struct net_device *dev)
{
	clear_bit(__LINK_STATE_XOFF, &dev->state);
}

/**
 *	netif_wake_queue - restart transmit
 *	@dev: network device
 *
 *	Allow upper layers to call the device hard_start_xmit routine.
 *	Used for flow control when transmit resources are available.
 */
static inline void netif_wake_queue(struct net_device *dev)
{
#ifdef CONFIG_NETPOLL_TRAP
	if (netpoll_trap()) {
		clear_bit(__LINK_STATE_XOFF, &dev->state);
		return;
	}
#endif
	//和netif_stop_queue()不同，这里还多了调度设备使其可以发送的操作
	//关于调度见笔记《设备接口层之数据包发送》
	if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state))
		__netif_schedule(dev);
}

/**
 *	netif_stop_queue - stop transmitted packets
 *	@dev: network device
 *
 *	Stop upper layers calling the device hard_start_xmit routine.
 *	Used for flow control when transmit resources are unavailable.
 */
static inline void netif_stop_queue(struct net_device *dev)
{
	set_bit(__LINK_STATE_XOFF, &dev->state);
}

/**
 *	netif_queue_stopped - test if transmit queue is flowblocked
 *	@dev: network device
 *
 *	Test if transmit queue on device is currently unable to send.
 */
static inline int netif_queue_stopped(const struct net_device *dev)
{
	return test_bit(__LINK_STATE_XOFF, &dev->state);
}

1.4 __LINK_STATE_SCHED

表示设备是否已经在发送轮询队列中，即CPU的收发队列softnet_data.output_queue中，一个设备同时只能在一个CPU的发送轮询队列中，在流量控制发送过程qdisc_run()过程中会检查和设置该标志位。

static inline void netif_schedule(struct net_device *dev)
{
	if (!test_bit(__LINK_STATE_XOFF, &dev->state))
		__netif_schedule(dev);
}

void __netif_schedule(struct net_device *dev)
{
	//如果没有就设置__LINK_STATE_SCHED标志
	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
		unsigned long flags;
		struct softnet_data *sd;

		//将设备接入当前CPU的发送轮询队列中
		local_irq_save(flags);
		sd = &__get_cpu_var(softnet_data);
		dev->next_sched = sd->output_queue;
		sd->output_queue = dev;
		//激活发送轮询软中断
		raise_softirq_irqoff(NET_TX_SOFTIRQ);
		local_irq_restore(flags);
	}
}
EXPORT_SYMBOL(__netif_schedule);

这里需要解释下接收过程为什么没有这样一个类似的标志位。其实是有的，只是该标志位不在dev->state中，而是在napi_struct.state中，具体可参考设备接口层之数据包发送.

1.5 __LINK_STATE_QDISC_RUNNING

表示流量控制过程是否正在运行，即是否正在qdisc_run()函数执行过程中。相关代码如下：

static inline void qdisc_run(struct net_device *dev)
{
	//1. 设备的发送队列开启(__LINK_STATE_XOFF标志位没有设置)
	//2. 设备没有被其它CPU调度发送(__LINK_STATE_QDISC_RUNNING标志位没有置位)
	//如果满足上述两个条件，那么设置调度标记，然后调用__qdisc_run()
	if (!netif_queue_stopped(dev) &&
	    !test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
		__qdisc_run(dev);
}

void __qdisc_run(struct net_device *dev)
{
...
	//退出该函数时清除__LINK_STATE_QDISC_RUNNING标志位
	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
}

1.6 __LINK_STATE_NOCARRIER

当驱动感知到设备的载波变化时，要使用netif_carrier_on/off()通知内核，使得内核可以在这些情况下合理的关闭设备的收发能力，具体见下文。

可以使用netif_carrier_ok()检查是否设置了该标志位:

/**
 *	netif_carrier_ok - test if carrier present
 *	@dev: network device
 *
 * Check if carrier is present on device
 */
static inline int netif_carrier_ok(const struct net_device *dev)
{
	return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}

1.7 __LINK_STATE_LINKWATCH_PENDING

如果设备已经产生了LINKWATCH事件并且正在调度过程中，设置该标志位。这是为了防止同一个设备同时被多次调度。

1.8 __LINK_STATE_DORMANT

此标志位在当前的收发流程中还没有见到怎么使用，暂不关注。

2. 打开设备dev_open()

设备被注册后还无法收发数据，这时网络设备的状态为DOWN，必须UP后才能收发数据，在用户空间，可以通过ifconifg {dev} up来使能设备，对应内核，该过程由dev_open()完成。

/**
 *	dev_open	- prepare an interface for use.
 *	@dev:	device to open
 *
 *	Takes a device from down to up state. The device's private open
 *	function is invoked and then the multicast lists are loaded. Finally
 *	the device is moved into the up state and a %NETDEV_UP message is
 *	sent to the netdev notifier chain.
 *
 *	Calling this function on an active interface is a nop. On a failure
 *	a negative errno code is returned.
 */
int dev_open(struct net_device *dev)
{
	int ret = 0;

	//如果该网络设备已经UP，直接返回成功
	if (dev->flags & IFF_UP)
		return 0;

	//设备不能处于挂起状态，已经注册并且没有被挂起的设备会设置__LINK_STATE_PRESENT
	if (!netif_device_present(dev))
		return -ENODEV;

	//设置网络设备处于激活状态
	set_bit(__LINK_STATE_START, &dev->state);
	//如果设备驱动提供了校验地址的接口，则调用它
	if (dev->validate_addr)
		ret = dev->validate_addr(dev);
	//如果设备驱动提供了open回调，则调用它
	if (!ret && dev->open)
		ret = dev->open(dev);

	if (ret)
		//校验地址或者驱动的open返回失败，清除__LINK_STATE_START标志位
		clear_bit(__LINK_STATE_START, &dev->state);
	else {
		//设置UP标记
		dev->flags |= IFF_UP;

		//回调set_rx_mode()和set_multicast_list()，这操作与设备强相关，
		//没写过网卡驱动，不知道要干什么
		dev_set_rx_mode(dev);
		/*
		 *	Wakeup transmit queue engine
		 */
		//设置流量控制的排队规则，具体见相关笔记
		dev_activate(dev);

		//向其他模块发送NETDEV_UP通知事件
		call_netdevice_notifiers(NETDEV_UP, dev);
	}

	return ret;
}

3. 设备的关闭dev_close()

有打开就有关闭，用户空间通过ifconfig {dev} down可以关闭设备，到了内核，对应的实现为dev_close()。

/**
 *	dev_close - shutdown an interface.
 *	@dev: device to shutdown
 *
 *	This function moves an active device into down state. A
 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 *	chain.
 */
int dev_close(struct net_device *dev)
{
	might_sleep();
	//如果设备已经关闭，直接返回成功
	if (!(dev->flags & IFF_UP))
		return 0;

	/*
	 *	Tell people we are going down, so that they can
	 *	prepare to death, when device is still operating.
	 */
	//向其他模块发送NETDEV_GOING_DOWN通知，表示设备即将被关闭
	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
	//清除__LINK_STATE_START标志位
	clear_bit(__LINK_STATE_START, &dev->state);

	/* Synchronize to scheduled poll. We cannot touch poll list,
	 * it can be even on different cpu. So just clear netif_running().
	 *
	 * dev->stop() will invoke napi_disable() on all of it's
	 * napi_struct instances on this device.
	 */
	//等待其它设备也感知到该标记的变化，这是一同同步机制，不是很了解
	smp_mb__after_clear_bit(); /* Commit netif_running(). */

	//关闭流量控制排队规则，将停止收发数据。见相关笔记
	dev_deactivate(dev);

	/*
	 *	Call the device specific close. This cannot fail.
	 *	Only if device is UP
	 *
	 *	We allow it to be called even after a DETACH hot-plug
	 *	event.
	 */
	//调用驱动程序的stop()完成硬件相关的关闭操作
	dev->stop(dev);

	//网络设备设置为DOWN
	dev->flags &= ~IFF_UP;

	/*
	 * Tell people we are down
	 */
	//关闭结束，向其它模块发送NETDEV_DOWN事件通知
	call_netdevice_notifiers(NETDEV_DOWN, dev);

	return 0;
}

4. 设备的挂起

当电源管理模块通知设备系统即将进入休眠态时，驱动程序必须要执行休眠准备工作，在该过程中，驱动需要调用netif_device_detach()告诉框架设备即将休眠。

在挂起时，需要干两件事：

清除__LINK_STATE_PRESENT标志位，使得设备暂时不可用；
如果设备已经打开(调用过dev_open()，可以正常收发数据了)，还需要关闭设备的发送队列。

/**
 * netif_device_detach - mark device as removed
 * @dev: network device
 *
 * Mark device as removed from system and therefore no longer available.
 */
void netif_device_detach(struct net_device *dev)
{
	//cond1：之前设备的PRESET标记存在，即没有休眠
	//cond2：设备的START标记存在，即设备是打开的
	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
	    netif_running(dev)) {
		//关闭设备的发送队列
		netif_stop_queue(dev);
	}
}

5. 设备的恢复

同上，设备退出休眠态时，驱动程序必须要执行恢复工作，在该过程中，驱动需要调用netif_device_attach()告诉框架设备即将恢复。

恢复设备时，干两件事：

重新社长是__LINK_STATE_PRESENT标志位;
如果设备已经打开，那么开启发送队列并且重新调度设备使其可以发送

/**
 * netif_device_attach - mark device as attached
 * @dev: network device
 *
 * Mark device as attached from system and restart if needed.
 */
void netif_device_attach(struct net_device *dev)
{
	//cond1：之前设备的PRESET标记不在，即休眠了
	//cond2：设备的START标记存在，即设备是打开的
	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
	    netif_running(dev)) {
		//开启发送队列；将设别加入发送轮询队列；激活发送软中断
		netif_wake_queue(dev);
		//启动WatchDog，其实是延迟回调驱动提供的tx_timeout()接口，
		//不知道这种设计有什么目的
		__netdev_watchdog_up(dev);
	}
}

6. 连接状态侦测

驱动程序可以感知到网络设备是否处于可用状态的变化，比如网线是否掉了等事件，这些事件可以简单的划分为可用和不可用两种状态。当这两个事件发生时，驱动程序应该将这种状态传递给网络设备接口层，这是通过netif_carrier_on()和netif_carrier_off()来实现的。

/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
void netif_carrier_on(struct net_device *dev)
{
	//清除设备的__LINK_STATE_NOCARRIER标志
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		//产生一个链路状态变化事件
		linkwatch_fire_event(dev);
		//如果设备已经打开，启动WatchDog
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
}

/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
void netif_carrier_off(struct net_device *dev)
{
	//设置__LINK_STATE_NOCARRIER标志位
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
    	//也产生一个链路状态变化事件
		linkwatch_fire_event(dev);
}

6.1 链路变化状态事件

6.1.1 事件的产生

如上，链路状态变化的事件最先由驱动程序感知到，并最后调用linkwatch_fire_event()触发事件的产生。

//何为紧急事件
static int linkwatch_urgent_event(struct net_device *dev)
{
	//设备打开 && 链路状态由off变为on && 排队规则有变更
	return netif_running(dev) && netif_carrier_ok(dev) &&
	       dev->qdisc != dev->qdisc_sleeping;
}

void linkwatch_fire_event(struct net_device *dev)
{
	//判断此次是否为紧急事件
	int urgent = linkwatch_urgent_event(dev);

	//设置LINKWATCH_PENDING标记，表示该设备的LINKWATCH事件正在被调度中，同一时间段内同一个设备只能有
	//一个LINKWATCH事件被调度，因为调度的多了也是浪费资源，完全没有必要
	if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
		//如果之前没有设置该标记，则说明该设备没有LINKWATCH事件被调度，
		//所以增加引用计数并将该设备加入到系统的LINKWATCH事件链表中
		dev_hold(dev);
		linkwatch_add_event(dev);
	} else if (!urgent)
		//非紧急事件，并且该设备的LINKWATCH事件已经在调度中了，返回
		return;

	//紧急事件、或者发生了第一次调度
	linkwatch_schedule_work(urgent);
}

6.1.2 事件列表

系统中会有多个网络设备，这些设备都有可能会有LINKWATCH事件产生，这些来自不同设备的LINKWATCH需要组织起来。内核是使用一个简单的链表组织的。

static struct net_device *lweventlist;
static DEFINE_SPINLOCK(lweventlist_lock);

6.1.3 事件的调度

一旦内核的事件队列不为空，那么就应该触发调度，使得这些事件能够被及时的处理，这是通过linkwatch_schedule_work()完成的。这里涉及到延迟任务的执行原理，不是我们关注的重点，我们直接看处理事件的函数实现：

static void __linkwatch_run_queue(int urgent_only)
{
	struct net_device *next;

	/*
	 * Limit the number of linkwatch events to one
	 * per second so that a runaway driver does not
	 * cause a storm of messages on the netlink
	 * socket.  This limit does not apply to up events
	 * while the device qdisc is down.
	 */
	//对于非紧急事件，下次延迟任务的执行最少在1s以后
	if (!urgent_only)
		linkwatch_nextevent = jiffies + HZ;
	/* Limit wrap-around effect on delay. */
	else if (time_after(linkwatch_nextevent, jiffies + HZ))
		linkwatch_nextevent = jiffies;

	//正在处理，清除LW_URGENT标志
	clear_bit(LW_URGENT, &linkwatch_flags);

	//处理LINKWATCH事件队列
	spin_lock_irq(&lweventlist_lock);
	next = lweventlist;
	lweventlist = NULL;
	spin_unlock_irq(&lweventlist_lock);

	while (next) {
		struct net_device *dev = next;

		next = dev->link_watch_next;

		if (urgent_only && !linkwatch_urgent_event(dev)) {
			linkwatch_add_event(dev);
			continue;
		}

		/*
		 * Make sure the above read is complete since it can be
		 * rewritten as soon as we clear the bit below.
		 */
		smp_mb__before_clear_bit();

		/* We are about to handle this device,
		 * so new events can be accepted
		 */
		//清除LINKWATCH_PENDING标记
		clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);

		rfc2863_policy(dev);
		if (dev->flags & IFF_UP) {
			//设备在打开的情况下，根据当前链路状态是否ok分别打开和关闭发送队列
			if (netif_carrier_ok(dev)) {
				WARN_ON(dev->qdisc_sleeping == &noop_qdisc);
				dev_activate(dev);
			} else
				dev_deactivate(dev);
			//设备状态发生了变化，通过RT_NETLINK通知用户空间
			netdev_state_change(dev);
		}

		dev_put(dev);
	}
	//如果LINKWATCH队列非空，触发下一次延迟任务的执行
	if (lweventlist)
		linkwatch_schedule_work(0);
}

注意：__linkwatch_run_queue()中涉及到了延迟任务执行机制的一些内容，可以忽略，重点把握对dev_activate()和dev_deactivate()的调用。

设备接口层之设备状态管理