第19章　Linux电源管理的系统架构和驱动之PM QoS（电源管理服务质量）

19.7　PM QoS（电源管理服务质量）

Linux内核的PM QoS系统针对内核和应用程序提供一套接口，通过这个接口，用户可以设定自身对性能的期望。一类是系统级的需求，通过cpu_dma_latency、network_latency和network_throughput（吞吐率）这些参数来设定；另一类是单个设备可以根据自身的性能需求发起per-device的PM QoS请求。

在内核空间，通过pm_qos_add_request（）函数可以注册PM QoS请求：

linux/pm_qos.h

void pm_qos_add_request(struct pm_qos_request *req, int pm_qos_class, s32 value);

kernel/power/qos.c

/**
* pm_qos_add_request - inserts new qos request into the list
* @req: pointer to a preallocated handle
* @pm_qos_class: identifies which list of qos request to use
* @value: defines the qos request
*
* This function inserts a new entry in the pm_qos_class list of requested qos
* performance characteristics. It recomputes the aggregate QoS expectations
* for the pm_qos_class of parameters and initializes the pm_qos_request
* handle. Caller needs to save this handle for later use in updates and
* removal.
*/
void pm_qos_add_request(struct pm_qos_request *req,
int pm_qos_class, s32 value)
{
if (!req) /*guard against callers passing in null */
return;

if (pm_qos_request_active(req)) {
WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
return;
}

switch (req->type) {
case PM_QOS_REQ_AFFINE_CORES:
if (cpumask_empty(&req->cpus_affine)) {
req->type = PM_QOS_REQ_ALL_CORES;
cpumask_setall(&req->cpus_affine);
WARN(1, KERN_ERR "Affine cores not set for request with affinity flag\n");
}
break;
#ifdef CONFIG_SMP
case PM_QOS_REQ_AFFINE_IRQ:
if (irq_can_set_affinity(req->irq)) {
struct irq_desc *desc = irq_to_desc(req->irq);
struct cpumask *mask = desc->irq_data.affinity;

/* Get the current affinity */
cpumask_copy(&req->cpus_affine, mask);
req->irq_notify.irq = req->irq;
req->irq_notify.notify = pm_qos_irq_notify;
req->irq_notify.release = pm_qos_irq_release;

} else {
req->type = PM_QOS_REQ_ALL_CORES;
cpumask_setall(&req->cpus_affine);
WARN(1, KERN_ERR "IRQ-%d not set for request with affinity flag\n",
req->irq);
}
break;
#endif
default:
WARN(1, KERN_ERR "Unknown request type %d\n", req->type);
/* fall through */
case PM_QOS_REQ_ALL_CORES:
cpumask_setall(&req->cpus_affine);
break;
}

req->pm_qos_class = pm_qos_class;
INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
trace_pm_qos_add_request(pm_qos_class, value);
pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
req, PM_QOS_ADD_REQ, value);

#ifdef CONFIG_SMP
if (req->type == PM_QOS_REQ_AFFINE_IRQ &&
irq_can_set_affinity(req->irq)) {
int ret = 0;

ret = irq_set_affinity_notifier(req->irq,
&req->irq_notify);
if (ret) {
WARN(1, "IRQ affinity notify set failed\n");
req->type = PM_QOS_REQ_ALL_CORES;
cpumask_setall(&req->cpus_affine);
pm_qos_update_target(
pm_qos_array[pm_qos_class]->constraints,
req, PM_QOS_UPDATE_REQ, value);
}
}
#endif
}

EXPORT_SYMBOL_GPL(pm_qos_add_request);

通过pm_qos_update_request（）函数更新已注册的PM QoS请求：

linux/pm_qos.h

void pm_qos_update_request(struct pm_qos_request *req, s32 new_value);

扫描二维码关注公众号，回复： 1531543 查看本文章

void pm_qos_update_request_timeout(struct pm_qos_request *req,
s32 new_value, unsigned long timeout_us);

kernel/power/qos.c

/**
* pm_qos_update_request - modifies an existing qos request
* @req : handle to list element holding a pm_qos request to use
* @value: defines the qos request
*
* Updates an existing qos request for the pm_qos_class of parameters along
* with updating the target pm_qos_class value.
*
* Attempts are made to make this code callable on hot code paths.
*/
void pm_qos_update_request(struct pm_qos_request *req,
s32 new_value)
{
if (!req) /*guard against callers passing in null */
return;

if (!pm_qos_request_active(req)) {
WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
return;
}

cancel_delayed_work_sync(&req->work);
__pm_qos_update_request(req, new_value);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);

/**
* pm_qos_update_request_timeout - modifies an existing qos request temporarily.
* @req : handle to list element holding a pm_qos request to use
* @new_value: defines the temporal qos request
* @timeout_us: the effective duration of this qos request in usecs.
*
* After timeout_us, this qos request is cancelled automatically.
*/
void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
unsigned long timeout_us)
{
if (!req)
return;
if (WARN(!pm_qos_request_active(req),
"%s called for unknown object.", __func__))
return;

cancel_delayed_work_sync(&req->work);
trace_pm_qos_update_request_timeout(req->pm_qos_class,
new_value, timeout_us);
if (new_value != req->node.prio)
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
req, PM_QOS_UPDATE_REQ, new_value);

schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
}

通过pm_qos_remove_request（）函数删除已注册的PM QoS请求：

linux/pm_qos.h

void pm_qos_remove_request(struct pm_qos_request *req);

kernel/power/qos.c
/**
* pm_qos_remove_request - modifies an existing qos request
* @req: handle to request list element
*
* Will remove pm qos request from the list of constraints and
* recompute the current target value for the pm_qos_class. Call this
* on slow code paths.
*/
void pm_qos_remove_request(struct pm_qos_request *req)
{
if (!req) /*guard against callers passing in null */
return;
/* silent return to keep pcm code cleaner */

if (!pm_qos_request_active(req)) {
WARN(1, "pm_qos_remove_request() called for unknown object\n");
return;
}

cancel_delayed_work_sync(&req->work);

#ifdef CONFIG_SMP
if (req->type == PM_QOS_REQ_AFFINE_IRQ) {
int ret = 0;
/* Get the current affinity */
ret = irq_set_affinity_notifier(req->irq, NULL);
if (ret)
WARN(1, "IRQ affinity notify set failed\n");
}
#endif

trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
req, PM_QOS_REMOVE_REQ,
PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
EXPORT_SYMBOL_GPL(pm_qos_remove_request);

譬如在drivers/media/platform/via-camera.c摄像头驱动中，当摄像头开启后，通过如下语句阻止CPU进入C3级别的深度Idle：

static int viacam_streamon(struct file *filp, void *priv, enum v4l2_buf_type t)
{
struct via_camera *cam = priv;
int ret = 0;

if (t != V4L2_BUF_TYPE_VIDEO_CAPTURE)
return -EINVAL;

mutex_lock(&cam->lock); //互斥锁锁定期间允许临界区阻塞，适用于临界区大的情况
if (cam->opstate != S_IDLE) {
ret = -EBUSY;
goto out;
}
/*
* Enforce the V4l2 "only one owner gets to read data" rule.
*/
if (cam->owner && cam->owner != filp) {
ret = -EBUSY;
goto out;
}
cam->owner = filp;
/*
* Configure things if need be.
*/
if (test_bit(CF_CONFIG_NEEDED, &cam->flags)) {
ret = viacam_configure_sensor(cam);
if (ret)
goto out;
ret = viacam_config_controller(cam);
if (ret)
goto out;
}
/*
* If the CPU goes into C3, the DMA transfer gets corrupted and
* users start filing unsightly bug reports. Put in a "latency"
* requirement which will keep the CPU out of the deeper sleep
* states.
*/
pm_qos_add_request(&cam->qos_request, PM_QOS_CPU_DMA_LATENCY, 50);//阻止CPU进入C3级别的深度Idle
/*
* Fire things up.
*/
INIT_LIST_HEAD(&cam->buffer_queue);
ret = videobuf_streamon(&cam->vb_queue);
if (!ret)
viacam_start_engine(cam);
out:
mutex_unlock(&cam->lock);
return ret;
}

这是因为，在CPUIdle子系统中，会根据PM_QOS_CPU_DMA_LATENCY请求的情况选择合适的C状态，如drivers/cpuidle/governors/ladder.c中的ladder_select_state（）就会判断目标C状态的exit_latency与QoS要求的关系，如代码清单19.11所示。

代码清单19.11　CPUIdle LADDER governor对QoS的判断

/**
* ladder_select_state - selects the next state to enter
* @drv: cpuidle driver
* @dev: the CPU
*/
static int ladder_select_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{
struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
struct ladder_device_state *last_state;
int last_residency, last_idx = ldev->last_state_idx;
int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);

/* Special case when user has set very strict latency requirement */
if (unlikely(latency_req == 0)) {
ladder_do_selection(ldev, last_idx, 0);
return 0;
}

last_state = &ldev->states[last_idx];

if (drv->states[last_idx].flags & CPUIDLE_FLAG_TIME_VALID) {
last_residency = cpuidle_get_last_residency(dev) - \
drv->states[last_idx].exit_latency;
}
else
last_residency = last_state->threshold.promotion_time + 1;

/* consider promotion */
if (last_idx < drv->state_count - 1 &&
!drv->states[last_idx + 1].disabled &&
!dev->states_usage[last_idx + 1].disable &&
last_residency > last_state->threshold.promotion_time &&
drv->states[last_idx + 1].exit_latency <= latency_req) {
last_state->stats.promotion_count++;
last_state->stats.demotion_count = 0;
if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) {
ladder_do_selection(ldev, last_idx, last_idx + 1);
return last_idx + 1;
}
}

/* consider demotion */
if (last_idx > CPUIDLE_DRIVER_STATE_START &&
(drv->states[last_idx].disabled ||
dev->states_usage[last_idx].disable ||
drv->states[last_idx].exit_latency > latency_req)) {
int i;

for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) {
if (drv->states[i].exit_latency <= latency_req)
break;
}
ladder_do_selection(ldev, last_idx, i);
return i;
}

if (last_idx > CPUIDLE_DRIVER_STATE_START &&
last_residency < last_state->threshold.demotion_time) {
last_state->stats.demotion_count++;
last_state->stats.promotion_count = 0;
if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) {
ladder_do_selection(ldev, last_idx, last_idx - 1);
return last_idx - 1;
}
}

/* otherwise remain at the current state */
return last_idx;
}

LADDER在选择是否进入更深层次的C状态时，会比较C状态的exit_latency要小于通过pm_qos_request（PM_QOS_CPU_DMA_LATENCY）得到的PM QoS请求的延迟。

同样的逻辑也出现于drivers/cpuidle/governors/menu.c中。

代码清单19.12　CPUIdle MENU governor对QoS的判断

/**
* menu_select - selects the next idle state to enter
* @drv: cpuidle driver containing state data
* @dev: the CPU
*/
static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
{
struct menu_device *data = this_cpu_ptr(&menu_devices);
int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
int i;
unsigned int interactivity_req;
unsigned long nr_iowaiters, cpu_load;

if (data->needs_update) {
menu_update(drv, dev);
data->needs_update = 0;
}

data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;

/* Special case when user has set very strict latency requirement */
if (unlikely(latency_req == 0))
return 0;

/* determine the expected residency time, round up */
data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
get_iowait_load(&nr_iowaiters, &cpu_load);
data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
/*
* Force the result of multiplication to be 64 bits even if both
* operands are 32 bits.
* Make sure to round up for half microseconds.
*/
data->predicted_us = div_round64((uint64_t)data->next_timer_us *
data->correction_factor[data->bucket],
RESOLUTION * DECAY);

get_typical_interval(data);

/*
* Performance multiplier defines a minimum predicted idle
* duration / latency ratio. Adjust the latency limit if
* necessary.
*/
interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
if (latency_req > interactivity_req)
latency_req = interactivity_req;

/*
* We want to default to C1 (hlt), not to busy polling
* unless the timer is happening really really soon.
*/
if (data->next_timer_us > 5 &&
!drv->states[CPUIDLE_DRIVER_STATE_START].disabled &&
dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
data->last_state_idx = CPUIDLE_DRIVER_STATE_START;

/*
* Find the idle state with the lowest power while satisfying
* our constraints.
*/
for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
struct cpuidle_state *s = &drv->states[i];
struct cpuidle_state_usage *su = &dev->states_usage[i];

if (s->disabled || su->disable)
continue;
if (s->target_residency > data->predicted_us)
continue;
if (s->exit_latency > latency_req)
continue;

data->last_state_idx = i;
}

return data->last_state_idx;
}

回到drivers/media/platform/via-camera.c中，当摄像头关闭后，会通过如下语句告知上述代码对

PM_QOS_CPU_DMA_LATENCY的性能要求取消：

static int viacam_streamoff(struct file *filp, void *priv, enum v4l2_buf_type t)
{
struct via_camera *cam = priv;
int ret;

if (t != V4L2_BUF_TYPE_VIDEO_CAPTURE)
return -EINVAL;
mutex_lock(&cam->lock);
if (cam->opstate != S_RUNNING) {
ret = -EINVAL;
goto out;
}
pm_qos_remove_request(&cam->qos_request);
viacam_stop_engine(cam);
/*
* Videobuf will recycle all of the outstanding buffers, but
* we should be sure we don't retain any references to
* any of them.
*/
ret = videobuf_streamoff(&cam->vb_queue);
INIT_LIST_HEAD(&cam->buffer_queue);
out:
mutex_unlock(&cam->lock);
return ret;
}

备注：

应用程序通过向/dev/cpu_dma_latency和/dev/network_latency设备节点写入值来发起QoS的性能请求。

第19章 Linux电源管理的系统架构和驱动之PM QoS（电源管理服务质量）

猜你喜欢

第19章　Linux电源管理的系统架构和驱动之PM QoS（电源管理服务质量）