linux内核工程师 2.09节 load_balance详解

Kernel version: linux-2.6.34

Knowledge Preparation:
http://www.ibm.com/developerworks/cn/linux/l-cn-schldom/index.html

Function:
Check this_cpu to ensure it is balanced within domain. Attempt to move tasks if there is an imbalance.

Data Structure:
enum cpu_idle_type {
    CPU_IDLE,
    CPU_NOT_IDLE,
    CPU_NEWLY_IDLE,
    CPU_MAX_IDLE_TYPES
};

typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;

/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);


static int load_balance(int this_cpu, struct rq *this_rq,
            struct sched_domain *sd, enum cpu_idle_type idle,
            int *balance)
{
    int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
    struct sched_group *group;
    unsigned long imbalance;
    struct rq *busiest;
    unsigned long flags;


/* load_balance_tmpmask is a per-cpu variable */

    struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);


/* cpu_active_mask is a global variable defined in cpumask.h */ 

    cpumask_copy(cpus, cpu_active_mask);

    /*
     * When power savings policy is enabled for the parent domain, idle
     * sibling can pick up load irrespective of busy siblings. In this case,
     * let the state of idle sibling percolate up as CPU_IDLE, instead of
     * portraying it as CPU_NOT_IDLE.
     */

    if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        sd_idle = 1;


/* ++sd->lb_count[idle] */
    schedstat_inc(sd, lb_count[idle]);

redo:

/* update schedule domain' member last_update */
    update_shares(sd);


/**
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance. If there isn't an imbalance, and
 * the user has opted for power-savings, it returns a group whose
 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
 * such a group exists.
 *
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
 * @sd: The sched_domain whose busiest group is to be returned.
 * @this_cpu: The cpu for which load balancing is currently being performed.
 * @imbalance: Variable which stores amount of weighted load which should
 *        be moved to restore balance/put a group to idle.
 * @idle: The idle status of this_cpu.
 * @sd_idle: The idleness of sd
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *    is the appropriate cpu to perform load balancing at this_level.
 *
 * Returns:    - the busiest group if imbalance exists.
 *        - If no imbalance and user has opted for power-savings balance,
 *           return the least loaded group whose CPUs can be
 *           put to idle by rebalancing its tasks onto our group.
 */

    group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,cpus, balance);

    if (*balance == 0)
        goto out_balanced;

    if (!group) {
        schedstat_inc(sd, lb_nobusyg[idle]);
        goto out_balanced;
    }


/*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */

    busiest = find_busiest_queue(group, idle, imbalance, cpus);
    if (!busiest) {
        schedstat_inc(sd, lb_nobusyq[idle]);
        goto out_balanced;
    }

    BUG_ON(busiest == this_rq);

    schedstat_add(sd, lb_imbalance[idle], imbalance);

    ld_moved = 0;
    if (busiest->nr_running > 1) {
        /*
         * Attempt to move tasks. If find_busiest_group has found
         * an imbalance but busiest->nr_running <= 1, the group is
         * still unbalanced. ld_moved simply stays zero, so it is
         * correctly treated as an imbalance.
         */

        local_irq_save(flags);
        double_rq_lock(this_rq, busiest);

/*
 * move_tasks tries to move up to max_load_move weighted load from busiest to
 * this_rq, as part of a balancing operation within domain "sd".
 * Returns 1 if successful and 0 otherwise.
 *
 * Called with both runqueues locked.
 */

        ld_moved = move_tasks(this_rq, this_cpu, busiest,
                 imbalance, sd, idle, &all_pinned);
        double_rq_unlock(this_rq, busiest);
        local_irq_restore(flags);

        /*
         * some other cpu did the load balance for us.
         */

        if (ld_moved && this_cpu != smp_processor_id())
            resched_cpu(this_cpu);

        /* All tasks on this runqueue were pinned by CPU affinity */
        if (unlikely(all_pinned)) {
            cpumask_clear_cpu(cpu_of(busiest), cpus);
            if (!cpumask_empty(cpus))
                goto redo;
            goto out_balanced;
        }
    }


/* if move_tasks failed */

    if (!ld_moved) {
        schedstat_inc(sd, lb_failed[idle]);
        sd->nr_balance_failed++;

        if (need_active_balance(sd, sd_idle, idle)) {
            raw_spin_lock_irqsave(&busiest->lock, flags);

            /* don't kick the migration_thread, if the curr
             * task on busiest cpu can't be moved to this_cpu
             */

            if (!cpumask_test_cpu(this_cpu,
                     &busiest->curr->cpus_allowed)) {
                raw_spin_unlock_irqrestore(&busiest->lock,
                             flags);
                all_pinned = 1;
                goto out_one_pinned;
            }


/* move_tasks() is pulling, now set active_balance as 1, set push_cpu as this_cpu, means pushing happening within busiest run queue */

            if (!busiest->active_balance) {
                busiest->active_balance = 1;
                busiest->push_cpu = this_cpu;
                active_balance = 1;
            }
            raw_spin_unlock_irqrestore(&busiest->lock, flags);


/* there is a migration thread specially processing migration at each run queue, now wake up it */
            if (active_balance)
                wake_up_process(busiest->migration_thread);

            /*
             * We've kicked active balancing, reset the failure
             * counter.
             */

            sd->nr_balance_failed = sd->cache_nice_tries+1;
        }
    } else
        sd->nr_balance_failed = 0;

    if (likely(!active_balance)) {
        /* We were unbalanced, so reset the balancing interval */
        sd->balance_interval = sd->min_interval;
    } else {
        /*
         * If we've begun active balancing, start to back off. This
         * case may not be covered by the all_pinned logic if there
         * is only 1 task on the busy runqueue (because we don't call
         * move_tasks).
         */

        if (sd->balance_interval < sd->max_interval)
            sd->balance_interval *= 2;
    }

    if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        ld_moved = -1;

    goto out;

out_balanced:
    schedstat_inc(sd, lb_balanced[idle]);

    sd->nr_balance_failed = 0;

out_one_pinned:
    /* tune up the balancing interval */
    if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
            (sd->balance_interval < sd->max_interval))
        sd->balance_interval *= 2;

    if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        ld_moved = -1;
    else
        ld_moved = 0;
out:
    if (ld_moved)
        update_shares(sd);
    return ld_moved;
}


猜你喜欢

转载自blog.csdn.net/zjy900507/article/details/80667974