X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=kernel%2Fsched.c;h=53608a59d6e3c0fd3d0b18dbf919beb9b0125397;hb=d0ebd9c0e71d20ea8c2b4a071d2a2b4878ef07d6;hp=6d7bf55ec33d8f0f80e7b6a00dc720f65ee18de1;hpb=74732646431a1bb7e23e6b564127a8881cfef900;p=powerpc.git diff --git a/kernel/sched.c b/kernel/sched.c index 6d7bf55ec3..53608a59d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1232,7 +1232,7 @@ nextgroup: } /* - * find_idlest_queue - find the idlest runqueue among the cpus in group. + * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) @@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag) while (sd) { cpumask_t span; struct sched_group *group; - int new_cpu; - int weight; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } span = sd->span; group = find_idlest_group(sd, t, cpu); - if (!group) - goto nextlevel; + if (!group) { + sd = sd->child; + continue; + } new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) - goto nextlevel; + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } - /* Now try balancing at a lower domain level */ + /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; -nextlevel: sd = NULL; weight = cpus_weight(span); for_each_domain(cpu, tmp) { @@ -2533,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2630,7 +2644,7 @@ redo: } if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return nr_moved; @@ -2646,7 +2660,7 @@ out_one_pinned: sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return 0; } @@ -2668,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int sd_idle = 0; cpumask_t cpus = CPU_MASK_ALL; - if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2709,7 +2730,8 @@ redo: if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; } else sd->nr_balance_failed = 0; @@ -2719,7 +2741,7 @@ redo: out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; sd->nr_balance_failed = 0; @@ -5392,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd) if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC)) { + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5426,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC); + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); } if (~cflags & pflags) return 0; @@ -5448,12 +5474,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) struct sched_domain *parent = tmp->parent; if (!parent) break; - if (sd_parent_degenerate(tmp, parent)) + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } } - if (sd && sd_degenerate(sd)) + if (sd && sd_degenerate(sd)) { sd = sd->parent; + if (sd) + sd->child = NULL; + } sched_domain_debug(sd, cpu); @@ -6226,6 +6258,58 @@ static void free_sched_groups(const cpumask_t *cpu_map) } #endif +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + * + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents + * the maximum number of tasks a group can handle in the presence of other idle + * or lightly loaded groups in the same sched domain. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + WARN_ON(!sd || !sd->groups); + + if (cpu != first_cpu(sd->groups->cpumask)) + return; + + child = sd->child; + + /* + * For perf policy, if the groups in child domain share resources + * (for example cores sharing some portions of the cache hierarchy + * or SMT), then set this domain groups cpu_power such that each group + * can handle only one task, when there are other idle groups in the + * same sched domain. + */ + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { + sd->groups->cpu_power = SCHED_LOAD_SCALE; + return; + } + + sd->groups->cpu_power = 0; + + /* + * add cpu_power of each child group to this groups cpu_power + */ + group = child->groups; + do { + sd->groups->cpu_power += group->cpu_power; + group = group->next; + } while (group != child->groups); +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -6233,6 +6317,7 @@ static void free_sched_groups(const cpumask_t *cpu_map) static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; struct sched_group *sched_group_allnodes = NULL; @@ -6264,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { if (!sched_group_allnodes) { sched_group_allnodes - = kmalloc(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL); + = kmalloc_node(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL, + cpu_to_node(i)); if (!sched_group_allnodes) { printk(KERN_WARNING "Can not alloc allnodes sched group\n"); @@ -6288,6 +6374,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + if (p) + p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); #endif @@ -6297,6 +6385,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; + if (p) + p->child = sd; sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC @@ -6307,6 +6397,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_core[group]; #endif @@ -6318,6 +6409,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_cpus[group]; #endif } @@ -6436,72 +6528,20 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; sd = &per_cpu(cpu_domains, i); - sd->groups->cpu_power = SCHED_LOAD_SCALE; + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - int power; - struct sched_domain *sd; sd = &per_cpu(core_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) - * SCHED_LOAD_SCALE / 10; - sd->groups->cpu_power = power; + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_MC sd = &per_cpu(phys_domains, i); - if (i != first_cpu(sd->groups->cpumask)) - continue; - - sd->groups->cpu_power = 0; - if (sched_mc_power_savings || sched_smt_power_savings) { - int j; - - for_each_cpu_mask(j, sd->groups->cpumask) { - struct sched_domain *sd1; - sd1 = &per_cpu(core_domains, j); - /* - * for each core we will add once - * to the group in physical domain - */ - if (j != first_cpu(sd1->groups->cpumask)) - continue; - - if (sched_smt_power_savings) - sd->groups->cpu_power += sd1->groups->cpu_power; - else - sd->groups->cpu_power += SCHED_LOAD_SCALE; - } - } else - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; -#else - int power; - sd = &per_cpu(phys_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; -#endif + init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA