Merge git://git.infradead.org/~dhowells/irq-2.6

[powerpc.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index a234fbe..53608a5 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,7 +49,7 @@
  #include <linux/seq_file.h>
  #include <linux/syscalls.h>
  #include <linux/times.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
  #include <linux/kprobes.h>
  #include <linux/delayacct.h>
  #include <asm/tlb.h>
@@ -238,6 +238,7 @@ struct rq {
         /* For active balancing */
         int active_balance;
         int push_cpu;
+       int cpu;                /* cpu of this runqueue */
  
         struct task_struct *migration_thread;
         struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
  
  static DEFINE_PER_CPU(struct rq, runqueues);
  
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+       return rq->cpu;
+#else
+       return 0;
+#endif
+}
+
  /*
   * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
   * See detach_destroy_domains: synchronize_sched for details.
@@ -1222,7 +1232,7 @@ nextgroup:
  }
  
  /*
- * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
   */
  static int
  find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -1276,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag)
         while (sd) {
                 cpumask_t span;
                 struct sched_group *group;
-               int new_cpu;
-               int weight;
+               int new_cpu, weight;
+
+               if (!(sd->flags & flag)) {
+                       sd = sd->child;
+                       continue;
+               }
  
                 span = sd->span;
                 group = find_idlest_group(sd, t, cpu);
-               if (!group)
-                       goto nextlevel;
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
  
                 new_cpu = find_idlest_cpu(group, t, cpu);
-               if (new_cpu == -1 || new_cpu == cpu)
-                       goto nextlevel;
+               if (new_cpu == -1 || new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
+               }
  
-               /* Now try balancing at a lower domain level */
+               /* Now try balancing at a lower domain level of new_cpu */
                 cpu = new_cpu;
-nextlevel:
                 sd = NULL;
                 weight = cpus_weight(span);
                 for_each_domain(cpu, tmp) {
@@ -1745,27 +1763,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
         __releases(rq->lock)
  {
         struct mm_struct *mm = rq->prev_mm;
-       unsigned long prev_task_flags;
+       long prev_state;
  
         rq->prev_mm = NULL;
  
         /*
          * A task struct has one reference for the use as "current".
-        * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
-        * calls schedule one last time. The schedule call will never return,
-        * and the scheduled task must drop that reference.
-        * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+        * schedule one last time. The schedule call will never return, and
+        * the scheduled task must drop that reference.
+        * The test for TASK_DEAD must occur while the runqueue locks are
          * still held, otherwise prev could be scheduled on another cpu, die
          * there before we look at prev->state, and then the reference would
          * be dropped twice.
          *              Manfred Spraul <manfred@colorfullife.com>
          */
-       prev_task_flags = prev->flags;
+       prev_state = prev->state;
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
         if (mm)
                 mmdrop(mm);
-       if (unlikely(prev_task_flags & PF_DEAD)) {
+       if (unlikely(prev_state == TASK_DEAD)) {
                 /*
                  * Remove function-return probe instances associated with this
                  * task and put them back on the free list.
@@ -2211,7 +2229,8 @@ out:
   */
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+                  unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+                  cpumask_t *cpus)
  {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2267,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 sum_weighted_load = sum_nr_running = avg_load = 0;
  
                 for_each_cpu_mask(i, group->cpumask) {
-                       struct rq *rq = cpu_rq(i);
+                       struct rq *rq;
+
+                       if (!cpu_isset(i, *cpus))
+                               continue;
+
+                       rq = cpu_rq(i);
  
                         if (*sd_idle && !idle_cpu(i))
                                 *sd_idle = 0;
@@ -2466,13 +2490,17 @@ ret:
   */
  static struct rq *
  find_busiest_queue(struct sched_group *group, enum idle_type idle,
-                  unsigned long imbalance)
+                  unsigned long imbalance, cpumask_t *cpus)
  {
         struct rq *busiest = NULL, *rq;
         unsigned long max_load = 0;
         int i;
  
         for_each_cpu_mask(i, group->cpumask) {
+
+               if (!cpu_isset(i, *cpus))
+                       continue;
+
                 rq = cpu_rq(i);
  
                 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,20 +2539,29 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         struct sched_group *group;
         unsigned long imbalance;
         struct rq *busiest;
+       cpumask_t cpus = CPU_MASK_ALL;
  
+       /*
+        * When power savings policy is enabled for the parent domain, idle
+        * sibling can pick up load irrespective of busy siblings. In this case,
+        * let the state of idle sibling percolate up as IDLE, instead of
+        * portraying it as NOT_IDLE.
+        */
         if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-           !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 sd_idle = 1;
  
         schedstat_inc(sd, lb_cnt[idle]);
  
-       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+                                                       &cpus);
         if (!group) {
                 schedstat_inc(sd, lb_nobusyg[idle]);
                 goto out_balanced;
         }
  
-       busiest = find_busiest_queue(group, idle, imbalance);
+       busiest = find_busiest_queue(group, idle, imbalance, &cpus);
         if (!busiest) {
                 schedstat_inc(sd, lb_nobusyq[idle]);
                 goto out_balanced;
@@ -2549,8 +2586,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                 double_rq_unlock(this_rq, busiest);
  
                 /* All tasks on this runqueue were pinned by CPU affinity */
-               if (unlikely(all_pinned))
+               if (unlikely(all_pinned)) {
+                       cpu_clear(cpu_of(busiest), cpus);
+                       if (!cpus_empty(cpus))
+                               goto redo;
                         goto out_balanced;
+               }
         }
  
         if (!nr_moved) {
@@ -2603,7 +2644,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         }
  
         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-           !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 return -1;
         return nr_moved;
  
@@ -2619,7 +2660,7 @@ out_one_pinned:
                 sd->balance_interval *= 2;
  
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                       !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 return -1;
         return 0;
  }
@@ -2639,18 +2680,29 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
         unsigned long imbalance;
         int nr_moved = 0;
         int sd_idle = 0;
+       cpumask_t cpus = CPU_MASK_ALL;
  
-       if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+       /*
+        * When power savings policy is enabled for the parent domain, idle
+        * sibling can pick up load irrespective of busy siblings. In this case,
+        * let the state of idle sibling percolate up as IDLE, instead of
+        * portraying it as NOT_IDLE.
+        */
+       if (sd->flags & SD_SHARE_CPUPOWER &&
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 sd_idle = 1;
  
         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+                               &sd_idle, &cpus);
         if (!group) {
                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                 goto out_balanced;
         }
  
-       busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
+       busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+                               &cpus);
         if (!busiest) {
                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                 goto out_balanced;
@@ -2668,11 +2720,18 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
                                         minus_1_or_zero(busiest->nr_running),
                                         imbalance, sd, NEWLY_IDLE, NULL);
                 spin_unlock(&busiest->lock);
+
+               if (!nr_moved) {
+                       cpu_clear(cpu_of(busiest), cpus);
+                       if (!cpus_empty(cpus))
+                               goto redo;
+               }
         }
  
         if (!nr_moved) {
                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+                   !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                         return -1;
         } else
                 sd->nr_balance_failed = 0;
@@ -2682,7 +2741,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
  out_balanced:
         schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                                       !sched_smt_power_savings)
+           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                 return -1;
         sd->nr_balance_failed = 0;
  
@@ -3311,9 +3370,6 @@ need_resched_nonpreemptible:
  
         spin_lock_irq(&rq->lock);
  
-       if (unlikely(prev->flags & PF_DEAD))
-               prev->state = EXIT_DEAD;
-
         switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                 switch_count = &prev->nvcsw;
@@ -4043,6 +4099,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
   * @p: the task in question.
   * @policy: new policy.
   * @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
   */
  int sched_setscheduler(struct task_struct *p, int policy,
                        struct sched_param *param)
@@ -4070,28 +4128,32 @@ recheck:
             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                 return -EINVAL;
-       if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
-                                       != (param->sched_priority == 0))
+       if (is_rt_policy(policy) != (param->sched_priority != 0))
                 return -EINVAL;
  
         /*
          * Allow unprivileged RT tasks to decrease priority:
          */
         if (!capable(CAP_SYS_NICE)) {
-               /*
-                * can't change policy, except between SCHED_NORMAL
-                * and SCHED_BATCH:
-                */
-               if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
-                       (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
-                               !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
-                       return -EPERM;
-               /* can't increase priority */
-               if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
-                   param->sched_priority > p->rt_priority &&
-                   param->sched_priority >
-                               p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
-                       return -EPERM;
+               if (is_rt_policy(policy)) {
+                       unsigned long rlim_rtprio;
+                       unsigned long flags;
+
+                       if (!lock_task_sighand(p, &flags))
+                               return -ESRCH;
+                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                       unlock_task_sighand(p, &flags);
+
+                       /* can't set/change the rt policy */
+                       if (policy != p->policy && !rlim_rtprio)
+                               return -EPERM;
+
+                       /* can't increase priority */
+                       if (param->sched_priority > p->rt_priority &&
+                           param->sched_priority > rlim_rtprio)
+                               return -EPERM;
+               }
+
                 /* can't change other user's priorities */
                 if ((current->euid != p->euid) &&
                     (current->euid != p->uid))
@@ -4156,14 +4218,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
                 return -EINVAL;
         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
                 return -EFAULT;
-       read_lock_irq(&tasklist_lock);
+
+       rcu_read_lock();
+       retval = -ESRCH;
         p = find_process_by_pid(pid);
-       if (!p) {
-               read_unlock_irq(&tasklist_lock);
-               return -ESRCH;
-       }
-       retval = sched_setscheduler(p, policy, &lparam);
-       read_unlock_irq(&tasklist_lock);
+       if (p != NULL)
+               retval = sched_setscheduler(p, policy, &lparam);
+       rcu_read_unlock();
  
         return retval;
  }
@@ -4345,7 +4406,10 @@ EXPORT_SYMBOL(cpu_present_map);
  
  #ifndef CONFIG_SMP
  cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
  cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
  #endif
  
  long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4775,7 +4839,7 @@ void show_state(void)
   * NOTE: this function does not set the idle thread's NEED_RESCHED
   * flag, to make booting more robust.
   */
-void __devinit init_idle(struct task_struct *idle, int cpu)
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
@@ -5114,7 +5178,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
  
         /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->flags & PF_DEAD);
+       BUG_ON(p->state == TASK_DEAD);
  
         get_task_struct(p);
  
@@ -5235,9 +5299,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
  int __init migration_init(void)
  {
         void *cpu = (void *)(long)smp_processor_id();
+       int err;
  
         /* Start one for the boot CPU: */
-       migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+       BUG_ON(err == NOTIFY_BAD);
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
  
@@ -5348,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
         if (sd->flags & (SD_LOAD_BALANCE |
                          SD_BALANCE_NEWIDLE |
                          SD_BALANCE_FORK |
-                        SD_BALANCE_EXEC)) {
+                        SD_BALANCE_EXEC |
+                        SD_SHARE_CPUPOWER |
+                        SD_SHARE_PKG_RESOURCES)) {
                 if (sd->groups != sd->groups->next)
                         return 0;
         }
@@ -5382,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                 pflags &= ~(SD_LOAD_BALANCE |
                                 SD_BALANCE_NEWIDLE |
                                 SD_BALANCE_FORK |
-                               SD_BALANCE_EXEC);
+                               SD_BALANCE_EXEC |
+                               SD_SHARE_CPUPOWER |
+                               SD_SHARE_PKG_RESOURCES);
         }
         if (~cflags & pflags)
                 return 0;
@@ -5404,12 +5474,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
                 struct sched_domain *parent = tmp->parent;
                 if (!parent)
                         break;
-               if (sd_parent_degenerate(tmp, parent))
+               if (sd_parent_degenerate(tmp, parent)) {
                         tmp->parent = parent->parent;
+                       if (parent->parent)
+                               parent->parent->child = tmp;
+               }
         }
  
-       if (sd && sd_degenerate(sd))
+       if (sd && sd_degenerate(sd)) {
                 sd = sd->parent;
+               if (sd)
+                       sd->child = NULL;
+       }
  
         sched_domain_debug(sd, cpu);
  
@@ -5417,7 +5493,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
  }
  
  /* cpus with isolated domains */
-static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
  
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
@@ -5445,15 +5521,17 @@ __setup ("isolcpus=", isolated_cpu_setup);
   * covered by the given span, and will set each group's ->cpumask correctly,
   * and ->cpu_power to 0.
   */
-static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-                                   int (*group_fn)(int cpu))
+static void
+init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+                       const cpumask_t *cpu_map,
+                       int (*group_fn)(int cpu, const cpumask_t *cpu_map))
  {
         struct sched_group *first = NULL, *last = NULL;
         cpumask_t covered = CPU_MASK_NONE;
         int i;
  
         for_each_cpu_mask(i, span) {
-               int group = group_fn(i);
+               int group = group_fn(i, cpu_map);
                 struct sched_group *sg = &groups[group];
                 int j;
  
@@ -5464,7 +5542,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
                 sg->cpu_power = 0;
  
                 for_each_cpu_mask(j, span) {
-                       if (group_fn(j) != group)
+                       if (group_fn(j, cpu_map) != group)
                                 continue;
  
                         cpu_set(j, covered);
@@ -5931,13 +6009,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
  #endif
                 );
         if (system_state == SYSTEM_BOOTING) {
-               printk("migration_cost=");
-               for (distance = 0; distance <= max_distance; distance++) {
-                       if (distance)
-                               printk(",");
-                       printk("%ld", (long)migration_cost[distance] / 1000);
+               if (num_online_cpus() > 1) {
+                       printk("migration_cost=");
+                       for (distance = 0; distance <= max_distance; distance++) {
+                               if (distance)
+                                       printk(",");
+                               printk("%ld", (long)migration_cost[distance] / 1000);
+                       }
+                       printk("\n");
                 }
-               printk("\n");
         }
         j1 = jiffies;
         if (migration_debug)
@@ -6040,7 +6120,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
  static struct sched_group sched_group_cpus[NR_CPUS];
  
-static int cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
  {
         return cpu;
  }
@@ -6051,31 +6131,36 @@ static int cpu_to_cpu_group(int cpu)
   */
  #ifdef CONFIG_SCHED_MC
  static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group *sched_group_core_bycpu[NR_CPUS];
+static struct sched_group sched_group_core[NR_CPUS];
  #endif
  
  #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
  {
-       return first_cpu(cpu_sibling_map[cpu]);
+       cpumask_t mask = cpu_sibling_map[cpu];
+       cpus_and(mask, mask, *cpu_map);
+       return first_cpu(mask);
  }
  #elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
  {
         return cpu;
  }
  #endif
  
  static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
  
-static int cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
  {
  #ifdef CONFIG_SCHED_MC
         cpumask_t mask = cpu_coregroup_map(cpu);
+       cpus_and(mask, mask, *cpu_map);
         return first_cpu(mask);
  #elif defined(CONFIG_SCHED_SMT)
-       return first_cpu(cpu_sibling_map[cpu]);
+       cpumask_t mask = cpu_sibling_map[cpu];
+       cpus_and(mask, mask, *cpu_map);
+       return first_cpu(mask);
  #else
         return cpu;
  #endif
@@ -6093,7 +6178,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
  static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
  static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
  
-static int cpu_to_allnodes_group(int cpu)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
  {
         return cpu_to_node(cpu);
  }
@@ -6125,12 +6210,11 @@ next_sg:
  }
  #endif
  
+#ifdef CONFIG_NUMA
  /* Free memory allocated for various sched_group structures */
  static void free_sched_groups(const cpumask_t *cpu_map)
  {
-       int cpu;
-#ifdef CONFIG_NUMA
-       int i;
+       int cpu, i;
  
         for_each_cpu_mask(cpu, *cpu_map) {
                 struct sched_group *sched_group_allnodes
@@ -6167,19 +6251,63 @@ next_sg:
                 kfree(sched_group_nodes);
                 sched_group_nodes_bycpu[cpu] = NULL;
         }
+}
+#else
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+}
  #endif
-       for_each_cpu_mask(cpu, *cpu_map) {
-               if (sched_group_phys_bycpu[cpu]) {
-                       kfree(sched_group_phys_bycpu[cpu]);
-                       sched_group_phys_bycpu[cpu] = NULL;
-               }
-#ifdef CONFIG_SCHED_MC
-               if (sched_group_core_bycpu[cpu]) {
-                       kfree(sched_group_core_bycpu[cpu]);
-                       sched_group_core_bycpu[cpu] = NULL;
-               }
-#endif
+
+/*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+       struct sched_domain *child;
+       struct sched_group *group;
+
+       WARN_ON(!sd || !sd->groups);
+
+       if (cpu != first_cpu(sd->groups->cpumask))
+               return;
+
+       child = sd->child;
+
+       /*
+        * For perf policy, if the groups in child domain share resources
+        * (for example cores sharing some portions of the cache hierarchy
+        * or SMT), then set this domain groups cpu_power such that each group
+        * can handle only one task, when there are other idle groups in the
+        * same sched domain.
+        */
+       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+                      (child->flags &
+                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+               return;
         }
+
+       sd->groups->cpu_power = 0;
+
+       /*
+        * add cpu_power of each child group to this groups cpu_power
+        */
+       group = child->groups;
+       do {
+               sd->groups->cpu_power += group->cpu_power;
+               group = group->next;
+       } while (group != child->groups);
  }
  
  /*
@@ -6189,10 +6317,7 @@ next_sg:
  static int build_sched_domains(const cpumask_t *cpu_map)
  {
         int i;
-       struct sched_group *sched_group_phys = NULL;
-#ifdef CONFIG_SCHED_MC
-       struct sched_group *sched_group_core = NULL;
-#endif
+       struct sched_domain *sd;
  #ifdef CONFIG_NUMA
         struct sched_group **sched_group_nodes = NULL;
         struct sched_group *sched_group_allnodes = NULL;
@@ -6224,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                                 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
                         if (!sched_group_allnodes) {
                                 sched_group_allnodes
-                                       = kmalloc(sizeof(struct sched_group)
-                                                       * MAX_NUMNODES,
-                                                 GFP_KERNEL);
+                                       = kmalloc_node(sizeof(struct sched_group)
+                                                       * MAX_NUMNODES,
+                                                 GFP_KERNEL,
+                                                 cpu_to_node(i));
                                 if (!sched_group_allnodes) {
                                         printk(KERN_WARNING
                                         "Can not alloc allnodes sched group\n");
@@ -6238,7 +6364,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                         sd = &per_cpu(allnodes_domains, i);
                         *sd = SD_ALLNODES_INIT;
                         sd->span = *cpu_map;
-                       group = cpu_to_allnodes_group(i);
+                       group = cpu_to_allnodes_group(i, cpu_map);
                         sd->groups = &sched_group_allnodes[group];
                         p = sd;
                 } else
@@ -6248,60 +6374,42 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                 *sd = SD_NODE_INIT;
                 sd->span = sched_domain_node_span(cpu_to_node(i));
                 sd->parent = p;
+               if (p)
+                       p->child = sd;
                 cpus_and(sd->span, sd->span, *cpu_map);
  #endif
  
-               if (!sched_group_phys) {
-                       sched_group_phys
-                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
-                                         GFP_KERNEL);
-                       if (!sched_group_phys) {
-                               printk (KERN_WARNING "Can not alloc phys sched"
-                                                    "group\n");
-                               goto error;
-                       }
-                       sched_group_phys_bycpu[i] = sched_group_phys;
-               }
-
                 p = sd;
                 sd = &per_cpu(phys_domains, i);
-               group = cpu_to_phys_group(i);
+               group = cpu_to_phys_group(i, cpu_map);
                 *sd = SD_CPU_INIT;
                 sd->span = nodemask;
                 sd->parent = p;
+               if (p)
+                       p->child = sd;
                 sd->groups = &sched_group_phys[group];
  
  #ifdef CONFIG_SCHED_MC
-               if (!sched_group_core) {
-                       sched_group_core
-                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
-                                         GFP_KERNEL);
-                       if (!sched_group_core) {
-                               printk (KERN_WARNING "Can not alloc core sched"
-                                                    "group\n");
-                               goto error;
-                       }
-                       sched_group_core_bycpu[i] = sched_group_core;
-               }
-
                 p = sd;
                 sd = &per_cpu(core_domains, i);
-               group = cpu_to_core_group(i);
+               group = cpu_to_core_group(i, cpu_map);
                 *sd = SD_MC_INIT;
                 sd->span = cpu_coregroup_map(i);
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
+               p->child = sd;
                 sd->groups = &sched_group_core[group];
  #endif
  
  #ifdef CONFIG_SCHED_SMT
                 p = sd;
                 sd = &per_cpu(cpu_domains, i);
-               group = cpu_to_cpu_group(i);
+               group = cpu_to_cpu_group(i, cpu_map);
                 *sd = SD_SIBLING_INIT;
                 sd->span = cpu_sibling_map[i];
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
+               p->child = sd;
                 sd->groups = &sched_group_cpus[group];
  #endif
         }
@@ -6315,7 +6423,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                         continue;
  
                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
-                                               &cpu_to_cpu_group);
+                                       cpu_map, &cpu_to_cpu_group);
         }
  #endif
  
@@ -6327,7 +6435,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                 if (i != first_cpu(this_core_map))
                         continue;
                 init_sched_build_groups(sched_group_core, this_core_map,
-                                       &cpu_to_core_group);
+                                       cpu_map, &cpu_to_core_group);
         }
  #endif
  
@@ -6341,14 +6449,14 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                         continue;
  
                 init_sched_build_groups(sched_group_phys, nodemask,
-                                               &cpu_to_phys_group);
+                                       cpu_map, &cpu_to_phys_group);
         }
  
  #ifdef CONFIG_NUMA
         /* Set up node groups */
         if (sched_group_allnodes)
                 init_sched_build_groups(sched_group_allnodes, *cpu_map,
-                                       &cpu_to_allnodes_group);
+                                       cpu_map, &cpu_to_allnodes_group);
  
         for (i = 0; i < MAX_NUMNODES; i++) {
                 /* Set up node groups */
@@ -6420,72 +6528,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
         /* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
         for_each_cpu_mask(i, *cpu_map) {
-               struct sched_domain *sd;
                 sd = &per_cpu(cpu_domains, i);
-               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+               init_sched_groups_power(i, sd);
         }
  #endif
  #ifdef CONFIG_SCHED_MC
         for_each_cpu_mask(i, *cpu_map) {
-               int power;
-               struct sched_domain *sd;
                 sd = &per_cpu(core_domains, i);
-               if (sched_smt_power_savings)
-                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-               else
-                       power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
-                                           * SCHED_LOAD_SCALE / 10;
-               sd->groups->cpu_power = power;
+               init_sched_groups_power(i, sd);
         }
  #endif
  
         for_each_cpu_mask(i, *cpu_map) {
-               struct sched_domain *sd;
-#ifdef CONFIG_SCHED_MC
-               sd = &per_cpu(phys_domains, i);
-               if (i != first_cpu(sd->groups->cpumask))
-                       continue;
-
-               sd->groups->cpu_power = 0;
-               if (sched_mc_power_savings || sched_smt_power_savings) {
-                       int j;
-
-                       for_each_cpu_mask(j, sd->groups->cpumask) {
-                               struct sched_domain *sd1;
-                               sd1 = &per_cpu(core_domains, j);
-                               /*
-                                * for each core we will add once
-                                * to the group in physical domain
-                                */
-                               if (j != first_cpu(sd1->groups->cpumask))
-                                       continue;
-
-                               if (sched_smt_power_savings)
-                                       sd->groups->cpu_power += sd1->groups->cpu_power;
-                               else
-                                       sd->groups->cpu_power += SCHED_LOAD_SCALE;
-                       }
-               } else
-                       /*
-                        * This has to be < 2 * SCHED_LOAD_SCALE
-                        * Lets keep it SCHED_LOAD_SCALE, so that
-                        * while calculating NUMA group's cpu_power
-                        * we can simply do
-                        *  numa_group->cpu_power += phys_group->cpu_power;
-                        *
-                        * See "only add power once for each physical pkg"
-                        * comment below
-                        */
-                       sd->groups->cpu_power = SCHED_LOAD_SCALE;
-#else
-               int power;
                 sd = &per_cpu(phys_domains, i);
-               if (sched_smt_power_savings)
-                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-               else
-                       power = SCHED_LOAD_SCALE;
-               sd->groups->cpu_power = power;
-#endif
+               init_sched_groups_power(i, sd);
         }
  
  #ifdef CONFIG_NUMA
@@ -6493,7 +6549,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                 init_numa_sched_groups_power(sched_group_nodes[i]);
  
         if (sched_group_allnodes) {
-               int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+               int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
                 struct sched_group *sg = &sched_group_allnodes[group];
  
                 init_numa_sched_groups_power(sg);
@@ -6519,9 +6575,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
  
         return 0;
  
+#ifdef CONFIG_NUMA
  error:
         free_sched_groups(cpu_map);
         return -ENOMEM;
+#endif
  }
  /*
   * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
@@ -6703,11 +6761,20 @@ static int update_sched_domains(struct notifier_block *nfb,
  
  void __init sched_init_smp(void)
  {
+       cpumask_t non_isolated_cpus;
+
         lock_cpu_hotplug();
         arch_init_sched_domains(&cpu_online_map);
+       cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+       if (cpus_empty(non_isolated_cpus))
+               cpu_set(smp_processor_id(), non_isolated_cpus);
         unlock_cpu_hotplug();
         /* XXX: Theoretical race here - CPU may be hotplugged now */
         hotcpu_notifier(update_sched_domains, 0);
+
+       /* Move init over to a non-isolated CPU */
+       if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+               BUG();
  }
  #else
  void __init sched_init_smp(void)
@@ -6747,6 +6814,7 @@ void __init sched_init(void)
                         rq->cpu_load[j] = 0;
                 rq->active_balance = 0;
                 rq->push_cpu = 0;
+               rq->cpu = i;
                 rq->migration_thread = NULL;
                 INIT_LIST_HEAD(&rq->migration_queue);
  #endif