[POWERPC] Assorted janitorial EEH cleanups
[powerpc.git] / kernel / sched.c
index 5530ed2..799d23b 100644 (file)
@@ -52,8 +52,9 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
 
+#include <asm/tlb.h>
 #include <asm/unistd.h>
 
 /*
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
                return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
 
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+       return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+       sg->__cpu_power += val;
+       sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
 /*
  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
  * to time slice values: [800ms ... 100ms ... 5ms]
@@ -223,6 +245,10 @@ struct rq {
        unsigned long raw_weighted_load;
 #ifdef CONFIG_SMP
        unsigned long cpu_load[3];
+       unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+       unsigned char in_nohz_recently;
+#endif
 #endif
        unsigned long long nr_switches;
 
@@ -278,7 +304,8 @@ struct rq {
        struct lock_class_key rq_lock_key;
 };
 
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline int cpu_of(struct rq *rq)
 {
@@ -1049,6 +1076,17 @@ static void resched_task(struct task_struct *p)
        if (!tsk_is_polling(p))
                smp_send_reschedule(cpu);
 }
+
+static void resched_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       if (!spin_trylock_irqsave(&rq->lock, flags))
+               return;
+       resched_task(cpu_curr(cpu));
+       spin_unlock_irqrestore(&rq->lock, flags);
+}
 #else
 static inline void resched_task(struct task_struct *p)
 {
@@ -1241,7 +1279,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                }
 
                /* Adjust by relative CPU power of the group */
-               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+               avg_load = sg_div_cpu_power(group,
+                               avg_load * SCHED_LOAD_SCALE);
 
                if (local_group) {
                        this_load = avg_load;
@@ -1368,7 +1407,16 @@ static int wake_idle(int cpu, struct task_struct *p)
        struct sched_domain *sd;
        int i;
 
-       if (idle_cpu(cpu))
+       /*
+        * If it is idle, then it is the best cpu to run this task.
+        *
+        * This cpu is also the best, if it has more than one task already.
+        * Siblings must be also busy(in most cases) as they didn't already
+        * pickup the extra load from this cpu and hence we need not check
+        * sibling runqueue info. This will avoid the checks and cache miss
+        * penalities associated with that.
+        */
+       if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
                return cpu;
 
        for_each_domain(cpu, sd) {
@@ -2352,12 +2400,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                }
 
                total_load += avg_load;
-               total_pwr += group->cpu_power;
+               total_pwr += group->__cpu_power;
 
                /* Adjust by relative CPU power of the group */
-               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+               avg_load = sg_div_cpu_power(group,
+                               avg_load * SCHED_LOAD_SCALE);
 
-               group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
                if (local_group) {
                        this_load = avg_load;
@@ -2468,8 +2517,8 @@ group_next:
        max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 
        /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * busiest->cpu_power,
-                               (avg_load - this_load) * this->cpu_power)
+       *imbalance = min(max_pull * busiest->__cpu_power,
+                               (avg_load - this_load) * this->__cpu_power)
                        / SCHED_LOAD_SCALE;
 
        /*
@@ -2503,28 +2552,29 @@ small_imbalance:
                 * moving them.
                 */
 
-               pwr_now += busiest->cpu_power *
-                       min(busiest_load_per_task, max_load);
-               pwr_now += this->cpu_power *
-                       min(this_load_per_task, this_load);
+               pwr_now += busiest->__cpu_power *
+                               min(busiest_load_per_task, max_load);
+               pwr_now += this->__cpu_power *
+                               min(this_load_per_task, this_load);
                pwr_now /= SCHED_LOAD_SCALE;
 
                /* Amount of load we'd subtract */
-               tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
-                       busiest->cpu_power;
+               tmp = sg_div_cpu_power(busiest,
+                               busiest_load_per_task * SCHED_LOAD_SCALE);
                if (max_load > tmp)
-                       pwr_move += busiest->cpu_power *
+                       pwr_move += busiest->__cpu_power *
                                min(busiest_load_per_task, max_load - tmp);
 
                /* Amount of load we'd add */
-               if (max_load * busiest->cpu_power <
+               if (max_load * busiest->__cpu_power <
                                busiest_load_per_task * SCHED_LOAD_SCALE)
-                       tmp = max_load * busiest->cpu_power / this->cpu_power;
+                       tmp = sg_div_cpu_power(this,
+                                       max_load * busiest->__cpu_power);
                else
-                       tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
-                               this->cpu_power;
-               pwr_move += this->cpu_power *
-                       min(this_load_per_task, this_load + tmp);
+                       tmp = sg_div_cpu_power(this,
+                               busiest_load_per_task * SCHED_LOAD_SCALE);
+               pwr_move += this->__cpu_power *
+                               min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
 
                /* Move if we gain throughput */
@@ -2657,6 +2707,12 @@ redo:
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
 
+               /*
+                * some other cpu did the load balance for us.
+                */
+               if (nr_moved && this_cpu != smp_processor_id())
+                       resched_cpu(this_cpu);
+
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
                        cpu_clear(cpu_of(busiest), cpus);
@@ -2927,32 +2983,98 @@ static void update_load(struct rq *this_rq)
        }
 }
 
+#ifdef CONFIG_NO_HZ
+static struct {
+       atomic_t load_balancer;
+       cpumask_t  cpu_mask;
+} nohz ____cacheline_aligned = {
+       .load_balancer = ATOMIC_INIT(-1),
+       .cpu_mask = CPU_MASK_NONE,
+};
+
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
  *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+       int cpu = smp_processor_id();
+
+       if (stop_tick) {
+               cpu_set(cpu, nohz.cpu_mask);
+               cpu_rq(cpu)->in_nohz_recently = 1;
+
+               /*
+                * If we are going offline and still the leader, give up!
+                */
+               if (cpu_is_offline(cpu) &&
+                   atomic_read(&nohz.load_balancer) == cpu) {
+                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                               BUG();
+                       return 0;
+               }
+
+               /* time for ilb owner also to sleep */
+               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+                       if (atomic_read(&nohz.load_balancer) == cpu)
+                               atomic_set(&nohz.load_balancer, -1);
+                       return 0;
+               }
+
+               if (atomic_read(&nohz.load_balancer) == -1) {
+                       /* make me the ilb owner */
+                       if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+                               return 1;
+               } else if (atomic_read(&nohz.load_balancer) == cpu)
+                       return 1;
+       } else {
+               if (!cpu_isset(cpu, nohz.cpu_mask))
+                       return 0;
+
+               cpu_clear(cpu, nohz.cpu_mask);
+
+               if (atomic_read(&nohz.load_balancer) == cpu)
+                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                               BUG();
+       }
+       return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
 {
-       int this_cpu = smp_processor_id(), balance = 1;
-       struct rq *this_rq = cpu_rq(this_cpu);
+       int balance = 1;
+       struct rq *rq = cpu_rq(cpu);
        unsigned long interval;
        struct sched_domain *sd;
-       /*
-        * We are idle if there are no processes running. This
-        * is valid even if we are the idle process (SMT).
-        */
-       enum idle_type idle = !this_rq->nr_running ?
-                               SCHED_IDLE : NOT_IDLE;
-       /* Earliest time when we have to call run_rebalance_domains again */
+       /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
 
-       for_each_domain(this_cpu, sd) {
+       for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
@@ -2971,7 +3093,7 @@ static void run_rebalance_domains(struct softirq_action *h)
                }
 
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -2995,7 +3117,114 @@ out:
                if (!balance)
                        break;
        }
-       this_rq->next_balance = next_balance;
+       rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+       int local_cpu = smp_processor_id();
+       struct rq *local_rq = cpu_rq(local_cpu);
+       enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+
+       rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+       /*
+        * If this cpu is the owner for idle load balancing, then do the
+        * balancing on behalf of the other idle cpus whose ticks are
+        * stopped.
+        */
+       if (local_rq->idle_at_tick &&
+           atomic_read(&nohz.load_balancer) == local_cpu) {
+               cpumask_t cpus = nohz.cpu_mask;
+               struct rq *rq;
+               int balance_cpu;
+
+               cpu_clear(local_cpu, cpus);
+               for_each_cpu_mask(balance_cpu, cpus) {
+                       /*
+                        * If this cpu gets work to do, stop the load balancing
+                        * work being done for other cpus. Next load
+                        * balancing owner will pick it up.
+                        */
+                       if (need_resched())
+                               break;
+
+                       rebalance_domains(balance_cpu, SCHED_IDLE);
+
+                       rq = cpu_rq(balance_cpu);
+                       if (time_after(local_rq->next_balance, rq->next_balance))
+                               local_rq->next_balance = rq->next_balance;
+               }
+       }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+       /*
+        * If we were in the nohz mode recently and busy at the current
+        * scheduler tick, then check if we need to nominate new idle
+        * load balancer.
+        */
+       if (rq->in_nohz_recently && !rq->idle_at_tick) {
+               rq->in_nohz_recently = 0;
+
+               if (atomic_read(&nohz.load_balancer) == cpu) {
+                       cpu_clear(cpu, nohz.cpu_mask);
+                       atomic_set(&nohz.load_balancer, -1);
+               }
+
+               if (atomic_read(&nohz.load_balancer) == -1) {
+                       /*
+                        * simple selection for now: Nominate the
+                        * first cpu in the nohz list to be the next
+                        * ilb owner.
+                        *
+                        * TBD: Traverse the sched domains and nominate
+                        * the nearest cpu in the nohz.cpu_mask.
+                        */
+                       int ilb = first_cpu(nohz.cpu_mask);
+
+                       if (ilb != NR_CPUS)
+                               resched_cpu(ilb);
+               }
+       }
+
+       /*
+        * If this cpu is idle and doing idle load balancing for all the
+        * cpus with ticks stopped, is it time for that to stop?
+        */
+       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               resched_cpu(cpu);
+               return;
+       }
+
+       /*
+        * If this cpu is idle and the idle load balancing is done by
+        * someone else, then no need raise the SCHED_SOFTIRQ
+        */
+       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+           cpu_isset(cpu, nohz.cpu_mask))
+               return;
+#endif
+       if (time_after_eq(jiffies, rq->next_balance))
+               raise_softirq(SCHED_SOFTIRQ);
 }
 #else
 /*
@@ -3218,16 +3447,17 @@ void scheduler_tick(void)
        unsigned long long now = sched_clock();
        struct task_struct *p = current;
        int cpu = smp_processor_id();
+       int idle_at_tick = idle_cpu(cpu);
        struct rq *rq = cpu_rq(cpu);
 
        update_cpu_clock(p, rq, now);
 
-       if (p != rq->idle)
+       if (!idle_at_tick)
                task_running_tick(rq, p);
 #ifdef CONFIG_SMP
        update_load(rq);
-       if (time_after_eq(jiffies, rq->next_balance))
-               raise_softirq(SCHED_SOFTIRQ);
+       rq->idle_at_tick = idle_at_tick;
+       trigger_load_balance(cpu);
 #endif
 }
 
@@ -4291,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
        struct task_struct *p;
        int retval;
 
-       lock_cpu_hotplug();
+       mutex_lock(&sched_hotcpu_mutex);
        read_lock(&tasklist_lock);
 
        p = find_process_by_pid(pid);
        if (!p) {
                read_unlock(&tasklist_lock);
-               unlock_cpu_hotplug();
+               mutex_unlock(&sched_hotcpu_mutex);
                return -ESRCH;
        }
 
@@ -4324,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 
 out_unlock:
        put_task_struct(p);
-       unlock_cpu_hotplug();
+       mutex_unlock(&sched_hotcpu_mutex);
        return retval;
 }
 
@@ -4381,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
        struct task_struct *p;
        int retval;
 
-       lock_cpu_hotplug();
+       mutex_lock(&sched_hotcpu_mutex);
        read_lock(&tasklist_lock);
 
        retval = -ESRCH;
@@ -4397,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 
 out_unlock:
        read_unlock(&tasklist_lock);
-       unlock_cpu_hotplug();
+       mutex_unlock(&sched_hotcpu_mutex);
        if (retval)
                return retval;
 
@@ -5159,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        struct rq *rq;
 
        switch (action) {
+       case CPU_LOCK_ACQUIRE:
+               mutex_lock(&sched_hotcpu_mutex);
+               break;
+
        case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
                p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
                if (IS_ERR(p))
                        return NOTIFY_BAD;
@@ -5173,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 
        case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
                /* Strictly unneccessary, as first user will wake it. */
                wake_up_process(cpu_rq(cpu)->migration_thread);
                break;
 
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
                if (!cpu_rq(cpu)->migration_thread)
                        break;
                /* Unbind it from offline cpu so it can run.  Fall thru. */
@@ -5189,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 
        case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
                migrate_live_tasks(cpu);
                rq = cpu_rq(cpu);
                kthread_stop(rq->migration_thread);
@@ -5204,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                BUG_ON(rq->nr_running != 0);
 
                /* No need to migrate the tasks: it was best-effort if
-                * they didn't do lock_cpu_hotplug().  Just wake up
+                * they didn't take sched_hotcpu_mutex.  Just wake up
                 * the requestors. */
                spin_lock_irq(&rq->lock);
                while (!list_empty(&rq->migration_queue)) {
@@ -5218,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_unlock_irq(&rq->lock);
                break;
 #endif
+       case CPU_LOCK_RELEASE:
+               mutex_unlock(&sched_hotcpu_mutex);
+               break;
        }
        return NOTIFY_OK;
 }
@@ -5306,7 +5547,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                                break;
                        }
 
-                       if (!group->cpu_power) {
+                       if (!group->__cpu_power) {
                                printk("\n");
                                printk(KERN_ERR "ERROR: domain->cpu_power not "
                                                "set\n");
@@ -5483,7 +5724,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
                        continue;
 
                sg->cpumask = CPU_MASK_NONE;
-               sg->cpu_power = 0;
+               sg->__cpu_power = 0;
 
                for_each_cpu_mask(j, span) {
                        if (group_fn(j, cpu_map, NULL) != group)
@@ -6172,7 +6413,7 @@ next_sg:
                        continue;
                }
 
-               sg->cpu_power += sd->groups->cpu_power;
+               sg_inc_cpu_power(sg, sd->groups->__cpu_power);
        }
        sg = sg->next;
        if (sg != group_head)
@@ -6247,6 +6488,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
        child = sd->child;
 
+       sd->groups->__cpu_power = 0;
+
        /*
         * For perf policy, if the groups in child domain share resources
         * (for example cores sharing some portions of the cache hierarchy
@@ -6257,18 +6500,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
                       (child->flags &
                        (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
-               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+               sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
                return;
        }
 
-       sd->groups->cpu_power = 0;
-
        /*
         * add cpu_power of each child group to this groups cpu_power
         */
        group = child->groups;
        do {
-               sd->groups->cpu_power += group->cpu_power;
+               sg_inc_cpu_power(sd->groups, group->__cpu_power);
                group = group->next;
        } while (group != child->groups);
 }
@@ -6428,7 +6669,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        sd = &per_cpu(node_domains, j);
                        sd->groups = sg;
                }
-               sg->cpu_power = 0;
+               sg->__cpu_power = 0;
                sg->cpumask = nodemask;
                sg->next = sg;
                cpus_or(covered, covered, nodemask);
@@ -6456,7 +6697,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                                "Can not alloc domain group for node %d\n", j);
                                goto error;
                        }
-                       sg->cpu_power = 0;
+                       sg->__cpu_power = 0;
                        sg->cpumask = tmp;
                        sg->next = prev->next;
                        cpus_or(covered, covered, tmp);
@@ -6593,10 +6834,10 @@ int arch_reinit_sched_domains(void)
 {
        int err;
 
-       lock_cpu_hotplug();
+       mutex_lock(&sched_hotcpu_mutex);
        detach_destroy_domains(&cpu_online_map);
        err = arch_init_sched_domains(&cpu_online_map);
-       unlock_cpu_hotplug();
+       mutex_unlock(&sched_hotcpu_mutex);
 
        return err;
 }
@@ -6675,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 {
        switch (action) {
        case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
        case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
                detach_destroy_domains(&cpu_online_map);
                return NOTIFY_OK;
 
        case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
        case CPU_DOWN_FAILED:
+       case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
                /*
                 * Fall through and re-initialise the domains.
                 */
@@ -6701,12 +6948,12 @@ void __init sched_init_smp(void)
 {
        cpumask_t non_isolated_cpus;
 
-       lock_cpu_hotplug();
+       mutex_lock(&sched_hotcpu_mutex);
        arch_init_sched_domains(&cpu_online_map);
        cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
        if (cpus_empty(non_isolated_cpus))
                cpu_set(smp_processor_id(), non_isolated_cpus);
-       unlock_cpu_hotplug();
+       mutex_unlock(&sched_hotcpu_mutex);
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);