Merge Christoph's freeze cleanup patch

author Linus Torvalds <torvalds@ppc970.osdl.org>

Sun, 26 Jun 2005 00:16:53 +0000 (17:16 -0700)

committer Linus Torvalds <torvalds@ppc970.osdl.org>

Sun, 26 Jun 2005 00:16:53 +0000 (17:16 -0700)
author Linus Torvalds <torvalds@ppc970.osdl.org>
Sun, 26 Jun 2005 00:16:53 +0000 (17:16 -0700)
committer Linus Torvalds <torvalds@ppc970.osdl.org>
Sun, 26 Jun 2005 00:16:53 +0000 (17:16 -0700)
diff --combined Documentation/power/swsusp.txt

index 8363c51,4e1627c..7a6b789
--- 1/Documentation/power/swsusp.txt
--- 2/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@@ -164,11 -164,10 +164,11 @@@ place where the thread is safe to be fr
   should be held at that point and it must be safe to sleep there), and
   add:
   
-       try_to_freeze(PF_FREEZE);
- -            try_to_freeze();
++       try_to_freeze();
   
   If the thread is needed for writing the image to storage, you should
- -instead set the PF_NOFREEZE process flag when creating the thread.
+ +instead set the PF_NOFREEZE process flag when creating the thread (and
+ +be very carefull).
   
   
   Q: What is the difference between between "platform", "shutdown" and
@@@ -233,81 -232,3 +233,81 @@@ A: Try runnin
   cat `cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u` > /dev/null
   
   after resume. swapoff -a; swapon -a may also be usefull.
+ +
+ +Q: What happens to devices during swsusp? They seem to be resumed
+ +during system suspend?
+ +
+ +A: That's correct. We need to resume them if we want to write image to
+ +disk. Whole sequence goes like
+ +
+ +      Suspend part
+ +      ~~~~~~~~~~~~
+ +      running system, user asks for suspend-to-disk
+ +
+ +      user processes are stopped
+ +
+ +      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+ +                            with state snapshot
+ +
+ +      state snapshot: copy of whole used memory is taken with interrupts disabled
+ +
+ +      resume(): devices are woken up so that we can write image to swap
+ +
+ +      write image to swap
+ +
+ +      suspend(PMSG_SUSPEND): suspend devices so that we can power off
+ +
+ +      turn the power off
+ +
+ +      Resume part
+ +      ~~~~~~~~~~~
+ +      (is actually pretty similar)
+ +
+ +      running system, user asks for suspend-to-disk
+ +
+ +      user processes are stopped (in common case there are none, but with resume-from-initrd, noone knows)
+ +
+ +      read image from disk
+ +
+ +      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+ +                            with image restoration
+ +
+ +      image restoration: rewrite memory with image
+ +
+ +      resume(): devices are woken up so that system can continue
+ +
+ +      thaw all user processes
+ +
+ +Q: What is this 'Encrypt suspend image' for?
+ +
+ +A: First of all: it is not a replacement for dm-crypt encrypted swap.
+ +It cannot protect your computer while it is suspended. Instead it does
+ +protect from leaking sensitive data after resume from suspend.
+ +
+ +Think of the following: you suspend while an application is running
+ +that keeps sensitive data in memory. The application itself prevents
+ +the data from being swapped out. Suspend, however, must write these
+ +data to swap to be able to resume later on. Without suspend encryption
+ +your sensitive data are then stored in plaintext on disk.  This means
+ +that after resume your sensitive data are accessible to all
+ +applications having direct access to the swap device which was used
+ +for suspend. If you don't need swap after resume these data can remain
+ +on disk virtually forever. Thus it can happen that your system gets
+ +broken in weeks later and sensitive data which you thought were
+ +encrypted and protected are retrieved and stolen from the swap device.
+ +To prevent this situation you should use 'Encrypt suspend image'.
+ +
+ +During suspend a temporary key is created and this key is used to
+ +encrypt the data written to disk. When, during resume, the data was
+ +read back into memory the temporary key is destroyed which simply
+ +means that all data written to disk during suspend are then
+ +inaccessible so they can't be stolen later on.  The only thing that
+ +you must then take care of is that you call 'mkswap' for the swap
+ +partition used for suspend as early as possible during regular
+ +boot. This asserts that any temporary key from an oopsed suspend or
+ +from a failed or aborted resume is erased from the swap device.
+ +
+ +As a rule of thumb use encrypted swap to protect your data while your
+ +system is shut down or suspended. Additionally use the encrypted
+ +suspend image to prevent sensitive data from being stolen after
+ +resume.
diff --combined arch/i386/kernel/io_apic.c

index 060c4ca,2451a3a..35eb8e2
--- 1/arch/i386/kernel/io_apic.c
--- 2/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@@ -573,14 -573,12 +573,14 @@@ static int balanced_irq(void *unused
         for ( ; ; ) {
                 set_current_state(TASK_INTERRUPTIBLE);
                 time_remaining = schedule_timeout(time_remaining);
-               try_to_freeze(PF_FREEZE);
+               try_to_freeze();
                 if (time_after(jiffies,
                                 prev_balance_time+balanced_irq_interval)) {
+ +                      preempt_disable();
                         do_irq_balance();
                         prev_balance_time = jiffies;
                         time_remaining = balanced_irq_interval;
+ +                      preempt_enable();
                 }
         }
         return 0;
@@@ -632,8 -630,10 +632,8 @@@ static int __init balanced_irq_init(voi
                 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
   failed:
         for (i = 0; i < NR_CPUS; i++) {
- -              if(irq_cpu_data[i].irq_delta)
- -                      kfree(irq_cpu_data[i].irq_delta);
- -              if(irq_cpu_data[i].last_irq)
- -                      kfree(irq_cpu_data[i].last_irq);
+ +              kfree(irq_cpu_data[i].irq_delta);
+ +              kfree(irq_cpu_data[i].last_irq);
         }
         return 0;
   }
@@@ -1634,43 -1634,12 +1634,43 @@@ static void __init enable_IO_APIC(void
    */
   void disable_IO_APIC(void)
   {
+ +      int pin;
         /*
          * Clear the IO-APIC before rebooting:
          */
         clear_IO_APIC();
   
- -      disconnect_bsp_APIC();
+ +      /*
+ +       * If the i82559 is routed through an IOAPIC
+ +       * Put that IOAPIC in virtual wire mode
+ +       * so legacy interrups can be delivered.
+ +       */
+ +      pin = find_isa_irq_pin(0, mp_ExtINT);
+ +      if (pin != -1) {
+ +              struct IO_APIC_route_entry entry;
+ +              unsigned long flags;
+ +
+ +              memset(&entry, 0, sizeof(entry));
+ +              entry.mask            = 0; /* Enabled */
+ +              entry.trigger         = 0; /* Edge */
+ +              entry.irr             = 0;
+ +              entry.polarity        = 0; /* High */
+ +              entry.delivery_status = 0;
+ +              entry.dest_mode       = 0; /* Physical */
+ +              entry.delivery_mode   = 7; /* ExtInt */
+ +              entry.vector          = 0;
+ +              entry.dest.physical.physical_dest = 0;
+ +
+ +
+ +              /*
+ +               * Add it to the IO-APIC irq-routing table:
+ +               */
+ +              spin_lock_irqsave(&ioapic_lock, flags);
+ +              io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+ +              io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ +              spin_unlock_irqrestore(&ioapic_lock, flags);
+ +      }
+ +      disconnect_bsp_APIC(pin != -1);
   }
   
   /*
diff --combined include/linux/sched.h

index 98c109e,e7fd09b..9530b19
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -368,11 -368,6 +368,11 @@@ struct signal_struct 
   #endif
   };
   
+ +/* Context switch must be unlocked if interrupts are to be enabled */
+ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ +# define __ARCH_WANT_UNLOCKED_CTXSW
+ +#endif
+ +
   /*
    * Bits in flags field of signal_struct.
    */
@@@ -465,11 -460,10 +465,11 @@@ enum idle_typ
   #define SD_LOAD_BALANCE               1       /* Do load balancing on this domain. */
   #define SD_BALANCE_NEWIDLE    2       /* Balance when about to become idle */
   #define SD_BALANCE_EXEC               4       /* Balance on exec */
- -#define SD_WAKE_IDLE          8       /* Wake to idle CPU on task wakeup */
- -#define SD_WAKE_AFFINE                16      /* Wake task to waking CPU */
- -#define SD_WAKE_BALANCE               32      /* Perform balancing at task wakeup */
- -#define SD_SHARE_CPUPOWER     64      /* Domain members share cpu power */
+ +#define SD_BALANCE_FORK               8       /* Balance on fork, clone */
+ +#define SD_WAKE_IDLE          16      /* Wake to idle CPU on task wakeup */
+ +#define SD_WAKE_AFFINE                32      /* Wake task to waking CPU */
+ +#define SD_WAKE_BALANCE               64      /* Perform balancing at task wakeup */
+ +#define SD_SHARE_CPUPOWER     128     /* Domain members share cpu power */
   
   struct sched_group {
         struct sched_group *next;       /* Must be a circular list */
@@@ -494,11 -488,6 +494,11 @@@ struct sched_domain 
         unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
         unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
         unsigned int per_cpu_gain;      /* CPU % gained by adding domain cpus */
+ +      unsigned int busy_idx;
+ +      unsigned int idle_idx;
+ +      unsigned int newidle_idx;
+ +      unsigned int wake_idx;
+ +      unsigned int forkexec_idx;
         int flags;                      /* See SD_* */
   
         /* Runtime fields. */
@@@ -522,16 -511,10 +522,16 @@@
         unsigned long alb_failed;
         unsigned long alb_pushed;
   
- -      /* sched_balance_exec() stats */
- -      unsigned long sbe_attempts;
+ +      /* SD_BALANCE_EXEC stats */
+ +      unsigned long sbe_cnt;
+ +      unsigned long sbe_balanced;
         unsigned long sbe_pushed;
   
+ +      /* SD_BALANCE_FORK stats */
+ +      unsigned long sbf_cnt;
+ +      unsigned long sbf_balanced;
+ +      unsigned long sbf_pushed;
+ +
         /* try_to_wake_up() stats */
         unsigned long ttwu_wake_remote;
         unsigned long ttwu_move_affine;
@@@ -539,8 -522,6 +539,8 @@@
   #endif
   };
   
+ +extern void partition_sched_domains(cpumask_t *partition1,
+ +                                  cpumask_t *partition2);
   #ifdef ARCH_HAS_SCHED_DOMAIN
   /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
   extern cpumask_t cpu_isolated_map;
@@@ -601,9 -582,6 +601,9 @@@ struct task_struct 
   
         int lock_depth;         /* BKL lock depth */
   
+ +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ +      int oncpu;
+ +#endif
         int prio, static_prio;
         struct list_head run_list;
         prio_array_t *array;
@@@ -726,6 -704,8 +726,6 @@@
         spinlock_t alloc_lock;
   /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
         spinlock_t proc_lock;
- -/* context-switch lock */
- -      spinlock_t switch_lock;
   
   /* journalling filesystem info */
         void *journal_info;
@@@ -932,7 -912,7 +932,7 @@@ extern void FASTCALL(wake_up_new_task(s
   #else
    static inline void kick_process(struct task_struct *tsk) { }
   #endif
- -extern void FASTCALL(sched_fork(task_t * p));
+ +extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
   extern void FASTCALL(sched_exit(task_t * p));
   
   extern int in_group_p(gid_t);
@@@ -1265,33 -1245,78 +1265,78 @@@ extern void normalize_rt_tasks(void)
   
   #endif
   
- /* try_to_freeze
-  *
-  * Checks whether we need to enter the refrigerator
-  * and returns 1 if we did so.
-  */
   #ifdef CONFIG_PM
- extern void refrigerator(unsigned long);
+ /*
+  * Check if a process has been frozen
+  */
+ static inline int frozen(struct task_struct *p)
+ {
+       return p->flags & PF_FROZEN;
+ }
+ 
+ /*
+  * Check if there is a request to freeze a process
+  */
+ static inline int freezing(struct task_struct *p)
+ {
+       return p->flags & PF_FREEZE;
+ }
+ 
+ /*
+  * Request that a process be frozen
+  * FIXME: SMP problem. We may not modify other process' flags!
+  */
+ static inline void freeze(struct task_struct *p)
+ {
+       p->flags |= PF_FREEZE;
+ }
+ 
+ /*
+  * Wake up a frozen process
+  */
+ static inline int thaw_process(struct task_struct *p)
+ {
+       if (frozen(p)) {
+               p->flags &= ~PF_FROZEN;
+               wake_up_process(p);
+               return 1;
+       }
+       return 0;
+ }
+ 
+ /*
+  * freezing is complete, mark process as frozen
+  */
+ static inline void frozen_process(struct task_struct *p)
+ {
+       p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
+ }
+ 
+ extern void refrigerator(void);
   extern int freeze_processes(void);
   extern void thaw_processes(void);
   
- static inline int try_to_freeze(unsigned long refrigerator_flags)
+ static inline int try_to_freeze(void)
   {
-       if (unlikely(current->flags & PF_FREEZE)) {
-               refrigerator(refrigerator_flags);
+       if (freezing(current)) {
+               refrigerator();
                 return 1;
         } else
                 return 0;
   }
   #else
- static inline void refrigerator(unsigned long flag) {}
+ static inline int frozen(struct task_struct *p) { return 0; }
+ static inline int freezing(struct task_struct *p) { return 0; }
+ static inline void freeze(struct task_struct *p) { BUG(); }
+ static inline int thaw_process(struct task_struct *p) { return 1; }
+ static inline void frozen_process(struct task_struct *p) { BUG(); }
+ 
+ static inline void refrigerator(void) {}
   static inline int freeze_processes(void) { BUG(); return 0; }
   static inline void thaw_processes(void) {}
   
- static inline int try_to_freeze(unsigned long refrigerator_flags)
- {
-       return 0;
- }
+ static inline int try_to_freeze(void) { return 0; }
+ 
   #endif /* CONFIG_PM */
   #endif /* __KERNEL__ */
   
diff --combined kernel/sched.c

index dee96b2,6fa9ea4..a07cff9
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -166,7 -166,7 +166,7 @@@
   #define SCALE_PRIO(x, prio) \
         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
   
- -static inline unsigned int task_timeslice(task_t *p)
+ +static unsigned int task_timeslice(task_t *p)
   {
         if (p->static_prio < NICE_TO_PRIO(0))
                 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@@ -206,7 -206,7 +206,7 @@@ struct runqueue 
          */
         unsigned long nr_running;
   #ifdef CONFIG_SMP
- -      unsigned long cpu_load;
+ +      unsigned long cpu_load[3];
   #endif
         unsigned long long nr_switches;
   
@@@ -260,86 -260,22 +260,86 @@@
   
   static DEFINE_PER_CPU(struct runqueue, runqueues);
   
+ +/*
+ + * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ + * See detach_destroy_domains: synchronize_sched for details.
+ + *
+ + * The domain tree of any CPU may only be accessed from within
+ + * preempt-disabled sections.
+ + */
   #define for_each_domain(cpu, domain) \
- -      for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+ +for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
   
   #define cpu_rq(cpu)           (&per_cpu(runqueues, (cpu)))
   #define this_rq()             (&__get_cpu_var(runqueues))
   #define task_rq(p)            cpu_rq(task_cpu(p))
   #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
   
- -/*
- - * Default context-switch locking:
- - */
   #ifndef prepare_arch_switch
- -# define prepare_arch_switch(rq, next)        do { } while (0)
- -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
- -# define task_running(rq, p)          ((rq)->curr == (p))
+ +# define prepare_arch_switch(next)    do { } while (0)
+ +#endif
+ +#ifndef finish_arch_switch
+ +# define finish_arch_switch(prev)     do { } while (0)
+ +#endif
+ +
+ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+ +static inline int task_running(runqueue_t *rq, task_t *p)
+ +{
+ +      return rq->curr == p;
+ +}
+ +
+ +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+ +{
+ +}
+ +
+ +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+ +{
+ +      spin_unlock_irq(&rq->lock);
+ +}
+ +
+ +#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+ +static inline int task_running(runqueue_t *rq, task_t *p)
+ +{
+ +#ifdef CONFIG_SMP
+ +      return p->oncpu;
+ +#else
+ +      return rq->curr == p;
+ +#endif
+ +}
+ +
+ +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+ +{
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * We can optimise this out completely for !SMP, because the
+ +       * SMP rebalancing from interrupt is the only thing that cares
+ +       * here.
+ +       */
+ +      next->oncpu = 1;
+ +#endif
+ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ +      spin_unlock_irq(&rq->lock);
+ +#else
+ +      spin_unlock(&rq->lock);
+ +#endif
+ +}
+ +
+ +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+ +{
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * After ->oncpu is cleared, the task can be moved to a different CPU.
+ +       * We must ensure this doesn't happen until the switch is completely
+ +       * finished.
+ +       */
+ +      smp_wmb();
+ +      prev->oncpu = 0;
   #endif
+ +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ +      local_irq_enable();
+ +#endif
+ +}
+ +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
   
   /*
    * task_rq_lock - lock the runqueue a given task resides on and disable
@@@ -373,7 -309,7 +373,7 @@@ static inline void task_rq_unlock(runqu
    * bump this up when changing the output format or the meaning of an existing
    * format, so that tools can adapt (or abort)
    */
- -#define SCHEDSTAT_VERSION 11
+ +#define SCHEDSTAT_VERSION 12
   
   static int show_schedstat(struct seq_file *seq, void *v)
   {
@@@ -402,7 -338,6 +402,7 @@@
   
   #ifdef CONFIG_SMP
                 /* domain-specific stats */
+ +              preempt_disable();
                 for_each_domain(cpu, sd) {
                         enum idle_type itype;
                         char mask_str[NR_CPUS];
@@@ -421,13 -356,11 +421,13 @@@
                                     sd->lb_nobusyq[itype],
                                     sd->lb_nobusyg[itype]);
                         }
- -                      seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ +                      seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
                             sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
- -                          sd->sbe_pushed, sd->sbe_attempts,
+ +                          sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+ +                          sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
                             sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
                 }
+ +              preempt_enable();
   #endif
         }
         return 0;
@@@ -481,6 -414,22 +481,6 @@@ static inline runqueue_t *this_rq_lock(
         return rq;
   }
   
- -#ifdef CONFIG_SCHED_SMT
- -static int cpu_and_siblings_are_idle(int cpu)
- -{
- -      int sib;
- -      for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
- -              if (idle_cpu(sib))
- -                      continue;
- -              return 0;
- -      }
- -
- -      return 1;
- -}
- -#else
- -#define cpu_and_siblings_are_idle(A) idle_cpu(A)
- -#endif
- -
   #ifdef CONFIG_SCHEDSTATS
   /*
    * Called when a process is dequeued from the active array and given
@@@ -673,7 -622,7 +673,7 @@@ static inline void __activate_idle_task
         rq->nr_running++;
   }
   
- -static void recalc_task_prio(task_t *p, unsigned long long now)
+ +static int recalc_task_prio(task_t *p, unsigned long long now)
   {
         /* Caller must always ensure 'now >= p->timestamp' */
         unsigned long long __sleep_time = now - p->timestamp;
@@@ -732,7 -681,7 +732,7 @@@
                 }
         }
   
- -      p->prio = effective_prio(p);
+ +      return effective_prio(p);
   }
   
   /*
@@@ -755,7 -704,7 +755,7 @@@ static void activate_task(task_t *p, ru
         }
   #endif
   
- -      recalc_task_prio(p, now);
+ +      p->prio = recalc_task_prio(p, now);
   
         /*
          * This checks to make sure it's not an uninterruptible task
@@@ -833,12 -782,22 +833,12 @@@ inline int task_curr(const task_t *p
   }
   
   #ifdef CONFIG_SMP
- -enum request_type {
- -      REQ_MOVE_TASK,
- -      REQ_SET_DOMAIN,
- -};
- -
   typedef struct {
         struct list_head list;
- -      enum request_type type;
   
- -      /* For REQ_MOVE_TASK */
         task_t *task;
         int dest_cpu;
   
- -      /* For REQ_SET_DOMAIN */
- -      struct sched_domain *sd;
- -
         struct completion done;
   } migration_req_t;
   
@@@ -860,6 -819,7 +860,6 @@@ static int migrate_task(task_t *p, int 
         }
   
         init_completion(&req->done);
- -      req->type = REQ_MOVE_TASK;
         req->task = p;
         req->dest_cpu = dest_cpu;
         list_add(&req->list, &rq->migration_queue);
@@@ -926,154 -886,26 +926,154 @@@ void kick_process(task_t *p
    * We want to under-estimate the load of migration sources, to
    * balance conservatively.
    */
- -static inline unsigned long source_load(int cpu)
+ +static inline unsigned long source_load(int cpu, int type)
   {
         runqueue_t *rq = cpu_rq(cpu);
         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ +      if (type == 0)
+ +              return load_now;
   
- -      return min(rq->cpu_load, load_now);
+ +      return min(rq->cpu_load[type-1], load_now);
   }
   
   /*
    * Return a high guess at the load of a migration-target cpu
    */
- -static inline unsigned long target_load(int cpu)
+ +static inline unsigned long target_load(int cpu, int type)
   {
         runqueue_t *rq = cpu_rq(cpu);
         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ +      if (type == 0)
+ +              return load_now;
   
- -      return max(rq->cpu_load, load_now);
+ +      return max(rq->cpu_load[type-1], load_now);
   }
   
- -#endif
+ +/*
+ + * find_idlest_group finds and returns the least busy CPU group within the
+ + * domain.
+ + */
+ +static struct sched_group *
+ +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+ +{
+ +      struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ +      unsigned long min_load = ULONG_MAX, this_load = 0;
+ +      int load_idx = sd->forkexec_idx;
+ +      int imbalance = 100 + (sd->imbalance_pct-100)/2;
+ +
+ +      do {
+ +              unsigned long load, avg_load;
+ +              int local_group;
+ +              int i;
+ +
+ +              local_group = cpu_isset(this_cpu, group->cpumask);
+ +              /* XXX: put a cpus allowed check */
+ +
+ +              /* Tally up the load of all CPUs in the group */
+ +              avg_load = 0;
+ +
+ +              for_each_cpu_mask(i, group->cpumask) {
+ +                      /* Bias balancing toward cpus of our domain */
+ +                      if (local_group)
+ +                              load = source_load(i, load_idx);
+ +                      else
+ +                              load = target_load(i, load_idx);
+ +
+ +                      avg_load += load;
+ +              }
+ +
+ +              /* Adjust by relative CPU power of the group */
+ +              avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ +
+ +              if (local_group) {
+ +                      this_load = avg_load;
+ +                      this = group;
+ +              } else if (avg_load < min_load) {
+ +                      min_load = avg_load;
+ +                      idlest = group;
+ +              }
+ +              group = group->next;
+ +      } while (group != sd->groups);
+ +
+ +      if (!idlest || 100*this_load < imbalance*min_load)
+ +              return NULL;
+ +      return idlest;
+ +}
+ +
+ +/*
+ + * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ + */
+ +static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+ +{
+ +      unsigned long load, min_load = ULONG_MAX;
+ +      int idlest = -1;
+ +      int i;
+ +
+ +      for_each_cpu_mask(i, group->cpumask) {
+ +              load = source_load(i, 0);
+ +
+ +              if (load < min_load || (load == min_load && i == this_cpu)) {
+ +                      min_load = load;
+ +                      idlest = i;
+ +              }
+ +      }
+ +
+ +      return idlest;
+ +}
+ +
+ +/*
+ + * sched_balance_self: balance the current task (running on cpu) in domains
+ + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ + * SD_BALANCE_EXEC.
+ + *
+ + * Balance, ie. select the least loaded group.
+ + *
+ + * Returns the target CPU number, or the same CPU if no balancing is needed.
+ + *
+ + * preempt must be disabled.
+ + */
+ +static int sched_balance_self(int cpu, int flag)
+ +{
+ +      struct task_struct *t = current;
+ +      struct sched_domain *tmp, *sd = NULL;
+ +
+ +      for_each_domain(cpu, tmp)
+ +              if (tmp->flags & flag)
+ +                      sd = tmp;
+ +
+ +      while (sd) {
+ +              cpumask_t span;
+ +              struct sched_group *group;
+ +              int new_cpu;
+ +              int weight;
+ +
+ +              span = sd->span;
+ +              group = find_idlest_group(sd, t, cpu);
+ +              if (!group)
+ +                      goto nextlevel;
+ +
+ +              new_cpu = find_idlest_cpu(group, cpu);
+ +              if (new_cpu == -1 || new_cpu == cpu)
+ +                      goto nextlevel;
+ +
+ +              /* Now try balancing at a lower domain level */
+ +              cpu = new_cpu;
+ +nextlevel:
+ +              sd = NULL;
+ +              weight = cpus_weight(span);
+ +              for_each_domain(cpu, tmp) {
+ +                      if (weight <= cpus_weight(tmp->span))
+ +                              break;
+ +                      if (tmp->flags & flag)
+ +                              sd = tmp;
+ +              }
+ +              /* while loop will break here if sd == NULL */
+ +      }
+ +
+ +      return cpu;
+ +}
+ +
+ +#endif /* CONFIG_SMP */
   
   /*
    * wake_idle() will wake a task on an idle cpu if task->cpu is
@@@ -1095,14 -927,14 +1095,14 @@@ static int wake_idle(int cpu, task_t *p
   
         for_each_domain(cpu, sd) {
                 if (sd->flags & SD_WAKE_IDLE) {
- -                      cpus_and(tmp, sd->span, cpu_online_map);
- -                      cpus_and(tmp, tmp, p->cpus_allowed);
+ +                      cpus_and(tmp, sd->span, p->cpus_allowed);
                         for_each_cpu_mask(i, tmp) {
                                 if (idle_cpu(i))
                                         return i;
                         }
                 }
- -              else break;
+ +              else
+ +                      break;
         }
         return cpu;
   }
@@@ -1135,7 -967,7 +1135,7 @@@ static int try_to_wake_up(task_t * p, u
         runqueue_t *rq;
   #ifdef CONFIG_SMP
         unsigned long load, this_load;
- -      struct sched_domain *sd;
+ +      struct sched_domain *sd, *this_sd = NULL;
         int new_cpu;
   #endif
   
@@@ -1154,69 -986,70 +1154,69 @@@
         if (unlikely(task_running(rq, p)))
                 goto out_activate;
   
- -#ifdef CONFIG_SCHEDSTATS
+ +      new_cpu = cpu;
+ +
         schedstat_inc(rq, ttwu_cnt);
         if (cpu == this_cpu) {
                 schedstat_inc(rq, ttwu_local);
- -      } else {
- -              for_each_domain(this_cpu, sd) {
- -                      if (cpu_isset(cpu, sd->span)) {
- -                              schedstat_inc(sd, ttwu_wake_remote);
- -                              break;
- -                      }
+ +              goto out_set_cpu;
+ +      }
+ +
+ +      for_each_domain(this_cpu, sd) {
+ +              if (cpu_isset(cpu, sd->span)) {
+ +                      schedstat_inc(sd, ttwu_wake_remote);
+ +                      this_sd = sd;
+ +                      break;
                 }
         }
- -#endif
   
- -      new_cpu = cpu;
- -      if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ +      if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                 goto out_set_cpu;
   
- -      load = source_load(cpu);
- -      this_load = target_load(this_cpu);
- -
         /*
- -       * If sync wakeup then subtract the (maximum possible) effect of
- -       * the currently running task from the load of the current CPU:
+ +       * Check for affine wakeup and passive balancing possibilities.
          */
- -      if (sync)
- -              this_load -= SCHED_LOAD_SCALE;
+ +      if (this_sd) {
+ +              int idx = this_sd->wake_idx;
+ +              unsigned int imbalance;
   
- -      /* Don't pull the task off an idle CPU to a busy one */
- -      if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
- -              goto out_set_cpu;
+ +              imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
   
- -      new_cpu = this_cpu; /* Wake to this CPU if we can */
+ +              load = source_load(cpu, idx);
+ +              this_load = target_load(this_cpu, idx);
   
- -      /*
- -       * Scan domains for affine wakeup and passive balancing
- -       * possibilities.
- -       */
- -      for_each_domain(this_cpu, sd) {
- -              unsigned int imbalance;
- -              /*
- -               * Start passive balancing when half the imbalance_pct
- -               * limit is reached.
- -               */
- -              imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+ +              new_cpu = this_cpu; /* Wake to this CPU if we can */
   
- -              if ((sd->flags & SD_WAKE_AFFINE) &&
- -                              !task_hot(p, rq->timestamp_last_tick, sd)) {
+ +              if (this_sd->flags & SD_WAKE_AFFINE) {
+ +                      unsigned long tl = this_load;
                         /*
- -                       * This domain has SD_WAKE_AFFINE and p is cache cold
- -                       * in this domain.
+ +                       * If sync wakeup then subtract the (maximum possible)
+ +                       * effect of the currently running task from the load
+ +                       * of the current CPU:
                          */
- -                      if (cpu_isset(cpu, sd->span)) {
- -                              schedstat_inc(sd, ttwu_move_affine);
+ +                      if (sync)
+ +                              tl -= SCHED_LOAD_SCALE;
+ +
+ +                      if ((tl <= load &&
+ +                              tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+ +                              100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+ +                              /*
+ +                               * This domain has SD_WAKE_AFFINE and
+ +                               * p is cache cold in this domain, and
+ +                               * there is no bad imbalance.
+ +                               */
+ +                              schedstat_inc(this_sd, ttwu_move_affine);
                                 goto out_set_cpu;
                         }
- -              } else if ((sd->flags & SD_WAKE_BALANCE) &&
- -                              imbalance*this_load <= 100*load) {
- -                      /*
- -                       * This domain has SD_WAKE_BALANCE and there is
- -                       * an imbalance.
- -                       */
- -                      if (cpu_isset(cpu, sd->span)) {
- -                              schedstat_inc(sd, ttwu_move_balance);
+ +              }
+ +
+ +              /*
+ +               * Start passive balancing when half the imbalance_pct
+ +               * limit is reached.
+ +               */
+ +              if (this_sd->flags & SD_WAKE_BALANCE) {
+ +                      if (imbalance*this_load <= 100*load) {
+ +                              schedstat_inc(this_sd, ttwu_move_balance);
                                 goto out_set_cpu;
                         }
                 }
@@@ -1287,19 -1120,17 +1287,19 @@@ int fastcall wake_up_state(task_t *p, u
         return try_to_wake_up(p, state, 0);
   }
   
- -#ifdef CONFIG_SMP
- -static int find_idlest_cpu(struct task_struct *p, int this_cpu,
- -                         struct sched_domain *sd);
- -#endif
- -
   /*
    * Perform scheduler related setup for a newly forked process p.
    * p is forked by current.
    */
- -void fastcall sched_fork(task_t *p)
+ +void fastcall sched_fork(task_t *p, int clone_flags)
   {
+ +      int cpu = get_cpu();
+ +
+ +#ifdef CONFIG_SMP
+ +      cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+ +#endif
+ +      set_task_cpu(p, cpu);
+ +
         /*
          * We mark the process as running here, but have not actually
          * inserted it onto the runqueue yet. This guarantees that
@@@ -1309,14 -1140,17 +1309,14 @@@
         p->state = TASK_RUNNING;
         INIT_LIST_HEAD(&p->run_list);
         p->array = NULL;
- -      spin_lock_init(&p->switch_lock);
   #ifdef CONFIG_SCHEDSTATS
         memset(&p->sched_info, 0, sizeof(p->sched_info));
   #endif
+ +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ +      p->oncpu = 0;
+ +#endif
   #ifdef CONFIG_PREEMPT
- -      /*
- -       * During context-switch we hold precisely one spinlock, which
- -       * schedule_tail drops. (in the common case it's this_rq()->lock,
- -       * but it also can be p->switch_lock.) So we compensate with a count
- -       * of 1. Also, we want to start with kernel preemption disabled.
- -       */
+ +      /* Want to start with kernel preemption disabled. */
         p->thread_info->preempt_count = 1;
   #endif
         /*
@@@ -1340,10 -1174,12 +1340,10 @@@
                  * runqueue lock is not a problem.
                  */
                 current->time_slice = 1;
- -              preempt_disable();
                 scheduler_tick();
- -              local_irq_enable();
- -              preempt_enable();
- -      } else
- -              local_irq_enable();
+ +      }
+ +      local_irq_enable();
+ +      put_cpu();
   }
   
   /*
@@@ -1360,9 -1196,10 +1360,9 @@@ void fastcall wake_up_new_task(task_t 
         runqueue_t *rq, *this_rq;
   
         rq = task_rq_lock(p, &flags);
- -      cpu = task_cpu(p);
- -      this_cpu = smp_processor_id();
- -
         BUG_ON(p->state != TASK_RUNNING);
+ +      this_cpu = smp_processor_id();
+ +      cpu = task_cpu(p);
   
         /*
          * We decrease the sleep average of forking parents
@@@ -1458,41 -1295,23 +1458,41 @@@ void fastcall sched_exit(task_t * p
         task_rq_unlock(rq, &flags);
   }
   
+ +/**
+ + * prepare_task_switch - prepare to switch tasks
+ + * @rq: the runqueue preparing to switch
+ + * @next: the task we are going to switch to.
+ + *
+ + * This is called with the rq lock held and interrupts off. It must
+ + * be paired with a subsequent finish_task_switch after the context
+ + * switch.
+ + *
+ + * prepare_task_switch sets up locking and calls architecture specific
+ + * hooks.
+ + */
+ +static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+ +{
+ +      prepare_lock_switch(rq, next);
+ +      prepare_arch_switch(next);
+ +}
+ +
   /**
    * finish_task_switch - clean up after a task-switch
    * @prev: the thread we just switched away from.
    *
- - * We enter this with the runqueue still locked, and finish_arch_switch()
- - * will unlock it along with doing any other architecture-specific cleanup
- - * actions.
+ + * finish_task_switch must be called after the context switch, paired
+ + * with a prepare_task_switch call before the context switch.
+ + * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ + * and do any other architecture-specific cleanup actions.
    *
    * Note that we may have delayed dropping an mm in context_switch(). If
    * so, we finish that here outside of the runqueue lock.  (Doing it
    * with the lock held can cause deadlocks; see schedule() for
    * details.)
    */
- -static inline void finish_task_switch(task_t *prev)
+ +static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
         __releases(rq->lock)
   {
- -      runqueue_t *rq = this_rq();
         struct mm_struct *mm = rq->prev_mm;
         unsigned long prev_task_flags;
   
@@@ -1510,8 -1329,7 +1510,8 @@@
          *              Manfred Spraul <manfred@colorfullife.com>
          */
         prev_task_flags = prev->flags;
- -      finish_arch_switch(rq, prev);
+ +      finish_arch_switch(prev);
+ +      finish_lock_switch(rq, prev);
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_task_flags & PF_DEAD))
@@@ -1525,12 -1343,8 +1525,12 @@@
   asmlinkage void schedule_tail(task_t *prev)
         __releases(rq->lock)
   {
- -      finish_task_switch(prev);
- -
+ +      runqueue_t *rq = this_rq();
+ +      finish_task_switch(rq, prev);
+ +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+ +      /* In this case, finish_task_switch does not reenable preemption */
+ +      preempt_enable();
+ +#endif
         if (current->set_child_tid)
                 put_user(current->pid, current->set_child_tid);
   }
@@@ -1679,6 -1493,51 +1679,6 @@@ static void double_lock_balance(runqueu
         }
   }
   
- -/*
- - * find_idlest_cpu - find the least busy runqueue.
- - */
- -static int find_idlest_cpu(struct task_struct *p, int this_cpu,
- -                         struct sched_domain *sd)
- -{
- -      unsigned long load, min_load, this_load;
- -      int i, min_cpu;
- -      cpumask_t mask;
- -
- -      min_cpu = UINT_MAX;
- -      min_load = ULONG_MAX;
- -
- -      cpus_and(mask, sd->span, p->cpus_allowed);
- -
- -      for_each_cpu_mask(i, mask) {
- -              load = target_load(i);
- -
- -              if (load < min_load) {
- -                      min_cpu = i;
- -                      min_load = load;
- -
- -                      /* break out early on an idle CPU: */
- -                      if (!min_load)
- -                              break;
- -              }
- -      }
- -
- -      /* add +1 to account for the new task */
- -      this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
- -
- -      /*
- -       * Would with the addition of the new task to the
- -       * current CPU there be an imbalance between this
- -       * CPU and the idlest CPU?
- -       *
- -       * Use half of the balancing threshold - new-context is
- -       * a good opportunity to balance.
- -       */
- -      if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
- -              return min_cpu;
- -
- -      return this_cpu;
- -}
- -
   /*
    * If dest_cpu is allowed for this process, migrate the task to it.
    * This is accomplished by forcing the cpu_allowed mask to only
@@@ -1712,16 -1571,37 +1712,16 @@@ out
   }
   
   /*
- - * sched_exec(): find the highest-level, exec-balance-capable
- - * domain and try to migrate the task to the least loaded CPU.
- - *
- - * execve() is a valuable balancing opportunity, because at this point
- - * the task has the smallest effective memory and cache footprint.
+ + * sched_exec - execve() is a valuable balancing opportunity, because at
+ + * this point the task has the smallest effective memory and cache footprint.
    */
   void sched_exec(void)
   {
- -      struct sched_domain *tmp, *sd = NULL;
         int new_cpu, this_cpu = get_cpu();
- -
- -      /* Prefer the current CPU if there's only this task running */
- -      if (this_rq()->nr_running <= 1)
- -              goto out;
- -
- -      for_each_domain(this_cpu, tmp)
- -              if (tmp->flags & SD_BALANCE_EXEC)
- -                      sd = tmp;
- -
- -      if (sd) {
- -              schedstat_inc(sd, sbe_attempts);
- -              new_cpu = find_idlest_cpu(current, this_cpu, sd);
- -              if (new_cpu != this_cpu) {
- -                      schedstat_inc(sd, sbe_pushed);
- -                      put_cpu();
- -                      sched_migrate_task(current, new_cpu);
- -                      return;
- -              }
- -      }
- -out:
+ +      new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
         put_cpu();
+ +      if (new_cpu != this_cpu)
+ +              sched_migrate_task(current, new_cpu);
   }
   
   /*
@@@ -1752,7 -1632,7 +1752,7 @@@ void pull_task(runqueue_t *src_rq, prio
    */
   static inline
   int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
- -                   struct sched_domain *sd, enum idle_type idle)
+ +           struct sched_domain *sd, enum idle_type idle, int *all_pinned)
   {
         /*
          * We do not migrate tasks that are:
@@@ -1760,24 -1640,23 +1760,24 @@@
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
          * 3) are cache-hot on their current CPU.
          */
- -      if (task_running(rq, p))
- -              return 0;
         if (!cpu_isset(this_cpu, p->cpus_allowed))
                 return 0;
+ +      *all_pinned = 0;
+ +
+ +      if (task_running(rq, p))
+ +              return 0;
   
         /*
          * Aggressive migration if:
- -       * 1) the [whole] cpu is idle, or
+ +       * 1) task is cache cold, or
          * 2) too many balance attempts have failed.
          */
   
- -      if (cpu_and_siblings_are_idle(this_cpu) || \
- -                      sd->nr_balance_failed > sd->cache_nice_tries)
+ +      if (sd->nr_balance_failed > sd->cache_nice_tries)
                 return 1;
   
         if (task_hot(p, rq->timestamp_last_tick, sd))
- -                      return 0;
+ +              return 0;
         return 1;
   }
   
@@@ -1790,18 -1669,16 +1790,18 @@@
    */
   static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
                       unsigned long max_nr_move, struct sched_domain *sd,
- -                    enum idle_type idle)
+ +                    enum idle_type idle, int *all_pinned)
   {
         prio_array_t *array, *dst_array;
         struct list_head *head, *curr;
- -      int idx, pulled = 0;
+ +      int idx, pulled = 0, pinned = 0;
         task_t *tmp;
   
- -      if (max_nr_move <= 0 || busiest->nr_running <= 1)
+ +      if (max_nr_move == 0)
                 goto out;
   
+ +      pinned = 1;
+ +
         /*
          * We first consider expired tasks. Those will likely not be
          * executed in the near future, and they are most likely to
@@@ -1840,7 -1717,7 +1840,7 @@@ skip_queue
   
         curr = curr->prev;
   
- -      if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+ +      if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
                 if (curr != head)
                         goto skip_queue;
                 idx++;
@@@ -1869,9 -1746,6 +1869,9 @@@ out
          * inside pull_task().
          */
         schedstat_add(sd, lb_gained[idle], pulled);
+ +
+ +      if (all_pinned)
+ +              *all_pinned = pinned;
         return pulled;
   }
   
@@@ -1886,15 -1760,8 +1886,15 @@@ find_busiest_group(struct sched_domain 
   {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ +      int load_idx;
   
         max_load = this_load = total_load = total_pwr = 0;
+ +      if (idle == NOT_IDLE)
+ +              load_idx = sd->busy_idx;
+ +      else if (idle == NEWLY_IDLE)
+ +              load_idx = sd->newidle_idx;
+ +      else
+ +              load_idx = sd->idle_idx;
   
         do {
                 unsigned long load;
@@@ -1909,9 -1776,9 +1909,9 @@@
                 for_each_cpu_mask(i, group->cpumask) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
- -                              load = target_load(i);
+ +                              load = target_load(i, load_idx);
                         else
- -                              load = source_load(i);
+ +                              load = source_load(i, load_idx);
   
                         avg_load += load;
                 }
@@@ -1925,10 -1792,12 +1925,10 @@@
                 if (local_group) {
                         this_load = avg_load;
                         this = group;
- -                      goto nextgroup;
                 } else if (avg_load > max_load) {
                         max_load = avg_load;
                         busiest = group;
                 }
- -nextgroup:
                 group = group->next;
         } while (group != sd->groups);
   
@@@ -2001,9 -1870,15 +2001,9 @@@
   
         /* Get rid of the scaling factor, rounding down as we divide */
         *imbalance = *imbalance / SCHED_LOAD_SCALE;
- -
         return busiest;
   
   out_balanced:
- -      if (busiest && (idle == NEWLY_IDLE ||
- -                      (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
- -              *imbalance = 1;
- -              return busiest;
- -      }
   
         *imbalance = 0;
         return NULL;
@@@ -2019,7 -1894,7 +2019,7 @@@ static runqueue_t *find_busiest_queue(s
         int i;
   
         for_each_cpu_mask(i, group->cpumask) {
- -              load = source_load(i);
+ +              load = source_load(i, 0);
   
                 if (load > max_load) {
                         max_load = load;
@@@ -2030,12 -1905,6 +2030,12 @@@
         return busiest;
   }
   
+ +/*
+ + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ + * so long as it is large enough.
+ + */
+ +#define MAX_PINNED_INTERVAL   512
+ +
   /*
    * Check this_cpu to ensure it is balanced within domain. Attempt to move
    * tasks if there is an imbalance.
@@@ -2048,8 -1917,7 +2048,8 @@@ static int load_balance(int this_cpu, r
         struct sched_group *group;
         runqueue_t *busiest;
         unsigned long imbalance;
- -      int nr_moved;
+ +      int nr_moved, all_pinned = 0;
+ +      int active_balance = 0;
   
         spin_lock(&this_rq->lock);
         schedstat_inc(sd, lb_cnt[idle]);
@@@ -2066,7 -1934,15 +2066,7 @@@
                 goto out_balanced;
         }
   
- -      /*
- -       * This should be "impossible", but since load
- -       * balancing is inherently racy and statistical,
- -       * it could happen in theory.
- -       */
- -      if (unlikely(busiest == this_rq)) {
- -              WARN_ON(1);
- -              goto out_balanced;
- -      }
+ +      BUG_ON(busiest == this_rq);
   
         schedstat_add(sd, lb_imbalance[idle], imbalance);
   
@@@ -2080,15 -1956,9 +2080,15 @@@
                  */
                 double_lock_balance(this_rq, busiest);
                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
- -                                              imbalance, sd, idle);
+ +                                              imbalance, sd, idle,
+ +                                              &all_pinned);
                 spin_unlock(&busiest->lock);
+ +
+ +              /* All tasks on this runqueue were pinned by CPU affinity */
+ +              if (unlikely(all_pinned))
+ +                      goto out_balanced;
         }
+ +
         spin_unlock(&this_rq->lock);
   
         if (!nr_moved) {
@@@ -2096,38 -1966,36 +2096,38 @@@
                 sd->nr_balance_failed++;
   
                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- -                      int wake = 0;
   
                         spin_lock(&busiest->lock);
                         if (!busiest->active_balance) {
                                 busiest->active_balance = 1;
                                 busiest->push_cpu = this_cpu;
- -                              wake = 1;
+ +                              active_balance = 1;
                         }
                         spin_unlock(&busiest->lock);
- -                      if (wake)
+ +                      if (active_balance)
                                 wake_up_process(busiest->migration_thread);
   
                         /*
                          * We've kicked active balancing, reset the failure
                          * counter.
                          */
- -                      sd->nr_balance_failed = sd->cache_nice_tries;
+ +                      sd->nr_balance_failed = sd->cache_nice_tries+1;
                 }
- -
- -              /*
- -               * We were unbalanced, but unsuccessful in move_tasks(),
- -               * so bump the balance_interval to lessen the lock contention.
- -               */
- -              if (sd->balance_interval < sd->max_interval)
- -                      sd->balance_interval++;
- -      } else {
+ +      } else
                 sd->nr_balance_failed = 0;
   
+ +      if (likely(!active_balance)) {
                 /* We were unbalanced, so reset the balancing interval */
                 sd->balance_interval = sd->min_interval;
+ +      } else {
+ +              /*
+ +               * If we've begun active balancing, start to back off. This
+ +               * case may not be covered by the all_pinned logic if there
+ +               * is only 1 task on the busy runqueue (because we don't call
+ +               * move_tasks).
+ +               */
+ +              if (sd->balance_interval < sd->max_interval)
+ +                      sd->balance_interval *= 2;
         }
   
         return nr_moved;
@@@ -2137,10 -2005,8 +2137,10 @@@ out_balanced
   
         schedstat_inc(sd, lb_balanced[idle]);
   
+ +      sd->nr_balance_failed = 0;
         /* tune up the balancing interval */
- -      if (sd->balance_interval < sd->max_interval)
+ +      if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ +                      (sd->balance_interval < sd->max_interval))
                 sd->balance_interval *= 2;
   
         return 0;
@@@ -2164,36 -2030,31 +2164,36 @@@ static int load_balance_newidle(int thi
         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
         if (!group) {
- -              schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
- -              goto out;
+ +              goto out_balanced;
         }
   
         busiest = find_busiest_queue(group);
- -      if (!busiest || busiest == this_rq) {
- -              schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ +      if (!busiest) {
                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
- -              goto out;
+ +              goto out_balanced;
         }
   
+ +      BUG_ON(busiest == this_rq);
+ +
         /* Attempt to move tasks */
         double_lock_balance(this_rq, busiest);
   
         schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
         nr_moved = move_tasks(this_rq, this_cpu, busiest,
- -                                      imbalance, sd, NEWLY_IDLE);
+ +                                      imbalance, sd, NEWLY_IDLE, NULL);
         if (!nr_moved)
                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ +      else
+ +              sd->nr_balance_failed = 0;
   
         spin_unlock(&busiest->lock);
- -
- -out:
         return nr_moved;
+ +
+ +out_balanced:
+ +      schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ +      sd->nr_balance_failed = 0;
+ +      return 0;
   }
   
   /*
@@@ -2225,42 -2086,56 +2225,42 @@@ static inline void idle_balance(int thi
   static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
   {
         struct sched_domain *sd;
- -      struct sched_group *cpu_group;
         runqueue_t *target_rq;
- -      cpumask_t visited_cpus;
- -      int cpu;
+ +      int target_cpu = busiest_rq->push_cpu;
+ +
+ +      if (busiest_rq->nr_running <= 1)
+ +              /* no task to move */
+ +              return;
+ +
+ +      target_rq = cpu_rq(target_cpu);
   
         /*
- -       * Search for suitable CPUs to push tasks to in successively higher
- -       * domains with SD_LOAD_BALANCE set.
+ +       * This condition is "impossible", if it occurs
+ +       * we need to fix it.  Originally reported by
+ +       * Bjorn Helgaas on a 128-cpu setup.
          */
- -      visited_cpus = CPU_MASK_NONE;
- -      for_each_domain(busiest_cpu, sd) {
- -              if (!(sd->flags & SD_LOAD_BALANCE))
- -                      /* no more domains to search */
- -                      break;
+ +      BUG_ON(busiest_rq == target_rq);
   
- -              schedstat_inc(sd, alb_cnt);
+ +      /* move a task from busiest_rq to target_rq */
+ +      double_lock_balance(busiest_rq, target_rq);
   
- -              cpu_group = sd->groups;
- -              do {
- -                      for_each_cpu_mask(cpu, cpu_group->cpumask) {
- -                              if (busiest_rq->nr_running <= 1)
- -                                      /* no more tasks left to move */
- -                                      return;
- -                              if (cpu_isset(cpu, visited_cpus))
- -                                      continue;
- -                              cpu_set(cpu, visited_cpus);
- -                              if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
- -                                      continue;
- -
- -                              target_rq = cpu_rq(cpu);
- -                              /*
- -                               * This condition is "impossible", if it occurs
- -                               * we need to fix it.  Originally reported by
- -                               * Bjorn Helgaas on a 128-cpu setup.
- -                               */
- -                              BUG_ON(busiest_rq == target_rq);
- -
- -                              /* move a task from busiest_rq to target_rq */
- -                              double_lock_balance(busiest_rq, target_rq);
- -                              if (move_tasks(target_rq, cpu, busiest_rq,
- -                                              1, sd, SCHED_IDLE)) {
- -                                      schedstat_inc(sd, alb_pushed);
- -                              } else {
- -                                      schedstat_inc(sd, alb_failed);
- -                              }
- -                              spin_unlock(&target_rq->lock);
- -                      }
- -                      cpu_group = cpu_group->next;
- -              } while (cpu_group != sd->groups);
- -      }
+ +      /* Search for an sd spanning us and the target CPU. */
+ +      for_each_domain(target_cpu, sd)
+ +              if ((sd->flags & SD_LOAD_BALANCE) &&
+ +                      cpu_isset(busiest_cpu, sd->span))
+ +                              break;
+ +
+ +      if (unlikely(sd == NULL))
+ +              goto out;
+ +
+ +      schedstat_inc(sd, alb_cnt);
+ +
+ +      if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+ +              schedstat_inc(sd, alb_pushed);
+ +      else
+ +              schedstat_inc(sd, alb_failed);
+ +out:
+ +      spin_unlock(&target_rq->lock);
   }
   
   /*
@@@ -2281,23 -2156,18 +2281,23 @@@ static void rebalance_tick(int this_cpu
         unsigned long old_load, this_load;
         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
         struct sched_domain *sd;
+ +      int i;
   
- -      /* Update our load */
- -      old_load = this_rq->cpu_load;
         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
- -      /*
- -       * Round up the averaging division if load is increasing. This
- -       * prevents us from getting stuck on 9 if the load is 10, for
- -       * example.
- -       */
- -      if (this_load > old_load)
- -              old_load++;
- -      this_rq->cpu_load = (old_load + this_load) / 2;
+ +      /* Update our load */
+ +      for (i = 0; i < 3; i++) {
+ +              unsigned long new_load = this_load;
+ +              int scale = 1 << i;
+ +              old_load = this_rq->cpu_load[i];
+ +              /*
+ +               * Round up the averaging division if load is increasing. This
+ +               * prevents us from getting stuck on 9 if the load is 10, for
+ +               * example.
+ +               */
+ +              if (new_load > old_load)
+ +                      new_load += scale-1;
+ +              this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+ +      }
   
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@@ -2577,15 -2447,11 +2577,15 @@@ out
   #ifdef CONFIG_SCHED_SMT
   static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
   {
- -      struct sched_domain *sd = this_rq->sd;
+ +      struct sched_domain *tmp, *sd = NULL;
         cpumask_t sibling_map;
         int i;
   
- -      if (!(sd->flags & SD_SHARE_CPUPOWER))
+ +      for_each_domain(this_cpu, tmp)
+ +              if (tmp->flags & SD_SHARE_CPUPOWER)
+ +                      sd = tmp;
+ +
+ +      if (!sd)
                 return;
   
         /*
@@@ -2626,17 -2492,13 +2626,17 @@@
   
   static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
   {
- -      struct sched_domain *sd = this_rq->sd;
+ +      struct sched_domain *tmp, *sd = NULL;
         cpumask_t sibling_map;
         prio_array_t *array;
         int ret = 0, i;
         task_t *p;
   
- -      if (!(sd->flags & SD_SHARE_CPUPOWER))
+ +      for_each_domain(this_cpu, tmp)
+ +              if (tmp->flags & SD_SHARE_CPUPOWER)
+ +                      sd = tmp;
+ +
+ +      if (!sd)
                 return 0;
   
         /*
@@@ -2751,7 -2613,7 +2751,7 @@@ asmlinkage void __sched schedule(void
         struct list_head *queue;
         unsigned long long now;
         unsigned long run_time;
- -      int cpu, idx;
+ +      int cpu, idx, new_prio;
   
         /*
          * Test if we are atomic.  Since do_exit() needs to call into
@@@ -2873,14 -2735,9 +2873,14 @@@ go_idle
                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
   
                 array = next->array;
- -              dequeue_task(next, array);
- -              recalc_task_prio(next, next->timestamp + delta);
- -              enqueue_task(next, array);
+ +              new_prio = recalc_task_prio(next, next->timestamp + delta);
+ +
+ +              if (unlikely(next->prio != new_prio)) {
+ +                      dequeue_task(next, array);
+ +                      next->prio = new_prio;
+ +                      enqueue_task(next, array);
+ +              } else
+ +                      requeue_task(next, array);
         }
         next->activated = 0;
   switch_tasks:
@@@ -2904,15 -2761,11 +2904,15 @@@
                 rq->curr = next;
                 ++*switch_count;
   
- -              prepare_arch_switch(rq, next);
+ +              prepare_task_switch(rq, next);
                 prev = context_switch(rq, prev, next);
                 barrier();
- -
- -              finish_task_switch(prev);
+ +              /*
+ +               * this_rq must be evaluated again because prev may have moved
+ +               * CPUs since it called schedule(), thus the 'rq' on its stack
+ +               * frame will be invalid.
+ +               */
+ +              finish_task_switch(this_rq(), prev);
         } else
                 spin_unlock_irq(&rq->lock);
   
@@@ -3531,24 -3384,13 +3531,24 @@@ recheck
         if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
                 return -EINVAL;
   
- -      if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
- -          param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
- -          !capable(CAP_SYS_NICE))
- -              return -EPERM;
- -      if ((current->euid != p->euid) && (current->euid != p->uid) &&
- -          !capable(CAP_SYS_NICE))
- -              return -EPERM;
+ +      /*
+ +       * Allow unprivileged RT tasks to decrease priority:
+ +       */
+ +      if (!capable(CAP_SYS_NICE)) {
+ +              /* can't change policy */
+ +              if (policy != p->policy)
+ +                      return -EPERM;
+ +              /* can't increase priority */
+ +              if (policy != SCHED_NORMAL &&
+ +                  param->sched_priority > p->rt_priority &&
+ +                  param->sched_priority >
+ +                              p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+ +                      return -EPERM;
+ +              /* can't change other user's priorities */
+ +              if ((current->euid != p->euid) &&
+ +                  (current->euid != p->uid))
+ +                      return -EPERM;
+ +      }
   
         retval = security_task_setscheduler(p, policy, param);
         if (retval)
@@@ -4188,9 -4030,6 +4188,9 @@@ void __devinit init_idle(task_t *idle, 
   
         spin_lock_irqsave(&rq->lock, flags);
         rq->curr = rq->idle = idle;
+ +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ +      idle->oncpu = 1;
+ +#endif
         set_tsk_need_resched(idle);
         spin_unlock_irqrestore(&rq->lock, flags);
   
@@@ -4335,8 -4174,7 +4335,7 @@@ static int migration_thread(void * data
                 struct list_head *head;
                 migration_req_t *req;
   
-               if (current->flags & PF_FREEZE)
-                       refrigerator(PF_FREEZE);
+               try_to_freeze();
   
                 spin_lock_irq(&rq->lock);
   
@@@ -4361,9 -4199,17 +4360,9 @@@
                 req = list_entry(head->next, migration_req_t, list);
                 list_del_init(head->next);
   
- -              if (req->type == REQ_MOVE_TASK) {
- -                      spin_unlock(&rq->lock);
- -                      __migrate_task(req->task, cpu, req->dest_cpu);
- -                      local_irq_enable();
- -              } else if (req->type == REQ_SET_DOMAIN) {
- -                      rq->sd = req->sd;
- -                      spin_unlock_irq(&rq->lock);
- -              } else {
- -                      spin_unlock_irq(&rq->lock);
- -                      WARN_ON(1);
- -              }
+ +              spin_unlock(&rq->lock);
+ +              __migrate_task(req->task, cpu, req->dest_cpu);
+ +              local_irq_enable();
   
                 complete(&req->done);
         }
@@@ -4594,6 -4440,7 +4593,6 @@@ static int migration_call(struct notifi
                         migration_req_t *req;
                         req = list_entry(rq->migration_queue.next,
                                          migration_req_t, list);
- -                      BUG_ON(req->type != REQ_MOVE_TASK);
                         list_del_init(&req->list);
                         complete(&req->done);
                 }
@@@ -4624,17 -4471,12 +4623,17 @@@ int __init migration_init(void
   #endif
   
   #ifdef CONFIG_SMP
- -#define SCHED_DOMAIN_DEBUG
+ +#undef SCHED_DOMAIN_DEBUG
   #ifdef SCHED_DOMAIN_DEBUG
   static void sched_domain_debug(struct sched_domain *sd, int cpu)
   {
         int level = 0;
   
+ +      if (!sd) {
+ +              printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ +              return;
+ +      }
+ +
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
   
         do {
@@@ -4717,81 -4559,37 +4716,81 @@@
   #define sched_domain_debug(sd, cpu) {}
   #endif
   
+ +static int sd_degenerate(struct sched_domain *sd)
+ +{
+ +      if (cpus_weight(sd->span) == 1)
+ +              return 1;
+ +
+ +      /* Following flags need at least 2 groups */
+ +      if (sd->flags & (SD_LOAD_BALANCE |
+ +                       SD_BALANCE_NEWIDLE |
+ +                       SD_BALANCE_FORK |
+ +                       SD_BALANCE_EXEC)) {
+ +              if (sd->groups != sd->groups->next)
+ +                      return 0;
+ +      }
+ +
+ +      /* Following flags don't use groups */
+ +      if (sd->flags & (SD_WAKE_IDLE |
+ +                       SD_WAKE_AFFINE |
+ +                       SD_WAKE_BALANCE))
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +
+ +static int sd_parent_degenerate(struct sched_domain *sd,
+ +                                              struct sched_domain *parent)
+ +{
+ +      unsigned long cflags = sd->flags, pflags = parent->flags;
+ +
+ +      if (sd_degenerate(parent))
+ +              return 1;
+ +
+ +      if (!cpus_equal(sd->span, parent->span))
+ +              return 0;
+ +
+ +      /* Does parent contain flags not in child? */
+ +      /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+ +      if (cflags & SD_WAKE_AFFINE)
+ +              pflags &= ~SD_WAKE_BALANCE;
+ +      /* Flags needing groups don't count if only 1 group in parent */
+ +      if (parent->groups == parent->groups->next) {
+ +              pflags &= ~(SD_LOAD_BALANCE |
+ +                              SD_BALANCE_NEWIDLE |
+ +                              SD_BALANCE_FORK |
+ +                              SD_BALANCE_EXEC);
+ +      }
+ +      if (~cflags & pflags)
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +
   /*
    * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
    * hold the hotplug lock.
    */
- -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+ +void cpu_attach_domain(struct sched_domain *sd, int cpu)
   {
- -      migration_req_t req;
- -      unsigned long flags;
         runqueue_t *rq = cpu_rq(cpu);
- -      int local = 1;
+ +      struct sched_domain *tmp;
   
- -      sched_domain_debug(sd, cpu);
- -
- -      spin_lock_irqsave(&rq->lock, flags);
- -
- -      if (cpu == smp_processor_id() || !cpu_online(cpu)) {
- -              rq->sd = sd;
- -      } else {
- -              init_completion(&req.done);
- -              req.type = REQ_SET_DOMAIN;
- -              req.sd = sd;
- -              list_add(&req.list, &rq->migration_queue);
- -              local = 0;
+ +      /* Remove the sched domains which do not contribute to scheduling. */
+ +      for (tmp = sd; tmp; tmp = tmp->parent) {
+ +              struct sched_domain *parent = tmp->parent;
+ +              if (!parent)
+ +                      break;
+ +              if (sd_parent_degenerate(tmp, parent))
+ +                      tmp->parent = parent->parent;
         }
   
- -      spin_unlock_irqrestore(&rq->lock, flags);
+ +      if (sd && sd_degenerate(sd))
+ +              sd = sd->parent;
   
- -      if (!local) {
- -              wake_up_process(rq->migration_thread);
- -              wait_for_completion(&req.done);
- -      }
+ +      sched_domain_debug(sd, cpu);
+ +
+ +      rcu_assign_pointer(rq->sd, sd);
   }
   
   /* cpus with isolated domains */
@@@ -4823,7 -4621,7 +4822,7 @@@ __setup ("isolcpus=", isolated_cpu_setu
    * covered by the given span, and will set each group's ->cpumask correctly,
    * and ->cpu_power to 0.
    */
- -void __devinit init_sched_build_groups(struct sched_group groups[],
+ +void init_sched_build_groups(struct sched_group groups[],
                         cpumask_t span, int (*group_fn)(int cpu))
   {
         struct sched_group *first = NULL, *last = NULL;
@@@ -4859,14 -4657,13 +4858,14 @@@
   
   
   #ifdef ARCH_HAS_SCHED_DOMAIN
- -extern void __devinit arch_init_sched_domains(void);
- -extern void __devinit arch_destroy_sched_domains(void);
+ +extern void build_sched_domains(const cpumask_t *cpu_map);
+ +extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+ +extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
   #else
   #ifdef CONFIG_SCHED_SMT
   static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
   static struct sched_group sched_group_cpus[NR_CPUS];
- -static int __devinit cpu_to_cpu_group(int cpu)
+ +static int cpu_to_cpu_group(int cpu)
   {
         return cpu;
   }
@@@ -4874,7 -4671,7 +4873,7 @@@
   
   static DEFINE_PER_CPU(struct sched_domain, phys_domains);
   static struct sched_group sched_group_phys[NR_CPUS];
- -static int __devinit cpu_to_phys_group(int cpu)
+ +static int cpu_to_phys_group(int cpu)
   {
   #ifdef CONFIG_SCHED_SMT
         return first_cpu(cpu_sibling_map[cpu]);
@@@ -4887,7 -4684,7 +4886,7 @@@
   
   static DEFINE_PER_CPU(struct sched_domain, node_domains);
   static struct sched_group sched_group_nodes[MAX_NUMNODES];
- -static int __devinit cpu_to_node_group(int cpu)
+ +static int cpu_to_node_group(int cpu)
   {
         return cpu_to_node(cpu);
   }
@@@ -4918,28 -4715,39 +4917,28 @@@ static void check_sibling_maps(void
   #endif
   
   /*
- - * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ + * Build sched domains for a given set of cpus and attach the sched domains
+ + * to the individual cpus
    */
- -static void __devinit arch_init_sched_domains(void)
+ +static void build_sched_domains(const cpumask_t *cpu_map)
   {
         int i;
- -      cpumask_t cpu_default_map;
- -
- -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
- -      check_sibling_maps();
- -#endif
- -      /*
- -       * Setup mask for cpus without special case scheduling requirements.
- -       * For now this just excludes isolated cpus, but could be used to
- -       * exclude other special cases in the future.
- -       */
- -      cpus_complement(cpu_default_map, cpu_isolated_map);
- -      cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
   
         /*
- -       * Set up domains. Isolated domains just stay on the dummy domain.
+ +       * Set up domains for cpus specified by the cpu_map.
          */
- -      for_each_cpu_mask(i, cpu_default_map) {
+ +      for_each_cpu_mask(i, *cpu_map) {
                 int group;
                 struct sched_domain *sd = NULL, *p;
                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
   
- -              cpus_and(nodemask, nodemask, cpu_default_map);
+ +              cpus_and(nodemask, nodemask, *cpu_map);
   
   #ifdef CONFIG_NUMA
                 sd = &per_cpu(node_domains, i);
                 group = cpu_to_node_group(i);
                 *sd = SD_NODE_INIT;
- -              sd->span = cpu_default_map;
+ +              sd->span = *cpu_map;
                 sd->groups = &sched_group_nodes[group];
   #endif
   
@@@ -4957,7 -4765,7 +4956,7 @@@
                 group = cpu_to_cpu_group(i);
                 *sd = SD_SIBLING_INIT;
                 sd->span = cpu_sibling_map[i];
- -              cpus_and(sd->span, sd->span, cpu_default_map);
+ +              cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
                 sd->groups = &sched_group_cpus[group];
   #endif
@@@ -4967,7 -4775,7 +4966,7 @@@
         /* Set up CPU (sibling) groups */
         for_each_online_cpu(i) {
                 cpumask_t this_sibling_map = cpu_sibling_map[i];
- -              cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+ +              cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                 if (i != first_cpu(this_sibling_map))
                         continue;
   
@@@ -4980,7 -4788,7 +4979,7 @@@
         for (i = 0; i < MAX_NUMNODES; i++) {
                 cpumask_t nodemask = node_to_cpumask(i);
   
- -              cpus_and(nodemask, nodemask, cpu_default_map);
+ +              cpus_and(nodemask, nodemask, *cpu_map);
                 if (cpus_empty(nodemask))
                         continue;
   
@@@ -4990,12 -4798,12 +4989,12 @@@
   
   #ifdef CONFIG_NUMA
         /* Set up node groups */
- -      init_sched_build_groups(sched_group_nodes, cpu_default_map,
+ +      init_sched_build_groups(sched_group_nodes, *cpu_map,
                                         &cpu_to_node_group);
   #endif
   
         /* Calculate CPU power for physical packages and nodes */
- -      for_each_cpu_mask(i, cpu_default_map) {
+ +      for_each_cpu_mask(i, *cpu_map) {
                 int power;
                 struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
@@@ -5019,7 -4827,7 +5018,7 @@@
         }
   
         /* Attach the domains */
- -      for_each_online_cpu(i) {
+ +      for_each_cpu_mask(i, *cpu_map) {
                 struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i);
@@@ -5029,85 -4837,41 +5028,85 @@@
                 cpu_attach_domain(sd, i);
         }
   }
+ +/*
+ + * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ + */
+ +static void arch_init_sched_domains(cpumask_t *cpu_map)
+ +{
+ +      cpumask_t cpu_default_map;
   
- -#ifdef CONFIG_HOTPLUG_CPU
- -static void __devinit arch_destroy_sched_domains(void)
+ +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+ +      check_sibling_maps();
+ +#endif
+ +      /*
+ +       * Setup mask for cpus without special case scheduling requirements.
+ +       * For now this just excludes isolated cpus, but could be used to
+ +       * exclude other special cases in the future.
+ +       */
+ +      cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+ +
+ +      build_sched_domains(&cpu_default_map);
+ +}
+ +
+ +static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
   {
         /* Do nothing: everything is statically allocated. */
   }
- -#endif
   
   #endif /* ARCH_HAS_SCHED_DOMAIN */
   
   /*
- - * Initial dummy domain for early boot and for hotplug cpu. Being static,
- - * it is initialized to zero, so all balancing flags are cleared which is
- - * what we want.
+ + * Detach sched domains from a group of cpus specified in cpu_map
+ + * These cpus will now be attached to the NULL domain
    */
- -static struct sched_domain sched_domain_dummy;
+ +static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+ +{
+ +      int i;
+ +
+ +      for_each_cpu_mask(i, *cpu_map)
+ +              cpu_attach_domain(NULL, i);
+ +      synchronize_sched();
+ +      arch_destroy_sched_domains(cpu_map);
+ +}
+ +
+ +/*
+ + * Partition sched domains as specified by the cpumasks below.
+ + * This attaches all cpus from the cpumasks to the NULL domain,
+ + * waits for a RCU quiescent period, recalculates sched
+ + * domain information and then attaches them back to the
+ + * correct sched domains
+ + * Call with hotplug lock held
+ + */
+ +void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+ +{
+ +      cpumask_t change_map;
+ +
+ +      cpus_and(*partition1, *partition1, cpu_online_map);
+ +      cpus_and(*partition2, *partition2, cpu_online_map);
+ +      cpus_or(change_map, *partition1, *partition2);
+ +
+ +      /* Detach sched domains from all of the affected cpus */
+ +      detach_destroy_domains(&change_map);
+ +      if (!cpus_empty(*partition1))
+ +              build_sched_domains(partition1);
+ +      if (!cpus_empty(*partition2))
+ +              build_sched_domains(partition2);
+ +}
   
   #ifdef CONFIG_HOTPLUG_CPU
   /*
    * Force a reinitialization of the sched domains hierarchy.  The domains
    * and groups cannot be updated in place without racing with the balancing
- - * code, so we temporarily attach all running cpus to a "dummy" domain
+ + * code, so we temporarily attach all running cpus to the NULL domain
    * which will prevent rebalancing while the sched domains are recalculated.
    */
   static int update_sched_domains(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
   {
- -      int i;
- -
         switch (action) {
         case CPU_UP_PREPARE:
         case CPU_DOWN_PREPARE:
- -              for_each_online_cpu(i)
- -                      cpu_attach_domain(&sched_domain_dummy, i);
- -              arch_destroy_sched_domains();
+ +              detach_destroy_domains(&cpu_online_map);
                 return NOTIFY_OK;
   
         case CPU_UP_CANCELED:
@@@ -5123,7 -4887,7 +5122,7 @@@
         }
   
         /* The hotplug lock is already held by cpu_up/cpu_down */
- -      arch_init_sched_domains();
+ +      arch_init_sched_domains(&cpu_online_map);
   
         return NOTIFY_OK;
   }
@@@ -5132,7 -4896,7 +5131,7 @@@
   void __init sched_init_smp(void)
   {
         lock_cpu_hotplug();
- -      arch_init_sched_domains();
+ +      arch_init_sched_domains(&cpu_online_map);
         unlock_cpu_hotplug();
         /* XXX: Theoretical race here - CPU may be hotplugged now */
         hotcpu_notifier(update_sched_domains, 0);
@@@ -5162,15 -4926,13 +5161,15 @@@ void __init sched_init(void
   
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
+ +              rq->nr_running = 0;
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
                 rq->best_expired_prio = MAX_PRIO;
   
   #ifdef CONFIG_SMP
- -              rq->sd = &sched_domain_dummy;
- -              rq->cpu_load = 0;
+ +              rq->sd = NULL;
+ +              for (j = 1; j < 3; j++)
+ +                      rq->cpu_load[j] = 0;
                 rq->active_balance = 0;
                 rq->push_cpu = 0;
                 rq->migration_thread = NULL;
author	Linus Torvalds <torvalds@ppc970.osdl.org>
	Sun, 26 Jun 2005 00:16:53 +0000 (17:16 -0700)
committer	Linus Torvalds <torvalds@ppc970.osdl.org>
	Sun, 26 Jun 2005 00:16:53 +0000 (17:16 -0700)
		1	2
Documentation/power/swsusp.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/i386/kernel/io_apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history