sched: dynticks idle load balancing
[powerpc.git] / kernel / sched.c
index ba053d8..7459928 100644 (file)
@@ -224,6 +224,9 @@ struct rq {
 #ifdef CONFIG_SMP
        unsigned long cpu_load[3];
        unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+       unsigned char in_nohz_recently;
+#endif
 #endif
        unsigned long long nr_switches;
 
@@ -1050,6 +1053,17 @@ static void resched_task(struct task_struct *p)
        if (!tsk_is_polling(p))
                smp_send_reschedule(cpu);
 }
+
+static void resched_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       if (!spin_trylock_irqsave(&rq->lock, flags))
+               return;
+       resched_task(cpu_curr(cpu));
+       spin_unlock_irqrestore(&rq->lock, flags);
+}
 #else
 static inline void resched_task(struct task_struct *p)
 {
@@ -2658,6 +2672,12 @@ redo:
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
 
+               /*
+                * some other cpu did the load balance for us.
+                */
+               if (nr_moved && this_cpu != smp_processor_id())
+                       resched_cpu(this_cpu);
+
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
                        cpu_clear(cpu_of(busiest), cpus);
@@ -2928,27 +2948,98 @@ static void update_load(struct rq *this_rq)
        }
 }
 
+#ifdef CONFIG_NO_HZ
+static struct {
+       atomic_t load_balancer;
+       cpumask_t  cpu_mask;
+} nohz ____cacheline_aligned = {
+       .load_balancer = ATOMIC_INIT(-1),
+       .cpu_mask = CPU_MASK_NONE,
+};
+
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
+ *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
  *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+       int cpu = smp_processor_id();
+
+       if (stop_tick) {
+               cpu_set(cpu, nohz.cpu_mask);
+               cpu_rq(cpu)->in_nohz_recently = 1;
+
+               /*
+                * If we are going offline and still the leader, give up!
+                */
+               if (cpu_is_offline(cpu) &&
+                   atomic_read(&nohz.load_balancer) == cpu) {
+                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                               BUG();
+                       return 0;
+               }
+
+               /* time for ilb owner also to sleep */
+               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+                       if (atomic_read(&nohz.load_balancer) == cpu)
+                               atomic_set(&nohz.load_balancer, -1);
+                       return 0;
+               }
+
+               if (atomic_read(&nohz.load_balancer) == -1) {
+                       /* make me the ilb owner */
+                       if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+                               return 1;
+               } else if (atomic_read(&nohz.load_balancer) == cpu)
+                       return 1;
+       } else {
+               if (!cpu_isset(cpu, nohz.cpu_mask))
+                       return 0;
+
+               cpu_clear(cpu, nohz.cpu_mask);
+
+               if (atomic_read(&nohz.load_balancer) == cpu)
+                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                               BUG();
+       }
+       return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
 {
-       int this_cpu = smp_processor_id(), balance = 1;
-       struct rq *this_rq = cpu_rq(this_cpu);
+       int balance = 1;
+       struct rq *rq = cpu_rq(cpu);
        unsigned long interval;
        struct sched_domain *sd;
-       enum idle_type idle = this_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
-       /* Earliest time when we have to call run_rebalance_domains again */
+       /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
 
-       for_each_domain(this_cpu, sd) {
+       for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
@@ -2967,7 +3058,7 @@ static void run_rebalance_domains(struct softirq_action *h)
                }
 
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -2991,7 +3082,114 @@ out:
                if (!balance)
                        break;
        }
-       this_rq->next_balance = next_balance;
+       rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+       int local_cpu = smp_processor_id();
+       struct rq *local_rq = cpu_rq(local_cpu);
+       enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+
+       rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+       /*
+        * If this cpu is the owner for idle load balancing, then do the
+        * balancing on behalf of the other idle cpus whose ticks are
+        * stopped.
+        */
+       if (local_rq->idle_at_tick &&
+           atomic_read(&nohz.load_balancer) == local_cpu) {
+               cpumask_t cpus = nohz.cpu_mask;
+               struct rq *rq;
+               int balance_cpu;
+
+               cpu_clear(local_cpu, cpus);
+               for_each_cpu_mask(balance_cpu, cpus) {
+                       /*
+                        * If this cpu gets work to do, stop the load balancing
+                        * work being done for other cpus. Next load
+                        * balancing owner will pick it up.
+                        */
+                       if (need_resched())
+                               break;
+
+                       rebalance_domains(balance_cpu, SCHED_IDLE);
+
+                       rq = cpu_rq(balance_cpu);
+                       if (time_after(local_rq->next_balance, rq->next_balance))
+                               local_rq->next_balance = rq->next_balance;
+               }
+       }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+       /*
+        * If we were in the nohz mode recently and busy at the current
+        * scheduler tick, then check if we need to nominate new idle
+        * load balancer.
+        */
+       if (rq->in_nohz_recently && !rq->idle_at_tick) {
+               rq->in_nohz_recently = 0;
+
+               if (atomic_read(&nohz.load_balancer) == cpu) {
+                       cpu_clear(cpu, nohz.cpu_mask);
+                       atomic_set(&nohz.load_balancer, -1);
+               }
+
+               if (atomic_read(&nohz.load_balancer) == -1) {
+                       /*
+                        * simple selection for now: Nominate the
+                        * first cpu in the nohz list to be the next
+                        * ilb owner.
+                        *
+                        * TBD: Traverse the sched domains and nominate
+                        * the nearest cpu in the nohz.cpu_mask.
+                        */
+                       int ilb = first_cpu(nohz.cpu_mask);
+
+                       if (ilb != NR_CPUS)
+                               resched_cpu(ilb);
+               }
+       }
+
+       /*
+        * If this cpu is idle and doing idle load balancing for all the
+        * cpus with ticks stopped, is it time for that to stop?
+        */
+       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               resched_cpu(cpu);
+               return;
+       }
+
+       /*
+        * If this cpu is idle and the idle load balancing is done by
+        * someone else, then no need raise the SCHED_SOFTIRQ
+        */
+       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+           cpu_isset(cpu, nohz.cpu_mask))
+               return;
+#endif
+       if (time_after_eq(jiffies, rq->next_balance))
+               raise_softirq(SCHED_SOFTIRQ);
 }
 #else
 /*
@@ -3224,8 +3422,7 @@ void scheduler_tick(void)
 #ifdef CONFIG_SMP
        update_load(rq);
        rq->idle_at_tick = idle_at_tick;
-       if (time_after_eq(jiffies, rq->next_balance))
-               raise_softirq(SCHED_SOFTIRQ);
+       trigger_load_balance(cpu);
 #endif
 }