Merge branch 'l1tf-final' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

[linux] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 14c3fdd..b39fb59 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1312,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                  * of each group. Skip other nodes.
                  */
                 if (sched_numa_topology_type == NUMA_BACKPLANE &&
-                                       dist > maxdist)
+                                       dist >= maxdist)
                         continue;
  
                 /* Add up the faults from nearby nodes. */
@@ -1450,15 +1450,12 @@ static unsigned long capacity_of(int cpu);
  
  /* Cached statistics for all CPUs within a node */
  struct numa_stats {
-       unsigned long nr_running;
         unsigned long load;
  
         /* Total compute capacity of CPUs on a node */
         unsigned long compute_capacity;
  
-       /* Approximate capacity in terms of runnable tasks on a node */
-       unsigned long task_capacity;
-       int has_free_capacity;
+       unsigned int nr_running;
  };
  
  /*
@@ -1485,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
          * the @ns structure is NULL'ed and task_numa_compare() will
          * not find this node attractive.
          *
-        * We'll either bail at !has_free_capacity, or we'll detect a huge
-        * imbalance and bail there.
+        * We'll detect a huge imbalance and bail there.
          */
         if (!cpus)
                 return;
@@ -1495,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
         smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
         capacity = cpus / smt; /* cores */
  
-       ns->task_capacity = min_t(unsigned, capacity,
+       capacity = min_t(unsigned, capacity,
                 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
-       ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
  }
  
  struct task_numa_env {
@@ -1546,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,
         src_capacity = env->src_stats.compute_capacity;
         dst_capacity = env->dst_stats.compute_capacity;
  
-       /* We care about the slope of the imbalance, not the direction. */
-       if (dst_load < src_load)
-               swap(dst_load, src_load);
+       imb = abs(dst_load * src_capacity - src_load * dst_capacity);
  
-       /* Is the difference below the threshold? */
-       imb = dst_load * src_capacity * 100 -
-             src_load * dst_capacity * env->imbalance_pct;
-       if (imb <= 0)
-               return false;
-
-       /*
-        * The imbalance is above the allowed threshold.
-        * Compare it with the old imbalance.
-        */
         orig_src_load = env->src_stats.load;
         orig_dst_load = env->dst_stats.load;
  
-       if (orig_dst_load < orig_src_load)
-               swap(orig_dst_load, orig_src_load);
-
-       old_imb = orig_dst_load * src_capacity * 100 -
-                 orig_src_load * dst_capacity * env->imbalance_pct;
+       old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
  
         /* Would this change make things worse? */
         return (imb > old_imb);
@@ -1580,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
   * be exchanged with the source task
   */
  static void task_numa_compare(struct task_numa_env *env,
-                             long taskimp, long groupimp)
+                             long taskimp, long groupimp, bool maymove)
  {
-       struct rq *src_rq = cpu_rq(env->src_cpu);
         struct rq *dst_rq = cpu_rq(env->dst_cpu);
         struct task_struct *cur;
         long src_load, dst_load;
@@ -1603,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env,
         if (cur == env->p)
                 goto unlock;
  
+       if (!cur) {
+               if (maymove || imp > env->best_imp)
+                       goto assign;
+               else
+                       goto unlock;
+       }
+
         /*
          * "imp" is the fault differential for the source task between the
          * source and destination node. Calculate the total differential for
          * the source task and potential destination task. The more negative
-        * the value is, the more rmeote accesses that would be expected to
+        * the value is, the more remote accesses that would be expected to
          * be incurred if the tasks were swapped.
          */
-       if (cur) {
-               /* Skip this swap candidate if cannot move to the source CPU: */
-               if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
-                       goto unlock;
+       /* Skip this swap candidate if cannot move to the source cpu */
+       if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+               goto unlock;
  
+       /*
+        * If dst and source tasks are in the same NUMA group, or not
+        * in any group then look only at task weights.
+        */
+       if (cur->numa_group == env->p->numa_group) {
+               imp = taskimp + task_weight(cur, env->src_nid, dist) -
+                     task_weight(cur, env->dst_nid, dist);
                 /*
-                * If dst and source tasks are in the same NUMA group, or not
-                * in any group then look only at task weights.
+                * Add some hysteresis to prevent swapping the
+                * tasks within a group over tiny differences.
                  */
-               if (cur->numa_group == env->p->numa_group) {
-                       imp = taskimp + task_weight(cur, env->src_nid, dist) -
-                             task_weight(cur, env->dst_nid, dist);
-                       /*
-                        * Add some hysteresis to prevent swapping the
-                        * tasks within a group over tiny differences.
-                        */
-                       if (cur->numa_group)
-                               imp -= imp/16;
-               } else {
-                       /*
-                        * Compare the group weights. If a task is all by
-                        * itself (not part of a group), use the task weight
-                        * instead.
-                        */
-                       if (cur->numa_group)
-                               imp += group_weight(cur, env->src_nid, dist) -
-                                      group_weight(cur, env->dst_nid, dist);
-                       else
-                               imp += task_weight(cur, env->src_nid, dist) -
-                                      task_weight(cur, env->dst_nid, dist);
-               }
+               if (cur->numa_group)
+                       imp -= imp / 16;
+       } else {
+               /*
+                * Compare the group weights. If a task is all by itself
+                * (not part of a group), use the task weight instead.
+                */
+               if (cur->numa_group && env->p->numa_group)
+                       imp += group_weight(cur, env->src_nid, dist) -
+                              group_weight(cur, env->dst_nid, dist);
+               else
+                       imp += task_weight(cur, env->src_nid, dist) -
+                              task_weight(cur, env->dst_nid, dist);
         }
  
-       if (imp <= env->best_imp && moveimp <= env->best_imp)
+       if (imp <= env->best_imp)
                 goto unlock;
  
-       if (!cur) {
-               /* Is there capacity at our destination? */
-               if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
-                   !env->dst_stats.has_free_capacity)
-                       goto unlock;
-
-               goto balance;
-       }
-
-       /* Balance doesn't matter much if we're running a task per CPU: */
-       if (imp > env->best_imp && src_rq->nr_running == 1 &&
-                       dst_rq->nr_running == 1)
+       if (maymove && moveimp > imp && moveimp > env->best_imp) {
+               imp = moveimp - 1;
+               cur = NULL;
                 goto assign;
+       }
  
         /*
          * In the overloaded case, try and keep the load balanced.
          */
-balance:
-       load = task_h_load(env->p);
+       load = task_h_load(env->p) - task_h_load(cur);
+       if (!load)
+               goto assign;
+
         dst_load = env->dst_stats.load + load;
         src_load = env->src_stats.load - load;
  
-       if (moveimp > imp && moveimp > env->best_imp) {
-               /*
-                * If the improvement from just moving env->p direction is
-                * better than swapping tasks around, check if a move is
-                * possible. Store a slightly smaller score than moveimp,
-                * so an actually idle CPU will win.
-                */
-               if (!load_too_imbalanced(src_load, dst_load, env)) {
-                       imp = moveimp - 1;
-                       cur = NULL;
-                       goto assign;
-               }
-       }
-
-       if (imp <= env->best_imp)
-               goto unlock;
-
-       if (cur) {
-               load = task_h_load(cur);
-               dst_load -= load;
-               src_load += load;
-       }
-
         if (load_too_imbalanced(src_load, dst_load, env))
                 goto unlock;
  
+assign:
         /*
          * One idle CPU per node is evaluated for a task numa move.
          * Call select_idle_sibling to maybe find a better one.
@@ -1709,7 +1663,6 @@ balance:
                 local_irq_enable();
         }
  
-assign:
         task_numa_assign(env, cur, imp);
  unlock:
         rcu_read_unlock();
@@ -1718,43 +1671,30 @@ unlock:
  static void task_numa_find_cpu(struct task_numa_env *env,
                                 long taskimp, long groupimp)
  {
+       long src_load, dst_load, load;
+       bool maymove = false;
         int cpu;
  
+       load = task_h_load(env->p);
+       dst_load = env->dst_stats.load + load;
+       src_load = env->src_stats.load - load;
+
+       /*
+        * If the improvement from just moving env->p direction is better
+        * than swapping tasks around, check if a move is possible.
+        */
+       maymove = !load_too_imbalanced(src_load, dst_load, env);
+
         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
                 /* Skip this CPU if the source task cannot migrate */
                 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
                         continue;
  
                 env->dst_cpu = cpu;
-               task_numa_compare(env, taskimp, groupimp);
+               task_numa_compare(env, taskimp, groupimp, maymove);
         }
  }
  
-/* Only move tasks to a NUMA node less busy than the current node. */
-static bool numa_has_capacity(struct task_numa_env *env)
-{
-       struct numa_stats *src = &env->src_stats;
-       struct numa_stats *dst = &env->dst_stats;
-
-       if (src->has_free_capacity && !dst->has_free_capacity)
-               return false;
-
-       /*
-        * Only consider a task move if the source has a higher load
-        * than the destination, corrected for CPU capacity on each node.
-        *
-        *      src->load                dst->load
-        * --------------------- vs ---------------------
-        * src->compute_capacity    dst->compute_capacity
-        */
-       if (src->load * dst->compute_capacity * env->imbalance_pct >
-
-           dst->load * src->compute_capacity * 100)
-               return true;
-
-       return false;
-}
-
  static int task_numa_migrate(struct task_struct *p)
  {
         struct task_numa_env env = {
@@ -1795,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)
          * elsewhere, so there is no point in (re)trying.
          */
         if (unlikely(!sd)) {
-               p->numa_preferred_nid = task_node(p);
+               sched_setnuma(p, task_node(p));
                 return -EINVAL;
         }
  
@@ -1809,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p)
         update_numa_stats(&env.dst_stats, env.dst_nid);
  
         /* Try to find a spot on the preferred nid. */
-       if (numa_has_capacity(&env))
-               task_numa_find_cpu(&env, taskimp, groupimp);
+       task_numa_find_cpu(&env, taskimp, groupimp);
  
         /*
          * Look at other nodes in these cases:
@@ -1840,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p)
                         env.dist = dist;
                         env.dst_nid = nid;
                         update_numa_stats(&env.dst_stats, env.dst_nid);
-                       if (numa_has_capacity(&env))
-                               task_numa_find_cpu(&env, taskimp, groupimp);
+                       task_numa_find_cpu(&env, taskimp, groupimp);
                 }
         }
  
@@ -1854,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)
          * trying for a better one later. Do not set the preferred node here.
          */
         if (p->numa_group) {
-               struct numa_group *ng = p->numa_group;
-
                 if (env.best_cpu == -1)
                         nid = env.src_nid;
                 else
-                       nid = env.dst_nid;
+                       nid = cpu_to_node(env.best_cpu);
  
-               if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
-                       sched_setnuma(p, env.dst_nid);
+               if (nid != p->numa_preferred_nid)
+                       sched_setnuma(p, nid);
         }
  
         /* No better CPU than the current one was found. */
@@ -1882,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)
                 return ret;
         }
  
-       ret = migrate_swap(p, env.best_task);
+       ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+
         if (ret != 0)
                 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
         put_task_struct(env.best_task);
@@ -2142,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
  
  static void task_numa_placement(struct task_struct *p)
  {
-       int seq, nid, max_nid = -1, max_group_nid = -1;
-       unsigned long max_faults = 0, max_group_faults = 0;
+       int seq, nid, max_nid = -1;
+       unsigned long max_faults = 0;
         unsigned long fault_types[2] = { 0, 0 };
         unsigned long total_faults;
         u64 runtime, period;
@@ -2222,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)
                         }
                 }
  
-               if (faults > max_faults) {
-                       max_faults = faults;
+               if (!p->numa_group) {
+                       if (faults > max_faults) {
+                               max_faults = faults;
+                               max_nid = nid;
+                       }
+               } else if (group_faults > max_faults) {
+                       max_faults = group_faults;
                         max_nid = nid;
                 }
-
-               if (group_faults > max_group_faults) {
-                       max_group_faults = group_faults;
-                       max_group_nid = nid;
-               }
         }
  
-       update_task_scan_period(p, fault_types[0], fault_types[1]);
-
         if (p->numa_group) {
                 numa_group_count_active_nodes(p->numa_group);
                 spin_unlock_irq(group_lock);
-               max_nid = preferred_group_nid(p, max_group_nid);
+               max_nid = preferred_group_nid(p, max_nid);
         }
  
         if (max_faults) {
                 /* Set the new preferred node */
                 if (max_nid != p->numa_preferred_nid)
                         sched_setnuma(p, max_nid);
-
-               if (task_node(p) != p->numa_preferred_nid)
-                       numa_migrate_preferred(p);
         }
+
+       update_task_scan_period(p, fault_types[0], fault_types[1]);
  }
  
  static inline int get_numa_group(struct numa_group *grp)
@@ -2448,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                                 numa_is_active_node(mem_node, ng))
                 local = 1;
  
-       task_numa_placement(p);
-
         /*
          * Retry task to preferred node migration periodically, in case it
          * case it previously failed, or the scheduler moved us.
          */
-       if (time_after(jiffies, p->numa_migrate_retry))
+       if (time_after(jiffies, p->numa_migrate_retry)) {
+               task_numa_placement(p);
                 numa_migrate_preferred(p);
+       }
  
         if (migrated)
                 p->numa_pages_migrated += pages;
@@ -5905,6 +5839,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
  }
  
  #ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
  
  static inline void set_idle_cores(int cpu, int val)
  {
@@ -6962,8 +6897,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
  static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
  {
         struct numa_group *numa_group = rcu_dereference(p->numa_group);
-       unsigned long src_faults, dst_faults;
-       int src_nid, dst_nid;
+       unsigned long src_weight, dst_weight;
+       int src_nid, dst_nid, dist;
  
         if (!static_branch_likely(&sched_numa_balancing))
                 return -1;
@@ -6990,18 +6925,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
                 return 0;
  
         /* Leaving a core idle is often worse than degrading locality. */
-       if (env->idle != CPU_NOT_IDLE)
+       if (env->idle == CPU_IDLE)
                 return -1;
  
+       dist = node_distance(src_nid, dst_nid);
         if (numa_group) {
-               src_faults = group_faults(p, src_nid);
-               dst_faults = group_faults(p, dst_nid);
+               src_weight = group_weight(p, src_nid, dist);
+               dst_weight = group_weight(p, dst_nid, dist);
         } else {
-               src_faults = task_faults(p, src_nid);
-               dst_faults = task_faults(p, dst_nid);
+               src_weight = task_weight(p, src_nid, dist);
+               dst_weight = task_weight(p, dst_nid, dist);
         }
  
-       return dst_faults < src_faults;
+       return dst_weight < src_weight;
  }
  
  #else