Merge branch 'l1tf-final' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
[linux] / kernel / sched / fair.c
index 2f0a0be..b39fb59 100644 (file)
@@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
        return cfs_rq->rq;
 }
 
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se)     (!se->my_q)
-
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
        SCHED_WARN_ON(!entity_is_task(se));
@@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
        return container_of(cfs_rq, struct rq, cfs);
 }
 
-#define entity_is_task(se)     1
 
 #define for_each_sched_entity(se) \
                for (; se; se = NULL)
@@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
-
+#include "pelt.h"
 #include "sched-pelt.h"
 
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
@@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
  * To solve this problem, we also cap the util_avg of successive tasks to
  * only 1/2 of the left utilization budget:
  *
- *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
  *
- * where n denotes the nth task.
+ * where n denotes the nth task and cpu_scale the CPU capacity.
  *
- * For example, a simplest series from the beginning would be like:
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
  *
  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
@@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
-       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+       long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+       long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
 
        if (cap > 0) {
                if (cfs_rq->avg.util_avg != 0) {
@@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                 * of each group. Skip other nodes.
                 */
                if (sched_numa_topology_type == NUMA_BACKPLANE &&
-                                       dist > maxdist)
+                                       dist >= maxdist)
                        continue;
 
                /* Add up the faults from nearby nodes. */
@@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu);
 
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
-       unsigned long nr_running;
        unsigned long load;
 
        /* Total compute capacity of CPUs on a node */
        unsigned long compute_capacity;
 
-       /* Approximate capacity in terms of runnable tasks on a node */
-       unsigned long task_capacity;
-       int has_free_capacity;
+       unsigned int nr_running;
 };
 
 /*
@@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
         * the @ns structure is NULL'ed and task_numa_compare() will
         * not find this node attractive.
         *
-        * We'll either bail at !has_free_capacity, or we'll detect a huge
-        * imbalance and bail there.
+        * We'll detect a huge imbalance and bail there.
         */
        if (!cpus)
                return;
@@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
        capacity = cpus / smt; /* cores */
 
-       ns->task_capacity = min_t(unsigned, capacity,
+       capacity = min_t(unsigned, capacity,
                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
-       ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
 
 struct task_numa_env {
@@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,
        src_capacity = env->src_stats.compute_capacity;
        dst_capacity = env->dst_stats.compute_capacity;
 
-       /* We care about the slope of the imbalance, not the direction. */
-       if (dst_load < src_load)
-               swap(dst_load, src_load);
+       imb = abs(dst_load * src_capacity - src_load * dst_capacity);
 
-       /* Is the difference below the threshold? */
-       imb = dst_load * src_capacity * 100 -
-             src_load * dst_capacity * env->imbalance_pct;
-       if (imb <= 0)
-               return false;
-
-       /*
-        * The imbalance is above the allowed threshold.
-        * Compare it with the old imbalance.
-        */
        orig_src_load = env->src_stats.load;
        orig_dst_load = env->dst_stats.load;
 
-       if (orig_dst_load < orig_src_load)
-               swap(orig_dst_load, orig_src_load);
-
-       old_imb = orig_dst_load * src_capacity * 100 -
-                 orig_src_load * dst_capacity * env->imbalance_pct;
+       old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
 
        /* Would this change make things worse? */
        return (imb > old_imb);
@@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
  * be exchanged with the source task
  */
 static void task_numa_compare(struct task_numa_env *env,
-                             long taskimp, long groupimp)
+                             long taskimp, long groupimp, bool maymove)
 {
-       struct rq *src_rq = cpu_rq(env->src_cpu);
        struct rq *dst_rq = cpu_rq(env->dst_cpu);
        struct task_struct *cur;
        long src_load, dst_load;
@@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env,
        if (cur == env->p)
                goto unlock;
 
+       if (!cur) {
+               if (maymove || imp > env->best_imp)
+                       goto assign;
+               else
+                       goto unlock;
+       }
+
        /*
         * "imp" is the fault differential for the source task between the
         * source and destination node. Calculate the total differential for
         * the source task and potential destination task. The more negative
-        * the value is, the more rmeote accesses that would be expected to
+        * the value is, the more remote accesses that would be expected to
         * be incurred if the tasks were swapped.
         */
-       if (cur) {
-               /* Skip this swap candidate if cannot move to the source CPU: */
-               if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
-                       goto unlock;
+       /* Skip this swap candidate if cannot move to the source cpu */
+       if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+               goto unlock;
 
+       /*
+        * If dst and source tasks are in the same NUMA group, or not
+        * in any group then look only at task weights.
+        */
+       if (cur->numa_group == env->p->numa_group) {
+               imp = taskimp + task_weight(cur, env->src_nid, dist) -
+                     task_weight(cur, env->dst_nid, dist);
                /*
-                * If dst and source tasks are in the same NUMA group, or not
-                * in any group then look only at task weights.
+                * Add some hysteresis to prevent swapping the
+                * tasks within a group over tiny differences.
                 */
-               if (cur->numa_group == env->p->numa_group) {
-                       imp = taskimp + task_weight(cur, env->src_nid, dist) -
-                             task_weight(cur, env->dst_nid, dist);
-                       /*
-                        * Add some hysteresis to prevent swapping the
-                        * tasks within a group over tiny differences.
-                        */
-                       if (cur->numa_group)
-                               imp -= imp/16;
-               } else {
-                       /*
-                        * Compare the group weights. If a task is all by
-                        * itself (not part of a group), use the task weight
-                        * instead.
-                        */
-                       if (cur->numa_group)
-                               imp += group_weight(cur, env->src_nid, dist) -
-                                      group_weight(cur, env->dst_nid, dist);
-                       else
-                               imp += task_weight(cur, env->src_nid, dist) -
-                                      task_weight(cur, env->dst_nid, dist);
-               }
+               if (cur->numa_group)
+                       imp -= imp / 16;
+       } else {
+               /*
+                * Compare the group weights. If a task is all by itself
+                * (not part of a group), use the task weight instead.
+                */
+               if (cur->numa_group && env->p->numa_group)
+                       imp += group_weight(cur, env->src_nid, dist) -
+                              group_weight(cur, env->dst_nid, dist);
+               else
+                       imp += task_weight(cur, env->src_nid, dist) -
+                              task_weight(cur, env->dst_nid, dist);
        }
 
-       if (imp <= env->best_imp && moveimp <= env->best_imp)
+       if (imp <= env->best_imp)
                goto unlock;
 
-       if (!cur) {
-               /* Is there capacity at our destination? */
-               if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
-                   !env->dst_stats.has_free_capacity)
-                       goto unlock;
-
-               goto balance;
-       }
-
-       /* Balance doesn't matter much if we're running a task per CPU: */
-       if (imp > env->best_imp && src_rq->nr_running == 1 &&
-                       dst_rq->nr_running == 1)
+       if (maymove && moveimp > imp && moveimp > env->best_imp) {
+               imp = moveimp - 1;
+               cur = NULL;
                goto assign;
+       }
 
        /*
         * In the overloaded case, try and keep the load balanced.
         */
-balance:
-       load = task_h_load(env->p);
+       load = task_h_load(env->p) - task_h_load(cur);
+       if (!load)
+               goto assign;
+
        dst_load = env->dst_stats.load + load;
        src_load = env->src_stats.load - load;
 
-       if (moveimp > imp && moveimp > env->best_imp) {
-               /*
-                * If the improvement from just moving env->p direction is
-                * better than swapping tasks around, check if a move is
-                * possible. Store a slightly smaller score than moveimp,
-                * so an actually idle CPU will win.
-                */
-               if (!load_too_imbalanced(src_load, dst_load, env)) {
-                       imp = moveimp - 1;
-                       cur = NULL;
-                       goto assign;
-               }
-       }
-
-       if (imp <= env->best_imp)
-               goto unlock;
-
-       if (cur) {
-               load = task_h_load(cur);
-               dst_load -= load;
-               src_load += load;
-       }
-
        if (load_too_imbalanced(src_load, dst_load, env))
                goto unlock;
 
+assign:
        /*
         * One idle CPU per node is evaluated for a task numa move.
         * Call select_idle_sibling to maybe find a better one.
@@ -1711,7 +1663,6 @@ balance:
                local_irq_enable();
        }
 
-assign:
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
@@ -1720,43 +1671,30 @@ unlock:
 static void task_numa_find_cpu(struct task_numa_env *env,
                                long taskimp, long groupimp)
 {
+       long src_load, dst_load, load;
+       bool maymove = false;
        int cpu;
 
+       load = task_h_load(env->p);
+       dst_load = env->dst_stats.load + load;
+       src_load = env->src_stats.load - load;
+
+       /*
+        * If the improvement from just moving env->p direction is better
+        * than swapping tasks around, check if a move is possible.
+        */
+       maymove = !load_too_imbalanced(src_load, dst_load, env);
+
        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
                /* Skip this CPU if the source task cannot migrate */
                if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
                        continue;
 
                env->dst_cpu = cpu;
-               task_numa_compare(env, taskimp, groupimp);
+               task_numa_compare(env, taskimp, groupimp, maymove);
        }
 }
 
-/* Only move tasks to a NUMA node less busy than the current node. */
-static bool numa_has_capacity(struct task_numa_env *env)
-{
-       struct numa_stats *src = &env->src_stats;
-       struct numa_stats *dst = &env->dst_stats;
-
-       if (src->has_free_capacity && !dst->has_free_capacity)
-               return false;
-
-       /*
-        * Only consider a task move if the source has a higher load
-        * than the destination, corrected for CPU capacity on each node.
-        *
-        *      src->load                dst->load
-        * --------------------- vs ---------------------
-        * src->compute_capacity    dst->compute_capacity
-        */
-       if (src->load * dst->compute_capacity * env->imbalance_pct >
-
-           dst->load * src->compute_capacity * 100)
-               return true;
-
-       return false;
-}
-
 static int task_numa_migrate(struct task_struct *p)
 {
        struct task_numa_env env = {
@@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)
         * elsewhere, so there is no point in (re)trying.
         */
        if (unlikely(!sd)) {
-               p->numa_preferred_nid = task_node(p);
+               sched_setnuma(p, task_node(p));
                return -EINVAL;
        }
 
@@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p)
        update_numa_stats(&env.dst_stats, env.dst_nid);
 
        /* Try to find a spot on the preferred nid. */
-       if (numa_has_capacity(&env))
-               task_numa_find_cpu(&env, taskimp, groupimp);
+       task_numa_find_cpu(&env, taskimp, groupimp);
 
        /*
         * Look at other nodes in these cases:
@@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p)
                        env.dist = dist;
                        env.dst_nid = nid;
                        update_numa_stats(&env.dst_stats, env.dst_nid);
-                       if (numa_has_capacity(&env))
-                               task_numa_find_cpu(&env, taskimp, groupimp);
+                       task_numa_find_cpu(&env, taskimp, groupimp);
                }
        }
 
@@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)
         * trying for a better one later. Do not set the preferred node here.
         */
        if (p->numa_group) {
-               struct numa_group *ng = p->numa_group;
-
                if (env.best_cpu == -1)
                        nid = env.src_nid;
                else
-                       nid = env.dst_nid;
+                       nid = cpu_to_node(env.best_cpu);
 
-               if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
-                       sched_setnuma(p, env.dst_nid);
+               if (nid != p->numa_preferred_nid)
+                       sched_setnuma(p, nid);
        }
 
        /* No better CPU than the current one was found. */
@@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)
                return ret;
        }
 
-       ret = migrate_swap(p, env.best_task);
+       ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+
        if (ret != 0)
                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
        put_task_struct(env.best_task);
@@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
 
 static void task_numa_placement(struct task_struct *p)
 {
-       int seq, nid, max_nid = -1, max_group_nid = -1;
-       unsigned long max_faults = 0, max_group_faults = 0;
+       int seq, nid, max_nid = -1;
+       unsigned long max_faults = 0;
        unsigned long fault_types[2] = { 0, 0 };
        unsigned long total_faults;
        u64 runtime, period;
@@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)
                        }
                }
 
-               if (faults > max_faults) {
-                       max_faults = faults;
+               if (!p->numa_group) {
+                       if (faults > max_faults) {
+                               max_faults = faults;
+                               max_nid = nid;
+                       }
+               } else if (group_faults > max_faults) {
+                       max_faults = group_faults;
                        max_nid = nid;
                }
-
-               if (group_faults > max_group_faults) {
-                       max_group_faults = group_faults;
-                       max_group_nid = nid;
-               }
        }
 
-       update_task_scan_period(p, fault_types[0], fault_types[1]);
-
        if (p->numa_group) {
                numa_group_count_active_nodes(p->numa_group);
                spin_unlock_irq(group_lock);
-               max_nid = preferred_group_nid(p, max_group_nid);
+               max_nid = preferred_group_nid(p, max_nid);
        }
 
        if (max_faults) {
                /* Set the new preferred node */
                if (max_nid != p->numa_preferred_nid)
                        sched_setnuma(p, max_nid);
-
-               if (task_node(p) != p->numa_preferred_nid)
-                       numa_migrate_preferred(p);
        }
+
+       update_task_scan_period(p, fault_types[0], fault_types[1]);
 }
 
 static inline int get_numa_group(struct numa_group *grp)
@@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                                numa_is_active_node(mem_node, ng))
                local = 1;
 
-       task_numa_placement(p);
-
        /*
         * Retry task to preferred node migration periodically, in case it
         * case it previously failed, or the scheduler moved us.
         */
-       if (time_after(jiffies, p->numa_migrate_retry))
+       if (time_after(jiffies, p->numa_migrate_retry)) {
+               task_numa_placement(p);
                numa_migrate_preferred(p);
+       }
 
        if (migrated)
                p->numa_pages_migrated += pages;
@@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 } while (0)
 
 #ifdef CONFIG_SMP
-/*
- * XXX we want to get rid of these helpers and use the full load resolution.
- */
-static inline long se_weight(struct sched_entity *se)
-{
-       return scale_load_down(se->load.weight);
-}
-
-static inline long se_runnable(struct sched_entity *se)
-{
-       return scale_load_down(se->runnable_weight);
-}
-
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 }
 
 #ifdef CONFIG_SMP
-/*
- * Approximate:
- *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
- */
-static u64 decay_load(u64 val, u64 n)
-{
-       unsigned int local_n;
-
-       if (unlikely(n > LOAD_AVG_PERIOD * 63))
-               return 0;
-
-       /* after bounds checking we can collapse to 32-bit */
-       local_n = n;
-
-       /*
-        * As y^PERIOD = 1/2, we can combine
-        *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
-        * With a look-up table which covers y^n (n<PERIOD)
-        *
-        * To achieve constant time decay_load.
-        */
-       if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
-               val >>= local_n / LOAD_AVG_PERIOD;
-               local_n %= LOAD_AVG_PERIOD;
-       }
-
-       val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
-       return val;
-}
-
-static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
-{
-       u32 c1, c2, c3 = d3; /* y^0 == 1 */
-
-       /*
-        * c1 = d1 y^p
-        */
-       c1 = decay_load((u64)d1, periods);
-
-       /*
-        *            p-1
-        * c2 = 1024 \Sum y^n
-        *            n=1
-        *
-        *              inf        inf
-        *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
-        *              n=0        n=p
-        */
-       c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
-
-       return c1 + c2 + c3;
-}
-
-/*
- * Accumulate the three separate parts of the sum; d1 the remainder
- * of the last (incomplete) period, d2 the span of full periods and d3
- * the remainder of the (incomplete) current period.
- *
- *           d1          d2           d3
- *           ^           ^            ^
- *           |           |            |
- *         |<->|<----------------->|<--->|
- * ... |---x---|------| ... |------|-----x (now)
- *
- *                           p-1
- * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
- *                           n=1
- *
- *    = u y^p +                                        (Step 1)
- *
- *                     p-1
- *      d1 y^p + 1024 \Sum y^n + d3 y^0                (Step 2)
- *                     n=1
- */
-static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-              unsigned long load, unsigned long runnable, int running)
-{
-       unsigned long scale_freq, scale_cpu;
-       u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
-       u64 periods;
-
-       scale_freq = arch_scale_freq_capacity(cpu);
-       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-       delta += sa->period_contrib;
-       periods = delta / 1024; /* A period is 1024us (~1ms) */
-
-       /*
-        * Step 1: decay old *_sum if we crossed period boundaries.
-        */
-       if (periods) {
-               sa->load_sum = decay_load(sa->load_sum, periods);
-               sa->runnable_load_sum =
-                       decay_load(sa->runnable_load_sum, periods);
-               sa->util_sum = decay_load((u64)(sa->util_sum), periods);
-
-               /*
-                * Step 2
-                */
-               delta %= 1024;
-               contrib = __accumulate_pelt_segments(periods,
-                               1024 - sa->period_contrib, delta);
-       }
-       sa->period_contrib = delta;
-
-       contrib = cap_scale(contrib, scale_freq);
-       if (load)
-               sa->load_sum += load * contrib;
-       if (runnable)
-               sa->runnable_load_sum += runnable * contrib;
-       if (running)
-               sa->util_sum += contrib * scale_cpu;
-
-       return periods;
-}
-
-/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series.  To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
- *
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- *      p0            p1           p2
- *     (now)       (~1ms ago)  (~2ms ago)
- *
- * Let u_i denote the fraction of p_i that the entity was runnable.
- *
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
- *
- * We choose y based on the with of a reasonably scheduling period, fixing:
- *   y^32 = 0.5
- *
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
- *
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
- */
-static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
-                 unsigned long load, unsigned long runnable, int running)
-{
-       u64 delta;
-
-       delta = now - sa->last_update_time;
-       /*
-        * This should only happen when time goes backwards, which it
-        * unfortunately does during sched clock init when we swap over to TSC.
-        */
-       if ((s64)delta < 0) {
-               sa->last_update_time = now;
-               return 0;
-       }
-
-       /*
-        * Use 1024ns as the unit of measurement since it's a reasonable
-        * approximation of 1us and fast to compute.
-        */
-       delta >>= 10;
-       if (!delta)
-               return 0;
-
-       sa->last_update_time += delta << 10;
-
-       /*
-        * running is a subset of runnable (weight) so running can't be set if
-        * runnable is clear. But there are some corner cases where the current
-        * se has been already dequeued but cfs_rq->curr still points to it.
-        * This means that weight will be 0 but not running for a sched_entity
-        * but also for a cfs_rq if the latter becomes idle. As an example,
-        * this happens during idle_balance() which calls
-        * update_blocked_averages()
-        */
-       if (!load)
-               runnable = running = 0;
-
-       /*
-        * Now we know we crossed measurement unit boundaries. The *_avg
-        * accrues by two steps:
-        *
-        * Step 1: accumulate *_sum since last_update_time. If we haven't
-        * crossed period boundaries, finish.
-        */
-       if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
-               return 0;
-
-       return 1;
-}
-
-static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
-{
-       u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
-
-       /*
-        * Step 2: update *_avg.
-        */
-       sa->load_avg = div_u64(load * sa->load_sum, divider);
-       sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
-       sa->util_avg = sa->util_sum / divider;
-}
-
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-
-static inline void cfs_se_util_change(struct sched_avg *avg)
-{
-       unsigned int enqueued;
-
-       if (!sched_feat(UTIL_EST))
-               return;
-
-       /* Avoid store if the flag has been already set */
-       enqueued = avg->util_est.enqueued;
-       if (!(enqueued & UTIL_AVG_UNCHANGED))
-               return;
-
-       /* Reset flag to report util_avg has been updated */
-       enqueued &= ~UTIL_AVG_UNCHANGED;
-       WRITE_ONCE(avg->util_est.enqueued, enqueued);
-}
-
-/*
- * sched_entity:
- *
- *   task:
- *     se_runnable() == se_weight()
- *
- *   group: [ see update_cfs_group() ]
- *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
- *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
- *
- *   load_sum := runnable_sum
- *   load_avg = se_weight(se) * runnable_avg
- *
- *   runnable_load_sum := runnable_sum
- *   runnable_load_avg = se_runnable(se) * runnable_avg
- *
- * XXX collapse load_sum and runnable_load_sum
- *
- * cfq_rs:
- *
- *   load_sum = \Sum se_weight(se) * se->avg.load_sum
- *   load_avg = \Sum se->avg.load_avg
- *
- *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
- *   runnable_load_avg = \Sum se->avg.runable_load_avg
- */
-
-static int
-__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
-{
-       if (entity_is_task(se))
-               se->runnable_weight = se->load.weight;
-
-       if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
-               ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
-               return 1;
-       }
-
-       return 0;
-}
-
-static int
-__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       if (entity_is_task(se))
-               se->runnable_weight = se->load.weight;
-
-       if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
-                               cfs_rq->curr == se)) {
-
-               ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
-               cfs_se_util_change(&se->avg);
-               return 1;
-       }
-
-       return 0;
-}
-
-static int
-__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
-{
-       if (___update_load_sum(now, cpu, &cfs_rq->avg,
-                               scale_load_down(cfs_rq->load.weight),
-                               scale_load_down(cfs_rq->runnable_weight),
-                               cfs_rq->curr != NULL)) {
-
-               ___update_load_avg(&cfs_rq->avg, 1, 1);
-               return 1;
-       }
-
-       return 0;
-}
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /**
  * update_tg_load_avg - update the tg's load avg
@@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 
 #else /* CONFIG_SMP */
 
-static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
-{
-       return 0;
-}
-
 #define UPDATE_TG      0x0
 #define SKIP_AGE_LOAD  0x0
 #define DO_ATTACH      0x0
@@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
               throttled_hierarchy(dest_cfs_rq);
 }
 
-/* updated child weight may affect parent so we have to do this bottom up */
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
        struct rq *rq = data;
@@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
 
                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
        }
-
-       sched_avg_update(this_rq);
 }
 
 /* Used instead of source_load when we know the type == 0 */
@@ -6237,6 +5839,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 }
 
 #ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 
 static inline void set_idle_cores(int cpu, int val)
 {
@@ -7294,8 +6897,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
-       unsigned long src_faults, dst_faults;
-       int src_nid, dst_nid;
+       unsigned long src_weight, dst_weight;
+       int src_nid, dst_nid, dist;
 
        if (!static_branch_likely(&sched_numa_balancing))
                return -1;
@@ -7322,18 +6925,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
                return 0;
 
        /* Leaving a core idle is often worse than degrading locality. */
-       if (env->idle != CPU_NOT_IDLE)
+       if (env->idle == CPU_IDLE)
                return -1;
 
+       dist = node_distance(src_nid, dst_nid);
        if (numa_group) {
-               src_faults = group_faults(p, src_nid);
-               dst_faults = group_faults(p, dst_nid);
+               src_weight = group_weight(p, src_nid, dist);
+               dst_weight = group_weight(p, dst_nid, dist);
        } else {
-               src_faults = task_faults(p, src_nid);
-               dst_faults = task_faults(p, dst_nid);
+               src_weight = task_weight(p, src_nid, dist);
+               dst_weight = task_weight(p, dst_nid, dist);
        }
 
-       return dst_faults < src_faults;
+       return dst_weight < src_weight;
 }
 
 #else
@@ -7620,6 +7224,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
        return false;
 }
 
+static inline bool others_have_blocked(struct rq *rq)
+{
+       if (READ_ONCE(rq->avg_rt.util_avg))
+               return true;
+
+       if (READ_ONCE(rq->avg_dl.util_avg))
+               return true;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       if (READ_ONCE(rq->avg_irq.util_avg))
+               return true;
+#endif
+
+       return false;
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7679,6 +7299,12 @@ static void update_blocked_averages(int cpu)
                if (cfs_rq_has_blocked(cfs_rq))
                        done = false;
        }
+       update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+       update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+       update_irq_load_avg(rq, 0);
+       /* Don't need periodic decay once load/util_avg are null */
+       if (others_have_blocked(rq))
+               done = false;
 
 #ifdef CONFIG_NO_HZ_COMMON
        rq->last_blocked_load_update_tick = jiffies;
@@ -7744,9 +7370,12 @@ static inline void update_blocked_averages(int cpu)
        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
        update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+       update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+       update_irq_load_avg(rq, 0);
 #ifdef CONFIG_NO_HZ_COMMON
        rq->last_blocked_load_update_tick = jiffies;
-       if (!cfs_rq_has_blocked(cfs_rq))
+       if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
                rq->has_blocked_load = 0;
 #endif
        rq_unlock_irqrestore(rq, &rf);
@@ -7856,39 +7485,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       u64 total, used, age_stamp, avg;
-       s64 delta;
+       unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+       unsigned long used, free;
+       unsigned long irq;
 
-       /*
-        * Since we're reading these variables without serialization make sure
-        * we read them once before doing sanity checks on them.
-        */
-       age_stamp = READ_ONCE(rq->age_stamp);
-       avg = READ_ONCE(rq->rt_avg);
-       delta = __rq_clock_broken(rq) - age_stamp;
+       irq = cpu_util_irq(rq);
 
-       if (unlikely(delta < 0))
-               delta = 0;
+       if (unlikely(irq >= max))
+               return 1;
 
-       total = sched_avg_period() + delta;
+       used = READ_ONCE(rq->avg_rt.util_avg);
+       used += READ_ONCE(rq->avg_dl.util_avg);
 
-       used = div_u64(avg, total);
+       if (unlikely(used >= max))
+               return 1;
 
-       if (likely(used < SCHED_CAPACITY_SCALE))
-               return SCHED_CAPACITY_SCALE - used;
+       free = max - used;
 
-       return 1;
+       return scale_irq_capacity(free, irq, max);
 }
 
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-       unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
+       unsigned long capacity = scale_rt_capacity(cpu);
        struct sched_group *sdg = sd->groups;
 
-       cpu_rq(cpu)->cpu_capacity_orig = capacity;
-
-       capacity *= scale_rt_capacity(cpu);
-       capacity >>= SCHED_CAPACITY_SHIFT;
+       cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
 
        if (!capacity)
                capacity = 1;