kernel/sched_fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 /*
  24  * Targeted preemption latency for CPU-bound tasks:
  25  * (default: 20ms, units: nanoseconds)
  26  *
  27  * NOTE: this latency value is not the same as the concept of
  28  * 'timeslice length' - timeslices in CFS are of variable length.
  29  * (to see the precise effective timeslice length of your workload,
  30  *  run vmstat and monitor the context-switches field)
  31  *
  32  * On SMP systems the value of this is multiplied by the log2 of the
  33  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  34  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  35  * Targeted preemption latency for CPU-bound tasks:
  36  */
  37 const_debug unsigned int sysctl_sched_latency = 20000000ULL;
  38
  39 /*
  40  * After fork, child runs first. (default) If set to 0 then
  41  * parent will (try to) run first.
  42  */
  43 const_debug unsigned int sysctl_sched_child_runs_first = 1;
  44
  45 /*
  46  * Minimal preemption granularity for CPU-bound tasks:
  47  * (default: 2 msec, units: nanoseconds)
  48  */
  49 const_debug unsigned int sysctl_sched_nr_latency = 20;
  50
  51 /*
  52  * sys_sched_yield() compat mode
  53  *
  54  * This option switches the agressive yield implementation of the
  55  * old scheduler back on.
  56  */
  57 unsigned int __read_mostly sysctl_sched_compat_yield;
  58
  59 /*
  60  * SCHED_BATCH wake-up granularity.
  61  * (default: 25 msec, units: nanoseconds)
  62  *
  63  * This option delays the preemption effects of decoupled workloads
  64  * and reduces their over-scheduling. Synchronous workloads will still
  65  * have immediate wakeup/sleep latencies.
  66  */
  67 const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  68
  69 /*
  70  * SCHED_OTHER wake-up granularity.
  71  * (default: 1 msec, units: nanoseconds)
  72  *
  73  * This option delays the preemption effects of decoupled workloads
  74  * and reduces their over-scheduling. Synchronous workloads will still
  75  * have immediate wakeup/sleep latencies.
  76  */
  77 const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
  78
  79 extern struct sched_class fair_sched_class;
  80
  81 /**************************************************************
  82  * CFS operations on generic schedulable entities:
  83  */
  84
  85 #ifdef CONFIG_FAIR_GROUP_SCHED
  86
  87 /* cpu runqueue to which this cfs_rq is attached */
  88 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  89 {
  90         return cfs_rq->rq;
  91 }
  92
  93 /* An entity is a task if it doesn't "own" a runqueue */
  94 #define entity_is_task(se)      (!se->my_q)
  95
  96 #else   /* CONFIG_FAIR_GROUP_SCHED */
  97
  98 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  99 {
 100         return container_of(cfs_rq, struct rq, cfs);
 101 }
 102
 103 #define entity_is_task(se)      1
 104
 105 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 106
 107 static inline struct task_struct *task_of(struct sched_entity *se)
 108 {
 109         return container_of(se, struct task_struct, se);
 110 }
 111
 112
 113 /**************************************************************
 114  * Scheduling class tree data structure manipulation methods:
 115  */
 116
 117 static inline u64
 118 max_vruntime(u64 min_vruntime, u64 vruntime)
 119 {
 120         s64 delta = (s64)(vruntime - min_vruntime);
 121         if (delta > 0)
 122                 min_vruntime = vruntime;
 123
 124         return min_vruntime;
 125 }
 126
 127 static inline u64
 128 min_vruntime(u64 min_vruntime, u64 vruntime)
 129 {
 130         s64 delta = (s64)(vruntime - min_vruntime);
 131         if (delta < 0)
 132                 min_vruntime = vruntime;
 133
 134         return min_vruntime;
 135 }
 136
 137 static inline s64
 138 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 139 {
 140         return se->vruntime - cfs_rq->min_vruntime;
 141 }
 142
 143 /*
 144  * Enqueue an entity into the rb-tree:
 145  */
 146 static void
 147 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 148 {
 149         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 150         struct rb_node *parent = NULL;
 151         struct sched_entity *entry;
 152         s64 key = entity_key(cfs_rq, se);
 153         int leftmost = 1;
 154
 155         /*
 156          * Find the right place in the rbtree:
 157          */
 158         while (*link) {
 159                 parent = *link;
 160                 entry = rb_entry(parent, struct sched_entity, run_node);
 161                 /*
 162                  * We dont care about collisions. Nodes with
 163                  * the same key stay together.
 164                  */
 165                 if (key < entity_key(cfs_rq, entry)) {
 166                         link = &parent->rb_left;
 167                 } else {
 168                         link = &parent->rb_right;
 169                         leftmost = 0;
 170                 }
 171         }
 172
 173         /*
 174          * Maintain a cache of leftmost tree entries (it is frequently
 175          * used):
 176          */
 177         if (leftmost)
 178                 cfs_rq->rb_leftmost = &se->run_node;
 179
 180         rb_link_node(&se->run_node, parent, link);
 181         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 182 }
 183
 184 static void
 185 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 186 {
 187         if (cfs_rq->rb_leftmost == &se->run_node)
 188                 cfs_rq->rb_leftmost = rb_next(&se->run_node);
 189
 190         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 191 }
 192
 193 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
 194 {
 195         return cfs_rq->rb_leftmost;
 196 }
 197
 198 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 199 {
 200         return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 201 }
 202
 203 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 204 {
 205         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 206         struct sched_entity *se = NULL;
 207         struct rb_node *parent;
 208
 209         while (*link) {
 210                 parent = *link;
 211                 se = rb_entry(parent, struct sched_entity, run_node);
 212                 link = &parent->rb_right;
 213         }
 214
 215         return se;
 216 }
 217
 218 /**************************************************************
 219  * Scheduling class statistics methods:
 220  */
 221
 222 static u64 __sched_period(unsigned long nr_running)
 223 {
 224         u64 period = sysctl_sched_latency;
 225         unsigned long nr_latency = sysctl_sched_nr_latency;
 226
 227         if (unlikely(nr_running > nr_latency)) {
 228                 period *= nr_running;
 229                 do_div(period, nr_latency);
 230         }
 231
 232         return period;
 233 }
 234
 235 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 236 {
 237         u64 period = __sched_period(cfs_rq->nr_running);
 238
 239         period *= se->load.weight;
 240         do_div(period, cfs_rq->load.weight);
 241
 242         return period;
 243 }
 244
 245 static u64 __sched_vslice(unsigned long nr_running)
 246 {
 247         unsigned long period = sysctl_sched_latency;
 248         unsigned long nr_latency = sysctl_sched_nr_latency;
 249
 250         if (unlikely(nr_running > nr_latency))
 251                 nr_running = nr_latency;
 252
 253         period /= nr_running;
 254
 255         return (u64)period;
 256 }
 257
 258 /*
 259  * Update the current task's runtime statistics. Skip current tasks that
 260  * are not in our scheduling class.
 261  */
 262 static inline void
 263 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 264               unsigned long delta_exec)
 265 {
 266         unsigned long delta_exec_weighted;
 267         u64 vruntime;
 268
 269         schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 270
 271         curr->sum_exec_runtime += delta_exec;
 272         schedstat_add(cfs_rq, exec_clock, delta_exec);
 273         delta_exec_weighted = delta_exec;
 274         if (unlikely(curr->load.weight != NICE_0_LOAD)) {
 275                 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
 276                                                         &curr->load);
 277         }
 278         curr->vruntime += delta_exec_weighted;
 279
 280         /*
 281          * maintain cfs_rq->min_vruntime to be a monotonic increasing
 282          * value tracking the leftmost vruntime in the tree.
 283          */
 284         if (first_fair(cfs_rq)) {
 285                 vruntime = min_vruntime(curr->vruntime,
 286                                 __pick_next_entity(cfs_rq)->vruntime);
 287         } else
 288                 vruntime = curr->vruntime;
 289
 290         cfs_rq->min_vruntime =
 291                 max_vruntime(cfs_rq->min_vruntime, vruntime);
 292 }
 293
 294 static void update_curr(struct cfs_rq *cfs_rq)
 295 {
 296         struct sched_entity *curr = cfs_rq->curr;
 297         u64 now = rq_of(cfs_rq)->clock;
 298         unsigned long delta_exec;
 299
 300         if (unlikely(!curr))
 301                 return;
 302
 303         /*
 304          * Get the amount of time the current task was running
 305          * since the last time we changed load (this cannot
 306          * overflow on 32 bits):
 307          */
 308         delta_exec = (unsigned long)(now - curr->exec_start);
 309
 310         __update_curr(cfs_rq, curr, delta_exec);
 311         curr->exec_start = now;
 312 }
 313
 314 static inline void
 315 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 316 {
 317         schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 318 }
 319
 320 static inline unsigned long
 321 calc_weighted(unsigned long delta, struct sched_entity *se)
 322 {
 323         unsigned long weight = se->load.weight;
 324
 325         if (unlikely(weight != NICE_0_LOAD))
 326                 return (u64)delta * se->load.weight >> NICE_0_SHIFT;
 327         else
 328                 return delta;
 329 }
 330
 331 /*
 332  * Task is being enqueued - update stats:
 333  */
 334 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 335 {
 336         /*
 337          * Are we enqueueing a waiting task? (for current tasks
 338          * a dequeue/enqueue event is a NOP)
 339          */
 340         if (se != cfs_rq->curr)
 341                 update_stats_wait_start(cfs_rq, se);
 342 }
 343
 344 static void
 345 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 346 {
 347         schedstat_set(se->wait_max, max(se->wait_max,
 348                         rq_of(cfs_rq)->clock - se->wait_start));
 349         schedstat_set(se->wait_start, 0);
 350 }
 351
 352 static inline void
 353 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 354 {
 355         update_curr(cfs_rq);
 356         /*
 357          * Mark the end of the wait period if dequeueing a
 358          * waiting task:
 359          */
 360         if (se != cfs_rq->curr)
 361                 update_stats_wait_end(cfs_rq, se);
 362 }
 363
 364 /*
 365  * We are picking a new current task - update its stats:
 366  */
 367 static inline void
 368 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 369 {
 370         /*
 371          * We are starting a new run period:
 372          */
 373         se->exec_start = rq_of(cfs_rq)->clock;
 374 }
 375
 376 /*
 377  * We are descheduling a task - update its stats:
 378  */
 379 static inline void
 380 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 381 {
 382         se->exec_start = 0;
 383 }
 384
 385 /**************************************************
 386  * Scheduling class queueing methods:
 387  */
 388
 389 static void
 390 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 391 {
 392         update_load_add(&cfs_rq->load, se->load.weight);
 393         cfs_rq->nr_running++;
 394         se->on_rq = 1;
 395 }
 396
 397 static void
 398 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 399 {
 400         update_load_sub(&cfs_rq->load, se->load.weight);
 401         cfs_rq->nr_running--;
 402         se->on_rq = 0;
 403 }
 404
 405 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 406 {
 407 #ifdef CONFIG_SCHEDSTATS
 408         if (se->sleep_start) {
 409                 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 410
 411                 if ((s64)delta < 0)
 412                         delta = 0;
 413
 414                 if (unlikely(delta > se->sleep_max))
 415                         se->sleep_max = delta;
 416
 417                 se->sleep_start = 0;
 418                 se->sum_sleep_runtime += delta;
 419         }
 420         if (se->block_start) {
 421                 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 422
 423                 if ((s64)delta < 0)
 424                         delta = 0;
 425
 426                 if (unlikely(delta > se->block_max))
 427                         se->block_max = delta;
 428
 429                 se->block_start = 0;
 430                 se->sum_sleep_runtime += delta;
 431
 432                 /*
 433                  * Blocking time is in units of nanosecs, so shift by 20 to
 434                  * get a milliseconds-range estimation of the amount of
 435                  * time that the task spent sleeping:
 436                  */
 437                 if (unlikely(prof_on == SLEEP_PROFILING)) {
 438                         struct task_struct *tsk = task_of(se);
 439
 440                         profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 441                                      delta >> 20);
 442                 }
 443         }
 444 #endif
 445 }
 446
 447 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 448 {
 449 #ifdef CONFIG_SCHED_DEBUG
 450         s64 d = se->vruntime - cfs_rq->min_vruntime;
 451
 452         if (d < 0)
 453                 d = -d;
 454
 455         if (d > 3*sysctl_sched_latency)
 456                 schedstat_inc(cfs_rq, nr_spread_over);
 457 #endif
 458 }
 459
 460 static void
 461 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 462 {
 463         u64 vruntime;
 464
 465         vruntime = cfs_rq->min_vruntime;
 466
 467         if (sched_feat(USE_TREE_AVG)) {
 468                 struct sched_entity *last = __pick_last_entity(cfs_rq);
 469                 if (last) {
 470                         vruntime += last->vruntime;
 471                         vruntime >>= 1;
 472                 }
 473         } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
 474                 vruntime += __sched_vslice(cfs_rq->nr_running)/2;
 475
 476         if (initial && sched_feat(START_DEBIT))
 477                 vruntime += __sched_vslice(cfs_rq->nr_running + 1);
 478
 479         if (!initial) {
 480                 if (sched_feat(NEW_FAIR_SLEEPERS))
 481                         vruntime -= sysctl_sched_latency;
 482
 483                 vruntime = max_t(s64, vruntime, se->vruntime);
 484         }
 485
 486         se->vruntime = vruntime;
 487
 488 }
 489
 490 static void
 491 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 492 {
 493         /*
 494          * Update the fair clock.
 495          */
 496         update_curr(cfs_rq);
 497
 498         if (wakeup) {
 499                 /* se->vruntime += cfs_rq->min_vruntime; */
 500                 place_entity(cfs_rq, se, 0);
 501                 enqueue_sleeper(cfs_rq, se);
 502         }
 503
 504         update_stats_enqueue(cfs_rq, se);
 505         check_spread(cfs_rq, se);
 506         if (se != cfs_rq->curr)
 507                 __enqueue_entity(cfs_rq, se);
 508         account_entity_enqueue(cfs_rq, se);
 509 }
 510
 511 static void
 512 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 513 {
 514         update_stats_dequeue(cfs_rq, se);
 515         if (sleep) {
 516 #ifdef CONFIG_SCHEDSTATS
 517                 if (entity_is_task(se)) {
 518                         struct task_struct *tsk = task_of(se);
 519
 520                         if (tsk->state & TASK_INTERRUPTIBLE)
 521                                 se->sleep_start = rq_of(cfs_rq)->clock;
 522                         if (tsk->state & TASK_UNINTERRUPTIBLE)
 523                                 se->block_start = rq_of(cfs_rq)->clock;
 524                 }
 525 #endif
 526         }
 527
 528         if (se != cfs_rq->curr)
 529                 __dequeue_entity(cfs_rq, se);
 530         account_entity_dequeue(cfs_rq, se);
 531 }
 532
 533 /*
 534  * Preempt the current task with a newly woken task if needed:
 535  */
 536 static void
 537 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 538 {
 539         unsigned long ideal_runtime, delta_exec;
 540
 541         ideal_runtime = sched_slice(cfs_rq, curr);
 542         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 543         if (delta_exec > ideal_runtime)
 544                 resched_task(rq_of(cfs_rq)->curr);
 545 }
 546
 547 static void
 548 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 549 {
 550         /* 'current' is not kept within the tree. */
 551         if (se->on_rq) {
 552                 /*
 553                  * Any task has to be enqueued before it get to execute on
 554                  * a CPU. So account for the time it spent waiting on the
 555                  * runqueue.
 556                  */
 557                 update_stats_wait_end(cfs_rq, se);
 558                 __dequeue_entity(cfs_rq, se);
 559         }
 560
 561         update_stats_curr_start(cfs_rq, se);
 562         cfs_rq->curr = se;
 563 #ifdef CONFIG_SCHEDSTATS
 564         /*
 565          * Track our maximum slice length, if the CPU's load is at
 566          * least twice that of our own weight (i.e. dont track it
 567          * when there are only lesser-weight tasks around):
 568          */
 569         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 570                 se->slice_max = max(se->slice_max,
 571                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
 572         }
 573 #endif
 574         se->prev_sum_exec_runtime = se->sum_exec_runtime;
 575 }
 576
 577 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 578 {
 579         struct sched_entity *se = __pick_next_entity(cfs_rq);
 580
 581         set_next_entity(cfs_rq, se);
 582
 583         return se;
 584 }
 585
 586 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 587 {
 588         /*
 589          * If still on the runqueue then deactivate_task()
 590          * was not called and update_curr() has to be done:
 591          */
 592         if (prev->on_rq)
 593                 update_curr(cfs_rq);
 594
 595         update_stats_curr_end(cfs_rq, prev);
 596
 597         check_spread(cfs_rq, prev);
 598         if (prev->on_rq) {
 599                 update_stats_wait_start(cfs_rq, prev);
 600                 /* Put 'current' back into the tree. */
 601                 __enqueue_entity(cfs_rq, prev);
 602         }
 603         cfs_rq->curr = NULL;
 604 }
 605
 606 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 607 {
 608         /*
 609          * Update run-time statistics of the 'current'.
 610          */
 611         update_curr(cfs_rq);
 612
 613         if (cfs_rq->nr_running > 1)
 614                 check_preempt_tick(cfs_rq, curr);
 615 }
 616
 617 /**************************************************
 618  * CFS operations on tasks:
 619  */
 620
 621 #ifdef CONFIG_FAIR_GROUP_SCHED
 622
 623 /* Walk up scheduling entities hierarchy */
 624 #define for_each_sched_entity(se) \
 625                 for (; se; se = se->parent)
 626
 627 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 628 {
 629         return p->se.cfs_rq;
 630 }
 631
 632 /* runqueue on which this entity is (to be) queued */
 633 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 634 {
 635         return se->cfs_rq;
 636 }
 637
 638 /* runqueue "owned" by this group */
 639 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 640 {
 641         return grp->my_q;
 642 }
 643
 644 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
 645  * another cpu ('this_cpu')
 646  */
 647 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 648 {
 649         return cfs_rq->tg->cfs_rq[this_cpu];
 650 }
 651
 652 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 653 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 654         list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 655
 656 /* Do the two (enqueued) tasks belong to the same group ? */
 657 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 658 {
 659         if (curr->se.cfs_rq == p->se.cfs_rq)
 660                 return 1;
 661
 662         return 0;
 663 }
 664
 665 #else   /* CONFIG_FAIR_GROUP_SCHED */
 666
 667 #define for_each_sched_entity(se) \
 668                 for (; se; se = NULL)
 669
 670 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 671 {
 672         return &task_rq(p)->cfs;
 673 }
 674
 675 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 676 {
 677         struct task_struct *p = task_of(se);
 678         struct rq *rq = task_rq(p);
 679
 680         return &rq->cfs;
 681 }
 682
 683 /* runqueue "owned" by this group */
 684 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 685 {
 686         return NULL;
 687 }
 688
 689 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 690 {
 691         return &cpu_rq(this_cpu)->cfs;
 692 }
 693
 694 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 695                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 696
 697 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 698 {
 699         return 1;
 700 }
 701
 702 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 703
 704 /*
 705  * The enqueue_task method is called before nr_running is
 706  * increased. Here we update the fair scheduling stats and
 707  * then put the task into the rbtree:
 708  */
 709 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 710 {
 711         struct cfs_rq *cfs_rq;
 712         struct sched_entity *se = &p->se;
 713
 714         for_each_sched_entity(se) {
 715                 if (se->on_rq)
 716                         break;
 717                 cfs_rq = cfs_rq_of(se);
 718                 enqueue_entity(cfs_rq, se, wakeup);
 719         }
 720 }
 721
 722 /*
 723  * The dequeue_task method is called before nr_running is
 724  * decreased. We remove the task from the rbtree and
 725  * update the fair scheduling stats:
 726  */
 727 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 728 {
 729         struct cfs_rq *cfs_rq;
 730         struct sched_entity *se = &p->se;
 731
 732         for_each_sched_entity(se) {
 733                 cfs_rq = cfs_rq_of(se);
 734                 dequeue_entity(cfs_rq, se, sleep);
 735                 /* Don't dequeue parent if it has other entities besides us */
 736                 if (cfs_rq->load.weight)
 737                         break;
 738         }
 739 }
 740
 741 /*
 742  * sched_yield() support is very simple - we dequeue and enqueue.
 743  *
 744  * If compat_yield is turned on then we requeue to the end of the tree.
 745  */
 746 static void yield_task_fair(struct rq *rq)
 747 {
 748         struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
 749         struct sched_entity *rightmost, *se = &rq->curr->se;
 750
 751         /*
 752          * Are we the only task in the tree?
 753          */
 754         if (unlikely(cfs_rq->nr_running == 1))
 755                 return;
 756
 757         if (likely(!sysctl_sched_compat_yield)) {
 758                 __update_rq_clock(rq);
 759                 /*
 760                  * Dequeue and enqueue the task to update its
 761                  * position within the tree:
 762                  */
 763                 update_curr(cfs_rq);
 764
 765                 return;
 766         }
 767         /*
 768          * Find the rightmost entry in the rbtree:
 769          */
 770         rightmost = __pick_last_entity(cfs_rq);
 771         /*
 772          * Already in the rightmost position?
 773          */
 774         if (unlikely(rightmost->vruntime < se->vruntime))
 775                 return;
 776
 777         /*
 778          * Minimally necessary key value to be last in the tree:
 779          * Upon rescheduling, sched_class::put_prev_task() will place
 780          * 'current' within the tree based on its new key value.
 781          */
 782         se->vruntime = rightmost->vruntime + 1;
 783 }
 784
 785 /*
 786  * Preempt the current task with a newly woken task if needed:
 787  */
 788 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 789 {
 790         struct task_struct *curr = rq->curr;
 791         struct cfs_rq *cfs_rq = task_cfs_rq(curr), *pcfs_rq;
 792         struct sched_entity *se = &curr->se, *pse = &p->se;
 793
 794         if (unlikely(rt_prio(p->prio))) {
 795                 update_rq_clock(rq);
 796                 update_curr(cfs_rq);
 797                 resched_task(curr);
 798                 return;
 799         }
 800
 801         for_each_sched_entity(se) {
 802                 cfs_rq = cfs_rq_of(se);
 803                 pcfs_rq = cfs_rq_of(pse);
 804
 805                 if (cfs_rq == pcfs_rq) {
 806                         s64 delta = se->vruntime - pse->vruntime;
 807
 808                         if (delta > (s64)sysctl_sched_wakeup_granularity)
 809                                 resched_task(curr);
 810                         break;
 811                 }
 812 #ifdef CONFIG_FAIR_GROUP_SCHED
 813                 pse = pse->parent;
 814 #endif
 815         }
 816 }
 817
 818 static struct task_struct *pick_next_task_fair(struct rq *rq)
 819 {
 820         struct cfs_rq *cfs_rq = &rq->cfs;
 821         struct sched_entity *se;
 822
 823         if (unlikely(!cfs_rq->nr_running))
 824                 return NULL;
 825
 826         do {
 827                 se = pick_next_entity(cfs_rq);
 828                 cfs_rq = group_cfs_rq(se);
 829         } while (cfs_rq);
 830
 831         return task_of(se);
 832 }
 833
 834 /*
 835  * Account for a descheduled task:
 836  */
 837 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 838 {
 839         struct sched_entity *se = &prev->se;
 840         struct cfs_rq *cfs_rq;
 841
 842         for_each_sched_entity(se) {
 843                 cfs_rq = cfs_rq_of(se);
 844                 put_prev_entity(cfs_rq, se);
 845         }
 846 }
 847
 848 /**************************************************
 849  * Fair scheduling class load-balancing methods:
 850  */
 851
 852 /*
 853  * Load-balancing iterator. Note: while the runqueue stays locked
 854  * during the whole iteration, the current task might be
 855  * dequeued so the iterator has to be dequeue-safe. Here we
 856  * achieve that by always pre-iterating before returning
 857  * the current task:
 858  */
 859 static inline struct task_struct *
 860 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 861 {
 862         struct task_struct *p;
 863
 864         if (!curr)
 865                 return NULL;
 866
 867         p = rb_entry(curr, struct task_struct, se.run_node);
 868         cfs_rq->rb_load_balance_curr = rb_next(curr);
 869
 870         return p;
 871 }
 872
 873 static struct task_struct *load_balance_start_fair(void *arg)
 874 {
 875         struct cfs_rq *cfs_rq = arg;
 876
 877         return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 878 }
 879
 880 static struct task_struct *load_balance_next_fair(void *arg)
 881 {
 882         struct cfs_rq *cfs_rq = arg;
 883
 884         return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 885 }
 886
 887 #ifdef CONFIG_FAIR_GROUP_SCHED
 888 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 889 {
 890         struct sched_entity *curr;
 891         struct task_struct *p;
 892
 893         if (!cfs_rq->nr_running)
 894                 return MAX_PRIO;
 895
 896         curr = cfs_rq->curr;
 897         if (!curr)
 898                 curr = __pick_next_entity(cfs_rq);
 899
 900         p = task_of(curr);
 901
 902         return p->prio;
 903 }
 904 #endif
 905
 906 static unsigned long
 907 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 908                   unsigned long max_nr_move, unsigned long max_load_move,
 909                   struct sched_domain *sd, enum cpu_idle_type idle,
 910                   int *all_pinned, int *this_best_prio)
 911 {
 912         struct cfs_rq *busy_cfs_rq;
 913         unsigned long load_moved, total_nr_moved = 0, nr_moved;
 914         long rem_load_move = max_load_move;
 915         struct rq_iterator cfs_rq_iterator;
 916
 917         cfs_rq_iterator.start = load_balance_start_fair;
 918         cfs_rq_iterator.next = load_balance_next_fair;
 919
 920         for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 921 #ifdef CONFIG_FAIR_GROUP_SCHED
 922                 struct cfs_rq *this_cfs_rq;
 923                 long imbalance;
 924                 unsigned long maxload;
 925
 926                 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 927
 928                 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
 929                 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
 930                 if (imbalance <= 0)
 931                         continue;
 932
 933                 /* Don't pull more than imbalance/2 */
 934                 imbalance /= 2;
 935                 maxload = min(rem_load_move, imbalance);
 936
 937                 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
 938 #else
 939 # define maxload rem_load_move
 940 #endif
 941                 /* pass busy_cfs_rq argument into
 942                  * load_balance_[start|next]_fair iterators
 943                  */
 944                 cfs_rq_iterator.arg = busy_cfs_rq;
 945                 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
 946                                 max_nr_move, maxload, sd, idle, all_pinned,
 947                                 &load_moved, this_best_prio, &cfs_rq_iterator);
 948
 949                 total_nr_moved += nr_moved;
 950                 max_nr_move -= nr_moved;
 951                 rem_load_move -= load_moved;
 952
 953                 if (max_nr_move <= 0 || rem_load_move <= 0)
 954                         break;
 955         }
 956
 957         return max_load_move - rem_load_move;
 958 }
 959
 960 /*
 961  * scheduler tick hitting a task of our scheduling class:
 962  */
 963 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 964 {
 965         struct cfs_rq *cfs_rq;
 966         struct sched_entity *se = &curr->se;
 967
 968         for_each_sched_entity(se) {
 969                 cfs_rq = cfs_rq_of(se);
 970                 entity_tick(cfs_rq, se);
 971         }
 972 }
 973
 974 #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
 975
 976 /*
 977  * Share the fairness runtime between parent and child, thus the
 978  * total amount of pressure for CPU stays equal - new tasks
 979  * get a chance to run but frequent forkers are not allowed to
 980  * monopolize the CPU. Note: the parent runqueue is locked,
 981  * the child is not running yet.
 982  */
 983 static void task_new_fair(struct rq *rq, struct task_struct *p)
 984 {
 985         struct cfs_rq *cfs_rq = task_cfs_rq(p);
 986         struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
 987
 988         sched_info_queued(p);
 989
 990         update_curr(cfs_rq);
 991         place_entity(cfs_rq, se, 1);
 992
 993         if (sysctl_sched_child_runs_first &&
 994                         curr->vruntime < se->vruntime) {
 995                 /*
 996                  * Upon rescheduling, sched_class::put_prev_task() will place
 997                  * 'current' within the tree based on its new key value.
 998                  */
 999                 swap(curr->vruntime, se->vruntime);
1000         }
1001
1002         update_stats_enqueue(cfs_rq, se);
1003         check_spread(cfs_rq, se);
1004         check_spread(cfs_rq, curr);
1005         __enqueue_entity(cfs_rq, se);
1006         account_entity_enqueue(cfs_rq, se);
1007         resched_task(rq->curr);
1008 }
1009
1010 /* Account for a task changing its policy or group.
1011  *
1012  * This routine is mostly called to set cfs_rq->curr field when a task
1013  * migrates between groups/classes.
1014  */
1015 static void set_curr_task_fair(struct rq *rq)
1016 {
1017         struct sched_entity *se = &rq->curr->se;
1018
1019         for_each_sched_entity(se)
1020                 set_next_entity(cfs_rq_of(se), se);
1021 }
1022
1023 /*
1024  * All the scheduling class methods:
1025  */
1026 struct sched_class fair_sched_class __read_mostly = {
1027         .enqueue_task           = enqueue_task_fair,
1028         .dequeue_task           = dequeue_task_fair,
1029         .yield_task             = yield_task_fair,
1030
1031         .check_preempt_curr     = check_preempt_wakeup,
1032
1033         .pick_next_task         = pick_next_task_fair,
1034         .put_prev_task          = put_prev_task_fair,
1035
1036         .load_balance           = load_balance_fair,
1037
1038         .set_curr_task          = set_curr_task_fair,
1039         .task_tick              = task_tick_fair,
1040         .task_new               = task_new_fair,
1041 };
1042
1043 #ifdef CONFIG_SCHED_DEBUG
1044 static void print_cfs_stats(struct seq_file *m, int cpu)
1045 {
1046         struct cfs_rq *cfs_rq;
1047
1048 #ifdef CONFIG_FAIR_GROUP_SCHED
1049         print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1050 #endif
1051         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1052                 print_cfs_rq(m, cpu, cfs_rq);
1053 }
1054 #endif