cfq-iosched: slice offset should take ioprio into account
[powerpc.git] / block / cfq-iosched.c
index bfb3967..4a03970 100644 (file)
@@ -26,7 +26,16 @@ static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 
+/*
+ * grace period before allowing idle class to get disk access
+ */
 #define CFQ_IDLE_GRACE         (HZ / 10)
+
+/*
+ * below this threshold, we consider thinktime immediate
+ */
+#define CFQ_MIN_TT             (2)
+
 #define CFQ_SLICE_SCALE                (5)
 
 #define CFQ_KEY_ASYNC          (0)
@@ -56,16 +65,22 @@ static struct completion *ioc_gone;
 #define ASYNC                  (0)
 #define SYNC                   (1)
 
-#define cfq_cfqq_dispatched(cfqq)      \
-       ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
-
-#define cfq_cfqq_class_sync(cfqq)      ((cfqq)->key != CFQ_KEY_ASYNC)
-
-#define cfq_cfqq_sync(cfqq)            \
-       (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
+#define cfq_cfqq_sync(cfqq)    ((cfqq)->key != CFQ_KEY_ASYNC)
 
 #define sample_valid(samples)  ((samples) > 80)
 
+/*
+ * Most of our rbtree usage is for sorting with min extraction, so
+ * if we cache the leftmost node we don't have to walk down the tree
+ * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
+ * move this into the elevator for the rq sorting as well.
+ */
+struct cfq_rb_root {
+       struct rb_root rb;
+       struct rb_node *left;
+};
+#define CFQ_RB_ROOT    (struct cfq_rb_root) { RB_ROOT, NULL, }
+
 /*
  * Per block device queue structure
  */
@@ -75,10 +90,8 @@ struct cfq_data {
        /*
         * rr list of queues with requests and the count of them
         */
-       struct list_head rr_list[CFQ_PRIO_LISTS];
-       struct list_head busy_rr;
+       struct cfq_rb_root service_tree;
        struct list_head cur_rr;
-       struct list_head idle_rr;
        unsigned int busy_queues;
 
        /*
@@ -97,12 +110,11 @@ struct cfq_data {
 
        struct cfq_queue *active_queue;
        struct cfq_io_context *active_cic;
-       int cur_prio, cur_end_prio;
        unsigned int dispatch_slice;
 
        struct timer_list idle_class_timer;
 
-       sector_t last_sector;
+       sector_t last_position;
        unsigned long last_end_request;
 
        /*
@@ -117,6 +129,9 @@ struct cfq_data {
        unsigned int cfq_slice_idle;
 
        struct list_head cic_list;
+
+       sector_t new_seek_mean;
+       u64 new_seek_total;
 };
 
 /*
@@ -133,6 +148,10 @@ struct cfq_queue {
        unsigned int key;
        /* member of the rr/busy/cur/idle cfqd list */
        struct list_head cfq_list;
+       /* service_tree member */
+       struct rb_node rb_node;
+       /* service_tree key */
+       unsigned long rb_key;
        /* sorted list of pending requests */
        struct rb_root sort_list;
        /* if fifo isn't expired, next request to serve */
@@ -147,11 +166,10 @@ struct cfq_queue {
        struct list_head fifo;
 
        unsigned long slice_end;
-       unsigned long service_last;
        long slice_resid;
 
-       /* number of requests that are on the dispatch list */
-       int on_dispatch[2];
+       /* number of requests that are on the dispatch list or inside driver */
+       int dispatched;
 
        /* io prio of this group */
        unsigned short ioprio, org_ioprio;
@@ -159,6 +177,8 @@ struct cfq_queue {
 
        /* various state flags, see below */
        unsigned int flags;
+
+       sector_t last_request_pos;
 };
 
 enum cfqq_state_flags {
@@ -202,7 +222,7 @@ CFQ_CFQQ_FNS(slice_new);
 
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_insert(request_queue_t *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *, unsigned int, struct task_struct *, gfp_t);
 
 /*
  * scheduler run of queue, if there are requests pending and no one in the
@@ -237,28 +257,26 @@ static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync)
  * if a queue is marked sync and has sync io queued. A sync queue with async
  * io only, should not get full sync slice length.
  */
-static inline int
-cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
+                                unsigned short prio)
 {
-       const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+       const int base_slice = cfqd->cfq_slice[sync];
 
-       WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
+       WARN_ON(prio >= IOPRIO_BE_NR);
 
-       return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
+       return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
+}
+
+static inline int
+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+       return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
        cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
-       cfqq->slice_end += cfqq->slice_resid;
-
-       /*
-        * Don't carry over residual for more than one slice, we only want
-        * to slightly correct the fairness. Carrying over forever would
-        * easily introduce oscillations.
-        */
-       cfqq->slice_resid = 0;
 }
 
 /*
@@ -307,7 +325,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
        s1 = rq1->sector;
        s2 = rq2->sector;
 
-       last = cfqd->last_sector;
+       last = cfqd->last_position;
 
        /*
         * by definition, 1KiB is 2 sectors
@@ -371,6 +389,26 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
        }
 }
 
+/*
+ * The below is leftmost cache rbtree addon
+ */
+static struct rb_node *cfq_rb_first(struct cfq_rb_root *root)
+{
+       if (!root->left)
+               root->left = rb_first(&root->rb);
+
+       return root->left;
+}
+
+static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
+{
+       if (root->left == n)
+               root->left = NULL;
+
+       rb_erase(n, &root->rb);
+       RB_CLEAR_NODE(n);
+}
+
 /*
  * would be nice to take fifo expire time into account as well
  */
@@ -398,78 +436,93 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        return cfq_choose_req(cfqd, next, prev);
 }
 
-static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
+static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+                                     struct cfq_queue *cfqq)
 {
-       struct cfq_data *cfqd = cfqq->cfqd;
-       struct list_head *list, *n;
-       struct cfq_queue *__cfqq;
-
        /*
-        * Resorting requires the cfqq to be on the RR list already.
+        * just an approximation, should be ok.
         */
-       if (!cfq_cfqq_on_rr(cfqq))
-               return;
+       return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+                      cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
+}
 
-       list_del(&cfqq->cfq_list);
+/*
+ * The cfqd->service_tree holds all pending cfq_queue's that have
+ * requests waiting to be processed. It is sorted in the order that
+ * we will service the queues.
+ */
+static void cfq_service_tree_add(struct cfq_data *cfqd,
+                                   struct cfq_queue *cfqq)
+{
+       struct rb_node **p = &cfqd->service_tree.rb.rb_node;
+       struct rb_node *parent = NULL;
+       unsigned long rb_key;
+       int left;
 
-       if (cfq_class_rt(cfqq))
-               list = &cfqd->cur_rr;
-       else if (cfq_class_idle(cfqq))
-               list = &cfqd->idle_rr;
-       else {
+       rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
+       rb_key += cfqq->slice_resid;
+       cfqq->slice_resid = 0;
+
+       if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
                /*
-                * if cfqq has requests in flight, don't allow it to be
-                * found in cfq_set_active_queue before it has finished them.
-                * this is done to increase fairness between a process that
-                * has lots of io pending vs one that only generates one
-                * sporadically or synchronously
+                * same position, nothing more to do
                 */
-               if (cfq_cfqq_dispatched(cfqq))
-                       list = &cfqd->busy_rr;
-               else
-                       list = &cfqd->rr_list[cfqq->ioprio];
+               if (rb_key == cfqq->rb_key)
+                       return;
+
+               cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
        }
 
-       if (preempted || cfq_cfqq_queue_new(cfqq)) {
-               /*
-                * If this queue was preempted or is new (never been serviced),
-                * let it be added first for fairness but beind other new
-                * queues.
-                */
-               n = list;
-               while (n->next != list) {
-                       __cfqq = list_entry_cfqq(n->next);
-                       if (!cfq_cfqq_queue_new(__cfqq))
-                               break;
+       left = 1;
+       while (*p) {
+               struct cfq_queue *__cfqq;
+               struct rb_node **n;
+
+               parent = *p;
+               __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
-                       n = n->next;
-               }
-               list_add_tail(&cfqq->cfq_list, n);
-       } else if (!cfq_cfqq_class_sync(cfqq)) {
-               /*
-                * async queue always goes to the end. this wont be overly
-                * unfair to writes, as the sort of the sync queue wont be
-                * allowed to pass the async queue again.
-                */
-               list_add_tail(&cfqq->cfq_list, list);
-       } else {
                /*
-                * sort by last service, but don't cross a new or async
-                * queue. we don't cross a new queue because it hasn't been
-                * service before, and we don't cross an async queue because
-                * it gets added to the end on expire.
+                * sort RT queues first, we always want to give
+                * preference to them. IDLE queues goes to the back.
+                * after that, sort on the next service time.
                 */
-               n = list;
-               while ((n = n->prev) != list) {
-                       struct cfq_queue *__cfqq = list_entry_cfqq(n);
+               if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
+                       n = &(*p)->rb_left;
+               else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
+                       n = &(*p)->rb_right;
+               else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
+                       n = &(*p)->rb_left;
+               else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
+                       n = &(*p)->rb_right;
+               else if (rb_key < __cfqq->rb_key)
+                       n = &(*p)->rb_left;
+               else
+                       n = &(*p)->rb_right;
 
-                       if (!cfq_cfqq_class_sync(cfqq) || !__cfqq->service_last)
-                               break;
-                       if (time_before(__cfqq->service_last, cfqq->service_last))
-                               break;
-               }
-               list_add(&cfqq->cfq_list, n);
+               if (n == &(*p)->rb_right)
+                       left = 0;
+
+               p = n;
        }
+
+       if (left)
+               cfqd->service_tree.left = &cfqq->rb_node;
+
+       cfqq->rb_key = rb_key;
+       rb_link_node(&cfqq->rb_node, parent, p);
+       rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
+}
+
+/*
+ * Update cfqq's position in the service tree.
+ */
+static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
+{
+       /*
+        * Resorting requires the cfqq to be on the RR list already.
+        */
+       if (cfq_cfqq_on_rr(cfqq))
+               cfq_service_tree_add(cfqq->cfqd, cfqq);
 }
 
 /*
@@ -486,6 +539,10 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        cfq_resort_rr_list(cfqq, 0);
 }
 
+/*
+ * Called when the cfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
 static inline void
 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
@@ -493,6 +550,9 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        cfq_clear_cfqq_on_rr(cfqq);
        list_del_init(&cfqq->cfq_list);
 
+       if (!RB_EMPTY_NODE(&cfqq->rb_node))
+               cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+
        BUG_ON(!cfqd->busy_queues);
        cfqd->busy_queues--;
 }
@@ -579,6 +639,8 @@ static void cfq_activate_request(request_queue_t *q, struct request *rq)
         */
        if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
                cfqd->hw_tag = 1;
+
+       cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
 }
 
 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
@@ -605,8 +667,7 @@ static void cfq_remove_request(struct request *rq)
        }
 }
 
-static int
-cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
+static int cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
        struct request *__rq;
@@ -684,6 +745,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                cfq_clear_cfqq_must_alloc_slice(cfqq);
                cfq_clear_cfqq_fifo_expire(cfqq);
                cfq_mark_cfqq_slice_new(cfqq);
+               cfq_clear_cfqq_queue_new(cfqq);
        }
 
        cfqd->active_queue = cfqq;
@@ -701,7 +763,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
        cfq_clear_cfqq_must_dispatch(cfqq);
        cfq_clear_cfqq_wait_request(cfqq);
-       cfq_clear_cfqq_queue_new(cfqq);
 
        /*
         * store what was left of this slice, if the queue idled out
@@ -733,144 +794,139 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted,
 }
 
 /*
- * 0
- * 0,1
- * 0,1,2
- * 0,1,2,3
- * 0,1,2,3,4
- * 0,1,2,3,4,5
- * 0,1,2,3,4,5,6
- * 0,1,2,3,4,5,6,7
+ * Get next queue for service. Unless we have a queue preemption,
+ * we'll simply select the first cfqq in the service tree.
  */
-static int cfq_get_next_prio_level(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-       int prio, wrap;
+       struct cfq_queue *cfqq = NULL;
 
-       prio = -1;
-       wrap = 0;
-       do {
-               int p;
+       if (!list_empty(&cfqd->cur_rr)) {
+               /*
+                * if current list is non-empty, grab first entry.
+                */
+               cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+       } else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) {
+               struct rb_node *n = cfq_rb_first(&cfqd->service_tree);
 
-               for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
-                       if (!list_empty(&cfqd->rr_list[p])) {
-                               prio = p;
-                               break;
-                       }
-               }
+               cfqq = rb_entry(n, struct cfq_queue, rb_node);
+               if (cfq_class_idle(cfqq)) {
+                       unsigned long end;
 
-               if (prio != -1)
-                       break;
-               cfqd->cur_prio = 0;
-               if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-                       cfqd->cur_end_prio = 0;
-                       if (wrap)
-                               break;
-                       wrap = 1;
+                       /*
+                        * if we have idle queues and no rt or be queues had
+                        * pending requests, either allow immediate service if
+                        * the grace period has passed or arm the idle grace
+                        * timer
+                        */
+                       end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+                       if (time_before(jiffies, end)) {
+                               mod_timer(&cfqd->idle_class_timer, end);
+                               cfqq = NULL;
+                       }
                }
-       } while (1);
-
-       if (unlikely(prio == -1))
-               return -1;
+       }
 
-       BUG_ON(prio >= CFQ_PRIO_LISTS);
+       return cfqq;
+}
 
-       list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
+/*
+ * Get and set a new active queue for service.
+ */
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+{
+       struct cfq_queue *cfqq;
 
-       cfqd->cur_prio = prio + 1;
-       if (cfqd->cur_prio > cfqd->cur_end_prio) {
-               cfqd->cur_end_prio = cfqd->cur_prio;
-               cfqd->cur_prio = 0;
-       }
-       if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
-               cfqd->cur_prio = 0;
-               cfqd->cur_end_prio = 0;
-       }
+       cfqq = cfq_get_next_queue(cfqd);
+       __cfq_set_active_queue(cfqd, cfqq);
+       return cfqq;
+}
 
-       return prio;
+static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
+                                         struct request *rq)
+{
+       if (rq->sector >= cfqd->last_position)
+               return rq->sector - cfqd->last_position;
+       else
+               return cfqd->last_position - rq->sector;
 }
 
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 {
-       struct cfq_queue *cfqq = NULL;
+       struct cfq_io_context *cic = cfqd->active_cic;
 
-       if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
-               /*
-                * if current list is non-empty, grab first entry. if it is
-                * empty, get next prio level and grab first entry then if any
-                * are spliced
-                */
-               cfqq = list_entry_cfqq(cfqd->cur_rr.next);
-       } else if (!list_empty(&cfqd->busy_rr)) {
-               /*
-                * If no new queues are available, check if the busy list has
-                * some before falling back to idle io.
-                */
-               cfqq = list_entry_cfqq(cfqd->busy_rr.next);
-       } else if (!list_empty(&cfqd->idle_rr)) {
-               /*
-                * if we have idle queues and no rt or be queues had pending
-                * requests, either allow immediate service if the grace period
-                * has passed or arm the idle grace timer
-                */
-               unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+       if (!sample_valid(cic->seek_samples))
+               return 0;
 
-               if (time_after_eq(jiffies, end))
-                       cfqq = list_entry_cfqq(cfqd->idle_rr.next);
-               else
-                       mod_timer(&cfqd->idle_class_timer, end);
-       }
+       return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
+}
 
-       __cfq_set_active_queue(cfqd, cfqq);
-       return cfqq;
+static int cfq_close_cooperator(struct cfq_data *cfq_data,
+                               struct cfq_queue *cfqq)
+{
+       /*
+        * We should notice if some of the queues are cooperating, eg
+        * working closely on the same area of the disk. In that case,
+        * we can group them together and don't waste time idling.
+        */
+       return 0;
 }
 
-#define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024))
+#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
 
-static int cfq_arm_slice_timer(struct cfq_data *cfqd)
+static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
        struct cfq_queue *cfqq = cfqd->active_queue;
        struct cfq_io_context *cic;
        unsigned long sl;
 
        WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
+       WARN_ON(cfq_cfqq_slice_new(cfqq));
 
        /*
         * idle is disabled, either manually or by past process history
         */
-       if (!cfqd->cfq_slice_idle)
-               return 0;
-       if (!cfq_cfqq_idle_window(cfqq))
-               return 0;
+       if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
+               return;
+
        /*
         * task has exited, don't wait
         */
        cic = cfqd->active_cic;
        if (!cic || !cic->ioc->task)
-               return 0;
+               return;
+
+       /*
+        * See if this prio level has a good candidate
+        */
+       if (cfq_close_cooperator(cfqd, cfqq) &&
+           (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
+               return;
 
        cfq_mark_cfqq_must_dispatch(cfqq);
        cfq_mark_cfqq_wait_request(cfqq);
 
-       sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
-
        /*
         * we don't want to idle for seeks, but we do want to allow
         * fair distribution of slice time for a process doing back-to-back
         * seeks. so allow a little bit of time for him to submit a new rq
         */
+       sl = cfqd->cfq_slice_idle;
        if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
-               sl = min(sl, msecs_to_jiffies(2));
+               sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
        mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-       return 1;
 }
 
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
 static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 {
        struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
        cfq_remove_request(rq);
-       cfqq->on_dispatch[rq_is_sync(rq)]++;
+       cfqq->dispatched++;
        elv_dispatch_sort(q, rq);
 }
 
@@ -891,13 +947,13 @@ static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
        if (list_empty(&cfqq->fifo))
                return NULL;
 
-       fifo = cfq_cfqq_class_sync(cfqq);
+       fifo = cfq_cfqq_sync(cfqq);
        rq = rq_entry_fifo(cfqq->fifo.next);
 
-       if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
-               return rq;
+       if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
+               return NULL;
 
-       return NULL;
+       return rq;
 }
 
 static inline int
@@ -911,7 +967,8 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 }
 
 /*
- * get next queue for service
+ * Select a queue for service. If we have a current active queue,
+ * check whether to continue servicing it, or retrieve and set a new one.
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
@@ -922,23 +979,26 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
                goto new_queue;
 
        /*
-        * slice has expired
+        * The active queue has run out of time, expire it and select new.
         */
-       if (!cfq_cfqq_must_dispatch(cfqq) && cfq_slice_used(cfqq))
+       if (cfq_slice_used(cfqq))
                goto expire;
 
        /*
-        * if queue has requests, dispatch one. if not, check if
-        * enough slice is left to wait for one
+        * The active queue has requests and isn't expired, allow it to
+        * dispatch.
         */
        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
                goto keep_queue;
-       else if (cfq_cfqq_slice_new(cfqq) || cfq_cfqq_dispatched(cfqq)) {
+
+       /*
+        * No requests pending. If the active queue still has requests in
+        * flight or is idling for a new request, allow either of these
+        * conditions to happen (or time out) before selecting a new queue.
+        */
+       if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) {
                cfqq = NULL;
                goto keep_queue;
-       } else if (cfq_cfqq_class_sync(cfqq)) {
-               if (cfq_arm_slice_timer(cfqd))
-                       return NULL;
        }
 
 expire:
@@ -949,6 +1009,10 @@ keep_queue:
        return cfqq;
 }
 
+/*
+ * Dispatch some requests from cfqq, moving them to the request queue
+ * dispatch list.
+ */
 static int
 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                        int max_dispatch)
@@ -998,35 +1062,47 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        return dispatched;
 }
 
-static int
-cfq_forced_dispatch_cfqqs(struct list_head *list)
+static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
+{
+       int dispatched = 0;
+
+       while (cfqq->next_rq) {
+               cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
+               dispatched++;
+       }
+
+       BUG_ON(!list_empty(&cfqq->fifo));
+       return dispatched;
+}
+
+static int cfq_forced_dispatch_cfqqs(struct list_head *list)
 {
        struct cfq_queue *cfqq, *next;
        int dispatched;
 
        dispatched = 0;
-       list_for_each_entry_safe(cfqq, next, list, cfq_list) {
-               while (cfqq->next_rq) {
-                       cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
-                       dispatched++;
-               }
-               BUG_ON(!list_empty(&cfqq->fifo));
-       }
+       list_for_each_entry_safe(cfqq, next, list, cfq_list)
+               dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
        return dispatched;
 }
 
-static int
-cfq_forced_dispatch(struct cfq_data *cfqd)
+/*
+ * Drain our current requests. Used for barriers and when switching
+ * io schedulers on-the-fly.
+ */
+static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
-       int i, dispatched = 0;
+       int dispatched = 0;
+       struct rb_node *n;
 
-       for (i = 0; i < CFQ_PRIO_LISTS; i++)
-               dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
+       while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) {
+               struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
+
+               dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+       }
 
-       dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
        dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
-       dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
 
        cfq_slice_expired(cfqd, 0, 0);
 
@@ -1035,11 +1111,10 @@ cfq_forced_dispatch(struct cfq_data *cfqd)
        return dispatched;
 }
 
-static int
-cfq_dispatch_requests(request_queue_t *q, int force)
+static int cfq_dispatch_requests(request_queue_t *q, int force)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-       struct cfq_queue *cfqq, *prev_cfqq;
+       struct cfq_queue *cfqq;
        int dispatched;
 
        if (!cfqd->busy_queues)
@@ -1049,23 +1124,19 @@ cfq_dispatch_requests(request_queue_t *q, int force)
                return cfq_forced_dispatch(cfqd);
 
        dispatched = 0;
-       prev_cfqq = NULL;
        while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
                int max_dispatch;
 
                if (cfqd->busy_queues > 1) {
-                       /*
-                        * Don't repeat dispatch from the previous queue.
-                        */
-                       if (prev_cfqq == cfqq)
-                               break;
-
                        /*
                         * So we have dispatched before in this round, if the
                         * next queue has idling enabled (must be sync), don't
-                        * allow it service until the previous have continued.
+                        * allow it service until the previous have completed.
                         */
-                       if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq))
+                       if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) &&
+                           dispatched)
+                               break;
+                       if (cfqq->dispatched >= cfqd->cfq_quantum)
                                break;
                }
 
@@ -1078,7 +1149,6 @@ cfq_dispatch_requests(request_queue_t *q, int force)
                        max_dispatch = 1;
 
                dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
-               prev_cfqq = cfqq;
        }
 
        return dispatched;
@@ -1111,7 +1181,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
        /*
         * it's on the empty list and still hashed
         */
-       list_del(&cfqq->cfq_list);
        hlist_del(&cfqq->cfq_hash);
        kmem_cache_free(cfq_pool, cfqq);
 }
@@ -1187,10 +1256,6 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
        }
 }
 
-
-/*
- * Called with interrupts disabled
- */
 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
        struct cfq_data *cfqd = cic->key;
@@ -1204,6 +1269,10 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
        }
 }
 
+/*
+ * The process that ioc belongs to has exited, we need to clean up
+ * and put the internal structures we have that belongs to that process.
+ */
 static void cfq_exit_io_context(struct io_context *ioc)
 {
        struct cfq_io_context *__cic;
@@ -1280,8 +1349,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
         */
        cfqq->org_ioprio = cfqq->ioprio;
        cfqq->org_ioprio_class = cfqq->ioprio_class;
-
-       cfq_resort_rr_list(cfqq, 0);
        cfq_clear_cfqq_prio_changed(cfqq);
 }
 
@@ -1367,6 +1434,7 @@ retry:
 
                INIT_HLIST_NODE(&cfqq->cfq_hash);
                INIT_LIST_HEAD(&cfqq->cfq_list);
+               RB_CLEAR_NODE(&cfqq->rb_node);
                INIT_LIST_HEAD(&cfqq->fifo);
 
                cfqq->key = key;
@@ -1391,6 +1459,9 @@ out:
        return cfqq;
 }
 
+/*
+ * We drop cfq io contexts lazily, so we may find a dead one.
+ */
 static void
 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
 {
@@ -1520,7 +1591,8 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 }
 
 static void
-cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq)
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+                      struct request *rq)
 {
        sector_t sdist;
        u64 total;
@@ -1530,6 +1602,11 @@ cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq)
        else
                sdist = cic->last_request_pos - rq->sector;
 
+       if (!cic->seek_samples) {
+               cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+               cfqd->new_seek_mean = cfqd->new_seek_total / 256;
+       }
+
        /*
         * Don't allow the seek distance to get too large from the
         * odd fragment, pagein, etc
@@ -1580,13 +1657,16 @@ static int
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
                   struct request *rq)
 {
-       struct cfq_queue *cfqq = cfqd->active_queue;
-       sector_t dist;
+       struct cfq_queue *cfqq;
 
-       if (cfq_class_idle(new_cfqq))
+       cfqq = cfqd->active_queue;
+       if (!cfqq)
                return 0;
 
-       if (!cfqq)
+       if (cfq_slice_used(cfqq))
+               return 1;
+
+       if (cfq_class_idle(new_cfqq))
                return 0;
 
        if (cfq_class_idle(cfqq))
@@ -1613,12 +1693,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
         * if this request is as-good as one we would expect from the
         * current cfqq, let it preempt
         */
-       if (rq->sector > cfqd->last_sector)
-               dist = rq->sector - cfqd->last_sector;
-       else
-               dist = cfqd->last_sector - rq->sector;
-
-       if (dist <= cfqd->active_cic->seek_mean)
+       if (cfq_rq_close(cfqd, rq))
                return 1;
 
        return 0;
@@ -1637,7 +1712,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         * so we know that it will be selected next.
         */
        BUG_ON(!cfq_cfqq_on_rr(cfqq));
-       list_move(&cfqq->cfq_list, &cfqd->cur_rr);
+       list_del_init(&cfqq->cfq_list);
+       list_add(&cfqq->cfq_list, &cfqd->cur_rr);
 
        cfqq->slice_end = 0;
        cfq_mark_cfqq_slice_new(cfqq);
@@ -1656,28 +1732,12 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        if (rq_is_meta(rq))
                cfqq->meta_pending++;
 
-       /*
-        * we never wait for an async request and we don't allow preemption
-        * of an async request. so just return early
-        */
-       if (!rq_is_sync(rq)) {
-               /*
-                * sync process issued an async request, if it's waiting
-                * then expire it and kick rq handling.
-                */
-               if (cic == cfqd->active_cic &&
-                   del_timer(&cfqd->idle_slice_timer)) {
-                       cfq_slice_expired(cfqd, 0, 0);
-                       blk_start_queueing(cfqd->queue);
-               }
-               return;
-       }
-
        cfq_update_io_thinktime(cfqd, cic);
-       cfq_update_io_seektime(cic, rq);
+       cfq_update_io_seektime(cfqd, cic, rq);
        cfq_update_idle_window(cfqd, cfqq, cic);
 
        cic->last_request_pos = rq->sector + rq->nr_sectors;
+       cfqq->last_request_pos = cic->last_request_pos;
 
        if (cfqq == cfqd->active_queue) {
                /*
@@ -1726,18 +1786,13 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
        now = jiffies;
 
        WARN_ON(!cfqd->rq_in_driver);
-       WARN_ON(!cfqq->on_dispatch[sync]);
+       WARN_ON(!cfqq->dispatched);
        cfqd->rq_in_driver--;
-       cfqq->on_dispatch[sync]--;
-       cfqq->service_last = now;
-
-       cfqd->last_sector = rq->hard_sector + rq->hard_nr_sectors;
+       cfqq->dispatched--;
 
        if (!cfq_class_idle(cfqq))
                cfqd->last_end_request = now;
 
-       cfq_resort_rr_list(cfqq, 0);
-
        if (sync)
                RQ_CIC(rq)->last_end_request = now;
 
@@ -1752,11 +1807,12 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
                }
                if (cfq_slice_used(cfqq))
                        cfq_slice_expired(cfqd, 0, 1);
-               else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
-                       if (!cfq_arm_slice_timer(cfqd))
-                               cfq_schedule_dispatch(cfqd);
-               }
+               else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
+                       cfq_arm_slice_timer(cfqd);
        }
+
+       if (!cfqd->rq_in_driver)
+               cfq_schedule_dispatch(cfqd);
 }
 
 /*
@@ -1765,9 +1821,6 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
-       const int ioprio_class = cfqq->ioprio_class;
-       const int ioprio = cfqq->ioprio;
-
        if (has_fs_excl()) {
                /*
                 * boost idle prio on transactions that would lock out other
@@ -1786,12 +1839,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
                if (cfqq->ioprio != cfqq->org_ioprio)
                        cfqq->ioprio = cfqq->org_ioprio;
        }
-
-       /*
-        * refile between round-robin lists if we moved the priority class
-        */
-       if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio))
-               cfq_resort_rr_list(cfqq, 0);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
@@ -2029,12 +2076,8 @@ static void *cfq_init_queue(request_queue_t *q)
 
        memset(cfqd, 0, sizeof(*cfqd));
 
-       for (i = 0; i < CFQ_PRIO_LISTS; i++)
-               INIT_LIST_HEAD(&cfqd->rr_list[i]);
-
-       INIT_LIST_HEAD(&cfqd->busy_rr);
+       cfqd->service_tree = CFQ_RB_ROOT;
        INIT_LIST_HEAD(&cfqd->cur_rr);
-       INIT_LIST_HEAD(&cfqd->idle_rr);
        INIT_LIST_HEAD(&cfqd->cic_list);
 
        cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
@@ -2101,7 +2144,6 @@ fail:
 /*
  * sysfs parts below -->
  */
-
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {