[PATCH] md: add a ->congested_fn function for raid5/6
[powerpc.git] / drivers / md / raid5.c
index 9ba7307..37e4ff6 100644 (file)
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches.  Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->bm_write is the number of the last batch successfully written.
+ * conf->bm_flush is the number of the last batch that was closed to
+ *    new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is bm_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ *   we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ *   batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
@@ -89,12 +112,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                BUG_ON(!list_empty(&sh->lru));
                BUG_ON(atomic_read(&conf->active_stripes)==0);
                if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                       if (test_bit(STRIPE_DELAYED, &sh->state))
+                       if (test_bit(STRIPE_DELAYED, &sh->state)) {
                                list_add_tail(&sh->lru, &conf->delayed_list);
-                       else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-                                conf->seq_write == sh->bm_seq)
+                               blk_plug_device(conf->mddev->queue);
+                       } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                  sh->bm_seq - conf->seq_write > 0) {
                                list_add_tail(&sh->lru, &conf->bitmap_list);
-                       else {
+                               blk_plug_device(conf->mddev->queue);
+                       } else {
                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
                                list_add_tail(&sh->lru, &conf->handle_list);
                        }
@@ -271,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                                                     < (conf->max_nr_stripes *3/4)
                                                     || !conf->inactive_blocked),
                                                    conf->device_lock,
-                                                   unplug_slaves(conf->mddev)
+                                                   raid5_unplug_device(conf->mddev->queue)
                                        );
                                conf->inactive_blocked = 0;
                        } else
@@ -282,7 +307,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
-                               if (list_empty(&sh->lru))
+                               if (list_empty(&sh->lru) &&
+                                   !test_bit(STRIPE_EXPANDING, &sh->state))
                                        BUG();
                                list_del_init(&sh->lru);
                        }
@@ -497,6 +523,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+       char b[BDEVNAME_SIZE];
+       mdk_rdev_t *rdev;
 
        if (bi->bi_size)
                return 1;
@@ -544,25 +572,39 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                set_bit(R5_UPTODATE, &sh->dev[i].flags);
 #endif
                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                       printk(KERN_INFO "raid5: read error corrected!!\n");
+                       rdev = conf->disks[i].rdev;
+                       printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
+                              mdname(conf->mddev), STRIPE_SECTORS,
+                              (unsigned long long)sh->sector + rdev->data_offset,
+                              bdevname(rdev->bdev, b));
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
                }
                if (atomic_read(&conf->disks[i].rdev->read_errors))
                        atomic_set(&conf->disks[i].rdev->read_errors, 0);
        } else {
+               const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
                int retry = 0;
+               rdev = conf->disks[i].rdev;
+
                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-               atomic_inc(&conf->disks[i].rdev->read_errors);
+               atomic_inc(&rdev->read_errors);
                if (conf->mddev->degraded)
-                       printk(KERN_WARNING "raid5: read error not correctable.\n");
+                       printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
+                              mdname(conf->mddev),
+                              (unsigned long long)sh->sector + rdev->data_offset,
+                              bdn);
                else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                        /* Oh, no!!! */
-                       printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
-               else if (atomic_read(&conf->disks[i].rdev->read_errors)
+                       printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
+                              mdname(conf->mddev),
+                              (unsigned long long)sh->sector + rdev->data_offset,
+                              bdn);
+               else if (atomic_read(&rdev->read_errors)
                         > conf->max_nr_stripes)
                        printk(KERN_WARNING
-                              "raid5: Too many read errors, failing device.\n");
+                              "raid5:%s: Too many read errors, failing device %s.\n",
+                              mdname(conf->mddev), bdn);
                else
                        retry = 1;
                if (retry)
@@ -570,7 +612,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                else {
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
-                       md_error(conf->mddev, conf->disks[i].rdev);
+                       md_error(conf->mddev, rdev);
                }
        }
        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
@@ -594,7 +636,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
        struct stripe_head *sh = bi->bi_private;
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
-       unsigned long flags;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
        if (bi->bi_size)
@@ -612,7 +653,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
                return 0;
        }
 
-       spin_lock_irqsave(&conf->device_lock, flags);
        if (!uptodate)
                md_error(conf->mddev, conf->disks[i].rdev);
 
@@ -620,8 +660,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
        
        clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
-       __release_stripe(conf, sh);
-       spin_unlock_irqrestore(&conf->device_lock, flags);
+       release_stripe(sh);
        return 0;
 }
 
@@ -654,12 +693,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        PRINTK("raid5: error called\n");
 
        if (!test_bit(Faulty, &rdev->flags)) {
-               mddev->sb_dirty = 1;
-               if (test_bit(In_sync, &rdev->flags)) {
-                       conf->working_disks--;
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               if (test_and_clear_bit(In_sync, &rdev->flags)) {
+                       unsigned long flags;
+                       spin_lock_irqsave(&conf->device_lock, flags);
                        mddev->degraded++;
-                       conf->failed_disks++;
-                       clear_bit(In_sync, &rdev->flags);
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
                        /*
                         * if recovery was running, make sure it aborts.
                         */
@@ -669,7 +708,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                printk (KERN_ALERT
                        "raid5: Disk failure on %s, disabling device."
                        " Operation continuing on %d devices\n",
-                       bdevname(rdev->bdev,b), conf->working_disks);
+                       bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
        }
 }
 
@@ -1271,9 +1310,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                (unsigned long long)sh->sector, dd_idx);
 
        if (conf->mddev->bitmap && firstwrite) {
-               sh->bm_seq = conf->seq_write;
                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
                                  STRIPE_SECTORS, 0);
+               sh->bm_seq = conf->seq_flush+1;
                set_bit(STRIPE_BIT_DELAY, &sh->state);
        }
 
@@ -1311,10 +1350,9 @@ static int page_is_zero(struct page *p)
 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 {
        int sectors_per_chunk = conf->chunk_size >> 9;
-       sector_t x = stripe;
        int pd_idx, dd_idx;
-       int chunk_offset = sector_div(x, sectors_per_chunk);
-       stripe = x;
+       int chunk_offset = sector_div(stripe, sectors_per_chunk);
+
        raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
                             + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
        return pd_idx;
@@ -2555,11 +2593,22 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
        return ret;
 }
 
-static inline void raid5_plug_device(raid5_conf_t *conf)
+static int raid5_congested(void *data, int bits)
 {
-       spin_lock_irq(&conf->device_lock);
-       blk_plug_device(conf->mddev->queue);
-       spin_unlock_irq(&conf->device_lock);
+       mddev_t *mddev = data;
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       /* No difference between reads and writes.  Just check
+        * how busy the stripe_cache is
+        */
+       if (conf->inactive_blocked)
+               return 1;
+       if (conf->quiesce)
+               return 1;
+       if (list_empty_careful(&conf->inactive_list))
+               return 1;
+
+       return 0;
 }
 
 static int make_request(request_queue_t *q, struct bio * bi)
@@ -2671,7 +2720,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
                                goto retry;
                        }
                        finish_wait(&conf->wait_for_overlap, &w);
-                       raid5_plug_device(conf);
                        handle_stripe(sh, NULL);
                        release_stripe(sh);
                } else {
@@ -2696,15 +2744,137 @@ static int make_request(request_queue_t *q, struct bio * bi)
        return 0;
 }
 
-/* FIXME go_faster isn't used */
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
 {
+       /* reshaping is quite different to recovery/resync so it is
+        * handled quite separately ... here.
+        *
+        * On each call to sync_request, we gather one chunk worth of
+        * destination stripes and flag them as expanding.
+        * Then we find all the source stripes and request reads.
+        * As the reads complete, handle_stripe will copy the data
+        * into the destination stripe and release that stripe.
+        */
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
        struct stripe_head *sh;
        int pd_idx;
        sector_t first_sector, last_sector;
+       int raid_disks;
+       int data_disks;
+       int i;
+       int dd_idx;
+       sector_t writepos, safepos, gap;
+
+       if (sector_nr == 0 &&
+           conf->expand_progress != 0) {
+               /* restarting in the middle, skip the initial sectors */
+               sector_nr = conf->expand_progress;
+               sector_div(sector_nr, conf->raid_disks-1);
+               *skipped = 1;
+               return sector_nr;
+       }
+
+       /* we update the metadata when there is more than 3Meg
+        * in the block range (that is rather arbitrary, should
+        * probably be time based) or when the data about to be
+        * copied would over-write the source of the data at
+        * the front of the range.
+        * i.e. one new_stripe forward from expand_progress new_maps
+        * to after where expand_lo old_maps to
+        */
+       writepos = conf->expand_progress +
+               conf->chunk_size/512*(conf->raid_disks-1);
+       sector_div(writepos, conf->raid_disks-1);
+       safepos = conf->expand_lo;
+       sector_div(safepos, conf->previous_raid_disks-1);
+       gap = conf->expand_progress - conf->expand_lo;
+
+       if (writepos >= safepos ||
+           gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+               /* Cannot proceed until we've updated the superblock... */
+               wait_event(conf->wait_for_overlap,
+                          atomic_read(&conf->reshape_stripes)==0);
+               mddev->reshape_position = conf->expand_progress;
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               md_wakeup_thread(mddev->thread);
+               wait_event(mddev->sb_wait, mddev->flags == 0 ||
+                          kthread_should_stop());
+               spin_lock_irq(&conf->device_lock);
+               conf->expand_lo = mddev->reshape_position;
+               spin_unlock_irq(&conf->device_lock);
+               wake_up(&conf->wait_for_overlap);
+       }
+
+       for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+               int j;
+               int skipped = 0;
+               pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+               sh = get_active_stripe(conf, sector_nr+i,
+                                      conf->raid_disks, pd_idx, 0);
+               set_bit(STRIPE_EXPANDING, &sh->state);
+               atomic_inc(&conf->reshape_stripes);
+               /* If any of this stripe is beyond the end of the old
+                * array, then we need to zero those blocks
+                */
+               for (j=sh->disks; j--;) {
+                       sector_t s;
+                       if (j == sh->pd_idx)
+                               continue;
+                       s = compute_blocknr(sh, j);
+                       if (s < (mddev->array_size<<1)) {
+                               skipped = 1;
+                               continue;
+                       }
+                       memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+                       set_bit(R5_Expanded, &sh->dev[j].flags);
+                       set_bit(R5_UPTODATE, &sh->dev[j].flags);
+               }
+               if (!skipped) {
+                       set_bit(STRIPE_EXPAND_READY, &sh->state);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+               }
+               release_stripe(sh);
+       }
+       spin_lock_irq(&conf->device_lock);
+       conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+       spin_unlock_irq(&conf->device_lock);
+       /* Ok, those stripe are ready. We can start scheduling
+        * reads on the source stripes.
+        * The source stripes are determined by mapping the first and last
+        * block on the destination stripes.
+        */
+       raid_disks = conf->previous_raid_disks;
+       data_disks = raid_disks - 1;
+       first_sector =
+               raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+                                    raid_disks, data_disks,
+                                    &dd_idx, &pd_idx, conf);
+       last_sector =
+               raid5_compute_sector((sector_nr+conf->chunk_size/512)
+                                    *(conf->raid_disks-1) -1,
+                                    raid_disks, data_disks,
+                                    &dd_idx, &pd_idx, conf);
+       if (last_sector >= (mddev->size<<1))
+               last_sector = (mddev->size<<1)-1;
+       while (first_sector <= last_sector) {
+               pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+               sh = get_active_stripe(conf, first_sector,
+                                      conf->previous_raid_disks, pd_idx, 0);
+               set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+               set_bit(STRIPE_HANDLE, &sh->state);
+               release_stripe(sh);
+               first_sector += STRIPE_SECTORS;
+       }
+       return conf->chunk_size>>9;
+}
+
+/* FIXME go_faster isn't used */
+static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+{
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+       struct stripe_head *sh;
+       int pd_idx;
        int raid_disks = conf->raid_disks;
-       int data_disks = raid_disks - conf->max_degraded;
        sector_t max_sector = mddev->size << 1;
        int sync_blocks;
        int still_degraded = 0;
@@ -2728,127 +2898,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                return 0;
        }
 
-       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
-               /* reshaping is quite different to recovery/resync so it is
-                * handled quite separately ... here.
-                *
-                * On each call to sync_request, we gather one chunk worth of
-                * destination stripes and flag them as expanding.
-                * Then we find all the source stripes and request reads.
-                * As the reads complete, handle_stripe will copy the data
-                * into the destination stripe and release that stripe.
-                */
-               int i;
-               int dd_idx;
-               sector_t writepos, safepos, gap;
-
-               if (sector_nr == 0 &&
-                   conf->expand_progress != 0) {
-                       /* restarting in the middle, skip the initial sectors */
-                       sector_nr = conf->expand_progress;
-                       sector_div(sector_nr, conf->raid_disks-1);
-                       *skipped = 1;
-                       return sector_nr;
-               }
-
-               /* we update the metadata when there is more than 3Meg
-                * in the block range (that is rather arbitrary, should
-                * probably be time based) or when the data about to be
-                * copied would over-write the source of the data at
-                * the front of the range.
-                * i.e. one new_stripe forward from expand_progress new_maps
-                * to after where expand_lo old_maps to
-                */
-               writepos = conf->expand_progress +
-                       conf->chunk_size/512*(conf->raid_disks-1);
-               sector_div(writepos, conf->raid_disks-1);
-               safepos = conf->expand_lo;
-               sector_div(safepos, conf->previous_raid_disks-1);
-               gap = conf->expand_progress - conf->expand_lo;
-
-               if (writepos >= safepos ||
-                   gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
-                       /* Cannot proceed until we've updated the superblock... */
-                       wait_event(conf->wait_for_overlap,
-                                  atomic_read(&conf->reshape_stripes)==0);
-                       mddev->reshape_position = conf->expand_progress;
-                       mddev->sb_dirty = 1;
-                       md_wakeup_thread(mddev->thread);
-                       wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
-                                  kthread_should_stop());
-                       spin_lock_irq(&conf->device_lock);
-                       conf->expand_lo = mddev->reshape_position;
-                       spin_unlock_irq(&conf->device_lock);
-                       wake_up(&conf->wait_for_overlap);
-               }
-
-               for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
-                       int j;
-                       int skipped = 0;
-                       pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
-                       sh = get_active_stripe(conf, sector_nr+i,
-                                              conf->raid_disks, pd_idx, 0);
-                       set_bit(STRIPE_EXPANDING, &sh->state);
-                       atomic_inc(&conf->reshape_stripes);
-                       /* If any of this stripe is beyond the end of the old
-                        * array, then we need to zero those blocks
-                        */
-                       for (j=sh->disks; j--;) {
-                               sector_t s;
-                               if (j == sh->pd_idx)
-                                       continue;
-                               s = compute_blocknr(sh, j);
-                               if (s < (mddev->array_size<<1)) {
-                                       skipped = 1;
-                                       continue;
-                               }
-                               memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
-                               set_bit(R5_Expanded, &sh->dev[j].flags);
-                               set_bit(R5_UPTODATE, &sh->dev[j].flags);
-                       }
-                       if (!skipped) {
-                               set_bit(STRIPE_EXPAND_READY, &sh->state);
-                               set_bit(STRIPE_HANDLE, &sh->state);
-                       }
-                       release_stripe(sh);
-               }
-               spin_lock_irq(&conf->device_lock);
-               conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
-               spin_unlock_irq(&conf->device_lock);
-               /* Ok, those stripe are ready. We can start scheduling
-                * reads on the source stripes.
-                * The source stripes are determined by mapping the first and last
-                * block on the destination stripes.
-                */
-               raid_disks = conf->previous_raid_disks;
-               data_disks = raid_disks - 1;
-               first_sector =
-                       raid5_compute_sector(sector_nr*(conf->raid_disks-1),
-                                            raid_disks, data_disks,
-                                            &dd_idx, &pd_idx, conf);
-               last_sector =
-                       raid5_compute_sector((sector_nr+conf->chunk_size/512)
-                                              *(conf->raid_disks-1) -1,
-                                            raid_disks, data_disks,
-                                            &dd_idx, &pd_idx, conf);
-               if (last_sector >= (mddev->size<<1))
-                       last_sector = (mddev->size<<1)-1;
-               while (first_sector <= last_sector) {
-                       pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
-                       sh = get_active_stripe(conf, first_sector,
-                                              conf->previous_raid_disks, pd_idx, 0);
-                       set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-                       set_bit(STRIPE_HANDLE, &sh->state);
-                       release_stripe(sh);
-                       first_sector += STRIPE_SECTORS;
-               }
-               return conf->chunk_size>>9;
-       }
+       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+               return reshape_request(mddev, sector_nr, skipped);
+
        /* if there is too many failed drives and we are trying
         * to resync, then assert that we are finished, because there is
         * nothing we can do.
         */
-       if (mddev->degraded >= (data_disks - raid_disks) &&
+       if (mddev->degraded >= conf->max_degraded &&
            test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                sector_t rv = (mddev->size << 1) - sector_nr;
                *skipped = 1;
@@ -2915,7 +2972,7 @@ static void raid5d (mddev_t *mddev)
        while (1) {
                struct list_head *first;
 
-               if (conf->seq_flush - conf->seq_write > 0) {
+               if (conf->seq_flush != conf->seq_write) {
                        int seq = conf->seq_flush;
                        spin_unlock_irq(&conf->device_lock);
                        bitmap_unplug(mddev->bitmap);
@@ -3031,6 +3088,7 @@ static int run(mddev_t *mddev)
        mdk_rdev_t *rdev;
        struct disk_info *disk;
        struct list_head *tmp;
+       int working_disks = 0;
 
        if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
                printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
@@ -3133,14 +3191,14 @@ static int run(mddev_t *mddev)
                        printk(KERN_INFO "raid5: device %s operational as raid"
                                " disk %d\n", bdevname(rdev->bdev,b),
                                raid_disk);
-                       conf->working_disks++;
+                       working_disks++;
                }
        }
 
        /*
         * 0 for a fully functional array, 1 or 2 for a degraded array.
         */
-       mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+       mddev->degraded = conf->raid_disks - working_disks;
        conf->mddev = mddev;
        conf->chunk_size = mddev->chunk_size;
        conf->level = mddev->level;
@@ -3175,7 +3233,7 @@ static int run(mddev_t *mddev)
        if (mddev->degraded > conf->max_degraded) {
                printk(KERN_ERR "raid5: not enough operational devices for %s"
                        " (%d/%d failed)\n",
-                       mdname(mddev), conf->failed_disks, conf->raid_disks);
+                       mdname(mddev), mddev->degraded, conf->raid_disks);
                goto abort;
        }
 
@@ -3238,9 +3296,6 @@ static int run(mddev_t *mddev)
                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                mddev->sync_thread = md_register_thread(md_do_sync, mddev,
                                                        "%s_reshape");
-               /* FIXME if md_register_thread fails?? */
-               md_wakeup_thread(mddev->sync_thread);
-
        }
 
        /* read-ahead size must cover two whole stripes, which is
@@ -3259,6 +3314,9 @@ static int run(mddev_t *mddev)
 
        mddev->queue->unplug_fn = raid5_unplug_device;
        mddev->queue->issue_flush_fn = raid5_issue_flush;
+       mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+       mddev->queue->backing_dev_info.congested_data = mddev;
+
        mddev->array_size =  mddev->size * (conf->previous_raid_disks -
                                            conf->max_degraded);
 
@@ -3335,7 +3393,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
        int i;
 
        seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
-       seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+       seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
        for (i = 0; i < conf->raid_disks; i++)
                seq_printf (seq, "%s",
                               conf->disks[i].rdev &&
@@ -3357,8 +3415,8 @@ static void print_raid5_conf (raid5_conf_t *conf)
                printk("(conf==NULL)\n");
                return;
        }
-       printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
-                conf->working_disks, conf->failed_disks);
+       printk(" --- rd:%d wd:%d\n", conf->raid_disks,
+                conf->raid_disks - conf->mddev->degraded);
 
        for (i = 0; i < conf->raid_disks; i++) {
                char b[BDEVNAME_SIZE];
@@ -3380,11 +3438,11 @@ static int raid5_spare_active(mddev_t *mddev)
                tmp = conf->disks + i;
                if (tmp->rdev
                    && !test_bit(Faulty, &tmp->rdev->flags)
-                   && !test_bit(In_sync, &tmp->rdev->flags)) {
+                   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+                       unsigned long flags;
+                       spin_lock_irqsave(&conf->device_lock, flags);
                        mddev->degraded--;
-                       conf->failed_disks--;
-                       conf->working_disks++;
-                       set_bit(In_sync, &tmp->rdev->flags);
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
                }
        }
        print_raid5_conf(conf);
@@ -3520,6 +3578,7 @@ static int raid5_start_reshape(mddev_t *mddev)
        struct list_head *rtmp;
        int spares = 0;
        int added_devices = 0;
+       unsigned long flags;
 
        if (mddev->degraded ||
            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3553,18 +3612,20 @@ static int raid5_start_reshape(mddev_t *mddev)
                        if (raid5_add_disk(mddev, rdev)) {
                                char nm[20];
                                set_bit(In_sync, &rdev->flags);
-                               conf->working_disks++;
                                added_devices++;
+                               rdev->recovery_offset = 0;
                                sprintf(nm, "rd%d", rdev->raid_disk);
                                sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
                        } else
                                break;
                }
 
+       spin_lock_irqsave(&conf->device_lock, flags);
        mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+       spin_unlock_irqrestore(&conf->device_lock, flags);
        mddev->raid_disks = conf->raid_disks;
        mddev->reshape_position = 0;
-       mddev->sb_dirty = 1;
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);