drivers/md/raid5.c

   1 /*
   2  * raid5.c : Multiple Devices driver for Linux
   3  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4  *         Copyright (C) 1999, 2000 Ingo Molnar
   5  *
   6  * RAID-5 management functions.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2, or (at your option)
  11  * any later version.
  12  *
  13  * You should have received a copy of the GNU General Public License
  14  * (for example /usr/src/linux/COPYING); if not, write to the Free
  15  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16  */
  17
  18
  19 #include <linux/config.h>
  20 #include <linux/module.h>
  21 #include <linux/locks.h>
  22 #include <linux/slab.h>
  23 #include <linux/raid/raid5.h>
  24 #include <asm/bitops.h>
  25 #include <asm/atomic.h>
  26
  27 static mdk_personality_t raid5_personality;
  28
  29 /*
  30  * Stripe cache
  31  */
  32
  33 #define NR_STRIPES              256
  34 #define IO_THRESHOLD            1
  35 #define HASH_PAGES              1
  36 #define HASH_PAGES_ORDER        0
  37 #define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
  38 #define HASH_MASK               (NR_HASH - 1)
  39 #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
  40
  41 /*
  42  * The following can be used to debug the driver
  43  */
  44 #define RAID5_DEBUG     0
  45 #define RAID5_PARANOIA  1
  46 #if RAID5_PARANOIA && CONFIG_SMP
  47 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
  48 #else
  49 # define CHECK_DEVLOCK()
  50 #endif
  51
  52 #if RAID5_DEBUG
  53 #define PRINTK(x...) printk(x)
  54 #define inline
  55 #define __inline__
  56 #else
  57 #define PRINTK(x...) do { } while (0)
  58 #endif
  59
  60 static void print_raid5_conf (raid5_conf_t *conf);
  61
  62 static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
  63 {
  64         if (atomic_dec_and_test(&sh->count)) {
  65                 if (!list_empty(&sh->lru))
  66                         BUG();
  67                 if (atomic_read(&conf->active_stripes)==0)
  68                         BUG();
  69                 if (test_bit(STRIPE_HANDLE, &sh->state)) {
  70                         if (test_bit(STRIPE_DELAYED, &sh->state))
  71                                 list_add_tail(&sh->lru, &conf->delayed_list);
  72                         else
  73                                 list_add_tail(&sh->lru, &conf->handle_list);
  74                         md_wakeup_thread(conf->thread);
  75                 } else {
  76                         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
  77                                 atomic_dec(&conf->preread_active_stripes);
  78                                 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
  79                                         md_wakeup_thread(conf->thread);
  80                         }
  81                         list_add_tail(&sh->lru, &conf->inactive_list);
  82                         atomic_dec(&conf->active_stripes);
  83                         if (!conf->inactive_blocked ||
  84                             atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
  85                                 wake_up(&conf->wait_for_stripe);
  86                 }
  87         }
  88 }
  89 static void release_stripe(struct stripe_head *sh)
  90 {
  91         raid5_conf_t *conf = sh->raid_conf;
  92         unsigned long flags;
  93
  94         spin_lock_irqsave(&conf->device_lock, flags);
  95         __release_stripe(conf, sh);
  96         spin_unlock_irqrestore(&conf->device_lock, flags);
  97 }
  98
  99 static void remove_hash(struct stripe_head *sh)
 100 {
 101         PRINTK("remove_hash(), stripe %lu\n", sh->sector);
 102
 103         if (sh->hash_pprev) {
 104                 if (sh->hash_next)
 105                         sh->hash_next->hash_pprev = sh->hash_pprev;
 106                 *sh->hash_pprev = sh->hash_next;
 107                 sh->hash_pprev = NULL;
 108         }
 109 }
 110
 111 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 112 {
 113         struct stripe_head **shp = &stripe_hash(conf, sh->sector);
 114
 115         PRINTK("insert_hash(), stripe %lu\n",sh->sector);
 116
 117         CHECK_DEVLOCK();
 118         if ((sh->hash_next = *shp) != NULL)
 119                 (*shp)->hash_pprev = &sh->hash_next;
 120         *shp = sh;
 121         sh->hash_pprev = shp;
 122 }
 123
 124
 125 /* find an idle stripe, make sure it is unhashed, and return it. */
 126 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 127 {
 128         struct stripe_head *sh = NULL;
 129         struct list_head *first;
 130
 131         CHECK_DEVLOCK();
 132         if (list_empty(&conf->inactive_list))
 133                 goto out;
 134         first = conf->inactive_list.next;
 135         sh = list_entry(first, struct stripe_head, lru);
 136         list_del_init(first);
 137         remove_hash(sh);
 138         atomic_inc(&conf->active_stripes);
 139 out:
 140         return sh;
 141 }
 142
 143 static void shrink_buffers(struct stripe_head *sh, int num)
 144 {
 145         struct buffer_head *bh;
 146         int i;
 147
 148         for (i=0; i<num ; i++) {
 149                 bh = sh->bh_cache[i];
 150                 if (!bh)
 151                         return;
 152                 sh->bh_cache[i] = NULL;
 153                 free_page((unsigned long) bh->b_data);
 154                 kfree(bh);
 155         }
 156 }
 157
 158 static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
 159 {
 160         struct buffer_head *bh;
 161         int i;
 162
 163         for (i=0; i<num; i++) {
 164                 struct page *page;
 165                 bh = kmalloc(sizeof(struct buffer_head), priority);
 166                 if (!bh)
 167                         return 1;
 168                 memset(bh, 0, sizeof (struct buffer_head));
 169                 init_waitqueue_head(&bh->b_wait);
 170                 if ((page = alloc_page(priority)))
 171                         bh->b_data = page_address(page);
 172                 else {
 173                         kfree(bh);
 174                         return 1;
 175                 }
 176                 atomic_set(&bh->b_count, 0);
 177                 bh->b_page = page;
 178                 sh->bh_cache[i] = bh;
 179
 180         }
 181         return 0;
 182 }
 183
 184 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
 185
 186 static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
 187 {
 188         raid5_conf_t *conf = sh->raid_conf;
 189         int disks = conf->raid_disks, i;
 190
 191         if (atomic_read(&sh->count) != 0)
 192                 BUG();
 193         if (test_bit(STRIPE_HANDLE, &sh->state))
 194                 BUG();
 195
 196         CHECK_DEVLOCK();
 197         PRINTK("init_stripe called, stripe %lu\n", sh->sector);
 198
 199         remove_hash(sh);
 200
 201         sh->sector = sector;
 202         sh->size = conf->buffer_size;
 203         sh->state = 0;
 204
 205         for (i=disks; i--; ) {
 206                 if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
 207                     buffer_locked(sh->bh_cache[i])) {
 208                         printk("sector=%lx i=%d %p %p %p %d\n",
 209                                sh->sector, i, sh->bh_read[i],
 210                                sh->bh_write[i], sh->bh_written[i],
 211                                buffer_locked(sh->bh_cache[i]));
 212                         BUG();
 213                 }
 214                 clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
 215                 raid5_build_block(sh, i);
 216         }
 217         insert_hash(conf, sh);
 218 }
 219
 220 /* the buffer size has changed, so unhash all stripes
 221  * as active stripes complete, they will go onto inactive list
 222  */
 223 static void shrink_stripe_cache(raid5_conf_t *conf)
 224 {
 225         int i;
 226         CHECK_DEVLOCK();
 227         if (atomic_read(&conf->active_stripes))
 228                 BUG();
 229         for (i=0; i < NR_HASH; i++) {
 230                 struct stripe_head *sh;
 231                 while ((sh = conf->stripe_hashtbl[i]))
 232                         remove_hash(sh);
 233         }
 234 }
 235
 236 static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
 237 {
 238         struct stripe_head *sh;
 239
 240         CHECK_DEVLOCK();
 241         PRINTK("__find_stripe, sector %lu\n", sector);
 242         for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
 243                 if (sh->sector == sector)
 244                         return sh;
 245         PRINTK("__stripe %lu not in cache\n", sector);
 246         return NULL;
 247 }
 248
 249 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
 250 {
 251         struct stripe_head *sh;
 252
 253         PRINTK("get_stripe, sector %lu\n", sector);
 254
 255         md_spin_lock_irq(&conf->device_lock);
 256
 257         do {
 258                 if (conf->buffer_size == 0 ||
 259                     (size && size != conf->buffer_size)) {
 260                         /* either the size is being changed (buffer_size==0) or
 261                          * we need to change it.
 262                          * If size==0, we can proceed as soon as buffer_size gets set.
 263                          * If size>0, we can proceed when active_stripes reaches 0, or
 264                          * when someone else sets the buffer_size to size.
 265                          * If someone sets the buffer size to something else, we will need to
 266                          * assert that we want to change it again
 267                          */
 268                         int oldsize = conf->buffer_size;
 269                         PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
 270                         if (size==0)
 271                                 wait_event_lock_irq(conf->wait_for_stripe,
 272                                                     conf->buffer_size,
 273                                                     conf->device_lock);
 274                         else {
 275                                 while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
 276                                         conf->buffer_size = 0;
 277                                         wait_event_lock_irq(conf->wait_for_stripe,
 278                                                             atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
 279                                                             conf->device_lock);
 280                                         PRINTK("waited and now  %ld/%d buffer_size is %d - %d active\n", sector, size,
 281                                                conf->buffer_size, atomic_read(&conf->active_stripes));
 282                                 }
 283
 284                                 if (conf->buffer_size != size) {
 285                                         printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
 286                                         shrink_stripe_cache(conf);
 287                                         if (size==0) BUG();
 288                                         conf->buffer_size = size;
 289                                         PRINTK("size now %d\n", conf->buffer_size);
 290                                 }
 291                         }
 292                 }
 293                 if (size == 0)
 294                         sector -= sector & ((conf->buffer_size>>9)-1);
 295
 296                 sh = __find_stripe(conf, sector);
 297                 if (!sh) {
 298                         if (!conf->inactive_blocked)
 299                                 sh = get_free_stripe(conf);
 300                         if (noblock && sh == NULL)
 301                                 break;
 302                         if (!sh) {
 303                                 conf->inactive_blocked = 1;
 304                                 wait_event_lock_irq(conf->wait_for_stripe,
 305                                                     !list_empty(&conf->inactive_list) &&
 306                                                     (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
 307                                                      || !conf->inactive_blocked),
 308                                                     conf->device_lock);
 309                                 conf->inactive_blocked = 0;
 310                         } else
 311                                 init_stripe(sh, sector);
 312                 } else {
 313                         if (atomic_read(&sh->count)) {
 314                                 if (!list_empty(&sh->lru))
 315                                         BUG();
 316                         } else {
 317                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
 318                                         atomic_inc(&conf->active_stripes);
 319                                 if (list_empty(&sh->lru))
 320                                         BUG();
 321                                 list_del_init(&sh->lru);
 322                         }
 323                 }
 324         } while (sh == NULL);
 325
 326         if (sh)
 327                 atomic_inc(&sh->count);
 328
 329         md_spin_unlock_irq(&conf->device_lock);
 330         return sh;
 331 }
 332
 333 static int grow_stripes(raid5_conf_t *conf, int num, int priority)
 334 {
 335         struct stripe_head *sh;
 336
 337         while (num--) {
 338                 sh = kmalloc(sizeof(struct stripe_head), priority);
 339                 if (!sh)
 340                         return 1;
 341                 memset(sh, 0, sizeof(*sh));
 342                 sh->raid_conf = conf;
 343                 sh->lock = SPIN_LOCK_UNLOCKED;
 344
 345                 if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
 346                         shrink_buffers(sh, conf->raid_disks);
 347                         kfree(sh);
 348                         return 1;
 349                 }
 350                 /* we just created an active stripe so... */
 351                 atomic_set(&sh->count, 1);
 352                 atomic_inc(&conf->active_stripes);
 353                 INIT_LIST_HEAD(&sh->lru);
 354                 release_stripe(sh);
 355         }
 356         return 0;
 357 }
 358
 359 static void shrink_stripes(raid5_conf_t *conf, int num)
 360 {
 361         struct stripe_head *sh;
 362
 363         while (num--) {
 364                 spin_lock_irq(&conf->device_lock);
 365                 sh = get_free_stripe(conf);
 366                 spin_unlock_irq(&conf->device_lock);
 367                 if (!sh)
 368                         break;
 369                 if (atomic_read(&sh->count))
 370                         BUG();
 371                 shrink_buffers(sh, conf->raid_disks);
 372                 kfree(sh);
 373                 atomic_dec(&conf->active_stripes);
 374         }
 375 }
 376
 377
 378 static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
 379 {
 380         struct stripe_head *sh = bh->b_private;
 381         raid5_conf_t *conf = sh->raid_conf;
 382         int disks = conf->raid_disks, i;
 383         unsigned long flags;
 384
 385         for (i=0 ; i<disks; i++)
 386                 if (bh == sh->bh_cache[i])
 387                         break;
 388
 389         PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
 390         if (i == disks) {
 391                 BUG();
 392                 return;
 393         }
 394
 395         if (uptodate) {
 396                 struct buffer_head *buffer;
 397                 spin_lock_irqsave(&conf->device_lock, flags);
 398                 /* we can return a buffer if we bypassed the cache or
 399                  * if the top buffer is not in highmem.  If there are
 400                  * multiple buffers, leave the extra work to
 401                  * handle_stripe
 402                  */
 403                 buffer = sh->bh_read[i];
 404                 if (buffer &&
 405                     (!PageHighMem(buffer->b_page)
 406                      || buffer->b_page == bh->b_page )
 407                         ) {
 408                         sh->bh_read[i] = buffer->b_reqnext;
 409                         buffer->b_reqnext = NULL;
 410                 } else
 411                         buffer = NULL;
 412                 spin_unlock_irqrestore(&conf->device_lock, flags);
 413                 if (sh->bh_page[i]==NULL)
 414                         set_bit(BH_Uptodate, &bh->b_state);
 415                 if (buffer) {
 416                         if (buffer->b_page != bh->b_page)
 417                                 memcpy(buffer->b_data, bh->b_data, bh->b_size);
 418                         buffer->b_end_io(buffer, 1);
 419                 }
 420         } else {
 421                 md_error(conf->mddev, bh->b_dev);
 422                 clear_bit(BH_Uptodate, &bh->b_state);
 423         }
 424         /* must restore b_page before unlocking buffer... */
 425         if (sh->bh_page[i]) {
 426                 bh->b_page = sh->bh_page[i];
 427                 bh->b_data = page_address(bh->b_page);
 428                 sh->bh_page[i] = NULL;
 429                 clear_bit(BH_Uptodate, &bh->b_state);
 430         }
 431         clear_bit(BH_Lock, &bh->b_state);
 432         set_bit(STRIPE_HANDLE, &sh->state);
 433         release_stripe(sh);
 434 }
 435
 436 static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
 437 {
 438         struct stripe_head *sh = bh->b_private;
 439         raid5_conf_t *conf = sh->raid_conf;
 440         int disks = conf->raid_disks, i;
 441         unsigned long flags;
 442
 443         for (i=0 ; i<disks; i++)
 444                 if (bh == sh->bh_cache[i])
 445                         break;
 446
 447         PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
 448         if (i == disks) {
 449                 BUG();
 450                 return;
 451         }
 452
 453         md_spin_lock_irqsave(&conf->device_lock, flags);
 454         if (!uptodate)
 455                 md_error(conf->mddev, bh->b_dev);
 456         clear_bit(BH_Lock, &bh->b_state);
 457         set_bit(STRIPE_HANDLE, &sh->state);
 458         __release_stripe(conf, sh);
 459         md_spin_unlock_irqrestore(&conf->device_lock, flags);
 460 }
 461
 462
 463
 464 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
 465 {
 466         raid5_conf_t *conf = sh->raid_conf;
 467         struct buffer_head *bh = sh->bh_cache[i];
 468         unsigned long block = sh->sector / (sh->size >> 9);
 469
 470         init_buffer(bh, raid5_end_read_request, sh);
 471         bh->b_dev       = conf->disks[i].dev;
 472         bh->b_blocknr   = block;
 473
 474         bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
 475         bh->b_size      = sh->size;
 476         bh->b_list      = BUF_LOCKED;
 477         return bh;
 478 }
 479
 480 static int raid5_error (mddev_t *mddev, kdev_t dev)
 481 {
 482         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 483         mdp_super_t *sb = mddev->sb;
 484         struct disk_info *disk;
 485         int i;
 486
 487         PRINTK("raid5_error called\n");
 488
 489         for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
 490                 if (disk->dev == dev) {
 491                         if (disk->operational) {
 492                                 disk->operational = 0;
 493                                 mark_disk_faulty(sb->disks+disk->number);
 494                                 mark_disk_nonsync(sb->disks+disk->number);
 495                                 mark_disk_inactive(sb->disks+disk->number);
 496                                 sb->active_disks--;
 497                                 sb->working_disks--;
 498                                 sb->failed_disks++;
 499                                 mddev->sb_dirty = 1;
 500                                 conf->working_disks--;
 501                                 conf->failed_disks++;
 502                                 md_wakeup_thread(conf->thread);
 503                                 printk (KERN_ALERT
 504                                         "raid5: Disk failure on %s, disabling device."
 505                                         " Operation continuing on %d devices\n",
 506                                         partition_name (dev), conf->working_disks);
 507                         }
 508                         return 0;
 509                 }
 510         }
 511         /*
 512          * handle errors in spares (during reconstruction)
 513          */
 514         if (conf->spare) {
 515                 disk = conf->spare;
 516                 if (disk->dev == dev) {
 517                         printk (KERN_ALERT
 518                                 "raid5: Disk failure on spare %s\n",
 519                                 partition_name (dev));
 520                         if (!conf->spare->operational) {
 521                                 /* probably a SET_DISK_FAULTY ioctl */
 522                                 return -EIO;
 523                         }
 524                         disk->operational = 0;
 525                         disk->write_only = 0;
 526                         conf->spare = NULL;
 527                         mark_disk_faulty(sb->disks+disk->number);
 528                         mark_disk_nonsync(sb->disks+disk->number);
 529                         mark_disk_inactive(sb->disks+disk->number);
 530                         sb->spare_disks--;
 531                         sb->working_disks--;
 532                         sb->failed_disks++;
 533
 534                         mddev->sb_dirty = 1;
 535                         md_wakeup_thread(conf->thread);
 536
 537                         return 0;
 538                 }
 539         }
 540         MD_BUG();
 541         return -EIO;
 542 }
 543
 544 /*
 545  * Input: a 'big' sector number,
 546  * Output: index of the data and parity disk, and the sector # in them.
 547  */
 548 static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
 549                         unsigned int data_disks, unsigned int * dd_idx,
 550                         unsigned int * pd_idx, raid5_conf_t *conf)
 551 {
 552         unsigned long stripe;
 553         unsigned long chunk_number;
 554         unsigned int chunk_offset;
 555         unsigned long new_sector;
 556         int sectors_per_chunk = conf->chunk_size >> 9;
 557
 558         /* First compute the information on this sector */
 559
 560         /*
 561          * Compute the chunk number and the sector offset inside the chunk
 562          */
 563         chunk_number = r_sector / sectors_per_chunk;
 564         chunk_offset = r_sector % sectors_per_chunk;
 565
 566         /*
 567          * Compute the stripe number
 568          */
 569         stripe = chunk_number / data_disks;
 570
 571         /*
 572          * Compute the data disk and parity disk indexes inside the stripe
 573          */
 574         *dd_idx = chunk_number % data_disks;
 575
 576         /*
 577          * Select the parity disk based on the user selected algorithm.
 578          */
 579         if (conf->level == 4)
 580                 *pd_idx = data_disks;
 581         else switch (conf->algorithm) {
 582                 case ALGORITHM_LEFT_ASYMMETRIC:
 583                         *pd_idx = data_disks - stripe % raid_disks;
 584                         if (*dd_idx >= *pd_idx)
 585                                 (*dd_idx)++;
 586                         break;
 587                 case ALGORITHM_RIGHT_ASYMMETRIC:
 588                         *pd_idx = stripe % raid_disks;
 589                         if (*dd_idx >= *pd_idx)
 590                                 (*dd_idx)++;
 591                         break;
 592                 case ALGORITHM_LEFT_SYMMETRIC:
 593                         *pd_idx = data_disks - stripe % raid_disks;
 594                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 595                         break;
 596                 case ALGORITHM_RIGHT_SYMMETRIC:
 597                         *pd_idx = stripe % raid_disks;
 598                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 599                         break;
 600                 default:
 601                         printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 602         }
 603
 604         /*
 605          * Finally, compute the new sector number
 606          */
 607         new_sector = stripe * sectors_per_chunk + chunk_offset;
 608         return new_sector;
 609 }
 610
 611 #if 0
 612 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
 613 {
 614         raid5_conf_t *conf = sh->raid_conf;
 615         int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
 616         unsigned long new_sector = sh->sector, check;
 617         int sectors_per_chunk = conf->chunk_size >> 9;
 618         unsigned long stripe = new_sector / sectors_per_chunk;
 619         int chunk_offset = new_sector % sectors_per_chunk;
 620         int chunk_number, dummy1, dummy2, dd_idx = i;
 621         unsigned long r_sector, blocknr;
 622
 623         switch (conf->algorithm) {
 624                 case ALGORITHM_LEFT_ASYMMETRIC:
 625                 case ALGORITHM_RIGHT_ASYMMETRIC:
 626                         if (i > sh->pd_idx)
 627                                 i--;
 628                         break;
 629                 case ALGORITHM_LEFT_SYMMETRIC:
 630                 case ALGORITHM_RIGHT_SYMMETRIC:
 631                         if (i < sh->pd_idx)
 632                                 i += raid_disks;
 633                         i -= (sh->pd_idx + 1);
 634                         break;
 635                 default:
 636                         printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 637         }
 638
 639         chunk_number = stripe * data_disks + i;
 640         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
 641         blocknr = r_sector / (sh->size >> 9);
 642
 643         check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
 644         if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
 645                 printk("compute_blocknr: map not correct\n");
 646                 return 0;
 647         }
 648         return blocknr;
 649 }
 650 #endif
 651
 652 #define check_xor()     do {                                    \
 653                            if (count == MAX_XOR_BLOCKS) {       \
 654                                 xor_block(count, bh_ptr);       \
 655                                 count = 1;                      \
 656                            }                                    \
 657                         } while(0)
 658
 659
 660 static void compute_block(struct stripe_head *sh, int dd_idx)
 661 {
 662         raid5_conf_t *conf = sh->raid_conf;
 663         int i, count, disks = conf->raid_disks;
 664         struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
 665
 666         PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
 667
 668
 669         memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
 670         bh_ptr[0] = sh->bh_cache[dd_idx];
 671         count = 1;
 672         for (i = disks ; i--; ) {
 673                 if (i == dd_idx)
 674                         continue;
 675                 bh = sh->bh_cache[i];
 676                 if (buffer_uptodate(bh))
 677                         bh_ptr[count++] = bh;
 678                 else
 679                         printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
 680
 681                 check_xor();
 682         }
 683         if (count != 1)
 684                 xor_block(count, bh_ptr);
 685         set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
 686 }
 687
 688 static void compute_parity(struct stripe_head *sh, int method)
 689 {
 690         raid5_conf_t *conf = sh->raid_conf;
 691         int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
 692         struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
 693         struct buffer_head *chosen[MD_SB_DISKS];
 694
 695         PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
 696         memset(chosen, 0, sizeof(chosen));
 697
 698         count = 1;
 699         bh_ptr[0] = sh->bh_cache[pd_idx];
 700         switch(method) {
 701         case READ_MODIFY_WRITE:
 702                 if (!buffer_uptodate(sh->bh_cache[pd_idx]))
 703                         BUG();
 704                 for (i=disks ; i-- ;) {
 705                         if (i==pd_idx)
 706                                 continue;
 707                         if (sh->bh_write[i] &&
 708                             buffer_uptodate(sh->bh_cache[i])) {
 709                                 bh_ptr[count++] = sh->bh_cache[i];
 710                                 chosen[i] = sh->bh_write[i];
 711                                 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
 712                                 chosen[i]->b_reqnext = sh->bh_written[i];
 713                                 sh->bh_written[i] = chosen[i];
 714                                 check_xor();
 715                         }
 716                 }
 717                 break;
 718         case RECONSTRUCT_WRITE:
 719                 memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
 720                 for (i= disks; i-- ;)
 721                         if (i!=pd_idx && sh->bh_write[i]) {
 722                                 chosen[i] = sh->bh_write[i];
 723                                 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
 724                                 chosen[i]->b_reqnext = sh->bh_written[i];
 725                                 sh->bh_written[i] = chosen[i];
 726                         }
 727                 break;
 728         case CHECK_PARITY:
 729                 break;
 730         }
 731         if (count>1) {
 732                 xor_block(count, bh_ptr);
 733                 count = 1;
 734         }
 735
 736         for (i = disks; i--;)
 737                 if (chosen[i]) {
 738                         struct buffer_head *bh = sh->bh_cache[i];
 739                         char *bdata;
 740                         bdata = bh_kmap(chosen[i]);
 741                         memcpy(bh->b_data,
 742                                bdata,sh->size);
 743                         bh_kunmap(chosen[i]);
 744                         set_bit(BH_Lock, &bh->b_state);
 745                         mark_buffer_uptodate(bh, 1);
 746                 }
 747
 748         switch(method) {
 749         case RECONSTRUCT_WRITE:
 750         case CHECK_PARITY:
 751                 for (i=disks; i--;)
 752                         if (i != pd_idx) {
 753                                 bh_ptr[count++] = sh->bh_cache[i];
 754                                 check_xor();
 755                         }
 756                 break;
 757         case READ_MODIFY_WRITE:
 758                 for (i = disks; i--;)
 759                         if (chosen[i]) {
 760                                 bh_ptr[count++] = sh->bh_cache[i];
 761                                 check_xor();
 762                         }
 763         }
 764         if (count != 1)
 765                 xor_block(count, bh_ptr);
 766
 767         if (method != CHECK_PARITY) {
 768                 mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
 769                 set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
 770         } else
 771                 mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
 772 }
 773
 774 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
 775 {
 776         struct buffer_head **bhp;
 777         raid5_conf_t *conf = sh->raid_conf;
 778
 779         PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
 780
 781
 782         spin_lock(&sh->lock);
 783         spin_lock_irq(&conf->device_lock);
 784         bh->b_reqnext = NULL;
 785         if (rw == READ)
 786                 bhp = &sh->bh_read[dd_idx];
 787         else
 788                 bhp = &sh->bh_write[dd_idx];
 789         while (*bhp) {
 790                 printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
 791                 bhp = & (*bhp)->b_reqnext;
 792         }
 793         *bhp = bh;
 794         spin_unlock_irq(&conf->device_lock);
 795         spin_unlock(&sh->lock);
 796
 797         PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
 798 }
 799
 800
 801
 802
 803
 804 /*
 805  * handle_stripe - do things to a stripe.
 806  *
 807  * We lock the stripe and then examine the state of various bits
 808  * to see what needs to be done.
 809  * Possible results:
 810  *    return some read request which now have data
 811  *    return some write requests which are safely on disc
 812  *    schedule a read on some buffers
 813  *    schedule a write of some buffers
 814  *    return confirmation of parity correctness
 815  *
 816  * Parity calculations are done inside the stripe lock
 817  * buffers are taken off read_list or write_list, and bh_cache buffers
 818  * get BH_Lock set before the stripe lock is released.
 819  *
 820  */
 821
 822 static void handle_stripe(struct stripe_head *sh)
 823 {
 824         raid5_conf_t *conf = sh->raid_conf;
 825         int disks = conf->raid_disks;
 826         struct buffer_head *return_ok= NULL, *return_fail = NULL;
 827         int action[MD_SB_DISKS];
 828         int i;
 829         int syncing;
 830         int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 831         int failed_num=0;
 832         struct buffer_head *bh;
 833
 834         PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
 835         memset(action, 0, sizeof(action));
 836
 837         spin_lock(&sh->lock);
 838         clear_bit(STRIPE_HANDLE, &sh->state);
 839         clear_bit(STRIPE_DELAYED, &sh->state);
 840
 841         syncing = test_bit(STRIPE_SYNCING, &sh->state);
 842         /* Now to look around and see what can be done */
 843
 844         for (i=disks; i--; ) {
 845                 bh = sh->bh_cache[i];
 846                 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
 847                 /* maybe we can reply to a read */
 848                 if (buffer_uptodate(bh) && sh->bh_read[i]) {
 849                         struct buffer_head *rbh, *rbh2;
 850                         PRINTK("Return read for disc %d\n", i);
 851                         spin_lock_irq(&conf->device_lock);
 852                         rbh = sh->bh_read[i];
 853                         sh->bh_read[i] = NULL;
 854                         spin_unlock_irq(&conf->device_lock);
 855                         while (rbh) {
 856                                 char *bdata;
 857                                 bdata = bh_kmap(rbh);
 858                                 memcpy(bdata, bh->b_data, bh->b_size);
 859                                 bh_kunmap(rbh);
 860                                 rbh2 = rbh->b_reqnext;
 861                                 rbh->b_reqnext = return_ok;
 862                                 return_ok = rbh;
 863                                 rbh = rbh2;
 864                         }
 865                 }
 866
 867                 /* now count some things */
 868                 if (buffer_locked(bh)) locked++;
 869                 if (buffer_uptodate(bh)) uptodate++;
 870
 871
 872                 if (sh->bh_read[i]) to_read++;
 873                 if (sh->bh_write[i]) to_write++;
 874                 if (sh->bh_written[i]) written++;
 875                 if (!conf->disks[i].operational) {
 876                         failed++;
 877                         failed_num = i;
 878                 }
 879         }
 880         PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
 881                locked, uptodate, to_read, to_write, failed, failed_num);
 882         /* check if the array has lost two devices and, if so, some requests might
 883          * need to be failed
 884          */
 885         if (failed > 1 && to_read+to_write+written) {
 886                 for (i=disks; i--; ) {
 887                         /* fail all writes first */
 888                         if (sh->bh_write[i]) to_write--;
 889                         while ((bh = sh->bh_write[i])) {
 890                                 sh->bh_write[i] = bh->b_reqnext;
 891                                 bh->b_reqnext = return_fail;
 892                                 return_fail = bh;
 893                         }
 894                         /* and fail all 'written' */
 895                         if (sh->bh_written[i]) written--;
 896                         while ((bh = sh->bh_written[i])) {
 897                                 sh->bh_written[i] = bh->b_reqnext;
 898                                 bh->b_reqnext = return_fail;
 899                                 return_fail = bh;
 900                         }
 901
 902                         /* fail any reads if this device is non-operational */
 903                         if (!conf->disks[i].operational) {
 904                                 spin_lock_irq(&conf->device_lock);
 905                                 if (sh->bh_read[i]) to_read--;
 906                                 while ((bh = sh->bh_read[i])) {
 907                                         sh->bh_read[i] = bh->b_reqnext;
 908                                         bh->b_reqnext = return_fail;
 909                                         return_fail = bh;
 910                                 }
 911                                 spin_unlock_irq(&conf->device_lock);
 912                         }
 913                 }
 914         }
 915         if (failed > 1 && syncing) {
 916                 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
 917                 clear_bit(STRIPE_SYNCING, &sh->state);
 918                 syncing = 0;
 919         }
 920
 921         /* might be able to return some write requests if the parity block
 922          * is safe, or on a failed drive
 923          */
 924         bh = sh->bh_cache[sh->pd_idx];
 925         if ( written &&
 926              ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
 927                || (failed == 1 && failed_num == sh->pd_idx))
 928             ) {
 929             /* any written block on a uptodate or failed drive can be returned */
 930             for (i=disks; i--; )
 931                 if (sh->bh_written[i]) {
 932                     bh = sh->bh_cache[i];
 933                     if (!conf->disks[sh->pd_idx].operational ||
 934                         (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
 935                         /* maybe we can return some write requests */
 936                         struct buffer_head *wbh, *wbh2;
 937                         PRINTK("Return write for disc %d\n", i);
 938                         wbh = sh->bh_written[i];
 939                         sh->bh_written[i] = NULL;
 940                         while (wbh) {
 941                             wbh2 = wbh->b_reqnext;
 942                             wbh->b_reqnext = return_ok;
 943                             return_ok = wbh;
 944                             wbh = wbh2;
 945                         }
 946                     }
 947                 }
 948         }
 949
 950         /* Now we might consider reading some blocks, either to check/generate
 951          * parity, or to satisfy requests
 952          */
 953         if (to_read || (syncing && (uptodate < disks))) {
 954                 for (i=disks; i--;) {
 955                         bh = sh->bh_cache[i];
 956                         if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
 957                             (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
 958                                 /* we would like to get this block, possibly
 959                                  * by computing it, but we might not be able to
 960                                  */
 961                                 if (uptodate == disks-1) {
 962                                         PRINTK("Computing block %d\n", i);
 963                                         compute_block(sh, i);
 964                                         uptodate++;
 965                                 } else if (conf->disks[i].operational) {
 966                                         set_bit(BH_Lock, &bh->b_state);
 967                                         action[i] = READ+1;
 968                                         /* if I am just reading this block and we don't have
 969                                            a failed drive, or any pending writes then sidestep the cache */
 970                                         if (sh->bh_page[i]) BUG();
 971                                         if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
 972                                             ! syncing && !failed && !to_write) {
 973                                                 sh->bh_page[i] = sh->bh_cache[i]->b_page;
 974                                                 sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
 975                                                 sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
 976                                         }
 977                                         locked++;
 978                                         PRINTK("Reading block %d (sync=%d)\n", i, syncing);
 979                                         if (syncing)
 980                                                 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
 981                                 }
 982                         }
 983                 }
 984                 set_bit(STRIPE_HANDLE, &sh->state);
 985         }
 986
 987         /* now to consider writing and what else, if anything should be read */
 988         if (to_write) {
 989                 int rmw=0, rcw=0;
 990                 for (i=disks ; i--;) {
 991                         /* would I have to read this buffer for read_modify_write */
 992                         bh = sh->bh_cache[i];
 993                         if ((sh->bh_write[i] || i == sh->pd_idx) &&
 994                             (!buffer_locked(bh) || sh->bh_page[i]) &&
 995                             !buffer_uptodate(bh)) {
 996                                 if (conf->disks[i].operational
 997 /*                                  && !(conf->resync_parity && i == sh->pd_idx) */
 998                                         )
 999                                         rmw++;
1000                                 else rmw += 2*disks;  /* cannot read it */
1001                         }
1002                         /* Would I have to read this buffer for reconstruct_write */
1003                         if (!sh->bh_write[i] && i != sh->pd_idx &&
1004                             (!buffer_locked(bh) || sh->bh_page[i]) &&
1005                             !buffer_uptodate(bh)) {
1006                                 if (conf->disks[i].operational) rcw++;
1007                                 else rcw += 2*disks;
1008                         }
1009                 }
1010                 PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1011                 set_bit(STRIPE_HANDLE, &sh->state);
1012                 if (rmw < rcw && rmw > 0)
1013                         /* prefer read-modify-write, but need to get some data */
1014                         for (i=disks; i--;) {
1015                                 bh = sh->bh_cache[i];
1016                                 if ((sh->bh_write[i] || i == sh->pd_idx) &&
1017                                     !buffer_locked(bh) && !buffer_uptodate(bh) &&
1018                                     conf->disks[i].operational) {
1019                                         if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1020                                         {
1021                                                 PRINTK("Read_old block %d for r-m-w\n", i);
1022                                                 set_bit(BH_Lock, &bh->b_state);
1023                                                 action[i] = READ+1;
1024                                                 locked++;
1025                                         } else {
1026                                                 set_bit(STRIPE_DELAYED, &sh->state);
1027                                                 set_bit(STRIPE_HANDLE, &sh->state);
1028                                         }
1029                                 }
1030                         }
1031                 if (rcw <= rmw && rcw > 0)
1032                         /* want reconstruct write, but need to get some data */
1033                         for (i=disks; i--;) {
1034                                 bh = sh->bh_cache[i];
1035                                 if (!sh->bh_write[i]  && i != sh->pd_idx &&
1036                                     !buffer_locked(bh) && !buffer_uptodate(bh) &&
1037                                     conf->disks[i].operational) {
1038                                         if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1039                                         {
1040                                                 PRINTK("Read_old block %d for Reconstruct\n", i);
1041                                                 set_bit(BH_Lock, &bh->b_state);
1042                                                 action[i] = READ+1;
1043                                                 locked++;
1044                                         } else {
1045                                                 set_bit(STRIPE_DELAYED, &sh->state);
1046                                                 set_bit(STRIPE_HANDLE, &sh->state);
1047                                         }
1048                                 }
1049                         }
1050                 /* now if nothing is locked, and if we have enough data, we can start a write request */
1051                 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1052                         PRINTK("Computing parity...\n");
1053                         compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1054                         /* now every locked buffer is ready to be written */
1055                         for (i=disks; i--;)
1056                                 if (buffer_locked(sh->bh_cache[i])) {
1057                                         PRINTK("Writing block %d\n", i);
1058                                         locked++;
1059                                         action[i] = WRITE+1;
1060                                         if (!conf->disks[i].operational
1061                                             || (i==sh->pd_idx && failed == 0))
1062                                                 set_bit(STRIPE_INSYNC, &sh->state);
1063                                 }
1064                         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1065                                 atomic_dec(&conf->preread_active_stripes);
1066                                 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1067                                         md_wakeup_thread(conf->thread);
1068                         }
1069                 }
1070         }
1071
1072         /* maybe we need to check and possibly fix the parity for this stripe
1073          * Any reads will already have been scheduled, so we just see if enough data
1074          * is available
1075          */
1076         if (syncing && locked == 0 &&
1077             !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1078                 set_bit(STRIPE_HANDLE, &sh->state);
1079                 if (failed == 0) {
1080                         if (uptodate != disks)
1081                                 BUG();
1082                         compute_parity(sh, CHECK_PARITY);
1083                         uptodate--;
1084                         bh = sh->bh_cache[sh->pd_idx];
1085                         if ((*(u32*)bh->b_data) == 0 &&
1086                             !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1087                                 /* parity is correct (on disc, not in buffer any more) */
1088                                 set_bit(STRIPE_INSYNC, &sh->state);
1089                         }
1090                 }
1091                 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1092                         struct disk_info *spare;
1093                         if (failed==0)
1094                                 failed_num = sh->pd_idx;
1095                         /* should be able to compute the missing block and write it to spare */
1096                         if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1097                                 if (uptodate+1 != disks)
1098                                         BUG();
1099                                 compute_block(sh, failed_num);
1100                                 uptodate++;
1101                         }
1102                         if (uptodate != disks)
1103                                 BUG();
1104                         bh = sh->bh_cache[failed_num];
1105                         set_bit(BH_Lock, &bh->b_state);
1106                         action[failed_num] = WRITE+1;
1107                         locked++;
1108                         set_bit(STRIPE_INSYNC, &sh->state);
1109                         if (conf->disks[failed_num].operational)
1110                                 md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
1111                         else if ((spare=conf->spare))
1112                                 md_sync_acct(spare->dev, bh->b_size>>9);
1113
1114                 }
1115         }
1116         if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1117                 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
1118                 clear_bit(STRIPE_SYNCING, &sh->state);
1119         }
1120
1121
1122         spin_unlock(&sh->lock);
1123
1124         while ((bh=return_ok)) {
1125                 return_ok = bh->b_reqnext;
1126                 bh->b_reqnext = NULL;
1127                 bh->b_end_io(bh, 1);
1128         }
1129         while ((bh=return_fail)) {
1130                 return_fail = bh->b_reqnext;
1131                 bh->b_reqnext = NULL;
1132                 bh->b_end_io(bh, 0);
1133         }
1134         for (i=disks; i-- ;)
1135                 if (action[i]) {
1136                         struct buffer_head *bh = sh->bh_cache[i];
1137                         struct disk_info *spare = conf->spare;
1138                         int skip = 0;
1139                         if (action[i] == READ+1)
1140                                 bh->b_end_io = raid5_end_read_request;
1141                         else
1142                                 bh->b_end_io = raid5_end_write_request;
1143                         if (conf->disks[i].operational)
1144                                 bh->b_dev = conf->disks[i].dev;
1145                         else if (spare && action[i] == WRITE+1)
1146                                 bh->b_dev = spare->dev;
1147                         else skip=1;
1148                         if (!skip) {
1149                                 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1150                                 atomic_inc(&sh->count);
1151                                 bh->b_rdev = bh->b_dev;
1152                                 bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1153                                 generic_make_request(action[i]-1, bh);
1154                         } else {
1155                                 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1156                                 clear_bit(BH_Lock, &bh->b_state);
1157                                 set_bit(STRIPE_HANDLE, &sh->state);
1158                         }
1159                 }
1160 }
1161
1162 static inline void raid5_activate_delayed(raid5_conf_t *conf)
1163 {
1164         if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1165                 while (!list_empty(&conf->delayed_list)) {
1166                         struct list_head *l = conf->delayed_list.next;
1167                         struct stripe_head *sh;
1168                         sh = list_entry(l, struct stripe_head, lru);
1169                         list_del_init(l);
1170                         clear_bit(STRIPE_DELAYED, &sh->state);
1171                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1172                                 atomic_inc(&conf->preread_active_stripes);
1173                         list_add_tail(&sh->lru, &conf->handle_list);
1174                 }
1175         }
1176 }
1177 static void raid5_unplug_device(void *data)
1178 {
1179         raid5_conf_t *conf = (raid5_conf_t *)data;
1180         unsigned long flags;
1181
1182         spin_lock_irqsave(&conf->device_lock, flags);
1183
1184         raid5_activate_delayed(conf);
1185
1186         conf->plugged = 0;
1187         md_wakeup_thread(conf->thread);
1188
1189         spin_unlock_irqrestore(&conf->device_lock, flags);
1190 }
1191
1192 static inline void raid5_plug_device(raid5_conf_t *conf)
1193 {
1194         spin_lock_irq(&conf->device_lock);
1195         if (list_empty(&conf->delayed_list))
1196                 if (!conf->plugged) {
1197                         conf->plugged = 1;
1198                         queue_task(&conf->plug_tq, &tq_disk);
1199                 }
1200         spin_unlock_irq(&conf->device_lock);
1201 }
1202
1203 static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1204 {
1205         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1206         const unsigned int raid_disks = conf->raid_disks;
1207         const unsigned int data_disks = raid_disks - 1;
1208         unsigned int dd_idx, pd_idx;
1209         unsigned long new_sector;
1210         int read_ahead = 0;
1211
1212         struct stripe_head *sh;
1213
1214         if (rw == READA) {
1215                 rw = READ;
1216                 read_ahead=1;
1217         }
1218
1219         new_sector = raid5_compute_sector(bh->b_rsector,
1220                         raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1221
1222         PRINTK("raid5_make_request, sector %lu\n", new_sector);
1223         sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1224         if (sh) {
1225                 sh->pd_idx = pd_idx;
1226
1227                 add_stripe_bh(sh, bh, dd_idx, rw);
1228
1229                 raid5_plug_device(conf);
1230                 handle_stripe(sh);
1231                 release_stripe(sh);
1232         } else
1233                 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1234         return 0;
1235 }
1236
1237 /*
1238  * Determine correct block size for this device.
1239  */
1240 unsigned int device_bsize (kdev_t dev)
1241 {
1242         unsigned int i, correct_size;
1243
1244         correct_size = BLOCK_SIZE;
1245         if (blksize_size[MAJOR(dev)]) {
1246                 i = blksize_size[MAJOR(dev)][MINOR(dev)];
1247                 if (i)
1248                         correct_size = i;
1249         }
1250
1251         return correct_size;
1252 }
1253
1254 static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
1255 {
1256         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1257         struct stripe_head *sh;
1258         int sectors_per_chunk = conf->chunk_size >> 9;
1259         unsigned long stripe = sector_nr/sectors_per_chunk;
1260         int chunk_offset = sector_nr % sectors_per_chunk;
1261         int dd_idx, pd_idx;
1262         unsigned long first_sector;
1263         int raid_disks = conf->raid_disks;
1264         int data_disks = raid_disks-1;
1265         int redone = 0;
1266         int bufsize;
1267
1268         sh = get_active_stripe(conf, sector_nr, 0, 0);
1269         bufsize = sh->size;
1270         redone = sector_nr - sh->sector;
1271         first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1272                 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1273         sh->pd_idx = pd_idx;
1274         spin_lock(&sh->lock);
1275         set_bit(STRIPE_SYNCING, &sh->state);
1276         clear_bit(STRIPE_INSYNC, &sh->state);
1277         sh->sync_redone = redone;
1278         spin_unlock(&sh->lock);
1279
1280         handle_stripe(sh);
1281         release_stripe(sh);
1282
1283         return (bufsize>>9)-redone;
1284 }
1285
1286 /*
1287  * This is our raid5 kernel thread.
1288  *
1289  * We scan the hash table for stripes which can be handled now.
1290  * During the scan, completed stripes are saved for us by the interrupt
1291  * handler, so that they will not have to wait for our next wakeup.
1292  */
1293 static void raid5d (void *data)
1294 {
1295         struct stripe_head *sh;
1296         raid5_conf_t *conf = data;
1297         mddev_t *mddev = conf->mddev;
1298         int handled;
1299
1300         PRINTK("+++ raid5d active\n");
1301
1302         handled = 0;
1303
1304         if (mddev->sb_dirty)
1305                 md_update_sb(mddev);
1306         md_spin_lock_irq(&conf->device_lock);
1307         while (1) {
1308                 struct list_head *first;
1309
1310                 if (list_empty(&conf->handle_list) &&
1311                     atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1312                     !conf->plugged &&
1313                     !list_empty(&conf->delayed_list))
1314                         raid5_activate_delayed(conf);
1315
1316                 if (list_empty(&conf->handle_list))
1317                         break;
1318
1319                 first = conf->handle_list.next;
1320                 sh = list_entry(first, struct stripe_head, lru);
1321
1322                 list_del_init(first);
1323                 atomic_inc(&sh->count);
1324                 if (atomic_read(&sh->count)!= 1)
1325                         BUG();
1326                 md_spin_unlock_irq(&conf->device_lock);
1327
1328                 handled++;
1329                 handle_stripe(sh);
1330                 release_stripe(sh);
1331
1332                 md_spin_lock_irq(&conf->device_lock);
1333         }
1334         PRINTK("%d stripes handled\n", handled);
1335
1336         md_spin_unlock_irq(&conf->device_lock);
1337
1338         PRINTK("--- raid5d inactive\n");
1339 }
1340
1341 /*
1342  * Private kernel thread for parity reconstruction after an unclean
1343  * shutdown. Reconstruction on spare drives in case of a failed drive
1344  * is done by the generic mdsyncd.
1345  */
1346 static void raid5syncd (void *data)
1347 {
1348         raid5_conf_t *conf = data;
1349         mddev_t *mddev = conf->mddev;
1350
1351         if (!conf->resync_parity)
1352                 return;
1353         if (conf->resync_parity == 2)
1354                 return;
1355         down(&mddev->recovery_sem);
1356         if (md_do_sync(mddev,NULL)) {
1357                 up(&mddev->recovery_sem);
1358                 printk("raid5: resync aborted!\n");
1359                 return;
1360         }
1361         conf->resync_parity = 0;
1362         up(&mddev->recovery_sem);
1363         printk("raid5: resync finished.\n");
1364 }
1365
1366 static int raid5_run (mddev_t *mddev)
1367 {
1368         raid5_conf_t *conf;
1369         int i, j, raid_disk, memory;
1370         mdp_super_t *sb = mddev->sb;
1371         mdp_disk_t *desc;
1372         mdk_rdev_t *rdev;
1373         struct disk_info *disk;
1374         struct md_list_head *tmp;
1375         int start_recovery = 0;
1376
1377         MOD_INC_USE_COUNT;
1378
1379         if (sb->level != 5 && sb->level != 4) {
1380                 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1381                 MOD_DEC_USE_COUNT;
1382                 return -EIO;
1383         }
1384
1385         mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1386         if ((conf = mddev->private) == NULL)
1387                 goto abort;
1388         memset (conf, 0, sizeof (*conf));
1389         conf->mddev = mddev;
1390
1391         if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1392                 goto abort;
1393         memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1394
1395         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1396         md_init_waitqueue_head(&conf->wait_for_stripe);
1397         INIT_LIST_HEAD(&conf->handle_list);
1398         INIT_LIST_HEAD(&conf->delayed_list);
1399         INIT_LIST_HEAD(&conf->inactive_list);
1400         atomic_set(&conf->active_stripes, 0);
1401         atomic_set(&conf->preread_active_stripes, 0);
1402         conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
1403
1404         conf->plugged = 0;
1405         conf->plug_tq.sync = 0;
1406         conf->plug_tq.routine = &raid5_unplug_device;
1407         conf->plug_tq.data = conf;
1408
1409         PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1410
1411         ITERATE_RDEV(mddev,rdev,tmp) {
1412                 /*
1413                  * This is important -- we are using the descriptor on
1414                  * the disk only to get a pointer to the descriptor on
1415                  * the main superblock, which might be more recent.
1416                  */
1417                 desc = sb->disks + rdev->desc_nr;
1418                 raid_disk = desc->raid_disk;
1419                 disk = conf->disks + raid_disk;
1420
1421                 if (disk_faulty(desc)) {
1422                         printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1423                         if (!rdev->faulty) {
1424                                 MD_BUG();
1425                                 goto abort;
1426                         }
1427                         disk->number = desc->number;
1428                         disk->raid_disk = raid_disk;
1429                         disk->dev = rdev->dev;
1430
1431                         disk->operational = 0;
1432                         disk->write_only = 0;
1433                         disk->spare = 0;
1434                         disk->used_slot = 1;
1435                         continue;
1436                 }
1437                 if (disk_active(desc)) {
1438                         if (!disk_sync(desc)) {
1439                                 printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1440                                 MD_BUG();
1441                                 goto abort;
1442                         }
1443                         if (raid_disk > sb->raid_disks) {
1444                                 printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1445                                 continue;
1446                         }
1447                         if (disk->operational) {
1448                                 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1449                                 continue;
1450                         }
1451                         printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1452
1453                         disk->number = desc->number;
1454                         disk->raid_disk = raid_disk;
1455                         disk->dev = rdev->dev;
1456                         disk->operational = 1;
1457                         disk->used_slot = 1;
1458
1459                         conf->working_disks++;
1460                 } else {
1461                         /*
1462                          * Must be a spare disk ..
1463                          */
1464                         printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1465                         disk->number = desc->number;
1466                         disk->raid_disk = raid_disk;
1467                         disk->dev = rdev->dev;
1468
1469                         disk->operational = 0;
1470                         disk->write_only = 0;
1471                         disk->spare = 1;
1472                         disk->used_slot = 1;
1473                 }
1474         }
1475
1476         for (i = 0; i < MD_SB_DISKS; i++) {
1477                 desc = sb->disks + i;
1478                 raid_disk = desc->raid_disk;
1479                 disk = conf->disks + raid_disk;
1480
1481                 if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1482                         !conf->disks[raid_disk].used_slot) {
1483
1484                         disk->number = desc->number;
1485                         disk->raid_disk = raid_disk;
1486                         disk->dev = MKDEV(0,0);
1487
1488                         disk->operational = 0;
1489                         disk->write_only = 0;
1490                         disk->spare = 0;
1491                         disk->used_slot = 1;
1492                 }
1493         }
1494
1495         conf->raid_disks = sb->raid_disks;
1496         /*
1497          * 0 for a fully functional array, 1 for a degraded array.
1498          */
1499         conf->failed_disks = conf->raid_disks - conf->working_disks;
1500         conf->mddev = mddev;
1501         conf->chunk_size = sb->chunk_size;
1502         conf->level = sb->level;
1503         conf->algorithm = sb->layout;
1504         conf->max_nr_stripes = NR_STRIPES;
1505
1506 #if 0
1507         for (i = 0; i < conf->raid_disks; i++) {
1508                 if (!conf->disks[i].used_slot) {
1509                         MD_BUG();
1510                         goto abort;
1511                 }
1512         }
1513 #endif
1514         if (!conf->chunk_size || conf->chunk_size % 4) {
1515                 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1516                 goto abort;
1517         }
1518         if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1519                 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1520                 goto abort;
1521         }
1522         if (conf->failed_disks > 1) {
1523                 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1524                 goto abort;
1525         }
1526
1527         if (conf->working_disks != sb->raid_disks) {
1528                 printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1529                 start_recovery = 1;
1530         }
1531
1532         {
1533                 const char * name = "raid5d";
1534
1535                 conf->thread = md_register_thread(raid5d, conf, name);
1536                 if (!conf->thread) {
1537                         printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1538                         goto abort;
1539                 }
1540         }
1541
1542         memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1543                  conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1544         if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1545                 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1546                 shrink_stripes(conf, conf->max_nr_stripes);
1547                 goto abort;
1548         } else
1549                 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1550
1551         /*
1552          * Regenerate the "device is in sync with the raid set" bit for
1553          * each device.
1554          */
1555         for (i = 0; i < MD_SB_DISKS ; i++) {
1556                 mark_disk_nonsync(sb->disks + i);
1557                 for (j = 0; j < sb->raid_disks; j++) {
1558                         if (!conf->disks[j].operational)
1559                                 continue;
1560                         if (sb->disks[i].number == conf->disks[j].number)
1561                                 mark_disk_sync(sb->disks + i);
1562                 }
1563         }
1564         sb->active_disks = conf->working_disks;
1565
1566         if (sb->active_disks == sb->raid_disks)
1567                 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1568         else
1569                 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1570
1571         if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1572                 const char * name = "raid5syncd";
1573
1574                 conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1575                 if (!conf->resync_thread) {
1576                         printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1577                         goto abort;
1578                 }
1579
1580                 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1581                 conf->resync_parity = 1;
1582                 md_wakeup_thread(conf->resync_thread);
1583         }
1584
1585         print_raid5_conf(conf);
1586         if (start_recovery)
1587                 md_recover_arrays();
1588         print_raid5_conf(conf);
1589
1590         /* Ok, everything is just fine now */
1591         return (0);
1592 abort:
1593         if (conf) {
1594                 print_raid5_conf(conf);
1595                 if (conf->stripe_hashtbl)
1596                         free_pages((unsigned long) conf->stripe_hashtbl,
1597                                                         HASH_PAGES_ORDER);
1598                 kfree(conf);
1599         }
1600         mddev->private = NULL;
1601         printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1602         MOD_DEC_USE_COUNT;
1603         return -EIO;
1604 }
1605
1606 static int raid5_stop_resync (mddev_t *mddev)
1607 {
1608         raid5_conf_t *conf = mddev_to_conf(mddev);
1609         mdk_thread_t *thread = conf->resync_thread;
1610
1611         if (thread) {
1612                 if (conf->resync_parity) {
1613                         conf->resync_parity = 2;
1614                         md_interrupt_thread(thread);
1615                         printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1616                         return 1;
1617                 }
1618                 return 0;
1619         }
1620         return 0;
1621 }
1622
1623 static int raid5_restart_resync (mddev_t *mddev)
1624 {
1625         raid5_conf_t *conf = mddev_to_conf(mddev);
1626
1627         if (conf->resync_parity) {
1628                 if (!conf->resync_thread) {
1629                         MD_BUG();
1630                         return 0;
1631                 }
1632                 printk("raid5: waking up raid5resync.\n");
1633                 conf->resync_parity = 1;
1634                 md_wakeup_thread(conf->resync_thread);
1635                 return 1;
1636         } else
1637                 printk("raid5: no restart-resync needed.\n");
1638         return 0;
1639 }
1640
1641
1642 static int raid5_stop (mddev_t *mddev)
1643 {
1644         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1645
1646         if (conf->resync_thread)
1647                 md_unregister_thread(conf->resync_thread);
1648         md_unregister_thread(conf->thread);
1649         shrink_stripes(conf, conf->max_nr_stripes);
1650         free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1651         kfree(conf);
1652         mddev->private = NULL;
1653         MOD_DEC_USE_COUNT;
1654         return 0;
1655 }
1656
1657 #if RAID5_DEBUG
1658 static void print_sh (struct stripe_head *sh)
1659 {
1660         int i;
1661
1662         printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1663         printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
1664         printk("sh %lu, ", sh->sector);
1665         for (i = 0; i < MD_SB_DISKS; i++) {
1666                 if (sh->bh_cache[i])
1667                         printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1668         }
1669         printk("\n");
1670 }
1671
1672 static void printall (raid5_conf_t *conf)
1673 {
1674         struct stripe_head *sh;
1675         int i;
1676
1677         md_spin_lock_irq(&conf->device_lock);
1678         for (i = 0; i < NR_HASH; i++) {
1679                 sh = conf->stripe_hashtbl[i];
1680                 for (; sh; sh = sh->hash_next) {
1681                         if (sh->raid_conf != conf)
1682                                 continue;
1683                         print_sh(sh);
1684                 }
1685         }
1686         md_spin_unlock_irq(&conf->device_lock);
1687
1688         PRINTK("--- raid5d inactive\n");
1689 }
1690 #endif
1691
1692 static void raid5_status (struct seq_file *seq, mddev_t *mddev)
1693 {
1694         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1695         mdp_super_t *sb = mddev->sb;
1696         int i;
1697
1698         seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1699         seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1700         for (i = 0; i < conf->raid_disks; i++)
1701                 seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_");
1702         seq_printf (seq, "]");
1703 #if RAID5_DEBUG
1704 #define D(x) \
1705         seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1706         printall(conf);
1707 #endif
1708
1709 }
1710
1711 static void print_raid5_conf (raid5_conf_t *conf)
1712 {
1713         int i;
1714         struct disk_info *tmp;
1715
1716         printk("RAID5 conf printout:\n");
1717         if (!conf) {
1718                 printk("(conf==NULL)\n");
1719                 return;
1720         }
1721         printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1722                  conf->working_disks, conf->failed_disks);
1723
1724 #if RAID5_DEBUG
1725         for (i = 0; i < MD_SB_DISKS; i++) {
1726 #else
1727         for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1728 #endif
1729                 tmp = conf->disks + i;
1730                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1731                         i, tmp->spare,tmp->operational,
1732                         tmp->number,tmp->raid_disk,tmp->used_slot,
1733                         partition_name(tmp->dev));
1734         }
1735 }
1736
1737 static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1738 {
1739         int err = 0;
1740         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1741         raid5_conf_t *conf = mddev->private;
1742         struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1743         mdp_super_t *sb = mddev->sb;
1744         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1745         mdk_rdev_t *spare_rdev, *failed_rdev;
1746
1747         print_raid5_conf(conf);
1748         md_spin_lock_irq(&conf->device_lock);
1749         /*
1750          * find the disk ...
1751          */
1752         switch (state) {
1753
1754         case DISKOP_SPARE_ACTIVE:
1755
1756                 /*
1757                  * Find the failed disk within the RAID5 configuration ...
1758                  * (this can only be in the first conf->raid_disks part)
1759                  */
1760                 for (i = 0; i < conf->raid_disks; i++) {
1761                         tmp = conf->disks + i;
1762                         if ((!tmp->operational && !tmp->spare) ||
1763                                         !tmp->used_slot) {
1764                                 failed_disk = i;
1765                                 break;
1766                         }
1767                 }
1768                 /*
1769                  * When we activate a spare disk we _must_ have a disk in
1770                  * the lower (active) part of the array to replace.
1771                  */
1772                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1773                         MD_BUG();
1774                         err = 1;
1775                         goto abort;
1776                 }
1777                 /* fall through */
1778
1779         case DISKOP_SPARE_WRITE:
1780         case DISKOP_SPARE_INACTIVE:
1781
1782                 /*
1783                  * Find the spare disk ... (can only be in the 'high'
1784                  * area of the array)
1785                  */
1786                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1787                         tmp = conf->disks + i;
1788                         if (tmp->spare && tmp->number == (*d)->number) {
1789                                 spare_disk = i;
1790                                 break;
1791                         }
1792                 }
1793                 if (spare_disk == -1) {
1794                         MD_BUG();
1795                         err = 1;
1796                         goto abort;
1797                 }
1798                 break;
1799
1800         case DISKOP_HOT_REMOVE_DISK:
1801
1802                 for (i = 0; i < MD_SB_DISKS; i++) {
1803                         tmp = conf->disks + i;
1804                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
1805                                 if (tmp->operational) {
1806                                         err = -EBUSY;
1807                                         goto abort;
1808                                 }
1809                                 removed_disk = i;
1810                                 break;
1811                         }
1812                 }
1813                 if (removed_disk == -1) {
1814                         MD_BUG();
1815                         err = 1;
1816                         goto abort;
1817                 }
1818                 break;
1819
1820         case DISKOP_HOT_ADD_DISK:
1821
1822                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1823                         tmp = conf->disks + i;
1824                         if (!tmp->used_slot) {
1825                                 added_disk = i;
1826                                 break;
1827                         }
1828                 }
1829                 if (added_disk == -1) {
1830                         MD_BUG();
1831                         err = 1;
1832                         goto abort;
1833                 }
1834                 break;
1835         }
1836
1837         switch (state) {
1838         /*
1839          * Switch the spare disk to write-only mode:
1840          */
1841         case DISKOP_SPARE_WRITE:
1842                 if (conf->spare) {
1843                         MD_BUG();
1844                         err = 1;
1845                         goto abort;
1846                 }
1847                 sdisk = conf->disks + spare_disk;
1848                 sdisk->operational = 1;
1849                 sdisk->write_only = 1;
1850                 conf->spare = sdisk;
1851                 break;
1852         /*
1853          * Deactivate a spare disk:
1854          */
1855         case DISKOP_SPARE_INACTIVE:
1856                 sdisk = conf->disks + spare_disk;
1857                 sdisk->operational = 0;
1858                 sdisk->write_only = 0;
1859                 /*
1860                  * Was the spare being resynced?
1861                  */
1862                 if (conf->spare == sdisk)
1863                         conf->spare = NULL;
1864                 break;
1865         /*
1866          * Activate (mark read-write) the (now sync) spare disk,
1867          * which means we switch it's 'raid position' (->raid_disk)
1868          * with the failed disk. (only the first 'conf->raid_disks'
1869          * slots are used for 'real' disks and we must preserve this
1870          * property)
1871          */
1872         case DISKOP_SPARE_ACTIVE:
1873                 if (!conf->spare) {
1874                         MD_BUG();
1875                         err = 1;
1876                         goto abort;
1877                 }
1878                 sdisk = conf->disks + spare_disk;
1879                 fdisk = conf->disks + failed_disk;
1880
1881                 spare_desc = &sb->disks[sdisk->number];
1882                 failed_desc = &sb->disks[fdisk->number];
1883
1884                 if (spare_desc != *d) {
1885                         MD_BUG();
1886                         err = 1;
1887                         goto abort;
1888                 }
1889
1890                 if (spare_desc->raid_disk != sdisk->raid_disk) {
1891                         MD_BUG();
1892                         err = 1;
1893                         goto abort;
1894                 }
1895
1896                 if (sdisk->raid_disk != spare_disk) {
1897                         MD_BUG();
1898                         err = 1;
1899                         goto abort;
1900                 }
1901
1902                 if (failed_desc->raid_disk != fdisk->raid_disk) {
1903                         MD_BUG();
1904                         err = 1;
1905                         goto abort;
1906                 }
1907
1908                 if (fdisk->raid_disk != failed_disk) {
1909                         MD_BUG();
1910                         err = 1;
1911                         goto abort;
1912                 }
1913
1914                 /*
1915                  * do the switch finally
1916                  */
1917                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1918                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1919
1920                 /* There must be a spare_rdev, but there may not be a
1921                  * failed_rdev.  That slot might be empty...
1922                  */
1923                 spare_rdev->desc_nr = failed_desc->number;
1924                 if (failed_rdev)
1925                         failed_rdev->desc_nr = spare_desc->number;
1926
1927                 xchg_values(*spare_desc, *failed_desc);
1928                 xchg_values(*fdisk, *sdisk);
1929
1930                 /*
1931                  * (careful, 'failed' and 'spare' are switched from now on)
1932                  *
1933                  * we want to preserve linear numbering and we want to
1934                  * give the proper raid_disk number to the now activated
1935                  * disk. (this means we switch back these values)
1936                  */
1937
1938                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1939                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1940                 xchg_values(spare_desc->number, failed_desc->number);
1941                 xchg_values(sdisk->number, fdisk->number);
1942
1943                 *d = failed_desc;
1944
1945                 if (sdisk->dev == MKDEV(0,0))
1946                         sdisk->used_slot = 0;
1947
1948                 /*
1949                  * this really activates the spare.
1950                  */
1951                 fdisk->spare = 0;
1952                 fdisk->write_only = 0;
1953
1954                 /*
1955                  * if we activate a spare, we definitely replace a
1956                  * non-operational disk slot in the 'low' area of
1957                  * the disk array.
1958                  */
1959                 conf->failed_disks--;
1960                 conf->working_disks++;
1961                 conf->spare = NULL;
1962
1963                 break;
1964
1965         case DISKOP_HOT_REMOVE_DISK:
1966                 rdisk = conf->disks + removed_disk;
1967
1968                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1969                         MD_BUG();
1970                         err = 1;
1971                         goto abort;
1972                 }
1973                 rdisk->dev = MKDEV(0,0);
1974                 rdisk->used_slot = 0;
1975
1976                 break;
1977
1978         case DISKOP_HOT_ADD_DISK:
1979                 adisk = conf->disks + added_disk;
1980                 added_desc = *d;
1981
1982                 if (added_disk != added_desc->number) {
1983                         MD_BUG();
1984                         err = 1;
1985                         goto abort;
1986                 }
1987
1988                 adisk->number = added_desc->number;
1989                 adisk->raid_disk = added_desc->raid_disk;
1990                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1991
1992                 adisk->operational = 0;
1993                 adisk->write_only = 0;
1994                 adisk->spare = 1;
1995                 adisk->used_slot = 1;
1996
1997
1998                 break;
1999
2000         default:
2001                 MD_BUG();
2002                 err = 1;
2003                 goto abort;
2004         }
2005 abort:
2006         md_spin_unlock_irq(&conf->device_lock);
2007         print_raid5_conf(conf);
2008         return err;
2009 }
2010
2011 static mdk_personality_t raid5_personality=
2012 {
2013         name:           "raid5",
2014         make_request:   raid5_make_request,
2015         run:            raid5_run,
2016         stop:           raid5_stop,
2017         status:         raid5_status,
2018         error_handler:  raid5_error,
2019         diskop:         raid5_diskop,
2020         stop_resync:    raid5_stop_resync,
2021         restart_resync: raid5_restart_resync,
2022         sync_request:   raid5_sync_request
2023 };
2024
2025 static int md__init raid5_init (void)
2026 {
2027         return register_md_personality (RAID5, &raid5_personality);
2028 }
2029
2030 static void raid5_exit (void)
2031 {
2032         unregister_md_personality (RAID5);
2033 }
2034
2035 module_init(raid5_init);
2036 module_exit(raid5_exit);
2037 MODULE_LICENSE("GPL");