drivers/md/multipath.c

   1 /*
   2  * multipath.c : Multiple Devices driver for Linux
   3  *
   4  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
   5  *
   6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   7  *
   8  * MULTIPATH management functions.
   9  *
  10  * derived from raid1.c.
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2, or (at your option)
  15  * any later version.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * (for example /usr/src/linux/COPYING); if not, write to the Free
  19  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  */
  21
  22 #include <linux/module.h>
  23 #include <linux/slab.h>
  24 #include <linux/raid/multipath.h>
  25 #include <asm/atomic.h>
  26
  27 #define MAJOR_NR MD_MAJOR
  28 #define MD_DRIVER
  29 #define MD_PERSONALITY
  30
  31 #define MAX_WORK_PER_DISK 128
  32
  33 #define NR_RESERVED_BUFS        32
  34
  35
  36 /*
  37  * The following can be used to debug the driver
  38  */
  39 #define MULTIPATH_DEBUG 0
  40
  41 #if MULTIPATH_DEBUG
  42 #define PRINTK(x...)   printk(x)
  43 #define inline
  44 #define __inline__
  45 #else
  46 #define PRINTK(x...)  do { } while (0)
  47 #endif
  48
  49
  50 static mdk_personality_t multipath_personality;
  51 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
  52 struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;
  53
  54 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
  55
  56
  57
  58 static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
  59 {
  60         struct multipath_bh *mp_bh = NULL;
  61
  62         do {
  63                 md_spin_lock_irq(&conf->device_lock);
  64                 if (!conf->freer1_blocked && conf->freer1) {
  65                         mp_bh = conf->freer1;
  66                         conf->freer1 = mp_bh->next_mp;
  67                         conf->freer1_cnt--;
  68                         mp_bh->next_mp = NULL;
  69                         mp_bh->state = (1 << MPBH_PreAlloc);
  70                         mp_bh->bh_req.b_state = 0;
  71                 }
  72                 md_spin_unlock_irq(&conf->device_lock);
  73                 if (mp_bh)
  74                         return mp_bh;
  75                 mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
  76                                         GFP_NOIO);
  77                 if (mp_bh) {
  78                         memset(mp_bh, 0, sizeof(*mp_bh));
  79                         return mp_bh;
  80                 }
  81                 conf->freer1_blocked = 1;
  82                 wait_disk_event(conf->wait_buffer,
  83                                 !conf->freer1_blocked ||
  84                                 conf->freer1_cnt > NR_RESERVED_BUFS/2
  85                     );
  86                 conf->freer1_blocked = 0;
  87         } while (1);
  88 }
  89
  90 static inline void multipath_free_mpbh(struct multipath_bh *mp_bh)
  91 {
  92         multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
  93
  94         if (test_bit(MPBH_PreAlloc, &mp_bh->state)) {
  95                 unsigned long flags;
  96                 spin_lock_irqsave(&conf->device_lock, flags);
  97                 mp_bh->next_mp = conf->freer1;
  98                 conf->freer1 = mp_bh;
  99                 conf->freer1_cnt++;
 100                 spin_unlock_irqrestore(&conf->device_lock, flags);
 101                 wake_up(&conf->wait_buffer);
 102         } else {
 103                 kfree(mp_bh);
 104         }
 105 }
 106
 107 static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
 108 {
 109         int i = 0;
 110
 111         while (i < cnt) {
 112                 struct multipath_bh *mp_bh;
 113                 mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL);
 114                 if (!mp_bh)
 115                         break;
 116                 memset(mp_bh, 0, sizeof(*mp_bh));
 117                 set_bit(MPBH_PreAlloc, &mp_bh->state);
 118                 mp_bh->mddev = conf->mddev;
 119
 120                 multipath_free_mpbh(mp_bh);
 121                 i++;
 122         }
 123         return i;
 124 }
 125
 126 static void multipath_shrink_mpbh(multipath_conf_t *conf)
 127 {
 128         md_spin_lock_irq(&conf->device_lock);
 129         while (conf->freer1) {
 130                 struct multipath_bh *mp_bh = conf->freer1;
 131                 conf->freer1 = mp_bh->next_mp;
 132                 conf->freer1_cnt--;
 133                 kfree(mp_bh);
 134         }
 135         md_spin_unlock_irq(&conf->device_lock);
 136 }
 137
 138
 139 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
 140 {
 141         multipath_conf_t *conf = mddev_to_conf(mddev);
 142         int i, disks = MD_SB_DISKS;
 143
 144         /*
 145          * Later we do read balancing on the read side
 146          * now we use the first available disk.
 147          */
 148
 149         for (i = 0; i < disks; i++) {
 150                 if (conf->multipaths[i].operational) {
 151                         *rdev = conf->multipaths[i].dev;
 152                         return (0);
 153                 }
 154         }
 155
 156         printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
 157         return (-1);
 158 }
 159
 160 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 161 {
 162         unsigned long flags;
 163         mddev_t *mddev = mp_bh->mddev;
 164         multipath_conf_t *conf = mddev_to_conf(mddev);
 165
 166         md_spin_lock_irqsave(&retry_list_lock, flags);
 167         if (multipath_retry_list == NULL)
 168                 multipath_retry_tail = &multipath_retry_list;
 169         *multipath_retry_tail = mp_bh;
 170         multipath_retry_tail = &mp_bh->next_mp;
 171         mp_bh->next_mp = NULL;
 172         md_spin_unlock_irqrestore(&retry_list_lock, flags);
 173         md_wakeup_thread(conf->thread);
 174 }
 175
 176
 177 /*
 178  * multipath_end_bh_io() is called when we have finished servicing a multipathed
 179  * operation and are ready to return a success/failure code to the buffer
 180  * cache layer.
 181  */
 182 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
 183 {
 184         struct buffer_head *bh = mp_bh->master_bh;
 185
 186         bh->b_end_io(bh, uptodate);
 187         multipath_free_mpbh(mp_bh);
 188 }
 189
 190 void multipath_end_request (struct buffer_head *bh, int uptodate)
 191 {
 192         struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
 193
 194         /*
 195          * this branch is our 'one multipath IO has finished' event handler:
 196          */
 197         if (!uptodate)
 198                 md_error (mp_bh->mddev, bh->b_dev);
 199         else
 200                 /*
 201                  * Set MPBH_Uptodate in our master buffer_head, so that
 202                  * we will return a good error code for to the higher
 203                  * levels even if IO on some other multipathed buffer fails.
 204                  *
 205                  * The 'master' represents the complex operation to
 206                  * user-side. So if something waits for IO, then it will
 207                  * wait for the 'master' buffer_head.
 208                  */
 209                 set_bit (MPBH_Uptodate, &mp_bh->state);
 210
 211
 212         if (uptodate) {
 213                 multipath_end_bh_io(mp_bh, uptodate);
 214                 return;
 215         }
 216         /*
 217          * oops, IO error:
 218          */
 219         printk(KERN_ERR "multipath: %s: rescheduling block %lu\n",
 220                  partition_name(bh->b_dev), bh->b_blocknr);
 221         multipath_reschedule_retry(mp_bh);
 222         return;
 223 }
 224
 225 /*
 226  * This routine returns the disk from which the requested read should
 227  * be done.
 228  */
 229
 230 static int multipath_read_balance (multipath_conf_t *conf)
 231 {
 232         int disk;
 233
 234         for (disk = 0; disk < conf->raid_disks; disk++)
 235                 if (conf->multipaths[disk].operational)
 236                         return disk;
 237         BUG();
 238         return 0;
 239 }
 240
 241 static int multipath_make_request (mddev_t *mddev, int rw,
 242                                struct buffer_head * bh)
 243 {
 244         multipath_conf_t *conf = mddev_to_conf(mddev);
 245         struct buffer_head *bh_req;
 246         struct multipath_bh * mp_bh;
 247         struct multipath_info *multipath;
 248
 249         if (!buffer_locked(bh))
 250                 BUG();
 251
 252 /*
 253  * make_request() can abort the operation when READA is being
 254  * used and no empty request is available.
 255  *
 256  * Currently, just replace the command with READ/WRITE.
 257  */
 258         if (rw == READA)
 259                 rw = READ;
 260
 261         mp_bh = multipath_alloc_mpbh (conf);
 262
 263         mp_bh->master_bh = bh;
 264         mp_bh->mddev = mddev;
 265         mp_bh->cmd = rw;
 266
 267         /*
 268          * read balancing logic:
 269          */
 270         multipath = conf->multipaths + multipath_read_balance(conf);
 271
 272         bh_req = &mp_bh->bh_req;
 273         memcpy(bh_req, bh, sizeof(*bh));
 274         bh_req->b_blocknr = bh->b_rsector;
 275         bh_req->b_dev = multipath->dev;
 276         bh_req->b_rdev = multipath->dev;
 277 /*      bh_req->b_rsector = bh->n_rsector; */
 278         bh_req->b_end_io = multipath_end_request;
 279         bh_req->b_private = mp_bh;
 280         generic_make_request (rw, bh_req);
 281         return 0;
 282 }
 283
 284 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 285 {
 286         multipath_conf_t *conf = mddev_to_conf(mddev);
 287         int i;
 288
 289         seq_printf (seq, " [%d/%d] [", conf->raid_disks,
 290                                                  conf->working_disks);
 291         for (i = 0; i < conf->raid_disks; i++)
 292                 seq_printf (seq, "%s",
 293                         conf->multipaths[i].operational ? "U" : "_");
 294         seq_printf (seq, "]");
 295 }
 296
 297 #define LAST_DISK KERN_ALERT \
 298 "multipath: only one IO path left and IO error.\n"
 299
 300 #define NO_SPARE_DISK KERN_ALERT \
 301 "multipath: no spare IO path left!\n"
 302
 303 #define DISK_FAILED KERN_ALERT \
 304 "multipath: IO failure on %s, disabling IO path. \n" \
 305 "       Operation continuing on %d IO paths.\n"
 306
 307 static void mark_disk_bad (mddev_t *mddev, int failed)
 308 {
 309         multipath_conf_t *conf = mddev_to_conf(mddev);
 310         struct multipath_info *multipath = conf->multipaths+failed;
 311         mdp_super_t *sb = mddev->sb;
 312
 313         multipath->operational = 0;
 314         mark_disk_faulty(sb->disks+multipath->number);
 315         mark_disk_nonsync(sb->disks+multipath->number);
 316         mark_disk_inactive(sb->disks+multipath->number);
 317         sb->active_disks--;
 318         sb->working_disks--;
 319         sb->failed_disks++;
 320         mddev->sb_dirty = 1;
 321         md_wakeup_thread(conf->thread);
 322         conf->working_disks--;
 323         printk (DISK_FAILED, partition_name (multipath->dev),
 324                                  conf->working_disks);
 325 }
 326
 327 /*
 328  * Careful, this can execute in IRQ contexts as well!
 329  */
 330 static int multipath_error (mddev_t *mddev, kdev_t dev)
 331 {
 332         multipath_conf_t *conf = mddev_to_conf(mddev);
 333         struct multipath_info * multipaths = conf->multipaths;
 334         int disks = MD_SB_DISKS;
 335         int other_paths = 1;
 336         int i;
 337
 338         if (conf->working_disks == 1) {
 339                 other_paths = 0;
 340                 for (i = 0; i < disks; i++) {
 341                         if (multipaths[i].spare) {
 342                                 other_paths = 1;
 343                                 break;
 344                         }
 345                 }
 346         }
 347
 348         if (!other_paths) {
 349                 /*
 350                  * Uh oh, we can do nothing if this is our last path, but
 351                  * first check if this is a queued request for a device
 352                  * which has just failed.
 353                  */
 354                 for (i = 0; i < disks; i++) {
 355                         if (multipaths[i].dev==dev && !multipaths[i].operational)
 356                                 return 0;
 357                 }
 358                 printk (LAST_DISK);
 359         } else {
 360                 /*
 361                  * Mark disk as unusable
 362                  */
 363                 for (i = 0; i < disks; i++) {
 364                         if (multipaths[i].dev==dev && multipaths[i].operational) {
 365                                 mark_disk_bad(mddev, i);
 366                                 break;
 367                         }
 368                 }
 369                 if (!conf->working_disks) {
 370                         int err = 1;
 371                         mdp_disk_t *spare;
 372                         mdp_super_t *sb = mddev->sb;
 373
 374                         spare = get_spare(mddev);
 375                         if (spare) {
 376                                 err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
 377                                 printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
 378                         }
 379                         if (!err && !disk_faulty(spare)) {
 380                                 multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
 381                                 mark_disk_sync(spare);
 382                                 mark_disk_active(spare);
 383                                 sb->active_disks++;
 384                                 sb->spare_disks--;
 385                         }
 386                 }
 387         }
 388         return 0;
 389 }
 390
 391 #undef LAST_DISK
 392 #undef NO_SPARE_DISK
 393 #undef DISK_FAILED
 394
 395
 396 static void print_multipath_conf (multipath_conf_t *conf)
 397 {
 398         int i;
 399         struct multipath_info *tmp;
 400
 401         printk("MULTIPATH conf printout:\n");
 402         if (!conf) {
 403                 printk("(conf==NULL)\n");
 404                 return;
 405         }
 406         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
 407                          conf->raid_disks, conf->nr_disks);
 408
 409         for (i = 0; i < MD_SB_DISKS; i++) {
 410                 tmp = conf->multipaths + i;
 411                 if (tmp->spare || tmp->operational || tmp->number ||
 412                                 tmp->raid_disk || tmp->used_slot)
 413                         printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 414                                 i, tmp->spare,tmp->operational,
 415                                 tmp->number,tmp->raid_disk,tmp->used_slot,
 416                                 partition_name(tmp->dev));
 417         }
 418 }
 419
 420 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 421 {
 422         int err = 0;
 423         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
 424         multipath_conf_t *conf = mddev->private;
 425         struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 426         mdp_super_t *sb = mddev->sb;
 427         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
 428         mdk_rdev_t *spare_rdev, *failed_rdev;
 429
 430         print_multipath_conf(conf);
 431         md_spin_lock_irq(&conf->device_lock);
 432         /*
 433          * find the disk ...
 434          */
 435         switch (state) {
 436
 437         case DISKOP_SPARE_ACTIVE:
 438
 439                 /*
 440                  * Find the failed disk within the MULTIPATH configuration ...
 441                  * (this can only be in the first conf->working_disks part)
 442                  */
 443                 for (i = 0; i < conf->raid_disks; i++) {
 444                         tmp = conf->multipaths + i;
 445                         if ((!tmp->operational && !tmp->spare) ||
 446                                         !tmp->used_slot) {
 447                                 failed_disk = i;
 448                                 break;
 449                         }
 450                 }
 451                 /*
 452                  * When we activate a spare disk we _must_ have a disk in
 453                  * the lower (active) part of the array to replace.
 454                  */
 455                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
 456                         MD_BUG();
 457                         err = 1;
 458                         goto abort;
 459                 }
 460                 /* fall through */
 461
 462         case DISKOP_SPARE_WRITE:
 463         case DISKOP_SPARE_INACTIVE:
 464
 465                 /*
 466                  * Find the spare disk ... (can only be in the 'high'
 467                  * area of the array)
 468                  */
 469                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 470                         tmp = conf->multipaths + i;
 471                         if (tmp->spare && tmp->number == (*d)->number) {
 472                                 spare_disk = i;
 473                                 break;
 474                         }
 475                 }
 476                 if (spare_disk == -1) {
 477                         MD_BUG();
 478                         err = 1;
 479                         goto abort;
 480                 }
 481                 break;
 482
 483         case DISKOP_HOT_REMOVE_DISK:
 484
 485                 for (i = 0; i < MD_SB_DISKS; i++) {
 486                         tmp = conf->multipaths + i;
 487                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
 488                                 if (tmp->operational) {
 489                                         printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number);
 490                                         err = -EBUSY;
 491                                         goto abort;
 492                                 }
 493                                 removed_disk = i;
 494                                 break;
 495                         }
 496                 }
 497                 if (removed_disk == -1) {
 498                         MD_BUG();
 499                         err = 1;
 500                         goto abort;
 501                 }
 502                 break;
 503
 504         case DISKOP_HOT_ADD_DISK:
 505
 506                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 507                         tmp = conf->multipaths + i;
 508                         if (!tmp->used_slot) {
 509                                 added_disk = i;
 510                                 break;
 511                         }
 512                 }
 513                 if (added_disk == -1) {
 514                         MD_BUG();
 515                         err = 1;
 516                         goto abort;
 517                 }
 518                 break;
 519         }
 520
 521         switch (state) {
 522         /*
 523          * Switch the spare disk to write-only mode:
 524          */
 525         case DISKOP_SPARE_WRITE:
 526                 sdisk = conf->multipaths + spare_disk;
 527                 sdisk->operational = 1;
 528                 break;
 529         /*
 530          * Deactivate a spare disk:
 531          */
 532         case DISKOP_SPARE_INACTIVE:
 533                 sdisk = conf->multipaths + spare_disk;
 534                 sdisk->operational = 0;
 535                 break;
 536         /*
 537          * Activate (mark read-write) the (now sync) spare disk,
 538          * which means we switch it's 'raid position' (->raid_disk)
 539          * with the failed disk. (only the first 'conf->nr_disks'
 540          * slots are used for 'real' disks and we must preserve this
 541          * property)
 542          */
 543         case DISKOP_SPARE_ACTIVE:
 544                 sdisk = conf->multipaths + spare_disk;
 545                 fdisk = conf->multipaths + failed_disk;
 546
 547                 spare_desc = &sb->disks[sdisk->number];
 548                 failed_desc = &sb->disks[fdisk->number];
 549
 550                 if (spare_desc != *d) {
 551                         MD_BUG();
 552                         err = 1;
 553                         goto abort;
 554                 }
 555
 556                 if (spare_desc->raid_disk != sdisk->raid_disk) {
 557                         MD_BUG();
 558                         err = 1;
 559                         goto abort;
 560                 }
 561
 562                 if (sdisk->raid_disk != spare_disk) {
 563                         MD_BUG();
 564                         err = 1;
 565                         goto abort;
 566                 }
 567
 568                 if (failed_desc->raid_disk != fdisk->raid_disk) {
 569                         MD_BUG();
 570                         err = 1;
 571                         goto abort;
 572                 }
 573
 574                 if (fdisk->raid_disk != failed_disk) {
 575                         MD_BUG();
 576                         err = 1;
 577                         goto abort;
 578                 }
 579
 580                 /*
 581                  * do the switch finally
 582                  */
 583                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
 584                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
 585                 xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
 586                 spare_rdev->alias_device = 0;
 587                 failed_rdev->alias_device = 1;
 588
 589                 xchg_values(*spare_desc, *failed_desc);
 590                 xchg_values(*fdisk, *sdisk);
 591
 592                 /*
 593                  * (careful, 'failed' and 'spare' are switched from now on)
 594                  *
 595                  * we want to preserve linear numbering and we want to
 596                  * give the proper raid_disk number to the now activated
 597                  * disk. (this means we switch back these values)
 598                  */
 599
 600                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
 601                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
 602                 xchg_values(spare_desc->number, failed_desc->number);
 603                 xchg_values(sdisk->number, fdisk->number);
 604
 605                 *d = failed_desc;
 606
 607                 if (sdisk->dev == MKDEV(0,0))
 608                         sdisk->used_slot = 0;
 609                 /*
 610                  * this really activates the spare.
 611                  */
 612                 fdisk->spare = 0;
 613
 614                 /*
 615                  * if we activate a spare, we definitely replace a
 616                  * non-operational disk slot in the 'low' area of
 617                  * the disk array.
 618                  */
 619
 620                 conf->working_disks++;
 621
 622                 break;
 623
 624         case DISKOP_HOT_REMOVE_DISK:
 625                 rdisk = conf->multipaths + removed_disk;
 626
 627                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
 628                         MD_BUG();
 629                         err = 1;
 630                         goto abort;
 631                 }
 632                 rdisk->dev = MKDEV(0,0);
 633                 rdisk->used_slot = 0;
 634                 conf->nr_disks--;
 635                 break;
 636
 637         case DISKOP_HOT_ADD_DISK:
 638                 adisk = conf->multipaths + added_disk;
 639                 added_desc = *d;
 640
 641                 if (added_disk != added_desc->number) {
 642                         MD_BUG();
 643                         err = 1;
 644                         goto abort;
 645                 }
 646
 647                 adisk->number = added_desc->number;
 648                 adisk->raid_disk = added_desc->raid_disk;
 649                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
 650
 651                 adisk->operational = 0;
 652                 adisk->spare = 1;
 653                 adisk->used_slot = 1;
 654                 conf->nr_disks++;
 655
 656                 break;
 657
 658         default:
 659                 MD_BUG();
 660                 err = 1;
 661                 goto abort;
 662         }
 663 abort:
 664         md_spin_unlock_irq(&conf->device_lock);
 665
 666         print_multipath_conf(conf);
 667         return err;
 668 }
 669
 670
 671 #define IO_ERROR KERN_ALERT \
 672 "multipath: %s: unrecoverable IO read error for block %lu\n"
 673
 674 #define REDIRECT_SECTOR KERN_ERR \
 675 "multipath: %s: redirecting sector %lu to another IO path\n"
 676
 677 /*
 678  * This is a kernel thread which:
 679  *
 680  *      1.      Retries failed read operations on working multipaths.
 681  *      2.      Updates the raid superblock when problems encounter.
 682  *      3.      Performs writes following reads for array syncronising.
 683  */
 684
 685 static void multipathd (void *data)
 686 {
 687         struct multipath_bh *mp_bh;
 688         struct buffer_head *bh;
 689         unsigned long flags;
 690         mddev_t *mddev;
 691         kdev_t dev;
 692
 693
 694         for (;;) {
 695                 md_spin_lock_irqsave(&retry_list_lock, flags);
 696                 mp_bh = multipath_retry_list;
 697                 if (!mp_bh)
 698                         break;
 699                 multipath_retry_list = mp_bh->next_mp;
 700                 md_spin_unlock_irqrestore(&retry_list_lock, flags);
 701
 702                 mddev = mp_bh->mddev;
 703                 if (mddev->sb_dirty)
 704                         md_update_sb(mddev);
 705                 bh = &mp_bh->bh_req;
 706                 dev = bh->b_dev;
 707
 708                 multipath_map (mddev, &bh->b_dev);
 709                 if (bh->b_dev == dev) {
 710                         printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
 711                         multipath_end_bh_io(mp_bh, 0);
 712                 } else {
 713                         printk (REDIRECT_SECTOR,
 714                                 partition_name(bh->b_dev), bh->b_blocknr);
 715                         bh->b_rdev = bh->b_dev;
 716                         bh->b_rsector = bh->b_blocknr;
 717                         generic_make_request (mp_bh->cmd, bh);
 718                 }
 719         }
 720         md_spin_unlock_irqrestore(&retry_list_lock, flags);
 721 }
 722 #undef IO_ERROR
 723 #undef REDIRECT_SECTOR
 724
 725 /*
 726  * This will catch the scenario in which one of the multipaths was
 727  * mounted as a normal device rather than as a part of a raid set.
 728  *
 729  * check_consistency is very personality-dependent, eg. RAID5 cannot
 730  * do this check, it uses another method.
 731  */
 732 static int __check_consistency (mddev_t *mddev, int row)
 733 {
 734         multipath_conf_t *conf = mddev_to_conf(mddev);
 735         int disks = MD_SB_DISKS;
 736         kdev_t dev;
 737         struct buffer_head *bh = NULL;
 738         int i, rc = 0;
 739         char *buffer = NULL;
 740
 741         for (i = 0; i < disks; i++) {
 742                 if (!conf->multipaths[i].operational)
 743                         continue;
 744                 printk("(checking disk %d)\n",i);
 745                 dev = conf->multipaths[i].dev;
 746                 set_blocksize(dev, 4096);
 747                 if ((bh = bread(dev, row / 4, 4096)) == NULL)
 748                         break;
 749                 if (!buffer) {
 750                         buffer = (char *) __get_free_page(GFP_KERNEL);
 751                         if (!buffer)
 752                                 break;
 753                         memcpy(buffer, bh->b_data, 4096);
 754                 } else if (memcmp(buffer, bh->b_data, 4096)) {
 755                         rc = 1;
 756                         break;
 757                 }
 758                 bforget(bh);
 759                 fsync_dev(dev);
 760                 invalidate_buffers(dev);
 761                 bh = NULL;
 762         }
 763         if (buffer)
 764                 free_page((unsigned long) buffer);
 765         if (bh) {
 766                 dev = bh->b_dev;
 767                 bforget(bh);
 768                 fsync_dev(dev);
 769                 invalidate_buffers(dev);
 770         }
 771         return rc;
 772 }
 773
 774 static int check_consistency (mddev_t *mddev)
 775 {
 776         if (__check_consistency(mddev, 0))
 777 /*
 778  * we do not do this currently, as it's perfectly possible to
 779  * have an inconsistent array when it's freshly created. Only
 780  * newly written data has to be consistent.
 781  */
 782                 return 0;
 783
 784         return 0;
 785 }
 786
 787 #define INVALID_LEVEL KERN_WARNING \
 788 "multipath: md%d: raid level not set to multipath IO (%d)\n"
 789
 790 #define NO_SB KERN_ERR \
 791 "multipath: disabled IO path %s (couldn't access raid superblock)\n"
 792
 793 #define ERRORS KERN_ERR \
 794 "multipath: disabled IO path %s (errors detected)\n"
 795
 796 #define NOT_IN_SYNC KERN_ERR \
 797 "multipath: making IO path %s a spare path (not in sync)\n"
 798
 799 #define INCONSISTENT KERN_ERR \
 800 "multipath: disabled IO path %s (inconsistent descriptor)\n"
 801
 802 #define ALREADY_RUNNING KERN_ERR \
 803 "multipath: disabled IO path %s (multipath %d already operational)\n"
 804
 805 #define OPERATIONAL KERN_INFO \
 806 "multipath: device %s operational as IO path %d\n"
 807
 808 #define MEM_ERROR KERN_ERR \
 809 "multipath: couldn't allocate memory for md%d\n"
 810
 811 #define SPARE KERN_INFO \
 812 "multipath: spare IO path %s\n"
 813
 814 #define NONE_OPERATIONAL KERN_ERR \
 815 "multipath: no operational IO paths for md%d\n"
 816
 817 #define SB_DIFFERENCES KERN_ERR \
 818 "multipath: detected IO path differences!\n"
 819
 820 #define ARRAY_IS_ACTIVE KERN_INFO \
 821 "multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n"
 822
 823 #define THREAD_ERROR KERN_ERR \
 824 "multipath: couldn't allocate thread for md%d\n"
 825
 826 static int multipath_run (mddev_t *mddev)
 827 {
 828         multipath_conf_t *conf;
 829         int i, j, disk_idx;
 830         struct multipath_info *disk, *disk2;
 831         mdp_super_t *sb = mddev->sb;
 832         mdp_disk_t *desc, *desc2;
 833         mdk_rdev_t *rdev, *def_rdev = NULL;
 834         struct md_list_head *tmp;
 835         int num_rdevs = 0;
 836
 837         MOD_INC_USE_COUNT;
 838
 839         if (sb->level != -4) {
 840                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
 841                 goto out;
 842         }
 843         /*
 844          * copy the already verified devices into our private MULTIPATH
 845          * bookkeeping area. [whatever we allocate in multipath_run(),
 846          * should be freed in multipath_stop()]
 847          */
 848
 849         conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
 850         mddev->private = conf;
 851         if (!conf) {
 852                 printk(MEM_ERROR, mdidx(mddev));
 853                 goto out;
 854         }
 855         memset(conf, 0, sizeof(*conf));
 856
 857         ITERATE_RDEV(mddev,rdev,tmp) {
 858                 if (rdev->faulty) {
 859                         /* this is a "should never happen" case and if it */
 860                         /* ever does happen, a continue; won't help */
 861                         printk(ERRORS, partition_name(rdev->dev));
 862                         continue;
 863                 } else {
 864                         /* this is a "should never happen" case and if it */
 865                         /* ever does happen, a continue; won't help */
 866                         if (!rdev->sb) {
 867                                 MD_BUG();
 868                                 continue;
 869                         }
 870                 }
 871                 if (rdev->desc_nr == -1) {
 872                         MD_BUG();
 873                         continue;
 874                 }
 875
 876                 desc = &sb->disks[rdev->desc_nr];
 877                 disk_idx = desc->raid_disk;
 878                 disk = conf->multipaths + disk_idx;
 879
 880                 if (!disk_sync(desc))
 881                         printk(NOT_IN_SYNC, partition_name(rdev->dev));
 882
 883                 /*
 884                  * Mark all disks as spare to start with, then pick our
 885                  * active disk.  If we have a disk that is marked active
 886                  * in the sb, then use it, else use the first rdev.
 887                  */
 888                 disk->number = desc->number;
 889                 disk->raid_disk = desc->raid_disk;
 890                 disk->dev = rdev->dev;
 891                 disk->operational = 0;
 892                 disk->spare = 1;
 893                 disk->used_slot = 1;
 894                 mark_disk_sync(desc);
 895
 896                 if (disk_active(desc)) {
 897                         if(!conf->working_disks) {
 898                                 printk(OPERATIONAL, partition_name(rdev->dev),
 899                                         desc->raid_disk);
 900                                 disk->operational = 1;
 901                                 disk->spare = 0;
 902                                 conf->working_disks++;
 903                                 def_rdev = rdev;
 904                         } else {
 905                                 mark_disk_spare(desc);
 906                         }
 907                 } else
 908                         mark_disk_spare(desc);
 909
 910                 if(!num_rdevs++) def_rdev = rdev;
 911         }
 912         if(!conf->working_disks && num_rdevs) {
 913                 desc = &sb->disks[def_rdev->desc_nr];
 914                 disk = conf->multipaths + desc->raid_disk;
 915                 printk(OPERATIONAL, partition_name(def_rdev->dev),
 916                         disk->raid_disk);
 917                 disk->operational = 1;
 918                 disk->spare = 0;
 919                 conf->working_disks++;
 920                 mark_disk_active(desc);
 921         }
 922         /*
 923          * Make sure our active path is in desc spot 0
 924          */
 925         if(def_rdev->desc_nr != 0) {
 926                 rdev = find_rdev_nr(mddev, 0);
 927                 desc = &sb->disks[def_rdev->desc_nr];
 928                 desc2 = sb->disks;
 929                 disk = conf->multipaths + desc->raid_disk;
 930                 disk2 = conf->multipaths + desc2->raid_disk;
 931                 xchg_values(*desc2,*desc);
 932                 xchg_values(*disk2,*disk);
 933                 xchg_values(desc2->number, desc->number);
 934                 xchg_values(disk2->number, disk->number);
 935                 xchg_values(desc2->raid_disk, desc->raid_disk);
 936                 xchg_values(disk2->raid_disk, disk->raid_disk);
 937                 if(rdev) {
 938                         xchg_values(def_rdev->desc_nr,rdev->desc_nr);
 939                 } else {
 940                         def_rdev->desc_nr = 0;
 941                 }
 942         }
 943         conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
 944         conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
 945         sb->failed_disks = 0;
 946         sb->spare_disks = num_rdevs - 1;
 947         mddev->sb_dirty = 1;
 948         conf->mddev = mddev;
 949         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
 950
 951         init_waitqueue_head(&conf->wait_buffer);
 952
 953         if (!conf->working_disks) {
 954                 printk(NONE_OPERATIONAL, mdidx(mddev));
 955                 goto out_free_conf;
 956         }
 957
 958
 959         /* pre-allocate some buffer_head structures.
 960          * As a minimum, 1 mpbh and raid_disks buffer_heads
 961          * would probably get us by in tight memory situations,
 962          * but a few more is probably a good idea.
 963          * For now, try NR_RESERVED_BUFS mpbh and
 964          * NR_RESERVED_BUFS*raid_disks bufferheads
 965          * This will allow at least NR_RESERVED_BUFS concurrent
 966          * reads or writes even if kmalloc starts failing
 967          */
 968         if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) {
 969                 printk(MEM_ERROR, mdidx(mddev));
 970                 goto out_free_conf;
 971         }
 972
 973         if ((sb->state & (1 << MD_SB_CLEAN))) {
 974                 /*
 975                  * we do sanity checks even if the device says
 976                  * it's clean ...
 977                  */
 978                 if (check_consistency(mddev)) {
 979                         printk(SB_DIFFERENCES);
 980                         sb->state &= ~(1 << MD_SB_CLEAN);
 981                 }
 982         }
 983
 984         {
 985                 const char * name = "multipathd";
 986
 987                 conf->thread = md_register_thread(multipathd, conf, name);
 988                 if (!conf->thread) {
 989                         printk(THREAD_ERROR, mdidx(mddev));
 990                         goto out_free_conf;
 991                 }
 992         }
 993
 994         /*
 995          * Regenerate the "device is in sync with the raid set" bit for
 996          * each device.
 997          */
 998         for (i = 0; i < MD_SB_DISKS; i++) {
 999                 mark_disk_nonsync(sb->disks+i);
1000                 for (j = 0; j < sb->raid_disks; j++) {
1001                         if (sb->disks[i].number == conf->multipaths[j].number)
1002                                 mark_disk_sync(sb->disks+i);
1003                 }
1004         }
1005
1006         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
1007                         sb->raid_disks, sb->spare_disks);
1008         /*
1009          * Ok, everything is just fine now
1010          */
1011         return 0;
1012
1013 out_free_conf:
1014         multipath_shrink_mpbh(conf);
1015         kfree(conf);
1016         mddev->private = NULL;
1017 out:
1018         MOD_DEC_USE_COUNT;
1019         return -EIO;
1020 }
1021
1022 #undef INVALID_LEVEL
1023 #undef NO_SB
1024 #undef ERRORS
1025 #undef NOT_IN_SYNC
1026 #undef INCONSISTENT
1027 #undef ALREADY_RUNNING
1028 #undef OPERATIONAL
1029 #undef SPARE
1030 #undef NONE_OPERATIONAL
1031 #undef SB_DIFFERENCES
1032 #undef ARRAY_IS_ACTIVE
1033
1034 static int multipath_stop (mddev_t *mddev)
1035 {
1036         multipath_conf_t *conf = mddev_to_conf(mddev);
1037
1038         md_unregister_thread(conf->thread);
1039         multipath_shrink_mpbh(conf);
1040         kfree(conf);
1041         mddev->private = NULL;
1042         MOD_DEC_USE_COUNT;
1043         return 0;
1044 }
1045
1046 static mdk_personality_t multipath_personality=
1047 {
1048         name:           "multipath",
1049         make_request:   multipath_make_request,
1050         run:            multipath_run,
1051         stop:           multipath_stop,
1052         status:         multipath_status,
1053         error_handler:  multipath_error,
1054         diskop:         multipath_diskop,
1055 };
1056
1057 static int md__init multipath_init (void)
1058 {
1059         return register_md_personality (MULTIPATH, &multipath_personality);
1060 }
1061
1062 static void multipath_exit (void)
1063 {
1064         unregister_md_personality (MULTIPATH);
1065 }
1066
1067 module_init(multipath_init);
1068 module_exit(multipath_exit);
1069 MODULE_LICENSE("GPL");