added mtd driver
[linux-2.4.git] / drivers / md / multipath.c
1 /*
2  * multipath.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5  *
6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7  *
8  * MULTIPATH management functions.
9  *
10  * derived from raid1.c.
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2, or (at your option)
15  * any later version.
16  *
17  * You should have received a copy of the GNU General Public License
18  * (for example /usr/src/linux/COPYING); if not, write to the Free
19  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #include <linux/module.h>
23 #include <linux/slab.h>
24 #include <linux/raid/multipath.h>
25 #include <asm/atomic.h>
26
27 #define MAJOR_NR MD_MAJOR
28 #define MD_DRIVER
29 #define MD_PERSONALITY
30
31 #define MAX_WORK_PER_DISK 128
32
33 #define NR_RESERVED_BUFS        32
34
35
36 /*
37  * The following can be used to debug the driver
38  */
39 #define MULTIPATH_DEBUG 0
40
41 #if MULTIPATH_DEBUG
42 #define PRINTK(x...)   printk(x)
43 #define inline
44 #define __inline__
45 #else
46 #define PRINTK(x...)  do { } while (0)
47 #endif
48
49
50 static mdk_personality_t multipath_personality;
51 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
52 struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;
53
54 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
55
56
57
58 static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
59 {
60         struct multipath_bh *mp_bh = NULL;
61
62         do {
63                 md_spin_lock_irq(&conf->device_lock);
64                 if (!conf->freer1_blocked && conf->freer1) {
65                         mp_bh = conf->freer1;
66                         conf->freer1 = mp_bh->next_mp;
67                         conf->freer1_cnt--;
68                         mp_bh->next_mp = NULL;
69                         mp_bh->state = (1 << MPBH_PreAlloc);
70                         mp_bh->bh_req.b_state = 0;
71                 }
72                 md_spin_unlock_irq(&conf->device_lock);
73                 if (mp_bh)
74                         return mp_bh;
75                 mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
76                                         GFP_NOIO);
77                 if (mp_bh) {
78                         memset(mp_bh, 0, sizeof(*mp_bh));
79                         return mp_bh;
80                 }
81                 conf->freer1_blocked = 1;
82                 wait_disk_event(conf->wait_buffer,
83                                 !conf->freer1_blocked ||
84                                 conf->freer1_cnt > NR_RESERVED_BUFS/2
85                     );
86                 conf->freer1_blocked = 0;
87         } while (1);
88 }
89
90 static inline void multipath_free_mpbh(struct multipath_bh *mp_bh)
91 {
92         multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
93
94         if (test_bit(MPBH_PreAlloc, &mp_bh->state)) {
95                 unsigned long flags;
96                 spin_lock_irqsave(&conf->device_lock, flags);
97                 mp_bh->next_mp = conf->freer1;
98                 conf->freer1 = mp_bh;
99                 conf->freer1_cnt++;
100                 spin_unlock_irqrestore(&conf->device_lock, flags);
101                 wake_up(&conf->wait_buffer);
102         } else {
103                 kfree(mp_bh);
104         }
105 }
106
107 static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
108 {
109         int i = 0;
110
111         while (i < cnt) {
112                 struct multipath_bh *mp_bh;
113                 mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL);
114                 if (!mp_bh)
115                         break;
116                 memset(mp_bh, 0, sizeof(*mp_bh));
117                 set_bit(MPBH_PreAlloc, &mp_bh->state);
118                 mp_bh->mddev = conf->mddev;            
119
120                 multipath_free_mpbh(mp_bh);
121                 i++;
122         }
123         return i;
124 }
125
126 static void multipath_shrink_mpbh(multipath_conf_t *conf)
127 {
128         md_spin_lock_irq(&conf->device_lock);
129         while (conf->freer1) {
130                 struct multipath_bh *mp_bh = conf->freer1;
131                 conf->freer1 = mp_bh->next_mp;
132                 conf->freer1_cnt--;
133                 kfree(mp_bh);
134         }
135         md_spin_unlock_irq(&conf->device_lock);
136 }
137
138
139 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
140 {
141         multipath_conf_t *conf = mddev_to_conf(mddev);
142         int i, disks = MD_SB_DISKS;
143
144         /*
145          * Later we do read balancing on the read side 
146          * now we use the first available disk.
147          */
148
149         for (i = 0; i < disks; i++) {
150                 if (conf->multipaths[i].operational) {
151                         *rdev = conf->multipaths[i].dev;
152                         return (0);
153                 }
154         }
155
156         printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
157         return (-1);
158 }
159
160 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
161 {
162         unsigned long flags;
163         mddev_t *mddev = mp_bh->mddev;
164         multipath_conf_t *conf = mddev_to_conf(mddev);
165
166         md_spin_lock_irqsave(&retry_list_lock, flags);
167         if (multipath_retry_list == NULL)
168                 multipath_retry_tail = &multipath_retry_list;
169         *multipath_retry_tail = mp_bh;
170         multipath_retry_tail = &mp_bh->next_mp;
171         mp_bh->next_mp = NULL;
172         md_spin_unlock_irqrestore(&retry_list_lock, flags);
173         md_wakeup_thread(conf->thread);
174 }
175
176
177 /*
178  * multipath_end_bh_io() is called when we have finished servicing a multipathed
179  * operation and are ready to return a success/failure code to the buffer
180  * cache layer.
181  */
182 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
183 {
184         struct buffer_head *bh = mp_bh->master_bh;
185
186         bh->b_end_io(bh, uptodate);
187         multipath_free_mpbh(mp_bh);
188 }
189
190 void multipath_end_request (struct buffer_head *bh, int uptodate)
191 {
192         struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
193
194         /*
195          * this branch is our 'one multipath IO has finished' event handler:
196          */
197         if (!uptodate)
198                 md_error (mp_bh->mddev, bh->b_dev);
199         else
200                 /*
201                  * Set MPBH_Uptodate in our master buffer_head, so that
202                  * we will return a good error code for to the higher
203                  * levels even if IO on some other multipathed buffer fails.
204                  *
205                  * The 'master' represents the complex operation to 
206                  * user-side. So if something waits for IO, then it will
207                  * wait for the 'master' buffer_head.
208                  */
209                 set_bit (MPBH_Uptodate, &mp_bh->state);
210
211                 
212         if (uptodate) {
213                 multipath_end_bh_io(mp_bh, uptodate);
214                 return;
215         }
216         /*
217          * oops, IO error:
218          */
219         printk(KERN_ERR "multipath: %s: rescheduling block %lu\n", 
220                  partition_name(bh->b_dev), bh->b_blocknr);
221         multipath_reschedule_retry(mp_bh);
222         return;
223 }
224
225 /*
226  * This routine returns the disk from which the requested read should
227  * be done.
228  */
229
230 static int multipath_read_balance (multipath_conf_t *conf)
231 {
232         int disk;
233
234         for (disk = 0; disk < conf->raid_disks; disk++) 
235                 if (conf->multipaths[disk].operational)
236                         return disk;
237         BUG();
238         return 0;
239 }
240
241 static int multipath_make_request (mddev_t *mddev, int rw,
242                                struct buffer_head * bh)
243 {
244         multipath_conf_t *conf = mddev_to_conf(mddev);
245         struct buffer_head *bh_req;
246         struct multipath_bh * mp_bh;
247         struct multipath_info *multipath;
248
249         if (!buffer_locked(bh))
250                 BUG();
251         
252 /*
253  * make_request() can abort the operation when READA is being
254  * used and no empty request is available.
255  *
256  * Currently, just replace the command with READ/WRITE.
257  */
258         if (rw == READA)
259                 rw = READ;
260
261         mp_bh = multipath_alloc_mpbh (conf);
262
263         mp_bh->master_bh = bh;
264         mp_bh->mddev = mddev;
265         mp_bh->cmd = rw;
266
267         /*
268          * read balancing logic:
269          */
270         multipath = conf->multipaths + multipath_read_balance(conf);
271
272         bh_req = &mp_bh->bh_req;
273         memcpy(bh_req, bh, sizeof(*bh));
274         bh_req->b_blocknr = bh->b_rsector;
275         bh_req->b_dev = multipath->dev;
276         bh_req->b_rdev = multipath->dev;
277 /*      bh_req->b_rsector = bh->n_rsector; */
278         bh_req->b_end_io = multipath_end_request;
279         bh_req->b_private = mp_bh;
280         generic_make_request (rw, bh_req);
281         return 0;
282 }
283
284 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
285 {
286         multipath_conf_t *conf = mddev_to_conf(mddev);
287         int i;
288         
289         seq_printf (seq, " [%d/%d] [", conf->raid_disks,
290                                                  conf->working_disks);
291         for (i = 0; i < conf->raid_disks; i++)
292                 seq_printf (seq, "%s",
293                         conf->multipaths[i].operational ? "U" : "_");
294         seq_printf (seq, "]");
295 }
296
297 #define LAST_DISK KERN_ALERT \
298 "multipath: only one IO path left and IO error.\n"
299
300 #define NO_SPARE_DISK KERN_ALERT \
301 "multipath: no spare IO path left!\n"
302
303 #define DISK_FAILED KERN_ALERT \
304 "multipath: IO failure on %s, disabling IO path. \n" \
305 "       Operation continuing on %d IO paths.\n"
306
307 static void mark_disk_bad (mddev_t *mddev, int failed)
308 {
309         multipath_conf_t *conf = mddev_to_conf(mddev);
310         struct multipath_info *multipath = conf->multipaths+failed;
311         mdp_super_t *sb = mddev->sb;
312
313         multipath->operational = 0;
314         mark_disk_faulty(sb->disks+multipath->number);
315         mark_disk_nonsync(sb->disks+multipath->number);
316         mark_disk_inactive(sb->disks+multipath->number);
317         sb->active_disks--;
318         sb->working_disks--;
319         sb->failed_disks++;
320         mddev->sb_dirty = 1;
321         md_wakeup_thread(conf->thread);
322         conf->working_disks--;
323         printk (DISK_FAILED, partition_name (multipath->dev),
324                                  conf->working_disks);
325 }
326
327 /*
328  * Careful, this can execute in IRQ contexts as well!
329  */
330 static int multipath_error (mddev_t *mddev, kdev_t dev)
331 {
332         multipath_conf_t *conf = mddev_to_conf(mddev);
333         struct multipath_info * multipaths = conf->multipaths;
334         int disks = MD_SB_DISKS;
335         int other_paths = 1;
336         int i;
337
338         if (conf->working_disks == 1) {
339                 other_paths = 0;
340                 for (i = 0; i < disks; i++) {
341                         if (multipaths[i].spare) {
342                                 other_paths = 1;
343                                 break;
344                         }
345                 }
346         }
347
348         if (!other_paths) {
349                 /*
350                  * Uh oh, we can do nothing if this is our last path, but
351                  * first check if this is a queued request for a device
352                  * which has just failed.
353                  */
354                 for (i = 0; i < disks; i++) {
355                         if (multipaths[i].dev==dev && !multipaths[i].operational)
356                                 return 0;
357                 }
358                 printk (LAST_DISK);
359         } else {
360                 /*
361                  * Mark disk as unusable
362                  */
363                 for (i = 0; i < disks; i++) {
364                         if (multipaths[i].dev==dev && multipaths[i].operational) {
365                                 mark_disk_bad(mddev, i);
366                                 break;
367                         }
368                 }
369                 if (!conf->working_disks) {
370                         int err = 1;
371                         mdp_disk_t *spare;
372                         mdp_super_t *sb = mddev->sb;
373
374                         spare = get_spare(mddev);
375                         if (spare) {
376                                 err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
377                                 printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
378                         }
379                         if (!err && !disk_faulty(spare)) {
380                                 multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
381                                 mark_disk_sync(spare);
382                                 mark_disk_active(spare);
383                                 sb->active_disks++;
384                                 sb->spare_disks--;
385                         }
386                 }
387         }
388         return 0;
389 }
390
391 #undef LAST_DISK
392 #undef NO_SPARE_DISK
393 #undef DISK_FAILED
394
395
396 static void print_multipath_conf (multipath_conf_t *conf)
397 {
398         int i;
399         struct multipath_info *tmp;
400
401         printk("MULTIPATH conf printout:\n");
402         if (!conf) {
403                 printk("(conf==NULL)\n");
404                 return;
405         }
406         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
407                          conf->raid_disks, conf->nr_disks);
408
409         for (i = 0; i < MD_SB_DISKS; i++) {
410                 tmp = conf->multipaths + i;
411                 if (tmp->spare || tmp->operational || tmp->number ||
412                                 tmp->raid_disk || tmp->used_slot)
413                         printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
414                                 i, tmp->spare,tmp->operational,
415                                 tmp->number,tmp->raid_disk,tmp->used_slot,
416                                 partition_name(tmp->dev));
417         }
418 }
419
420 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
421 {
422         int err = 0;
423         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
424         multipath_conf_t *conf = mddev->private;
425         struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
426         mdp_super_t *sb = mddev->sb;
427         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
428         mdk_rdev_t *spare_rdev, *failed_rdev;
429
430         print_multipath_conf(conf);
431         md_spin_lock_irq(&conf->device_lock);
432         /*
433          * find the disk ...
434          */
435         switch (state) {
436
437         case DISKOP_SPARE_ACTIVE:
438
439                 /*
440                  * Find the failed disk within the MULTIPATH configuration ...
441                  * (this can only be in the first conf->working_disks part)
442                  */
443                 for (i = 0; i < conf->raid_disks; i++) {
444                         tmp = conf->multipaths + i;
445                         if ((!tmp->operational && !tmp->spare) ||
446                                         !tmp->used_slot) {
447                                 failed_disk = i;
448                                 break;
449                         }
450                 }
451                 /*
452                  * When we activate a spare disk we _must_ have a disk in
453                  * the lower (active) part of the array to replace. 
454                  */
455                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
456                         MD_BUG();
457                         err = 1;
458                         goto abort;
459                 }
460                 /* fall through */
461
462         case DISKOP_SPARE_WRITE:
463         case DISKOP_SPARE_INACTIVE:
464
465                 /*
466                  * Find the spare disk ... (can only be in the 'high'
467                  * area of the array)
468                  */
469                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
470                         tmp = conf->multipaths + i;
471                         if (tmp->spare && tmp->number == (*d)->number) {
472                                 spare_disk = i;
473                                 break;
474                         }
475                 }
476                 if (spare_disk == -1) {
477                         MD_BUG();
478                         err = 1;
479                         goto abort;
480                 }
481                 break;
482
483         case DISKOP_HOT_REMOVE_DISK:
484
485                 for (i = 0; i < MD_SB_DISKS; i++) {
486                         tmp = conf->multipaths + i;
487                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
488                                 if (tmp->operational) {
489                                         printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number);
490                                         err = -EBUSY;
491                                         goto abort;
492                                 }
493                                 removed_disk = i;
494                                 break;
495                         }
496                 }
497                 if (removed_disk == -1) {
498                         MD_BUG();
499                         err = 1;
500                         goto abort;
501                 }
502                 break;
503
504         case DISKOP_HOT_ADD_DISK:
505
506                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
507                         tmp = conf->multipaths + i;
508                         if (!tmp->used_slot) {
509                                 added_disk = i;
510                                 break;
511                         }
512                 }
513                 if (added_disk == -1) {
514                         MD_BUG();
515                         err = 1;
516                         goto abort;
517                 }
518                 break;
519         }
520
521         switch (state) {
522         /*
523          * Switch the spare disk to write-only mode:
524          */
525         case DISKOP_SPARE_WRITE:
526                 sdisk = conf->multipaths + spare_disk;
527                 sdisk->operational = 1;
528                 break;
529         /*
530          * Deactivate a spare disk:
531          */
532         case DISKOP_SPARE_INACTIVE:
533                 sdisk = conf->multipaths + spare_disk;
534                 sdisk->operational = 0;
535                 break;
536         /*
537          * Activate (mark read-write) the (now sync) spare disk,
538          * which means we switch it's 'raid position' (->raid_disk)
539          * with the failed disk. (only the first 'conf->nr_disks'
540          * slots are used for 'real' disks and we must preserve this
541          * property)
542          */
543         case DISKOP_SPARE_ACTIVE:
544                 sdisk = conf->multipaths + spare_disk;
545                 fdisk = conf->multipaths + failed_disk;
546
547                 spare_desc = &sb->disks[sdisk->number];
548                 failed_desc = &sb->disks[fdisk->number];
549
550                 if (spare_desc != *d) {
551                         MD_BUG();
552                         err = 1;
553                         goto abort;
554                 }
555
556                 if (spare_desc->raid_disk != sdisk->raid_disk) {
557                         MD_BUG();
558                         err = 1;
559                         goto abort;
560                 }
561                         
562                 if (sdisk->raid_disk != spare_disk) {
563                         MD_BUG();
564                         err = 1;
565                         goto abort;
566                 }
567
568                 if (failed_desc->raid_disk != fdisk->raid_disk) {
569                         MD_BUG();
570                         err = 1;
571                         goto abort;
572                 }
573
574                 if (fdisk->raid_disk != failed_disk) {
575                         MD_BUG();
576                         err = 1;
577                         goto abort;
578                 }
579
580                 /*
581                  * do the switch finally
582                  */
583                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
584                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
585                 xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
586                 spare_rdev->alias_device = 0;
587                 failed_rdev->alias_device = 1;
588
589                 xchg_values(*spare_desc, *failed_desc);
590                 xchg_values(*fdisk, *sdisk);
591
592                 /*
593                  * (careful, 'failed' and 'spare' are switched from now on)
594                  *
595                  * we want to preserve linear numbering and we want to
596                  * give the proper raid_disk number to the now activated
597                  * disk. (this means we switch back these values)
598                  */
599         
600                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
601                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
602                 xchg_values(spare_desc->number, failed_desc->number);
603                 xchg_values(sdisk->number, fdisk->number);
604
605                 *d = failed_desc;
606
607                 if (sdisk->dev == MKDEV(0,0))
608                         sdisk->used_slot = 0;
609                 /*
610                  * this really activates the spare.
611                  */
612                 fdisk->spare = 0;
613
614                 /*
615                  * if we activate a spare, we definitely replace a
616                  * non-operational disk slot in the 'low' area of
617                  * the disk array.
618                  */
619
620                 conf->working_disks++;
621
622                 break;
623
624         case DISKOP_HOT_REMOVE_DISK:
625                 rdisk = conf->multipaths + removed_disk;
626
627                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
628                         MD_BUG();       
629                         err = 1;
630                         goto abort;
631                 }
632                 rdisk->dev = MKDEV(0,0);
633                 rdisk->used_slot = 0;
634                 conf->nr_disks--;
635                 break;
636
637         case DISKOP_HOT_ADD_DISK:
638                 adisk = conf->multipaths + added_disk;
639                 added_desc = *d;
640
641                 if (added_disk != added_desc->number) {
642                         MD_BUG();       
643                         err = 1;
644                         goto abort;
645                 }
646
647                 adisk->number = added_desc->number;
648                 adisk->raid_disk = added_desc->raid_disk;
649                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
650
651                 adisk->operational = 0;
652                 adisk->spare = 1;
653                 adisk->used_slot = 1;
654                 conf->nr_disks++;
655
656                 break;
657
658         default:
659                 MD_BUG();       
660                 err = 1;
661                 goto abort;
662         }
663 abort:
664         md_spin_unlock_irq(&conf->device_lock);
665
666         print_multipath_conf(conf);
667         return err;
668 }
669
670
671 #define IO_ERROR KERN_ALERT \
672 "multipath: %s: unrecoverable IO read error for block %lu\n"
673
674 #define REDIRECT_SECTOR KERN_ERR \
675 "multipath: %s: redirecting sector %lu to another IO path\n"
676
677 /*
678  * This is a kernel thread which:
679  *
680  *      1.      Retries failed read operations on working multipaths.
681  *      2.      Updates the raid superblock when problems encounter.
682  *      3.      Performs writes following reads for array syncronising.
683  */
684
685 static void multipathd (void *data)
686 {
687         struct multipath_bh *mp_bh;
688         struct buffer_head *bh;
689         unsigned long flags;
690         mddev_t *mddev;
691         kdev_t dev;
692
693
694         for (;;) {
695                 md_spin_lock_irqsave(&retry_list_lock, flags);
696                 mp_bh = multipath_retry_list;
697                 if (!mp_bh)
698                         break;
699                 multipath_retry_list = mp_bh->next_mp;
700                 md_spin_unlock_irqrestore(&retry_list_lock, flags);
701
702                 mddev = mp_bh->mddev;
703                 if (mddev->sb_dirty)
704                         md_update_sb(mddev);
705                 bh = &mp_bh->bh_req;
706                 dev = bh->b_dev;
707                 
708                 multipath_map (mddev, &bh->b_dev);
709                 if (bh->b_dev == dev) {
710                         printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
711                         multipath_end_bh_io(mp_bh, 0);
712                 } else {
713                         printk (REDIRECT_SECTOR,
714                                 partition_name(bh->b_dev), bh->b_blocknr);
715                         bh->b_rdev = bh->b_dev;
716                         bh->b_rsector = bh->b_blocknr;
717                         generic_make_request (mp_bh->cmd, bh);
718                 }
719         }
720         md_spin_unlock_irqrestore(&retry_list_lock, flags);
721 }
722 #undef IO_ERROR
723 #undef REDIRECT_SECTOR
724
725 /*
726  * This will catch the scenario in which one of the multipaths was
727  * mounted as a normal device rather than as a part of a raid set.
728  *
729  * check_consistency is very personality-dependent, eg. RAID5 cannot
730  * do this check, it uses another method.
731  */
732 static int __check_consistency (mddev_t *mddev, int row)
733 {
734         multipath_conf_t *conf = mddev_to_conf(mddev);
735         int disks = MD_SB_DISKS;
736         kdev_t dev;
737         struct buffer_head *bh = NULL;
738         int i, rc = 0;
739         char *buffer = NULL;
740
741         for (i = 0; i < disks; i++) {
742                 if (!conf->multipaths[i].operational)
743                         continue;
744                 printk("(checking disk %d)\n",i);
745                 dev = conf->multipaths[i].dev;
746                 set_blocksize(dev, 4096);
747                 if ((bh = bread(dev, row / 4, 4096)) == NULL)
748                         break;
749                 if (!buffer) {
750                         buffer = (char *) __get_free_page(GFP_KERNEL);
751                         if (!buffer)
752                                 break;
753                         memcpy(buffer, bh->b_data, 4096);
754                 } else if (memcmp(buffer, bh->b_data, 4096)) {
755                         rc = 1;
756                         break;
757                 }
758                 bforget(bh);
759                 fsync_dev(dev);
760                 invalidate_buffers(dev);
761                 bh = NULL;
762         }
763         if (buffer)
764                 free_page((unsigned long) buffer);
765         if (bh) {
766                 dev = bh->b_dev;
767                 bforget(bh);
768                 fsync_dev(dev);
769                 invalidate_buffers(dev);
770         }
771         return rc;
772 }
773
774 static int check_consistency (mddev_t *mddev)
775 {
776         if (__check_consistency(mddev, 0))
777 /*
778  * we do not do this currently, as it's perfectly possible to
779  * have an inconsistent array when it's freshly created. Only
780  * newly written data has to be consistent.
781  */
782                 return 0;
783
784         return 0;
785 }
786
787 #define INVALID_LEVEL KERN_WARNING \
788 "multipath: md%d: raid level not set to multipath IO (%d)\n"
789
790 #define NO_SB KERN_ERR \
791 "multipath: disabled IO path %s (couldn't access raid superblock)\n"
792
793 #define ERRORS KERN_ERR \
794 "multipath: disabled IO path %s (errors detected)\n"
795
796 #define NOT_IN_SYNC KERN_ERR \
797 "multipath: making IO path %s a spare path (not in sync)\n"
798
799 #define INCONSISTENT KERN_ERR \
800 "multipath: disabled IO path %s (inconsistent descriptor)\n"
801
802 #define ALREADY_RUNNING KERN_ERR \
803 "multipath: disabled IO path %s (multipath %d already operational)\n"
804
805 #define OPERATIONAL KERN_INFO \
806 "multipath: device %s operational as IO path %d\n"
807
808 #define MEM_ERROR KERN_ERR \
809 "multipath: couldn't allocate memory for md%d\n"
810
811 #define SPARE KERN_INFO \
812 "multipath: spare IO path %s\n"
813
814 #define NONE_OPERATIONAL KERN_ERR \
815 "multipath: no operational IO paths for md%d\n"
816
817 #define SB_DIFFERENCES KERN_ERR \
818 "multipath: detected IO path differences!\n"
819
820 #define ARRAY_IS_ACTIVE KERN_INFO \
821 "multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n"
822
823 #define THREAD_ERROR KERN_ERR \
824 "multipath: couldn't allocate thread for md%d\n"
825
826 static int multipath_run (mddev_t *mddev)
827 {
828         multipath_conf_t *conf;
829         int i, j, disk_idx;
830         struct multipath_info *disk, *disk2;
831         mdp_super_t *sb = mddev->sb;
832         mdp_disk_t *desc, *desc2;
833         mdk_rdev_t *rdev, *def_rdev = NULL;
834         struct md_list_head *tmp;
835         int num_rdevs = 0;
836
837         MOD_INC_USE_COUNT;
838
839         if (sb->level != -4) {
840                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
841                 goto out;
842         }
843         /*
844          * copy the already verified devices into our private MULTIPATH
845          * bookkeeping area. [whatever we allocate in multipath_run(),
846          * should be freed in multipath_stop()]
847          */
848
849         conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
850         mddev->private = conf;
851         if (!conf) {
852                 printk(MEM_ERROR, mdidx(mddev));
853                 goto out;
854         }
855         memset(conf, 0, sizeof(*conf));
856
857         ITERATE_RDEV(mddev,rdev,tmp) {
858                 if (rdev->faulty) {
859                         /* this is a "should never happen" case and if it */
860                         /* ever does happen, a continue; won't help */
861                         printk(ERRORS, partition_name(rdev->dev));
862                         continue;
863                 } else {
864                         /* this is a "should never happen" case and if it */
865                         /* ever does happen, a continue; won't help */
866                         if (!rdev->sb) {
867                                 MD_BUG();
868                                 continue;
869                         }
870                 }
871                 if (rdev->desc_nr == -1) {
872                         MD_BUG();
873                         continue;
874                 }
875
876                 desc = &sb->disks[rdev->desc_nr];
877                 disk_idx = desc->raid_disk;
878                 disk = conf->multipaths + disk_idx;
879
880                 if (!disk_sync(desc))
881                         printk(NOT_IN_SYNC, partition_name(rdev->dev));
882
883                 /*
884                  * Mark all disks as spare to start with, then pick our
885                  * active disk.  If we have a disk that is marked active
886                  * in the sb, then use it, else use the first rdev.
887                  */
888                 disk->number = desc->number;
889                 disk->raid_disk = desc->raid_disk;
890                 disk->dev = rdev->dev;
891                 disk->operational = 0;
892                 disk->spare = 1;
893                 disk->used_slot = 1;
894                 mark_disk_sync(desc);
895
896                 if (disk_active(desc)) {
897                         if(!conf->working_disks) {
898                                 printk(OPERATIONAL, partition_name(rdev->dev),
899                                         desc->raid_disk);
900                                 disk->operational = 1;
901                                 disk->spare = 0;
902                                 conf->working_disks++;
903                                 def_rdev = rdev;
904                         } else {
905                                 mark_disk_spare(desc);
906                         }
907                 } else
908                         mark_disk_spare(desc);
909
910                 if(!num_rdevs++) def_rdev = rdev;
911         }
912         if(!conf->working_disks && num_rdevs) {
913                 desc = &sb->disks[def_rdev->desc_nr];
914                 disk = conf->multipaths + desc->raid_disk;
915                 printk(OPERATIONAL, partition_name(def_rdev->dev),
916                         disk->raid_disk);
917                 disk->operational = 1;
918                 disk->spare = 0;
919                 conf->working_disks++;
920                 mark_disk_active(desc);
921         }
922         /*
923          * Make sure our active path is in desc spot 0
924          */
925         if(def_rdev->desc_nr != 0) {
926                 rdev = find_rdev_nr(mddev, 0);
927                 desc = &sb->disks[def_rdev->desc_nr];
928                 desc2 = sb->disks;
929                 disk = conf->multipaths + desc->raid_disk;
930                 disk2 = conf->multipaths + desc2->raid_disk;
931                 xchg_values(*desc2,*desc);
932                 xchg_values(*disk2,*disk);
933                 xchg_values(desc2->number, desc->number);
934                 xchg_values(disk2->number, disk->number);
935                 xchg_values(desc2->raid_disk, desc->raid_disk);
936                 xchg_values(disk2->raid_disk, disk->raid_disk);
937                 if(rdev) {
938                         xchg_values(def_rdev->desc_nr,rdev->desc_nr);
939                 } else {
940                         def_rdev->desc_nr = 0;
941                 }
942         }
943         conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
944         conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
945         sb->failed_disks = 0;
946         sb->spare_disks = num_rdevs - 1;
947         mddev->sb_dirty = 1;
948         conf->mddev = mddev;
949         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
950
951         init_waitqueue_head(&conf->wait_buffer);
952
953         if (!conf->working_disks) {
954                 printk(NONE_OPERATIONAL, mdidx(mddev));
955                 goto out_free_conf;
956         }
957
958
959         /* pre-allocate some buffer_head structures.
960          * As a minimum, 1 mpbh and raid_disks buffer_heads
961          * would probably get us by in tight memory situations,
962          * but a few more is probably a good idea.
963          * For now, try NR_RESERVED_BUFS mpbh and
964          * NR_RESERVED_BUFS*raid_disks bufferheads
965          * This will allow at least NR_RESERVED_BUFS concurrent
966          * reads or writes even if kmalloc starts failing
967          */
968         if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) {
969                 printk(MEM_ERROR, mdidx(mddev));
970                 goto out_free_conf;
971         }
972
973         if ((sb->state & (1 << MD_SB_CLEAN))) {
974                 /*
975                  * we do sanity checks even if the device says
976                  * it's clean ...
977                  */
978                 if (check_consistency(mddev)) {
979                         printk(SB_DIFFERENCES);
980                         sb->state &= ~(1 << MD_SB_CLEAN);
981                 }
982         }
983
984         {
985                 const char * name = "multipathd";
986
987                 conf->thread = md_register_thread(multipathd, conf, name);
988                 if (!conf->thread) {
989                         printk(THREAD_ERROR, mdidx(mddev));
990                         goto out_free_conf;
991                 }
992         }
993
994         /*
995          * Regenerate the "device is in sync with the raid set" bit for
996          * each device.
997          */
998         for (i = 0; i < MD_SB_DISKS; i++) {
999                 mark_disk_nonsync(sb->disks+i);
1000                 for (j = 0; j < sb->raid_disks; j++) {
1001                         if (sb->disks[i].number == conf->multipaths[j].number)
1002                                 mark_disk_sync(sb->disks+i);
1003                 }
1004         }
1005
1006         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
1007                         sb->raid_disks, sb->spare_disks);
1008         /*
1009          * Ok, everything is just fine now
1010          */
1011         return 0;
1012
1013 out_free_conf:
1014         multipath_shrink_mpbh(conf);
1015         kfree(conf);
1016         mddev->private = NULL;
1017 out:
1018         MOD_DEC_USE_COUNT;
1019         return -EIO;
1020 }
1021
1022 #undef INVALID_LEVEL
1023 #undef NO_SB
1024 #undef ERRORS
1025 #undef NOT_IN_SYNC
1026 #undef INCONSISTENT
1027 #undef ALREADY_RUNNING
1028 #undef OPERATIONAL
1029 #undef SPARE
1030 #undef NONE_OPERATIONAL
1031 #undef SB_DIFFERENCES
1032 #undef ARRAY_IS_ACTIVE
1033
1034 static int multipath_stop (mddev_t *mddev)
1035 {
1036         multipath_conf_t *conf = mddev_to_conf(mddev);
1037
1038         md_unregister_thread(conf->thread);
1039         multipath_shrink_mpbh(conf);
1040         kfree(conf);
1041         mddev->private = NULL;
1042         MOD_DEC_USE_COUNT;
1043         return 0;
1044 }
1045
1046 static mdk_personality_t multipath_personality=
1047 {
1048         name:           "multipath",
1049         make_request:   multipath_make_request,
1050         run:            multipath_run,
1051         stop:           multipath_stop,
1052         status:         multipath_status,
1053         error_handler:  multipath_error,
1054         diskop:         multipath_diskop,
1055 };
1056
1057 static int md__init multipath_init (void)
1058 {
1059         return register_md_personality (MULTIPATH, &multipath_personality);
1060 }
1061
1062 static void multipath_exit (void)
1063 {
1064         unregister_md_personality (MULTIPATH);
1065 }
1066
1067 module_init(multipath_init);
1068 module_exit(multipath_exit);
1069 MODULE_LICENSE("GPL");