added mtd driver
[linux-2.4.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12    - kmod support by: Cyrus Durgin
13    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
15
16    - lots of fixes and improvements to the RAID1/RAID5 and generic
17      RAID code (such as request based resynchronization):
18
19      Neil Brown <neilb@cse.unsw.edu.au>.
20
21    This program is free software; you can redistribute it and/or modify
22    it under the terms of the GNU General Public License as published by
23    the Free Software Foundation; either version 2, or (at your option)
24    any later version.
25
26    You should have received a copy of the GNU General Public License
27    (for example /usr/src/linux/COPYING); if not, write to the Free
28    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 */
30
31 #include <linux/module.h>
32 #include <linux/config.h>
33 #include <linux/raid/md.h>
34 #include <linux/sysctl.h>
35 #include <linux/raid/xor.h>
36 #include <linux/devfs_fs_kernel.h>
37
38 #include <linux/init.h>
39
40 #ifdef CONFIG_KMOD
41 #include <linux/kmod.h>
42 #endif
43
44 #define __KERNEL_SYSCALLS__
45 #include <linux/unistd.h>
46
47 #include <asm/unaligned.h>
48
49 #define MAJOR_NR MD_MAJOR
50 #define MD_DRIVER
51
52 #include <linux/blk.h>
53
54 #define DEBUG 0
55 #if DEBUG
56 # define dprintk(x...) printk(x)
57 #else
58 # define dprintk(x...) do { } while(0)
59 #endif
60
61 #ifndef MODULE
62 static void autostart_arrays (void);
63 #endif
64
65 static mdk_personality_t *pers[MAX_PERSONALITY];
66
67 /*
68  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
69  * is 100 KB/sec, so the extra system load does not show up that much.
70  * Increase it if you want to have more _guaranteed_ speed. Note that
71  * the RAID driver will use the maximum available bandwith if the IO
72  * subsystem is idle. There is also an 'absolute maximum' reconstruction
73  * speed limit - in case reconstruction slows down your system despite
74  * idle IO detection.
75  *
76  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
77  */
78
79 static int sysctl_speed_limit_min = 100;
80 static int sysctl_speed_limit_max = 100000;
81
82 static struct ctl_table_header *raid_table_header;
83
84 static ctl_table raid_table[] = {
85         {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
86          &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
87         {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
88          &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
89         {0}
90 };
91
92 static ctl_table raid_dir_table[] = {
93         {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
94         {0}
95 };
96
97 static ctl_table raid_root_table[] = {
98         {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
99         {0}
100 };
101
102 /*
103  * these have to be allocated separately because external
104  * subsystems want to have a pre-defined structure
105  */
106 struct hd_struct md_hd_struct[MAX_MD_DEVS];
107 static int md_blocksizes[MAX_MD_DEVS];
108 static int md_hardsect_sizes[MAX_MD_DEVS];
109 static int md_maxreadahead[MAX_MD_DEVS];
110 static mdk_thread_t *md_recovery_thread;
111
112 int md_size[MAX_MD_DEVS];
113
114 static struct block_device_operations md_fops;
115 static devfs_handle_t devfs_handle;
116
117 static struct gendisk md_gendisk=
118 {
119         major: MD_MAJOR,
120         major_name: "md",
121         minor_shift: 0,
122         max_p: 1,
123         part: md_hd_struct,
124         sizes: md_size,
125         nr_real: MAX_MD_DEVS,
126         real_devices: NULL,
127         next: NULL,
128         fops: &md_fops,
129 };
130
131 /*
132  * Enables to iterate over all existing md arrays
133  */
134 static MD_LIST_HEAD(all_mddevs);
135
136 /*
137  * The mapping between kdev and mddev is not necessary a simple
138  * one! Eg. HSM uses several sub-devices to implement Logical
139  * Volumes. All these sub-devices map to the same mddev.
140  */
141 dev_mapping_t mddev_map[MAX_MD_DEVS];
142
143 void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data)
144 {
145         unsigned int minor = MINOR(dev);
146
147         if (MAJOR(dev) != MD_MAJOR) {
148                 MD_BUG();
149                 return;
150         }
151         if (mddev_map[minor].mddev) {
152                 MD_BUG();
153                 return;
154         }
155         mddev_map[minor].mddev = mddev;
156         mddev_map[minor].data = data;
157 }
158
159 void del_mddev_mapping(mddev_t * mddev, kdev_t dev)
160 {
161         unsigned int minor = MINOR(dev);
162
163         if (MAJOR(dev) != MD_MAJOR) {
164                 MD_BUG();
165                 return;
166         }
167         if (mddev_map[minor].mddev != mddev) {
168                 MD_BUG();
169                 return;
170         }
171         mddev_map[minor].mddev = NULL;
172         mddev_map[minor].data = NULL;
173 }
174
175 static int md_make_request(request_queue_t *q, int rw, struct buffer_head * bh)
176 {
177         mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
178
179         if (mddev && mddev->pers)
180                 return mddev->pers->make_request(mddev, rw, bh);
181         else {
182                 buffer_IO_error(bh);
183                 return 0;
184         }
185 }
186
187 static mddev_t * alloc_mddev(kdev_t dev)
188 {
189         mddev_t *mddev;
190
191         if (MAJOR(dev) != MD_MAJOR) {
192                 MD_BUG();
193                 return 0;
194         }
195         mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
196         if (!mddev)
197                 return NULL;
198
199         memset(mddev, 0, sizeof(*mddev));
200
201         mddev->__minor = MINOR(dev);
202         init_MUTEX(&mddev->reconfig_sem);
203         init_MUTEX(&mddev->recovery_sem);
204         init_MUTEX(&mddev->resync_sem);
205         MD_INIT_LIST_HEAD(&mddev->disks);
206         MD_INIT_LIST_HEAD(&mddev->all_mddevs);
207         atomic_set(&mddev->active, 0);
208
209         /*
210          * The 'base' mddev is the one with data NULL.
211          * personalities can create additional mddevs
212          * if necessary.
213          */
214         add_mddev_mapping(mddev, dev, 0);
215         md_list_add(&mddev->all_mddevs, &all_mddevs);
216
217         MOD_INC_USE_COUNT;
218
219         return mddev;
220 }
221
222 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
223 {
224         mdk_rdev_t * rdev;
225         struct md_list_head *tmp;
226
227         ITERATE_RDEV(mddev,rdev,tmp) {
228                 if (rdev->desc_nr == nr)
229                         return rdev;
230         }
231         return NULL;
232 }
233
234 mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
235 {
236         struct md_list_head *tmp;
237         mdk_rdev_t *rdev;
238
239         ITERATE_RDEV(mddev,rdev,tmp) {
240                 if (rdev->dev == dev)
241                         return rdev;
242         }
243         return NULL;
244 }
245
246 static MD_LIST_HEAD(device_names);
247
248 char * partition_name(kdev_t dev)
249 {
250         struct gendisk *hd;
251         static char nomem [] = "<nomem>";
252         dev_name_t *dname;
253         struct md_list_head *tmp = device_names.next;
254
255         while (tmp != &device_names) {
256                 dname = md_list_entry(tmp, dev_name_t, list);
257                 if (dname->dev == dev)
258                         return dname->name;
259                 tmp = tmp->next;
260         }
261
262         dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
263
264         if (!dname)
265                 return nomem;
266         /*
267          * ok, add this new device name to the list
268          */
269         hd = get_gendisk (dev);
270         dname->name = NULL;
271         if (hd)
272                 dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
273         if (!dname->name) {
274                 sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
275                 dname->name = dname->namebuf;
276         }
277
278         dname->dev = dev;
279         MD_INIT_LIST_HEAD(&dname->list);
280         md_list_add(&dname->list, &device_names);
281
282         return dname->name;
283 }
284
285 static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
286                                                 int persistent)
287 {
288         unsigned int size = 0;
289
290         if (blk_size[MAJOR(dev)])
291                 size = blk_size[MAJOR(dev)][MINOR(dev)];
292         if (persistent)
293                 size = MD_NEW_SIZE_BLOCKS(size);
294         return size;
295 }
296
297 static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
298 {
299         unsigned int size;
300
301         size = calc_dev_sboffset(dev, mddev, persistent);
302         if (!mddev->sb) {
303                 MD_BUG();
304                 return size;
305         }
306         if (mddev->sb->chunk_size)
307                 size &= ~(mddev->sb->chunk_size/1024 - 1);
308         return size;
309 }
310
311 static unsigned int zoned_raid_size(mddev_t *mddev)
312 {
313         unsigned int mask;
314         mdk_rdev_t * rdev;
315         struct md_list_head *tmp;
316
317         if (!mddev->sb) {
318                 MD_BUG();
319                 return -EINVAL;
320         }
321         /*
322          * do size and offset calculations.
323          */
324         mask = ~(mddev->sb->chunk_size/1024 - 1);
325
326         ITERATE_RDEV(mddev,rdev,tmp) {
327                 rdev->size &= mask;
328                 md_size[mdidx(mddev)] += rdev->size;
329         }
330         return 0;
331 }
332
333 /*
334  * We check wether all devices are numbered from 0 to nb_dev-1. The
335  * order is guaranteed even after device name changes.
336  *
337  * Some personalities (raid0, linear) use this. Personalities that
338  * provide data have to be able to deal with loss of individual
339  * disks, so they do their checking themselves.
340  */
341 int md_check_ordering(mddev_t *mddev)
342 {
343         int i, c;
344         mdk_rdev_t *rdev;
345         struct md_list_head *tmp;
346
347         /*
348          * First, all devices must be fully functional
349          */
350         ITERATE_RDEV(mddev,rdev,tmp) {
351                 if (rdev->faulty) {
352                         printk(KERN_ERR "md: md%d's device %s faulty, aborting.\n",
353                                mdidx(mddev), partition_name(rdev->dev));
354                         goto abort;
355                 }
356         }
357
358         c = 0;
359         ITERATE_RDEV(mddev,rdev,tmp) {
360                 c++;
361         }
362         if (c != mddev->nb_dev) {
363                 MD_BUG();
364                 goto abort;
365         }
366         if (mddev->nb_dev != mddev->sb->raid_disks) {
367                 printk(KERN_ERR "md: md%d, array needs %d disks, has %d, aborting.\n",
368                         mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
369                 goto abort;
370         }
371         /*
372          * Now the numbering check
373          */
374         for (i = 0; i < mddev->nb_dev; i++) {
375                 c = 0;
376                 ITERATE_RDEV(mddev,rdev,tmp) {
377                         if (rdev->desc_nr == i)
378                                 c++;
379                 }
380                 if (!c) {
381                         printk(KERN_ERR "md: md%d, missing disk #%d, aborting.\n",
382                                mdidx(mddev), i);
383                         goto abort;
384                 }
385                 if (c > 1) {
386                         printk(KERN_ERR "md: md%d, too many disks #%d, aborting.\n",
387                                mdidx(mddev), i);
388                         goto abort;
389                 }
390         }
391         return 0;
392 abort:
393         return 1;
394 }
395
396 static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
397 {
398         if (disk_active(disk)) {
399                 sb->working_disks--;
400         } else {
401                 if (disk_spare(disk)) {
402                         sb->spare_disks--;
403                         sb->working_disks--;
404                 } else  {
405                         sb->failed_disks--;
406                 }
407         }
408         sb->nr_disks--;
409         disk->major = 0;
410         disk->minor = 0;
411         mark_disk_removed(disk);
412 }
413
414 #define BAD_MAGIC KERN_ERR \
415 "md: invalid raid superblock magic on %s\n"
416
417 #define BAD_MINOR KERN_ERR \
418 "md: %s: invalid raid minor (%x)\n"
419
420 #define OUT_OF_MEM KERN_ALERT \
421 "md: out of memory.\n"
422
423 #define NO_SB KERN_ERR \
424 "md: disabled device %s, could not read superblock.\n"
425
426 #define BAD_CSUM KERN_WARNING \
427 "md: invalid superblock checksum on %s\n"
428
429 static int alloc_array_sb(mddev_t * mddev)
430 {
431         if (mddev->sb) {
432                 MD_BUG();
433                 return 0;
434         }
435
436         mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
437         if (!mddev->sb)
438                 return -ENOMEM;
439         md_clear_page(mddev->sb);
440         return 0;
441 }
442
443 static int alloc_disk_sb(mdk_rdev_t * rdev)
444 {
445         if (rdev->sb)
446                 MD_BUG();
447
448         rdev->sb_page = alloc_page(GFP_KERNEL);
449         if (!rdev->sb_page) {
450                 printk(OUT_OF_MEM);
451                 return -EINVAL;
452         }
453         rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
454
455         return 0;
456 }
457
458 static void free_disk_sb(mdk_rdev_t * rdev)
459 {
460         if (rdev->sb_page) {
461                 page_cache_release(rdev->sb_page);
462                 rdev->sb = NULL;
463                 rdev->sb_page = NULL;
464                 rdev->sb_offset = 0;
465                 rdev->size = 0;
466         } else {
467                 if (!rdev->faulty)
468                         MD_BUG();
469         }
470 }
471
472
473 static void bh_complete(struct buffer_head *bh, int uptodate)
474 {
475
476         if (uptodate)
477                 set_bit(BH_Uptodate, &bh->b_state);
478
479         complete((struct completion*)bh->b_private);
480 }
481
482 static int sync_page_io(kdev_t dev, unsigned long sector, int size,
483                         struct page *page, int rw)
484 {
485         struct buffer_head bh;
486         struct completion event;
487
488         init_completion(&event);
489         init_buffer(&bh, bh_complete, &event);
490         bh.b_rdev = dev;
491         bh.b_rsector = sector;
492         bh.b_state      = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
493         bh.b_size = size;
494         bh.b_page = page;
495         bh.b_reqnext = NULL;
496         bh.b_data = page_address(page);
497         generic_make_request(rw, &bh);
498
499         run_task_queue(&tq_disk);
500         wait_for_completion(&event);
501
502         return test_bit(BH_Uptodate, &bh.b_state);
503 }
504
505 static int read_disk_sb(mdk_rdev_t * rdev)
506 {
507         int ret = -EINVAL;
508         kdev_t dev = rdev->dev;
509         unsigned long sb_offset;
510
511         if (!rdev->sb) {
512                 MD_BUG();
513                 goto abort;
514         }
515
516         /*
517          * Calculate the position of the superblock,
518          * it's at the end of the disk
519          */
520         sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
521         rdev->sb_offset = sb_offset;
522
523         if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
524                 printk(NO_SB,partition_name(dev));
525                 return -EINVAL;
526         }
527         printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
528         ret = 0;
529 abort:
530         return ret;
531 }
532
533 static unsigned int calc_sb_csum(mdp_super_t * sb)
534 {
535         unsigned int disk_csum, csum;
536
537         disk_csum = sb->sb_csum;
538         sb->sb_csum = 0;
539         csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
540         sb->sb_csum = disk_csum;
541         return csum;
542 }
543
544 /*
545  * Check one RAID superblock for generic plausibility
546  */
547
548 static int check_disk_sb(mdk_rdev_t * rdev)
549 {
550         mdp_super_t *sb;
551         int ret = -EINVAL;
552
553         sb = rdev->sb;
554         if (!sb) {
555                 MD_BUG();
556                 goto abort;
557         }
558
559         if (sb->md_magic != MD_SB_MAGIC) {
560                 printk(BAD_MAGIC, partition_name(rdev->dev));
561                 goto abort;
562         }
563
564         if (sb->md_minor >= MAX_MD_DEVS) {
565                 printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
566                 goto abort;
567         }
568
569         if (calc_sb_csum(sb) != sb->sb_csum) {
570                 printk(BAD_CSUM, partition_name(rdev->dev));
571                 goto abort;
572         }
573         ret = 0;
574 abort:
575         return ret;
576 }
577
578 static kdev_t dev_unit(kdev_t dev)
579 {
580         unsigned int mask;
581         struct gendisk *hd = get_gendisk(dev);
582
583         if (!hd)
584                 return 0;
585         mask = ~((1 << hd->minor_shift) - 1);
586
587         return MKDEV(MAJOR(dev), MINOR(dev) & mask);
588 }
589
590 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
591 {
592         struct md_list_head *tmp;
593         mdk_rdev_t *rdev;
594
595         ITERATE_RDEV(mddev,rdev,tmp)
596                 if (dev_unit(rdev->dev) == dev_unit(dev))
597                         return rdev;
598
599         return NULL;
600 }
601
602 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
603 {
604         struct md_list_head *tmp;
605         mdk_rdev_t *rdev;
606
607         ITERATE_RDEV(mddev1,rdev,tmp)
608                 if (match_dev_unit(mddev2, rdev->dev))
609                         return 1;
610
611         return 0;
612 }
613
614 static MD_LIST_HEAD(all_raid_disks);
615 static MD_LIST_HEAD(pending_raid_disks);
616
617 static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
618 {
619         mdk_rdev_t *same_pdev;
620
621         if (rdev->mddev) {
622                 MD_BUG();
623                 return;
624         }
625         same_pdev = match_dev_unit(mddev, rdev->dev);
626         if (same_pdev)
627                 printk( KERN_WARNING
628 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
629 "     protection against single-disk failure might be compromised.\n",
630                         mdidx(mddev), partition_name(rdev->dev),
631                                 partition_name(same_pdev->dev));
632
633         md_list_add(&rdev->same_set, &mddev->disks);
634         rdev->mddev = mddev;
635         mddev->nb_dev++;
636         printk(KERN_INFO "md: bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
637 }
638
639 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
640 {
641         if (!rdev->mddev) {
642                 MD_BUG();
643                 return;
644         }
645         md_list_del(&rdev->same_set);
646         MD_INIT_LIST_HEAD(&rdev->same_set);
647         rdev->mddev->nb_dev--;
648         printk(KERN_INFO "md: unbind<%s,%d>\n", partition_name(rdev->dev),
649                                                  rdev->mddev->nb_dev);
650         rdev->mddev = NULL;
651 }
652
653 /*
654  * prevent the device from being mounted, repartitioned or
655  * otherwise reused by a RAID array (or any other kernel
656  * subsystem), by opening the device. [simply getting an
657  * inode is not enough, the SCSI module usage code needs
658  * an explicit open() on the device]
659  */
660 static int lock_rdev(mdk_rdev_t *rdev)
661 {
662         int err = 0;
663         struct block_device *bdev;
664
665         bdev = bdget(rdev->dev);
666         if (!bdev)
667                 return -ENOMEM;
668         err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
669         if (!err)
670                 rdev->bdev = bdev;
671         return err;
672 }
673
674 static void unlock_rdev(mdk_rdev_t *rdev)
675 {
676         struct block_device *bdev = rdev->bdev;
677         rdev->bdev = NULL;
678         if (!bdev)
679                 MD_BUG();
680         blkdev_put(bdev, BDEV_RAW);
681 }
682
683 void md_autodetect_dev(kdev_t dev);
684
685 static void export_rdev(mdk_rdev_t * rdev)
686 {
687         printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
688         if (rdev->mddev)
689                 MD_BUG();
690         unlock_rdev(rdev);
691         free_disk_sb(rdev);
692         md_list_del(&rdev->all);
693         MD_INIT_LIST_HEAD(&rdev->all);
694         if (rdev->pending.next != &rdev->pending) {
695                 printk(KERN_INFO "md: (%s was pending)\n",
696                         partition_name(rdev->dev));
697                 md_list_del(&rdev->pending);
698                 MD_INIT_LIST_HEAD(&rdev->pending);
699         }
700 #ifndef MODULE
701         md_autodetect_dev(rdev->dev);
702 #endif
703         rdev->dev = 0;
704         rdev->faulty = 0;
705         kfree(rdev);
706 }
707
708 static void kick_rdev_from_array(mdk_rdev_t * rdev)
709 {
710         unbind_rdev_from_array(rdev);
711         export_rdev(rdev);
712 }
713
714 static void export_array(mddev_t *mddev)
715 {
716         struct md_list_head *tmp;
717         mdk_rdev_t *rdev;
718         mdp_super_t *sb = mddev->sb;
719
720         if (mddev->sb) {
721                 mddev->sb = NULL;
722                 free_page((unsigned long) sb);
723         }
724
725         ITERATE_RDEV(mddev,rdev,tmp) {
726                 if (!rdev->mddev) {
727                         MD_BUG();
728                         continue;
729                 }
730                 kick_rdev_from_array(rdev);
731         }
732         if (mddev->nb_dev)
733                 MD_BUG();
734 }
735
736 static void free_mddev(mddev_t *mddev)
737 {
738         if (!mddev) {
739                 MD_BUG();
740                 return;
741         }
742
743         export_array(mddev);
744         md_size[mdidx(mddev)] = 0;
745         md_hd_struct[mdidx(mddev)].nr_sects = 0;
746
747         /*
748          * Make sure nobody else is using this mddev
749          * (careful, we rely on the global kernel lock here)
750          */
751         while (sem_getcount(&mddev->resync_sem) != 1)
752                 schedule();
753         while (sem_getcount(&mddev->recovery_sem) != 1)
754                 schedule();
755
756         del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
757         md_list_del(&mddev->all_mddevs);
758         MD_INIT_LIST_HEAD(&mddev->all_mddevs);
759         kfree(mddev);
760         MOD_DEC_USE_COUNT;
761 }
762
763 #undef BAD_CSUM
764 #undef BAD_MAGIC
765 #undef OUT_OF_MEM
766 #undef NO_SB
767
768 static void print_desc(mdp_disk_t *desc)
769 {
770         printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
771                 partition_name(MKDEV(desc->major,desc->minor)),
772                 desc->major,desc->minor,desc->raid_disk,desc->state);
773 }
774
775 static void print_sb(mdp_super_t *sb)
776 {
777         int i;
778
779         printk(KERN_INFO "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
780                 sb->major_version, sb->minor_version, sb->patch_version,
781                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
782                 sb->ctime);
783         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
784                 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
785                 sb->layout, sb->chunk_size);
786         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
787                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
788                 sb->failed_disks, sb->spare_disks,
789                 sb->sb_csum, (unsigned long)sb->events_lo);
790
791         printk(KERN_INFO);
792         for (i = 0; i < MD_SB_DISKS; i++) {
793                 mdp_disk_t *desc;
794
795                 desc = sb->disks + i;
796                 if (desc->number || desc->major || desc->minor ||
797                     desc->raid_disk || (desc->state && (desc->state != 4))) {
798                         printk("     D %2d: ", i);
799                         print_desc(desc);
800                 }
801         }
802         printk(KERN_INFO "md:     THIS: ");
803         print_desc(&sb->this_disk);
804
805 }
806
807 static void print_rdev(mdk_rdev_t *rdev)
808 {
809         printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
810                 partition_name(rdev->dev), partition_name(rdev->old_dev),
811                 rdev->size, rdev->faulty, rdev->desc_nr);
812         if (rdev->sb) {
813                 printk(KERN_INFO "md: rdev superblock:\n");
814                 print_sb(rdev->sb);
815         } else
816                 printk(KERN_INFO "md: no rdev superblock!\n");
817 }
818
819 void md_print_devices(void)
820 {
821         struct md_list_head *tmp, *tmp2;
822         mdk_rdev_t *rdev;
823         mddev_t *mddev;
824
825         printk("\n");
826         printk("md:     **********************************\n");
827         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
828         printk("md:     **********************************\n");
829         ITERATE_MDDEV(mddev,tmp) {
830                 printk("md%d: ", mdidx(mddev));
831
832                 ITERATE_RDEV(mddev,rdev,tmp2)
833                         printk("<%s>", partition_name(rdev->dev));
834
835                 if (mddev->sb) {
836                         printk(" array superblock:\n");
837                         print_sb(mddev->sb);
838                 } else
839                         printk(" no array superblock.\n");
840
841                 ITERATE_RDEV(mddev,rdev,tmp2)
842                         print_rdev(rdev);
843         }
844         printk("md:     **********************************\n");
845         printk("\n");
846 }
847
848 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
849 {
850         int ret;
851         mdp_super_t *tmp1, *tmp2;
852
853         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
854         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
855
856         if (!tmp1 || !tmp2) {
857                 ret = 0;
858                 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
859                 goto abort;
860         }
861
862         *tmp1 = *sb1;
863         *tmp2 = *sb2;
864
865         /*
866          * nr_disks is not constant
867          */
868         tmp1->nr_disks = 0;
869         tmp2->nr_disks = 0;
870
871         if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
872                 ret = 0;
873         else
874                 ret = 1;
875
876 abort:
877         if (tmp1)
878                 kfree(tmp1);
879         if (tmp2)
880                 kfree(tmp2);
881
882         return ret;
883 }
884
885 static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
886 {
887         if (    (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
888                 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
889                 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
890                 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
891
892                 return 1;
893
894         return 0;
895 }
896
897 static mdk_rdev_t * find_rdev_all(kdev_t dev)
898 {
899         struct md_list_head *tmp;
900         mdk_rdev_t *rdev;
901
902         tmp = all_raid_disks.next;
903         while (tmp != &all_raid_disks) {
904                 rdev = md_list_entry(tmp, mdk_rdev_t, all);
905                 if (rdev->dev == dev)
906                         return rdev;
907                 tmp = tmp->next;
908         }
909         return NULL;
910 }
911
912 #define GETBLK_FAILED KERN_ERR \
913 "md: getblk failed for device %s\n"
914
915 static int write_disk_sb(mdk_rdev_t * rdev)
916 {
917         kdev_t dev;
918         unsigned long sb_offset, size;
919
920         if (!rdev->sb) {
921                 MD_BUG();
922                 return 1;
923         }
924         if (rdev->faulty) {
925                 MD_BUG();
926                 return 1;
927         }
928         if (rdev->sb->md_magic != MD_SB_MAGIC) {
929                 MD_BUG();
930                 return 1;
931         }
932
933         dev = rdev->dev;
934         sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
935         if (rdev->sb_offset != sb_offset) {
936                 printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
937                        partition_name(dev), rdev->sb_offset, sb_offset);
938                 goto skip;
939         }
940         /*
941          * If the disk went offline meanwhile and it's just a spare, then
942          * its size has changed to zero silently, and the MD code does
943          * not yet know that it's faulty.
944          */
945         size = calc_dev_size(dev, rdev->mddev, 1);
946         if (size != rdev->size) {
947                 printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
948                        partition_name(dev), rdev->size, size);
949                 goto skip;
950         }
951
952         printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
953
954         if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
955                 printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
956                 return 1;
957         }
958 skip:
959         return 0;
960 }
961 #undef GETBLK_FAILED
962
963 static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
964 {
965         int i, ok = 0;
966         mdp_disk_t *desc;
967
968         for (i = 0; i < MD_SB_DISKS; i++) {
969                 desc = mddev->sb->disks + i;
970 #if 0
971                 if (disk_faulty(desc)) {
972                         if (MKDEV(desc->major,desc->minor) == rdev->dev)
973                                 ok = 1;
974                         continue;
975                 }
976 #endif
977                 if (MKDEV(desc->major,desc->minor) == rdev->dev) {
978                         rdev->sb->this_disk = *desc;
979                         rdev->desc_nr = desc->number;
980                         ok = 1;
981                         break;
982                 }
983         }
984
985         if (!ok) {
986                 MD_BUG();
987         }
988 }
989
990 static int sync_sbs(mddev_t * mddev)
991 {
992         mdk_rdev_t *rdev;
993         mdp_super_t *sb;
994         struct md_list_head *tmp;
995
996         ITERATE_RDEV(mddev,rdev,tmp) {
997                 if (rdev->faulty || rdev->alias_device)
998                         continue;
999                 sb = rdev->sb;
1000                 *sb = *mddev->sb;
1001                 set_this_disk(mddev, rdev);
1002                 sb->sb_csum = calc_sb_csum(sb);
1003         }
1004         return 0;
1005 }
1006
1007 int md_update_sb(mddev_t * mddev)
1008 {
1009         int err, count = 100;
1010         struct md_list_head *tmp;
1011         mdk_rdev_t *rdev;
1012
1013         if (!mddev->sb_dirty) {
1014                 printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
1015                 return 0;
1016         }
1017         mddev->sb_dirty = 0;
1018 repeat:
1019         mddev->sb->utime = CURRENT_TIME;
1020         if ((++mddev->sb->events_lo)==0)
1021                 ++mddev->sb->events_hi;
1022
1023         if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
1024                 /*
1025                  * oops, this 64-bit counter should never wrap.
1026                  * Either we are in around ~1 trillion A.C., assuming
1027                  * 1 reboot per second, or we have a bug:
1028                  */
1029                 MD_BUG();
1030                 mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
1031         }
1032         sync_sbs(mddev);
1033
1034         /*
1035          * do not write anything to disk if using
1036          * nonpersistent superblocks
1037          */
1038         if (mddev->sb->not_persistent)
1039                 return 0;
1040
1041         printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1042                                         mdidx(mddev));
1043
1044         err = 0;
1045         ITERATE_RDEV(mddev,rdev,tmp) {
1046                 printk(KERN_INFO "md: ");
1047                 if (rdev->faulty)
1048                         printk("(skipping faulty ");
1049                 if (rdev->alias_device)
1050                         printk("(skipping alias ");
1051                 if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
1052                         printk("(skipping new-faulty %s )\n",
1053                                partition_name(rdev->dev));
1054                         continue;
1055                 }
1056                 printk("%s ", partition_name(rdev->dev));
1057                 if (!rdev->faulty && !rdev->alias_device) {
1058                         printk("[events: %08lx]",
1059                                 (unsigned long)rdev->sb->events_lo);
1060                         err += write_disk_sb(rdev);
1061                 } else
1062                         printk(")\n");
1063         }
1064         if (err) {
1065                 if (--count) {
1066                         printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
1067                         goto repeat;
1068                 }
1069                 printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
1070         }
1071         return 0;
1072 }
1073
1074 /*
1075  * Import a device. If 'on_disk', then sanity check the superblock
1076  *
1077  * mark the device faulty if:
1078  *
1079  *   - the device is nonexistent (zero size)
1080  *   - the device has no valid superblock
1081  *
1082  */
1083 static int md_import_device(kdev_t newdev, int on_disk)
1084 {
1085         int err;
1086         mdk_rdev_t *rdev;
1087         unsigned int size;
1088
1089         if (find_rdev_all(newdev))
1090                 return -EEXIST;
1091
1092         rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1093         if (!rdev) {
1094                 printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
1095                 return -ENOMEM;
1096         }
1097         memset(rdev, 0, sizeof(*rdev));
1098
1099         if (is_mounted(newdev)) {
1100                 printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
1101                         partition_name(newdev));
1102                 err = -EBUSY;
1103                 goto abort_free;
1104         }
1105
1106         if ((err = alloc_disk_sb(rdev)))
1107                 goto abort_free;
1108
1109         rdev->dev = newdev;
1110         if (lock_rdev(rdev)) {
1111                 printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
1112                         partition_name(newdev));
1113                 err = -EINVAL;
1114                 goto abort_free;
1115         }
1116         rdev->desc_nr = -1;
1117         rdev->faulty = 0;
1118
1119         size = 0;
1120         if (blk_size[MAJOR(newdev)])
1121                 size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1122         if (!size) {
1123                 printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
1124                                 partition_name(newdev));
1125                 err = -EINVAL;
1126                 goto abort_free;
1127         }
1128
1129         if (on_disk) {
1130                 if ((err = read_disk_sb(rdev))) {
1131                         printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
1132                                partition_name(newdev));
1133                         goto abort_free;
1134                 }
1135                 if ((err = check_disk_sb(rdev))) {
1136                         printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
1137                                partition_name(newdev));
1138                         goto abort_free;
1139                 }
1140
1141                 if (rdev->sb->level != -4) {
1142                         rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1143                                                 rdev->sb->this_disk.minor);
1144                         rdev->desc_nr = rdev->sb->this_disk.number;
1145                 } else {
1146                         rdev->old_dev = MKDEV(0, 0);
1147                         rdev->desc_nr = -1;
1148                 }
1149         }
1150         md_list_add(&rdev->all, &all_raid_disks);
1151         MD_INIT_LIST_HEAD(&rdev->pending);
1152
1153         return 0;
1154
1155 abort_free:
1156         if (rdev->sb) {
1157                 if (rdev->bdev)
1158                         unlock_rdev(rdev);
1159                 free_disk_sb(rdev);
1160         }
1161         kfree(rdev);
1162         return err;
1163 }
1164
1165 /*
1166  * Check a full RAID array for plausibility
1167  */
1168
1169 #define INCONSISTENT KERN_ERR \
1170 "md: fatal superblock inconsistency in %s -- removing from array\n"
1171
1172 #define OUT_OF_DATE KERN_ERR \
1173 "md: superblock update time inconsistency -- using the most recent one\n"
1174
1175 #define OLD_VERSION KERN_ALERT \
1176 "md: md%d: unsupported raid array version %d.%d.%d\n"
1177
1178 #define NOT_CLEAN_IGNORE KERN_ERR \
1179 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1180
1181 #define UNKNOWN_LEVEL KERN_ERR \
1182 "md: md%d: unsupported raid level %d\n"
1183
1184 static int analyze_sbs(mddev_t * mddev)
1185 {
1186         int out_of_date = 0, i, first;
1187         struct md_list_head *tmp, *tmp2;
1188         mdk_rdev_t *rdev, *rdev2, *freshest;
1189         mdp_super_t *sb;
1190
1191         /*
1192          * Verify the RAID superblock on each real device
1193          */
1194         ITERATE_RDEV(mddev,rdev,tmp) {
1195                 if (rdev->faulty) {
1196                         MD_BUG();
1197                         goto abort;
1198                 }
1199                 if (!rdev->sb) {
1200                         MD_BUG();
1201                         goto abort;
1202                 }
1203                 if (check_disk_sb(rdev))
1204                         goto abort;
1205         }
1206
1207         /*
1208          * The superblock constant part has to be the same
1209          * for all disks in the array.
1210          */
1211         sb = NULL;
1212
1213         ITERATE_RDEV(mddev,rdev,tmp) {
1214                 if (!sb) {
1215                         sb = rdev->sb;
1216                         continue;
1217                 }
1218                 if (!sb_equal(sb, rdev->sb)) {
1219                         printk(INCONSISTENT, partition_name(rdev->dev));
1220                         kick_rdev_from_array(rdev);
1221                         continue;
1222                 }
1223         }
1224
1225         /*
1226          * OK, we have all disks and the array is ready to run. Let's
1227          * find the freshest superblock, that one will be the superblock
1228          * that represents the whole array.
1229          */
1230         if (!mddev->sb)
1231                 if (alloc_array_sb(mddev))
1232                         goto abort;
1233         sb = mddev->sb;
1234         freshest = NULL;
1235
1236         ITERATE_RDEV(mddev,rdev,tmp) {
1237                 __u64 ev1, ev2;
1238                 /*
1239                  * if the checksum is invalid, use the superblock
1240                  * only as a last resort. (decrease it's age by
1241                  * one event)
1242                  */
1243                 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1244                         if (rdev->sb->events_lo || rdev->sb->events_hi)
1245                                 if ((rdev->sb->events_lo--)==0)
1246                                         rdev->sb->events_hi--;
1247                 }
1248
1249                 printk(KERN_INFO "md: %s's event counter: %08lx\n",
1250                        partition_name(rdev->dev),
1251                         (unsigned long)rdev->sb->events_lo);
1252                 if (!freshest) {
1253                         freshest = rdev;
1254                         continue;
1255                 }
1256                 /*
1257                  * Find the newest superblock version
1258                  */
1259                 ev1 = md_event(rdev->sb);
1260                 ev2 = md_event(freshest->sb);
1261                 if (ev1 != ev2) {
1262                         out_of_date = 1;
1263                         if (ev1 > ev2)
1264                                 freshest = rdev;
1265                 }
1266         }
1267         if (out_of_date) {
1268                 printk(OUT_OF_DATE);
1269                 printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
1270         }
1271         memcpy (sb, freshest->sb, sizeof(*sb));
1272
1273         /*
1274          * For multipathing, lots of things are different from "true"
1275          * RAIDs.
1276          * All rdev's could be read, so they are no longer faulty.
1277          * As there is just one sb, trying to find changed devices via the
1278          * this_disk pointer is useless too.
1279          *
1280          * lmb@suse.de, 2002-09-12
1281          */
1282
1283         if (sb->level == -4) {
1284                 int desc_nr = 0;
1285
1286                 /* ... and initialize from the current rdevs instead */
1287                 ITERATE_RDEV(mddev,rdev,tmp) {
1288                         mdp_disk_t *desc;
1289
1290                         rdev->desc_nr=desc_nr;
1291
1292                         desc = &sb->disks[rdev->desc_nr];
1293
1294                         desc->number = desc_nr;
1295                         desc->major = MAJOR(rdev->dev);
1296                         desc->minor = MINOR(rdev->dev);
1297                         desc->raid_disk = desc_nr;
1298
1299                         /* We could read from it, so it isn't faulty
1300                          * any longer */
1301                         if (disk_faulty(desc))
1302                                 mark_disk_spare(desc);
1303
1304                         memcpy(&rdev->sb->this_disk,desc,sizeof(*desc));
1305
1306                         desc_nr++;
1307                 }
1308
1309                 /* Kick out all old info about disks we used to have,
1310                  * if any */
1311                 for (i = desc_nr; i < MD_SB_DISKS; i++)
1312                         memset(&(sb->disks[i]),0,sizeof(mdp_disk_t));
1313         } else {
1314                 /*
1315                  * at this point we have picked the 'best' superblock
1316                  * from all available superblocks.
1317                  * now we validate this superblock and kick out possibly
1318                  * failed disks.
1319                  */
1320                 ITERATE_RDEV(mddev,rdev,tmp) {
1321                         /*
1322                          * Kick all non-fresh devices
1323                          */
1324                         __u64 ev1, ev2;
1325                         ev1 = md_event(rdev->sb);
1326                         ev2 = md_event(sb);
1327                         ++ev1;
1328                         if (ev1 < ev2) {
1329                                 printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
1330                                                         partition_name(rdev->dev));
1331                                 kick_rdev_from_array(rdev);
1332                                 continue;
1333                         }
1334                 }
1335
1336                 /*
1337                  * Fix up changed device names ... but only if this disk has a
1338                  * recent update time. Use faulty checksum ones too.
1339                  */
1340                 ITERATE_RDEV(mddev,rdev,tmp) {
1341                         __u64 ev1, ev2, ev3;
1342                         if (rdev->faulty || rdev->alias_device) {
1343                                 MD_BUG();
1344                                 goto abort;
1345                         }
1346                         ev1 = md_event(rdev->sb);
1347                         ev2 = md_event(sb);
1348                         ev3 = ev2;
1349                         --ev3;
1350                         if ((rdev->dev != rdev->old_dev) &&
1351                                 ((ev1 == ev2) || (ev1 == ev3))) {
1352                                 mdp_disk_t *desc;
1353
1354                                 printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
1355                                        partition_name(rdev->old_dev), partition_name(rdev->dev));
1356                                 if (rdev->desc_nr == -1) {
1357                                         MD_BUG();
1358                                         goto abort;
1359                                 }
1360                                 desc = &sb->disks[rdev->desc_nr];
1361                                 if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1362                                         MD_BUG();
1363                                         goto abort;
1364                                 }
1365                                 desc->major = MAJOR(rdev->dev);
1366                                 desc->minor = MINOR(rdev->dev);
1367                                 desc = &rdev->sb->this_disk;
1368                                 desc->major = MAJOR(rdev->dev);
1369                                 desc->minor = MINOR(rdev->dev);
1370                         }
1371                 }
1372
1373                 /*
1374                  * Remove unavailable and faulty devices ...
1375                  *
1376                  * note that if an array becomes completely unrunnable due to
1377                  * missing devices, we do not write the superblock back, so the
1378                  * administrator has a chance to fix things up. The removal thus
1379                  * only happens if it's nonfatal to the contents of the array.
1380                  */
1381                 for (i = 0; i < MD_SB_DISKS; i++) {
1382                         int found;
1383                         mdp_disk_t *desc;
1384                         kdev_t dev;
1385
1386                         desc = sb->disks + i;
1387                         dev = MKDEV(desc->major, desc->minor);
1388
1389                         /*
1390                          * We kick faulty devices/descriptors immediately.
1391                          */
1392                         if (disk_faulty(desc)) {
1393                                 found = 0;
1394                                 ITERATE_RDEV(mddev,rdev,tmp) {
1395                                         if (rdev->desc_nr != desc->number)
1396                                                 continue;
1397                                         printk(KERN_WARNING "md%d: kicking faulty %s!\n",
1398                                                 mdidx(mddev),partition_name(rdev->dev));
1399                                         kick_rdev_from_array(rdev);
1400                                         found = 1;
1401                                         break;
1402                                 }
1403                                 if (!found) {
1404                                         if (dev == MKDEV(0,0))
1405                                                 continue;
1406                                         printk(KERN_WARNING "md%d: removing former faulty %s!\n",
1407                                                 mdidx(mddev), partition_name(dev));
1408                                 }
1409                                 remove_descriptor(desc, sb);
1410                                 continue;
1411                         }
1412
1413                         if (dev == MKDEV(0,0))
1414                                 continue;
1415                         /*
1416                          * Is this device present in the rdev ring?
1417                          */
1418                         found = 0;
1419                         ITERATE_RDEV(mddev,rdev,tmp) {
1420                                 if (rdev->desc_nr == desc->number) {
1421                                         found = 1;
1422                                         break;
1423                                 }
1424                         }
1425                         if (found)
1426                                 continue;
1427
1428                         printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
1429                                mdidx(mddev), partition_name(dev));
1430                         remove_descriptor(desc, sb);
1431                 }
1432         }
1433
1434         /*
1435          * Double check wether all devices mentioned in the
1436          * superblock are in the rdev ring.
1437          */
1438         first = 1;
1439         for (i = 0; i < MD_SB_DISKS; i++) {
1440                 mdp_disk_t *desc;
1441                 kdev_t dev;
1442
1443                 desc = sb->disks + i;
1444                 dev = MKDEV(desc->major, desc->minor);
1445
1446                 if (dev == MKDEV(0,0))
1447                         continue;
1448
1449                 if (disk_faulty(desc)) {
1450                         MD_BUG();
1451                         goto abort;
1452                 }
1453
1454                 rdev = find_rdev(mddev, dev);
1455                 if (!rdev) {
1456                         MD_BUG();
1457                         goto abort;
1458                 }
1459                 /*
1460                  * In the case of Multipath-IO, we have no
1461                  * other information source to find out which
1462                  * disk is which, only the position of the device
1463                  * in the superblock:
1464                  */
1465                 if (mddev->sb->level == -4) {
1466                         if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
1467                                 MD_BUG();
1468                                 goto abort;
1469                         }
1470                         rdev->desc_nr = i;
1471                         if (!first)
1472                                 rdev->alias_device = 1;
1473                         else
1474                                 first = 0;
1475                 }
1476         }
1477
1478         /*
1479          * Kick all rdevs that are not in the
1480          * descriptor array:
1481          */
1482         ITERATE_RDEV(mddev,rdev,tmp) {
1483                 if (rdev->desc_nr == -1)
1484                         kick_rdev_from_array(rdev);
1485         }
1486
1487         /*
1488          * Do a final reality check.
1489          */
1490         if (mddev->sb->level != -4) {
1491                 ITERATE_RDEV(mddev,rdev,tmp) {
1492                         if (rdev->desc_nr == -1) {
1493                                 MD_BUG();
1494                                 goto abort;
1495                         }
1496                         /*
1497                          * is the desc_nr unique?
1498                          */
1499                         ITERATE_RDEV(mddev,rdev2,tmp2) {
1500                                 if ((rdev2 != rdev) &&
1501                                                 (rdev2->desc_nr == rdev->desc_nr)) {
1502                                         MD_BUG();
1503                                         goto abort;
1504                                 }
1505                         }
1506                         /*
1507                          * is the device unique?
1508                          */
1509                         ITERATE_RDEV(mddev,rdev2,tmp2) {
1510                                 if ((rdev2 != rdev) &&
1511                                                 (rdev2->dev == rdev->dev)) {
1512                                         MD_BUG();
1513                                         goto abort;
1514                                 }
1515                         }
1516                 }
1517         }
1518
1519         /*
1520          * Check if we can support this RAID array
1521          */
1522         if (sb->major_version != MD_MAJOR_VERSION ||
1523                         sb->minor_version > MD_MINOR_VERSION) {
1524
1525                 printk(OLD_VERSION, mdidx(mddev), sb->major_version,
1526                                 sb->minor_version, sb->patch_version);
1527                 goto abort;
1528         }
1529
1530         if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1531                         (sb->level == 4) || (sb->level == 5)))
1532                 printk(NOT_CLEAN_IGNORE, mdidx(mddev));
1533
1534         return 0;
1535 abort:
1536         return 1;
1537 }
1538
1539 #undef INCONSISTENT
1540 #undef OUT_OF_DATE
1541 #undef OLD_VERSION
1542 #undef OLD_LEVEL
1543
1544 static int device_size_calculation(mddev_t * mddev)
1545 {
1546         int data_disks = 0, persistent;
1547         unsigned int readahead;
1548         mdp_super_t *sb = mddev->sb;
1549         struct md_list_head *tmp;
1550         mdk_rdev_t *rdev;
1551
1552         /*
1553          * Do device size calculation. Bail out if too small.
1554          * (we have to do this after having validated chunk_size,
1555          * because device size has to be modulo chunk_size)
1556          */
1557         persistent = !mddev->sb->not_persistent;
1558         ITERATE_RDEV(mddev,rdev,tmp) {
1559                 if (rdev->faulty)
1560                         continue;
1561                 if (rdev->size) {
1562                         MD_BUG();
1563                         continue;
1564                 }
1565                 rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1566                 if (rdev->size < sb->chunk_size / 1024) {
1567                         printk(KERN_WARNING
1568                                 "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
1569                                 partition_name(rdev->dev),
1570                                 rdev->size, sb->chunk_size / 1024);
1571                         return -EINVAL;
1572                 }
1573         }
1574
1575         switch (sb->level) {
1576                 case -4:
1577                         data_disks = 1;
1578                         break;
1579                 case -3:
1580                         data_disks = 1;
1581                         break;
1582                 case -2:
1583                         data_disks = 1;
1584                         break;
1585                 case -1:
1586                         zoned_raid_size(mddev);
1587                         data_disks = 1;
1588                         break;
1589                 case 0:
1590                         zoned_raid_size(mddev);
1591                         data_disks = sb->raid_disks;
1592                         break;
1593                 case 1:
1594                         data_disks = 1;
1595                         break;
1596                 case 4:
1597                 case 5:
1598                         data_disks = sb->raid_disks-1;
1599                         break;
1600                 default:
1601                         printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1602                         goto abort;
1603         }
1604         if (!md_size[mdidx(mddev)])
1605                 md_size[mdidx(mddev)] = sb->size * data_disks;
1606
1607         readahead = MD_READAHEAD;
1608         if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
1609                 readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
1610                 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
1611                         readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
1612         } else {
1613                 // (no multipath branch - it uses the default setting)
1614                 if (sb->level == -3)
1615                         readahead = 0;
1616         }
1617         md_maxreadahead[mdidx(mddev)] = readahead;
1618
1619         printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
1620                 mdidx(mddev), readahead*(PAGE_SIZE/1024));
1621
1622         printk(KERN_INFO
1623                 "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1624                         mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
1625         return 0;
1626 abort:
1627         return 1;
1628 }
1629
1630
1631 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1632 "too big chunk_size: %d > %d\n"
1633
1634 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1635 "too small chunk_size: %d < %ld\n"
1636
1637 #define BAD_CHUNKSIZE KERN_ERR \
1638 "no chunksize specified, see 'man raidtab'\n"
1639
1640 static int do_md_run(mddev_t * mddev)
1641 {
1642         int pnum, err;
1643         int chunk_size;
1644         struct md_list_head *tmp;
1645         mdk_rdev_t *rdev;
1646
1647
1648         if (!mddev->nb_dev) {
1649                 MD_BUG();
1650                 return -EINVAL;
1651         }
1652
1653         if (mddev->pers)
1654                 return -EBUSY;
1655
1656         /*
1657          * Resize disks to align partitions size on a given
1658          * chunk size.
1659          */
1660         md_size[mdidx(mddev)] = 0;
1661
1662         /*
1663          * Analyze all RAID superblock(s)
1664          */
1665         if (analyze_sbs(mddev)) {
1666                 MD_BUG();
1667                 return -EINVAL;
1668         }
1669
1670         chunk_size = mddev->sb->chunk_size;
1671         pnum = level_to_pers(mddev->sb->level);
1672
1673         mddev->param.chunk_size = chunk_size;
1674         mddev->param.personality = pnum;
1675
1676         if ((pnum != MULTIPATH) && (pnum != RAID1)) {
1677                 if (!chunk_size) {
1678                         /*
1679                          * 'default chunksize' in the old md code used to
1680                          * be PAGE_SIZE, baaad.
1681                          * we abort here to be on the safe side. We dont
1682                          * want to continue the bad practice.
1683                          */
1684                         printk(BAD_CHUNKSIZE);
1685                         return -EINVAL;
1686                 }
1687                 if (chunk_size > MAX_CHUNK_SIZE) {
1688                         printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1689                         return -EINVAL;
1690                 }
1691                 /*
1692                  * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1693                  */
1694                 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1695                         MD_BUG();
1696                         return -EINVAL;
1697                 }
1698                 if (chunk_size < PAGE_SIZE) {
1699                         printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1700                         return -EINVAL;
1701                 }
1702         } else
1703                 if (chunk_size)
1704                         printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
1705                                mddev->sb->level);
1706
1707         if (pnum >= MAX_PERSONALITY) {
1708                 MD_BUG();
1709                 return -EINVAL;
1710         }
1711
1712         if (!pers[pnum])
1713         {
1714 #ifdef CONFIG_KMOD
1715                 char module_name[80];
1716                 sprintf (module_name, "md-personality-%d", pnum);
1717                 request_module (module_name);
1718                 if (!pers[pnum])
1719 #endif
1720                 {
1721                         printk(KERN_ERR "md: personality %d is not loaded!\n",
1722                                 pnum);
1723                         return -EINVAL;
1724                 }
1725         }
1726
1727         if (device_size_calculation(mddev))
1728                 return -EINVAL;
1729
1730         /*
1731          * Drop all container device buffers, from now on
1732          * the only valid external interface is through the md
1733          * device.
1734          * Also find largest hardsector size
1735          */
1736         md_hardsect_sizes[mdidx(mddev)] = 512;
1737         ITERATE_RDEV(mddev,rdev,tmp) {
1738                 if (rdev->faulty)
1739                         continue;
1740                 invalidate_device(rdev->dev, 1);
1741                 if (get_hardsect_size(rdev->dev)
1742                         > md_hardsect_sizes[mdidx(mddev)])
1743                         md_hardsect_sizes[mdidx(mddev)] =
1744                                 get_hardsect_size(rdev->dev);
1745         }
1746         md_blocksizes[mdidx(mddev)] = 1024;
1747         if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
1748                 md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
1749         mddev->pers = pers[pnum];
1750
1751         err = mddev->pers->run(mddev);
1752         if (err) {
1753                 printk(KERN_ERR "md: pers->run() failed ...\n");
1754                 mddev->pers = NULL;
1755                 return -EINVAL;
1756         }
1757
1758         mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1759         mddev->sb_dirty = 1;
1760         md_update_sb(mddev);
1761
1762         /*
1763          * md_size has units of 1K blocks, which are
1764          * twice as large as sectors.
1765          */
1766         md_hd_struct[mdidx(mddev)].start_sect = 0;
1767         register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
1768                         1, &md_fops, md_size[mdidx(mddev)]<<1);
1769
1770         read_ahead[MD_MAJOR] = 1024;
1771         return (0);
1772 }
1773
1774 #undef TOO_BIG_CHUNKSIZE
1775 #undef BAD_CHUNKSIZE
1776
1777 #define OUT(x) do { err = (x); goto out; } while (0)
1778
1779 static int restart_array(mddev_t *mddev)
1780 {
1781         int err = 0;
1782
1783         /*
1784          * Complain if it has no devices
1785          */
1786         if (!mddev->nb_dev)
1787                 OUT(-ENXIO);
1788
1789         if (mddev->pers) {
1790                 if (!mddev->ro)
1791                         OUT(-EBUSY);
1792
1793                 mddev->ro = 0;
1794                 set_device_ro(mddev_to_kdev(mddev), 0);
1795
1796                 printk(KERN_INFO
1797                         "md: md%d switched to read-write mode.\n", mdidx(mddev));
1798                 /*
1799                  * Kick recovery or resync if necessary
1800                  */
1801                 md_recover_arrays();
1802                 if (mddev->pers->restart_resync)
1803                         mddev->pers->restart_resync(mddev);
1804         } else {
1805                 printk(KERN_ERR "md: md%d has no personality assigned.\n",
1806                         mdidx(mddev));
1807                 err = -EINVAL;
1808         }
1809
1810 out:
1811         return err;
1812 }
1813
1814 #define STILL_MOUNTED KERN_WARNING \
1815 "md: md%d still mounted.\n"
1816 #define STILL_IN_USE \
1817 "md: md%d still in use.\n"
1818
1819 static int do_md_stop(mddev_t * mddev, int ro)
1820 {
1821         int err = 0, resync_interrupted = 0;
1822         kdev_t dev = mddev_to_kdev(mddev);
1823
1824 #if 0 /* ->active is not currently reliable */
1825         if (atomic_read(&mddev->active)>1) {
1826                 printk(STILL_IN_USE, mdidx(mddev));
1827                 OUT(-EBUSY);
1828         }
1829 #endif
1830
1831         if (mddev->pers) {
1832                 /*
1833                  * It is safe to call stop here, it only frees private
1834                  * data. Also, it tells us if a device is unstoppable
1835                  * (eg. resyncing is in progress)
1836                  */
1837                 if (mddev->pers->stop_resync)
1838                         if (mddev->pers->stop_resync(mddev))
1839                                 resync_interrupted = 1;
1840
1841                 if (mddev->recovery_running)
1842                         md_interrupt_thread(md_recovery_thread);
1843
1844                 /*
1845                  * This synchronizes with signal delivery to the
1846                  * resync or reconstruction thread. It also nicely
1847                  * hangs the process if some reconstruction has not
1848                  * finished.
1849                  */
1850                 down(&mddev->recovery_sem);
1851                 up(&mddev->recovery_sem);
1852
1853                 invalidate_device(dev, 1);
1854
1855                 if (ro) {
1856                         if (mddev->ro)
1857                                 OUT(-ENXIO);
1858                         mddev->ro = 1;
1859                 } else {
1860                         if (mddev->ro)
1861                                 set_device_ro(dev, 0);
1862                         if (mddev->pers->stop(mddev)) {
1863                                 if (mddev->ro)
1864                                         set_device_ro(dev, 1);
1865                                 OUT(-EBUSY);
1866                         }
1867                         if (mddev->ro)
1868                                 mddev->ro = 0;
1869                 }
1870                 if (mddev->sb) {
1871                         /*
1872                          * mark it clean only if there was no resync
1873                          * interrupted.
1874                          */
1875                         if (!mddev->recovery_running && !resync_interrupted) {
1876                                 printk(KERN_INFO "md: marking sb clean...\n");
1877                                 mddev->sb->state |= 1 << MD_SB_CLEAN;
1878                         }
1879                         mddev->sb_dirty = 1;
1880                         md_update_sb(mddev);
1881                 }
1882                 if (ro)
1883                         set_device_ro(dev, 1);
1884         }
1885
1886         /*
1887          * Free resources if final stop
1888          */
1889         if (!ro) {
1890                 printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
1891                 free_mddev(mddev);
1892
1893         } else
1894                 printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
1895 out:
1896         return err;
1897 }
1898
1899 #undef OUT
1900
1901 /*
1902  * We have to safely support old arrays too.
1903  */
1904 int detect_old_array(mdp_super_t *sb)
1905 {
1906         if (sb->major_version > 0)
1907                 return 0;
1908         if (sb->minor_version >= 90)
1909                 return 0;
1910
1911         return -EINVAL;
1912 }
1913
1914
1915 static void autorun_array(mddev_t *mddev)
1916 {
1917         mdk_rdev_t *rdev;
1918         struct md_list_head *tmp;
1919         int err;
1920
1921         if (mddev->disks.prev == &mddev->disks) {
1922                 MD_BUG();
1923                 return;
1924         }
1925
1926         printk(KERN_INFO "md: running: ");
1927
1928         ITERATE_RDEV(mddev,rdev,tmp) {
1929                 printk("<%s>", partition_name(rdev->dev));
1930         }
1931         printk("\n");
1932
1933         err = do_md_run (mddev);
1934         if (err) {
1935                 printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
1936                 /*
1937                  * prevent the writeback of an unrunnable array
1938                  */
1939                 mddev->sb_dirty = 0;
1940                 do_md_stop (mddev, 0);
1941         }
1942 }
1943
1944 /*
1945  * lets try to run arrays based on all disks that have arrived
1946  * until now. (those are in the ->pending list)
1947  *
1948  * the method: pick the first pending disk, collect all disks with
1949  * the same UUID, remove all from the pending list and put them into
1950  * the 'same_array' list. Then order this list based on superblock
1951  * update time (freshest comes first), kick out 'old' disks and
1952  * compare superblocks. If everything's fine then run it.
1953  *
1954  * If "unit" is allocated, then bump its reference count
1955  */
1956 static void autorun_devices(kdev_t countdev)
1957 {
1958         struct md_list_head candidates;
1959         struct md_list_head *tmp;
1960         mdk_rdev_t *rdev0, *rdev;
1961         mddev_t *mddev;
1962         kdev_t md_kdev;
1963
1964
1965         printk(KERN_INFO "md: autorun ...\n");
1966         while (pending_raid_disks.next != &pending_raid_disks) {
1967                 rdev0 = md_list_entry(pending_raid_disks.next,
1968                                          mdk_rdev_t, pending);
1969
1970                 printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
1971                 MD_INIT_LIST_HEAD(&candidates);
1972                 ITERATE_RDEV_PENDING(rdev,tmp) {
1973                         if (uuid_equal(rdev0, rdev)) {
1974                                 if (!sb_equal(rdev0->sb, rdev->sb)) {
1975                                         printk(KERN_WARNING
1976                                                "md: %s has same UUID as %s, but superblocks differ ...\n",
1977                                                partition_name(rdev->dev), partition_name(rdev0->dev));
1978                                         continue;
1979                                 }
1980                                 printk(KERN_INFO "md:  adding %s ...\n", partition_name(rdev->dev));
1981                                 md_list_del(&rdev->pending);
1982                                 md_list_add(&rdev->pending, &candidates);
1983                         }
1984                 }
1985                 /*
1986                  * now we have a set of devices, with all of them having
1987                  * mostly sane superblocks. It's time to allocate the
1988                  * mddev.
1989                  */
1990                 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1991                 mddev = kdev_to_mddev(md_kdev);
1992                 if (mddev) {
1993                         printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
1994                                mdidx(mddev), partition_name(rdev0->dev));
1995                         ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1996                                 export_rdev(rdev);
1997                         continue;
1998                 }
1999                 mddev = alloc_mddev(md_kdev);
2000                 if (!mddev) {
2001                         printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
2002                         break;
2003                 }
2004                 if (md_kdev == countdev)
2005                         atomic_inc(&mddev->active);
2006                 printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
2007                 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
2008                         bind_rdev_to_array(rdev, mddev);
2009                         md_list_del(&rdev->pending);
2010                         MD_INIT_LIST_HEAD(&rdev->pending);
2011                 }
2012                 autorun_array(mddev);
2013         }
2014         printk(KERN_INFO "md: ... autorun DONE.\n");
2015 }
2016
2017 /*
2018  * import RAID devices based on one partition
2019  * if possible, the array gets run as well.
2020  */
2021
2022 #define BAD_VERSION KERN_ERR \
2023 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
2024
2025 #define OUT_OF_MEM KERN_ALERT \
2026 "md: out of memory.\n"
2027
2028 #define NO_DEVICE KERN_ERR \
2029 "md: disabled device %s\n"
2030
2031 #define AUTOADD_FAILED KERN_ERR \
2032 "md: auto-adding devices to md%d FAILED (error %d).\n"
2033
2034 #define AUTOADD_FAILED_USED KERN_ERR \
2035 "md: cannot auto-add device %s to md%d, already used.\n"
2036
2037 #define AUTORUN_FAILED KERN_ERR \
2038 "md: auto-running md%d FAILED (error %d).\n"
2039
2040 #define MDDEV_BUSY KERN_ERR \
2041 "md: cannot auto-add to md%d, already running.\n"
2042
2043 #define AUTOADDING KERN_INFO \
2044 "md: auto-adding devices to md%d, based on %s's superblock.\n"
2045
2046 #define AUTORUNNING KERN_INFO \
2047 "md: auto-running md%d.\n"
2048
2049 static int autostart_array(kdev_t startdev, kdev_t countdev)
2050 {
2051         int err = -EINVAL, i;
2052         mdp_super_t *sb = NULL;
2053         mdk_rdev_t *start_rdev = NULL, *rdev;
2054
2055         if (md_import_device(startdev, 1)) {
2056                 printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
2057                 goto abort;
2058         }
2059
2060         start_rdev = find_rdev_all(startdev);
2061         if (!start_rdev) {
2062                 MD_BUG();
2063                 goto abort;
2064         }
2065         if (start_rdev->faulty) {
2066                 printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
2067                                                 partition_name(startdev));
2068                 goto abort;
2069         }
2070         md_list_add(&start_rdev->pending, &pending_raid_disks);
2071
2072         sb = start_rdev->sb;
2073
2074         err = detect_old_array(sb);
2075         if (err) {
2076                 printk(KERN_WARNING "md: array version is too old to be autostarted ,"
2077                        "use raidtools 0.90 mkraid --upgrade to upgrade the array "
2078                        "without data loss!\n");
2079                 goto abort;
2080         }
2081
2082         for (i = 0; i < MD_SB_DISKS; i++) {
2083                 mdp_disk_t *desc;
2084                 kdev_t dev;
2085
2086                 desc = sb->disks + i;
2087                 dev = MKDEV(desc->major, desc->minor);
2088
2089                 if (dev == MKDEV(0,0))
2090                         continue;
2091                 if (dev == startdev)
2092                         continue;
2093                 if (md_import_device(dev, 1)) {
2094                         printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
2095                                partition_name(dev));
2096                         continue;
2097                 }
2098                 rdev = find_rdev_all(dev);
2099                 if (!rdev) {
2100                         MD_BUG();
2101                         goto abort;
2102                 }
2103                 md_list_add(&rdev->pending, &pending_raid_disks);
2104         }
2105
2106         /*
2107          * possibly return codes
2108          */
2109         autorun_devices(countdev);
2110         return 0;
2111
2112 abort:
2113         if (start_rdev)
2114                 export_rdev(start_rdev);
2115         return err;
2116 }
2117
2118 #undef BAD_VERSION
2119 #undef OUT_OF_MEM
2120 #undef NO_DEVICE
2121 #undef AUTOADD_FAILED_USED
2122 #undef AUTOADD_FAILED
2123 #undef AUTORUN_FAILED
2124 #undef AUTOADDING
2125 #undef AUTORUNNING
2126
2127
2128 static int get_version(void * arg)
2129 {
2130         mdu_version_t ver;
2131
2132         ver.major = MD_MAJOR_VERSION;
2133         ver.minor = MD_MINOR_VERSION;
2134         ver.patchlevel = MD_PATCHLEVEL_VERSION;
2135
2136         if (md_copy_to_user(arg, &ver, sizeof(ver)))
2137                 return -EFAULT;
2138
2139         return 0;
2140 }
2141
2142 #define SET_FROM_SB(x) info.x = mddev->sb->x
2143 static int get_array_info(mddev_t * mddev, void * arg)
2144 {
2145         mdu_array_info_t info;
2146
2147         if (!mddev->sb) {
2148                 MD_BUG();
2149                 return -EINVAL;
2150         }
2151
2152         SET_FROM_SB(major_version);
2153         SET_FROM_SB(minor_version);
2154         SET_FROM_SB(patch_version);
2155         SET_FROM_SB(ctime);
2156         SET_FROM_SB(level);
2157         SET_FROM_SB(size);
2158         SET_FROM_SB(nr_disks);
2159         SET_FROM_SB(raid_disks);
2160         SET_FROM_SB(md_minor);
2161         SET_FROM_SB(not_persistent);
2162
2163         SET_FROM_SB(utime);
2164         SET_FROM_SB(state);
2165         SET_FROM_SB(active_disks);
2166         SET_FROM_SB(working_disks);
2167         SET_FROM_SB(failed_disks);
2168         SET_FROM_SB(spare_disks);
2169
2170         SET_FROM_SB(layout);
2171         SET_FROM_SB(chunk_size);
2172
2173         if (md_copy_to_user(arg, &info, sizeof(info)))
2174                 return -EFAULT;
2175
2176         return 0;
2177 }
2178 #undef SET_FROM_SB
2179
2180 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2181 static int get_disk_info(mddev_t * mddev, void * arg)
2182 {
2183         mdu_disk_info_t info;
2184         unsigned int nr;
2185
2186         if (!mddev->sb)
2187                 return -EINVAL;
2188
2189         if (md_copy_from_user(&info, arg, sizeof(info)))
2190                 return -EFAULT;
2191
2192         nr = info.number;
2193         if (nr >= MD_SB_DISKS)
2194                 return -EINVAL;
2195
2196         SET_FROM_SB(major);
2197         SET_FROM_SB(minor);
2198         SET_FROM_SB(raid_disk);
2199         SET_FROM_SB(state);
2200
2201         if (md_copy_to_user(arg, &info, sizeof(info)))
2202                 return -EFAULT;
2203
2204         return 0;
2205 }
2206 #undef SET_FROM_SB
2207
2208 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2209
2210 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2211 {
2212         int err, size, persistent;
2213         mdk_rdev_t *rdev;
2214         unsigned int nr;
2215         kdev_t dev;
2216         dev = MKDEV(info->major,info->minor);
2217
2218         if (find_rdev_all(dev)) {
2219                 printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
2220                        partition_name(dev));
2221                 return -EBUSY;
2222         }
2223         if (!mddev->sb) {
2224                 /* expecting a device which has a superblock */
2225                 err = md_import_device(dev, 1);
2226                 if (err) {
2227                         printk(KERN_WARNING "md: md_import_device returned %d\n", err);
2228                         return -EINVAL;
2229                 }
2230                 rdev = find_rdev_all(dev);
2231                 if (!rdev) {
2232                         MD_BUG();
2233                         return -EINVAL;
2234                 }
2235                 if (mddev->nb_dev) {
2236                         mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2237                                                         mdk_rdev_t, same_set);
2238                         if (!uuid_equal(rdev0, rdev)) {
2239                                 printk(KERN_WARNING "md: %s has different UUID to %s\n",
2240                                        partition_name(rdev->dev), partition_name(rdev0->dev));
2241                                 export_rdev(rdev);
2242                                 return -EINVAL;
2243                         }
2244                         if (!sb_equal(rdev0->sb, rdev->sb)) {
2245                                 printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
2246                                        partition_name(rdev->dev), partition_name(rdev0->dev));
2247                                 export_rdev(rdev);
2248                                 return -EINVAL;
2249                         }
2250                 }
2251                 bind_rdev_to_array(rdev, mddev);
2252                 return 0;
2253         }
2254
2255         nr = info->number;
2256         if (nr >= mddev->sb->nr_disks) {
2257                 MD_BUG();
2258                 return -EINVAL;
2259         }
2260
2261
2262         SET_SB(number);
2263         SET_SB(major);
2264         SET_SB(minor);
2265         SET_SB(raid_disk);
2266         SET_SB(state);
2267
2268         if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2269                 err = md_import_device (dev, 0);
2270                 if (err) {
2271                         printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
2272                         return -EINVAL;
2273                 }
2274                 rdev = find_rdev_all(dev);
2275                 if (!rdev) {
2276                         MD_BUG();
2277                         return -EINVAL;
2278                 }
2279
2280                 rdev->old_dev = dev;
2281                 rdev->desc_nr = info->number;
2282
2283                 bind_rdev_to_array(rdev, mddev);
2284
2285                 persistent = !mddev->sb->not_persistent;
2286                 if (!persistent)
2287                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
2288
2289                 size = calc_dev_size(dev, mddev, persistent);
2290                 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2291
2292                 if (!mddev->sb->size || (mddev->sb->size > size))
2293                         mddev->sb->size = size;
2294         }
2295
2296         /*
2297          * sync all other superblocks with the main superblock
2298          */
2299         sync_sbs(mddev);
2300
2301         return 0;
2302 }
2303 #undef SET_SB
2304
2305 static int hot_generate_error(mddev_t * mddev, kdev_t dev)
2306 {
2307         struct request_queue *q;
2308         mdk_rdev_t *rdev;
2309         mdp_disk_t *disk;
2310
2311         if (!mddev->pers)
2312                 return -ENODEV;
2313
2314         printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
2315                 partition_name(dev), mdidx(mddev));
2316
2317         rdev = find_rdev(mddev, dev);
2318         if (!rdev) {
2319                 MD_BUG();
2320                 return -ENXIO;
2321         }
2322
2323         if (rdev->desc_nr == -1) {
2324                 MD_BUG();
2325                 return -EINVAL;
2326         }
2327         disk = &mddev->sb->disks[rdev->desc_nr];
2328         if (!disk_active(disk))
2329                 return -ENODEV;
2330
2331         q = blk_get_queue(rdev->dev);
2332         if (!q) {
2333                 MD_BUG();
2334                 return -ENODEV;
2335         }
2336         printk(KERN_INFO "md: okay, generating error!\n");
2337 //      q->oneshot_error = 1; // disabled for now
2338
2339         return 0;
2340 }
2341
2342 static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
2343 {
2344         int err;
2345         mdk_rdev_t *rdev;
2346         mdp_disk_t *disk;
2347
2348         if (!mddev->pers)
2349                 return -ENODEV;
2350
2351         printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
2352                 partition_name(dev), mdidx(mddev));
2353
2354         if (!mddev->pers->diskop) {
2355                 printk(KERN_WARNING "md%d: personality does not support diskops!\n",
2356                        mdidx(mddev));
2357                 return -EINVAL;
2358         }
2359
2360         rdev = find_rdev(mddev, dev);
2361         if (!rdev)
2362                 return -ENXIO;
2363
2364         if (rdev->desc_nr == -1) {
2365                 MD_BUG();
2366                 return -EINVAL;
2367         }
2368         disk = &mddev->sb->disks[rdev->desc_nr];
2369         if (disk_active(disk))
2370                 goto busy;
2371
2372         if (disk_removed(disk))
2373                 return -EINVAL;
2374
2375         err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2376         if (err == -EBUSY)
2377                 goto busy;
2378
2379         if (err) {
2380                 MD_BUG();
2381                 return -EINVAL;
2382         }
2383
2384         remove_descriptor(disk, mddev->sb);
2385         kick_rdev_from_array(rdev);
2386         mddev->sb_dirty = 1;
2387         md_update_sb(mddev);
2388
2389         return 0;
2390 busy:
2391         printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
2392                 partition_name(dev), mdidx(mddev));
2393         return -EBUSY;
2394 }
2395
2396 static int hot_add_disk(mddev_t * mddev, kdev_t dev)
2397 {
2398         int i, err, persistent;
2399         unsigned int size;
2400         mdk_rdev_t *rdev;
2401         mdp_disk_t *disk;
2402
2403         if (!mddev->pers)
2404                 return -ENODEV;
2405
2406         printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
2407                 partition_name(dev), mdidx(mddev));
2408
2409         if (!mddev->pers->diskop) {
2410                 printk(KERN_WARNING "md%d: personality does not support diskops!\n",
2411                        mdidx(mddev));
2412                 return -EINVAL;
2413         }
2414
2415         persistent = !mddev->sb->not_persistent;
2416
2417         rdev = find_rdev(mddev, dev);
2418         if (rdev)
2419                 return -EBUSY;
2420
2421         err = md_import_device (dev, 0);
2422         if (err) {
2423                 printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
2424                 return -EINVAL;
2425         }
2426         rdev = find_rdev_all(dev);
2427         if (!rdev) {
2428                 MD_BUG();
2429                 return -EINVAL;
2430         }
2431         if (rdev->faulty) {
2432                 printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
2433                                 partition_name(dev), mdidx(mddev));
2434                 err = -EINVAL;
2435                 goto abort_export;
2436         }
2437         size = calc_dev_size(dev, mddev, persistent);
2438
2439         if (size < mddev->sb->size) {
2440                 printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
2441                                 mdidx(mddev), size, mddev->sb->size);
2442                 err = -ENOSPC;
2443                 goto abort_export;
2444         }
2445         bind_rdev_to_array(rdev, mddev);
2446
2447         /*
2448          * The rest should better be atomic, we can have disk failures
2449          * noticed in interrupt contexts ...
2450          */
2451         rdev->old_dev = dev;
2452         rdev->size = size;
2453         rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2454
2455         disk = mddev->sb->disks + mddev->sb->raid_disks;
2456         for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2457                 disk = mddev->sb->disks + i;
2458
2459                 if (!disk->major && !disk->minor)
2460                         break;
2461                 if (disk_removed(disk))
2462                         break;
2463         }
2464         if (i == MD_SB_DISKS) {
2465                 printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
2466                        mdidx(mddev));
2467                 err = -EBUSY;
2468                 goto abort_unbind_export;
2469         }
2470
2471         if (disk_removed(disk)) {
2472                 /*
2473                  * reuse slot
2474                  */
2475                 if (disk->number != i) {
2476                         MD_BUG();
2477                         err = -EINVAL;
2478                         goto abort_unbind_export;
2479                 }
2480         } else {
2481                 disk->number = i;
2482         }
2483
2484         disk->raid_disk = disk->number;
2485         disk->major = MAJOR(dev);
2486         disk->minor = MINOR(dev);
2487
2488         if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2489                 MD_BUG();
2490                 err = -EINVAL;
2491                 goto abort_unbind_export;
2492         }
2493
2494         mark_disk_spare(disk);
2495         mddev->sb->nr_disks++;
2496         mddev->sb->spare_disks++;
2497         mddev->sb->working_disks++;
2498
2499         mddev->sb_dirty = 1;
2500         md_update_sb(mddev);
2501
2502         /*
2503          * Kick recovery, maybe this spare has to be added to the
2504          * array immediately.
2505          */
2506         md_recover_arrays();
2507
2508         return 0;
2509
2510 abort_unbind_export:
2511         unbind_rdev_from_array(rdev);
2512
2513 abort_export:
2514         export_rdev(rdev);
2515         return err;
2516 }
2517
2518 #define SET_SB(x) mddev->sb->x = info->x
2519 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2520 {
2521
2522         if (alloc_array_sb(mddev))
2523                 return -ENOMEM;
2524
2525         mddev->sb->major_version = MD_MAJOR_VERSION;
2526         mddev->sb->minor_version = MD_MINOR_VERSION;
2527         mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2528         mddev->sb->ctime = CURRENT_TIME;
2529
2530         SET_SB(level);
2531         SET_SB(size);
2532         SET_SB(nr_disks);
2533         SET_SB(raid_disks);
2534         SET_SB(md_minor);
2535         SET_SB(not_persistent);
2536
2537         SET_SB(state);
2538         SET_SB(active_disks);
2539         SET_SB(working_disks);
2540         SET_SB(failed_disks);
2541         SET_SB(spare_disks);
2542
2543         SET_SB(layout);
2544         SET_SB(chunk_size);
2545
2546         mddev->sb->md_magic = MD_SB_MAGIC;
2547
2548         /*
2549          * Generate a 128 bit UUID
2550          */
2551         get_random_bytes(&mddev->sb->set_uuid0, 4);
2552         get_random_bytes(&mddev->sb->set_uuid1, 4);
2553         get_random_bytes(&mddev->sb->set_uuid2, 4);
2554         get_random_bytes(&mddev->sb->set_uuid3, 4);
2555
2556         return 0;
2557 }
2558 #undef SET_SB
2559
2560 static int set_disk_info(mddev_t * mddev, void * arg)
2561 {
2562         printk(KERN_INFO "md: not yet");
2563         return -EINVAL;
2564 }
2565
2566 static int clear_array(mddev_t * mddev)
2567 {
2568         printk(KERN_INFO "md: not yet");
2569         return -EINVAL;
2570 }
2571
2572 static int write_raid_info(mddev_t * mddev)
2573 {
2574         printk(KERN_INFO "md: not yet");
2575         return -EINVAL;
2576 }
2577
2578 static int protect_array(mddev_t * mddev)
2579 {
2580         printk(KERN_INFO "md: not yet");
2581         return -EINVAL;
2582 }
2583
2584 static int unprotect_array(mddev_t * mddev)
2585 {
2586         printk(KERN_INFO "md: not yet");
2587         return -EINVAL;
2588 }
2589
2590 static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
2591 {
2592         int ret;
2593
2594         ret = md_error(mddev, dev);
2595         return ret;
2596 }
2597
2598 static int md_ioctl(struct inode *inode, struct file *file,
2599                         unsigned int cmd, unsigned long arg)
2600 {
2601         unsigned int minor;
2602         int err = 0;
2603         struct hd_geometry *loc = (struct hd_geometry *) arg;
2604         mddev_t *mddev = NULL;
2605         kdev_t dev;
2606
2607         if (!md_capable_admin())
2608                 return -EACCES;
2609
2610         dev = inode->i_rdev;
2611         minor = MINOR(dev);
2612         if (minor >= MAX_MD_DEVS) {
2613                 MD_BUG();
2614                 return -EINVAL;
2615         }
2616
2617         /*
2618          * Commands dealing with the RAID driver but not any
2619          * particular array:
2620          */
2621         switch (cmd)
2622         {
2623                 case RAID_VERSION:
2624                         err = get_version((void *)arg);
2625                         goto done;
2626
2627                 case PRINT_RAID_DEBUG:
2628                         err = 0;
2629                         md_print_devices();
2630                         goto done_unlock;
2631
2632 #ifndef MODULE
2633                 case RAID_AUTORUN:
2634                         err = 0;
2635                         autostart_arrays();
2636                         goto done;
2637 #endif
2638
2639                 case BLKGETSIZE:
2640                 case BLKGETSIZE64:
2641                 case BLKRAGET:
2642                 case BLKRASET:
2643                 case BLKFLSBUF:
2644                 case BLKSSZGET:
2645                 case BLKBSZGET:
2646                 case BLKBSZSET:
2647                         err = blk_ioctl (dev, cmd, arg);
2648                         goto abort;
2649
2650                 default:;
2651         }
2652
2653         /*
2654          * Commands creating/starting a new array:
2655          */
2656
2657         mddev = kdev_to_mddev(dev);
2658
2659         switch (cmd)
2660         {
2661                 case SET_ARRAY_INFO:
2662                 case START_ARRAY:
2663                         if (mddev) {
2664                                 printk(KERN_WARNING "md: array md%d already exists!\n",
2665                                                                 mdidx(mddev));
2666                                 err = -EEXIST;
2667                                 goto abort;
2668                         }
2669                 default:;
2670         }
2671         switch (cmd)
2672         {
2673                 case SET_ARRAY_INFO:
2674                         mddev = alloc_mddev(dev);
2675                         if (!mddev) {
2676                                 err = -ENOMEM;
2677                                 goto abort;
2678                         }
2679                         atomic_inc(&mddev->active);
2680
2681                         /*
2682                          * alloc_mddev() should possibly self-lock.
2683                          */
2684                         err = lock_mddev(mddev);
2685                         if (err) {
2686                                 printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
2687                                        err, cmd);
2688                                 goto abort;
2689                         }
2690
2691                         if (mddev->sb) {
2692                                 printk(KERN_WARNING "md: array md%d already has a superblock!\n",
2693                                         mdidx(mddev));
2694                                 err = -EBUSY;
2695                                 goto abort_unlock;
2696                         }
2697                         if (arg) {
2698                                 mdu_array_info_t info;
2699                                 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2700                                         err = -EFAULT;
2701                                         goto abort_unlock;
2702                                 }
2703                                 err = set_array_info(mddev, &info);
2704                                 if (err) {
2705                                         printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
2706                                         goto abort_unlock;
2707                                 }
2708                         }
2709                         goto done_unlock;
2710
2711                 case START_ARRAY:
2712                         /*
2713                          * possibly make it lock the array ...
2714                          */
2715                         err = autostart_array((kdev_t)arg, dev);
2716                         if (err) {
2717                                 printk(KERN_WARNING "md: autostart %s failed!\n",
2718                                         partition_name((kdev_t)arg));
2719                                 goto abort;
2720                         }
2721                         goto done;
2722
2723                 default:;
2724         }
2725
2726         /*
2727          * Commands querying/configuring an existing array:
2728          */
2729
2730         if (!mddev) {
2731                 err = -ENODEV;
2732                 goto abort;
2733         }
2734         err = lock_mddev(mddev);
2735         if (err) {
2736                 printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2737                 goto abort;
2738         }
2739         /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2740         if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2741                 err = -ENODEV;
2742                 goto abort_unlock;
2743         }
2744
2745         /*
2746          * Commands even a read-only array can execute:
2747          */
2748         switch (cmd)
2749         {
2750                 case GET_ARRAY_INFO:
2751                         err = get_array_info(mddev, (void *)arg);
2752                         goto done_unlock;
2753
2754                 case GET_DISK_INFO:
2755                         err = get_disk_info(mddev, (void *)arg);
2756                         goto done_unlock;
2757
2758                 case RESTART_ARRAY_RW:
2759                         err = restart_array(mddev);
2760                         goto done_unlock;
2761
2762                 case STOP_ARRAY:
2763                         if (inode->i_bdev->bd_openers > 1)
2764                                 err = -EBUSY;
2765                         else if (!(err = do_md_stop (mddev, 0)))
2766                                 mddev = NULL;
2767                         goto done_unlock;
2768
2769                 case STOP_ARRAY_RO:
2770                         if (inode->i_bdev->bd_openers > 1)
2771                                 err = -EBUSY;
2772                         else 
2773                                 err = do_md_stop (mddev, 1);
2774                         goto done_unlock;
2775
2776         /*
2777          * We have a problem here : there is no easy way to give a CHS
2778          * virtual geometry. We currently pretend that we have a 2 heads
2779          * 4 sectors (with a BIG number of cylinders...). This drives
2780          * dosfs just mad... ;-)
2781          */
2782                 case HDIO_GETGEO:
2783                         if (!loc) {
2784                                 err = -EINVAL;
2785                                 goto abort_unlock;
2786                         }
2787                         err = md_put_user (2, (char *) &loc->heads);
2788                         if (err)
2789                                 goto abort_unlock;
2790                         err = md_put_user (4, (char *) &loc->sectors);
2791                         if (err)
2792                                 goto abort_unlock;
2793                         err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2794                                                 (short *) &loc->cylinders);
2795                         if (err)
2796                                 goto abort_unlock;
2797                         err = md_put_user (md_hd_struct[minor].start_sect,
2798                                                 (long *) &loc->start);
2799                         goto done_unlock;
2800         }
2801
2802         /*
2803          * The remaining ioctls are changing the state of the
2804          * superblock, so we do not allow read-only arrays
2805          * here:
2806          */
2807         if (mddev->ro) {
2808                 err = -EROFS;
2809                 goto abort_unlock;
2810         }
2811
2812         switch (cmd)
2813         {
2814                 case CLEAR_ARRAY:
2815                         err = clear_array(mddev);
2816                         goto done_unlock;
2817
2818                 case ADD_NEW_DISK:
2819                 {
2820                         mdu_disk_info_t info;
2821                         if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2822                                 err = -EFAULT;
2823                         else
2824                                 err = add_new_disk(mddev, &info);
2825                         goto done_unlock;
2826                 }
2827                 case HOT_GENERATE_ERROR:
2828                         err = hot_generate_error(mddev, (kdev_t)arg);
2829                         goto done_unlock;
2830                 case HOT_REMOVE_DISK:
2831                         err = hot_remove_disk(mddev, (kdev_t)arg);
2832                         goto done_unlock;
2833
2834                 case HOT_ADD_DISK:
2835                         err = hot_add_disk(mddev, (kdev_t)arg);
2836                         goto done_unlock;
2837
2838                 case SET_DISK_INFO:
2839                         err = set_disk_info(mddev, (void *)arg);
2840                         goto done_unlock;
2841
2842                 case WRITE_RAID_INFO:
2843                         err = write_raid_info(mddev);
2844                         goto done_unlock;
2845
2846                 case UNPROTECT_ARRAY:
2847                         err = unprotect_array(mddev);
2848                         goto done_unlock;
2849
2850                 case PROTECT_ARRAY:
2851                         err = protect_array(mddev);
2852                         goto done_unlock;
2853
2854                 case SET_DISK_FAULTY:
2855                         err = set_disk_faulty(mddev, (kdev_t)arg);
2856                         goto done_unlock;
2857
2858                 case RUN_ARRAY:
2859                 {
2860 /* The data is never used....
2861                         mdu_param_t param;
2862                         err = md_copy_from_user(&param, (mdu_param_t *)arg,
2863                                                          sizeof(param));
2864                         if (err)
2865                                 goto abort_unlock;
2866 */
2867                         err = do_md_run (mddev);
2868                         /*
2869                          * we have to clean up the mess if
2870                          * the array cannot be run for some
2871                          * reason ...
2872                          */
2873                         if (err) {
2874                                 mddev->sb_dirty = 0;
2875                                 if (!do_md_stop (mddev, 0))
2876                                         mddev = NULL;
2877                         }
2878                         goto done_unlock;
2879                 }
2880
2881                 default:
2882                         printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
2883                                "upgrade your software to use new ictls.\n",
2884                                current->comm, current->pid);
2885                         err = -EINVAL;
2886                         goto abort_unlock;
2887         }
2888
2889 done_unlock:
2890 abort_unlock:
2891         if (mddev)
2892                 unlock_mddev(mddev);
2893
2894         return err;
2895 done:
2896         if (err)
2897                 MD_BUG();
2898 abort:
2899         return err;
2900 }
2901
2902 static int md_open(struct inode *inode, struct file *file)
2903 {
2904         /*
2905          * Always succeed, but increment the usage count
2906          */
2907         mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2908         if (mddev)
2909                 atomic_inc(&mddev->active);
2910         return (0);
2911 }
2912
2913 static int md_release(struct inode *inode, struct file * file)
2914 {
2915         mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2916         if (mddev)
2917                 atomic_dec(&mddev->active);
2918         return 0;
2919 }
2920
2921 static struct block_device_operations md_fops=
2922 {
2923         owner:          THIS_MODULE,
2924         open:           md_open,
2925         release:        md_release,
2926         ioctl:          md_ioctl,
2927 };
2928
2929
2930 int md_thread(void * arg)
2931 {
2932         mdk_thread_t *thread = arg;
2933
2934         md_lock_kernel();
2935
2936         /*
2937          * Detach thread
2938          */
2939
2940         daemonize();
2941
2942         sprintf(current->comm, thread->name);
2943         md_init_signals();
2944         md_flush_signals();
2945         thread->tsk = current;
2946
2947         /*
2948          * md_thread is a 'system-thread', it's priority should be very
2949          * high. We avoid resource deadlocks individually in each
2950          * raid personality. (RAID5 does preallocation) We also use RR and
2951          * the very same RT priority as kswapd, thus we will never get
2952          * into a priority inversion deadlock.
2953          *
2954          * we definitely have to have equal or higher priority than
2955          * bdflush, otherwise bdflush will deadlock if there are too
2956          * many dirty RAID5 blocks.
2957          */
2958         current->policy = SCHED_OTHER;
2959         current->nice = -20;
2960         md_unlock_kernel();
2961
2962         complete(thread->event);
2963         while (thread->run) {
2964                 void (*run)(void *data);
2965                 DECLARE_WAITQUEUE(wait, current);
2966
2967                 add_wait_queue(&thread->wqueue, &wait);
2968                 set_task_state(current, TASK_INTERRUPTIBLE);
2969                 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2970                         dprintk("md: thread %p went to sleep.\n", thread);
2971                         schedule();
2972                         dprintk("md: thread %p woke up.\n", thread);
2973                 }
2974                 current->state = TASK_RUNNING;
2975                 remove_wait_queue(&thread->wqueue, &wait);
2976                 clear_bit(THREAD_WAKEUP, &thread->flags);
2977
2978                 run = thread->run;
2979                 if (run) {
2980                         run(thread->data);
2981                         run_task_queue(&tq_disk);
2982                 }
2983                 if (md_signal_pending(current))
2984                         md_flush_signals();
2985         }
2986         complete(thread->event);
2987         return 0;
2988 }
2989
2990 void md_wakeup_thread(mdk_thread_t *thread)
2991 {
2992         dprintk("md: waking up MD thread %p.\n", thread);
2993         set_bit(THREAD_WAKEUP, &thread->flags);
2994         wake_up(&thread->wqueue);
2995 }
2996
2997 mdk_thread_t *md_register_thread(void (*run) (void *),
2998                                                 void *data, const char *name)
2999 {
3000         mdk_thread_t *thread;
3001         int ret;
3002         struct completion event;
3003
3004         thread = (mdk_thread_t *) kmalloc
3005                                 (sizeof(mdk_thread_t), GFP_KERNEL);
3006         if (!thread)
3007                 return NULL;
3008
3009         memset(thread, 0, sizeof(mdk_thread_t));
3010         md_init_waitqueue_head(&thread->wqueue);
3011
3012         init_completion(&event);
3013         thread->event = &event;
3014         thread->run = run;
3015         thread->data = data;
3016         thread->name = name;
3017         ret = kernel_thread(md_thread, thread, 0);
3018         if (ret < 0) {
3019                 kfree(thread);
3020                 return NULL;
3021         }
3022         wait_for_completion(&event);
3023         return thread;
3024 }
3025
3026 void md_interrupt_thread(mdk_thread_t *thread)
3027 {
3028         if (!thread->tsk) {
3029                 MD_BUG();
3030                 return;
3031         }
3032         dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
3033         send_sig(SIGKILL, thread->tsk, 1);
3034 }
3035
3036 void md_unregister_thread(mdk_thread_t *thread)
3037 {
3038         struct completion event;
3039
3040         init_completion(&event);
3041
3042         thread->event = &event;
3043         thread->run = NULL;
3044         thread->name = NULL;
3045         md_interrupt_thread(thread);
3046         wait_for_completion(&event);
3047         kfree(thread);
3048 }
3049
3050 void md_recover_arrays(void)
3051 {
3052         if (!md_recovery_thread) {
3053                 MD_BUG();
3054                 return;
3055         }
3056         md_wakeup_thread(md_recovery_thread);
3057 }
3058
3059
3060 int md_error(mddev_t *mddev, kdev_t rdev)
3061 {
3062         mdk_rdev_t * rrdev;
3063
3064         dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
3065                 MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev),
3066                 __builtin_return_address(0),__builtin_return_address(1),
3067                 __builtin_return_address(2),__builtin_return_address(3));
3068
3069         if (!mddev) {
3070                 MD_BUG();
3071                 return 0;
3072         }
3073         rrdev = find_rdev(mddev, rdev);
3074         if (!rrdev || rrdev->faulty)
3075                 return 0;
3076         if (!mddev->pers->error_handler
3077                         || mddev->pers->error_handler(mddev,rdev) <= 0) {
3078                 rrdev->faulty = 1;
3079         } else
3080                 return 1;
3081         /*
3082          * if recovery was running, stop it now.
3083          */
3084         if (mddev->pers->stop_resync)
3085                 mddev->pers->stop_resync(mddev);
3086         if (mddev->recovery_running)
3087                 md_interrupt_thread(md_recovery_thread);
3088         md_recover_arrays();
3089
3090         return 0;
3091 }
3092
3093 static void status_unused(struct seq_file *seq)
3094 {
3095         int i = 0;
3096         mdk_rdev_t *rdev;
3097         struct md_list_head *tmp;
3098
3099         seq_printf(seq, "unused devices: ");
3100
3101         ITERATE_RDEV_ALL(rdev,tmp) {
3102                 if (!rdev->same_set.next && !rdev->same_set.prev) {
3103                         /*
3104                          * The device is not yet used by any array.
3105                          */
3106                         i++;
3107                         seq_printf(seq, "%s ",
3108                                 partition_name(rdev->dev));
3109                 }
3110         }
3111         if (!i)
3112                 seq_printf(seq, "<none>");
3113
3114         seq_printf(seq, "\n");
3115 }
3116
3117
3118 static void status_resync(struct seq_file *seq, mddev_t * mddev)
3119 {
3120         unsigned long max_blocks, resync, res, dt, db, rt;
3121
3122         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
3123         max_blocks = mddev->sb->size;
3124
3125         /*
3126          * Should not happen.
3127          */
3128         if (!max_blocks)
3129                 MD_BUG();
3130
3131         res = (resync/1024)*1000/(max_blocks/1024 + 1);
3132         {
3133                 int i, x = res/50, y = 20-x;
3134                 seq_printf(seq, "[");
3135                 for (i = 0; i < x; i++)
3136                         seq_printf(seq, "=");
3137                 seq_printf(seq, ">");
3138                 for (i = 0; i < y; i++)
3139                         seq_printf(seq, ".");
3140                 seq_printf(seq, "] ");
3141         }
3142         if (!mddev->recovery_running)
3143                 /*
3144                  * true resync
3145                  */
3146                 seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)",
3147                                 res/10, res % 10, resync, max_blocks);
3148         else
3149                 /*
3150                  * recovery ...
3151                  */
3152                 seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)",
3153                                 res/10, res % 10, resync, max_blocks);
3154
3155         /*
3156          * We do not want to overflow, so the order of operands and
3157          * the * 100 / 100 trick are important. We do a +1 to be
3158          * safe against division by zero. We only estimate anyway.
3159          *
3160          * dt: time from mark until now
3161          * db: blocks written from mark until now
3162          * rt: remaining time
3163          */
3164         dt = ((jiffies - mddev->resync_mark) / HZ);
3165         if (!dt) dt++;
3166         db = resync - (mddev->resync_mark_cnt/2);
3167         rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3168
3169         seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3170
3171         seq_printf(seq, " speed=%ldK/sec", db/dt);
3172
3173 }
3174
3175
3176 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
3177 {
3178         struct list_head *tmp;
3179         loff_t l = *pos;
3180         mddev_t *mddev;
3181
3182         if (l >= 0x10000)
3183                 return NULL;
3184         if (!l--)
3185                 /* header */
3186                 return (void*)1;
3187
3188         list_for_each(tmp,&all_mddevs)
3189                 if (!l--) {
3190                         mddev = list_entry(tmp, mddev_t, all_mddevs);
3191                         return mddev;
3192                 }
3193         if (!l--)       
3194                 return (void*)2;/* tail */
3195         return NULL;
3196 }
3197
3198 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3199 {
3200         struct list_head *tmp;
3201         mddev_t *next_mddev, *mddev = v;
3202         
3203         ++*pos;
3204         if (v == (void*)2)
3205                 return NULL;
3206
3207         if (v == (void*)1)
3208                 tmp = all_mddevs.next;
3209         else
3210                 tmp = mddev->all_mddevs.next;
3211         if (tmp != &all_mddevs)
3212                 next_mddev = list_entry(tmp,mddev_t,all_mddevs);
3213         else {
3214                 next_mddev = (void*)2;
3215                 *pos = 0x10000;
3216         }               
3217
3218         return next_mddev;
3219
3220 }
3221
3222 static void md_seq_stop(struct seq_file *seq, void *v)
3223 {
3224
3225 }
3226
3227 static int md_seq_show(struct seq_file *seq, void *v)
3228 {
3229         int j, size;
3230         struct md_list_head *tmp2;
3231         mdk_rdev_t *rdev;
3232         mddev_t *mddev = v;
3233
3234         if (v == (void*)1) {
3235                 seq_printf(seq, "Personalities : ");
3236                 for (j = 0; j < MAX_PERSONALITY; j++)
3237                         if (pers[j])
3238                                 seq_printf(seq, "[%s] ", pers[j]->name);
3239
3240                 seq_printf(seq, "\n");
3241                 seq_printf(seq, "read_ahead ");
3242                 if (read_ahead[MD_MAJOR] == INT_MAX)
3243                         seq_printf(seq, "not set\n");
3244                 else
3245                         seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
3246                 return 0;
3247         }
3248         if (v == (void*)2) {
3249                 status_unused(seq);
3250                 return 0;
3251         }
3252
3253         seq_printf(seq, "md%d : %sactive", mdidx(mddev),
3254                    mddev->pers ? "" : "in");
3255         if (mddev->pers) {
3256                 if (mddev->ro)
3257                         seq_printf(seq, " (read-only)");
3258                 seq_printf(seq, " %s", mddev->pers->name);
3259         }
3260         
3261         size = 0;
3262         ITERATE_RDEV(mddev,rdev,tmp2) {
3263                 seq_printf(seq, " %s[%d]",
3264                            partition_name(rdev->dev), rdev->desc_nr);
3265                 if (rdev->faulty) {
3266                         seq_printf(seq, "(F)");
3267                         continue;
3268                 }
3269                 size += rdev->size;
3270         }
3271
3272         if (mddev->nb_dev) {
3273                 if (mddev->pers)
3274                         seq_printf(seq, "\n      %d blocks",
3275                                    md_size[mdidx(mddev)]);
3276                 else
3277                         seq_printf(seq, "\n      %d blocks", size);
3278         }
3279
3280         if (mddev->pers) {
3281
3282                 mddev->pers->status (seq, mddev);
3283
3284                 seq_printf(seq, "\n      ");
3285                 if (mddev->curr_resync) {
3286                         status_resync (seq, mddev);
3287                 } else {
3288                         if (sem_getcount(&mddev->resync_sem) != 1)
3289                                 seq_printf(seq, "       resync=DELAYED");
3290                 }
3291         }
3292         seq_printf(seq, "\n");
3293
3294         return 0;
3295 }
3296
3297   
3298 static struct seq_operations md_seq_ops = {
3299         .start  = md_seq_start,
3300         .next   = md_seq_next,
3301         .stop   = md_seq_stop,
3302         .show   = md_seq_show,
3303 };
3304
3305 static int md_seq_open(struct inode *inode, struct file *file)
3306 {
3307         int error;
3308
3309         error = seq_open(file, &md_seq_ops);
3310         return error;
3311 }
3312
3313 static struct file_operations md_seq_fops = {
3314         .open           = md_seq_open,
3315         .read           = seq_read,
3316         .llseek         = seq_lseek,
3317         .release        = seq_release,
3318 };
3319
3320
3321 int register_md_personality(int pnum, mdk_personality_t *p)
3322 {
3323         if (pnum >= MAX_PERSONALITY) {
3324                 MD_BUG();
3325                 return -EINVAL;
3326         }
3327
3328         if (pers[pnum]) {
3329                 MD_BUG();
3330                 return -EBUSY;
3331         }
3332
3333         pers[pnum] = p;
3334         printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3335         return 0;
3336 }
3337
3338 int unregister_md_personality(int pnum)
3339 {
3340         if (pnum >= MAX_PERSONALITY) {
3341                 MD_BUG();
3342                 return -EINVAL;
3343         }
3344
3345         printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3346         pers[pnum] = NULL;
3347         return 0;
3348 }
3349
3350 mdp_disk_t *get_spare(mddev_t *mddev)
3351 {
3352         mdp_super_t *sb = mddev->sb;
3353         mdp_disk_t *disk;
3354         mdk_rdev_t *rdev;
3355         struct md_list_head *tmp;
3356
3357         ITERATE_RDEV(mddev,rdev,tmp) {
3358                 if (rdev->faulty)
3359                         continue;
3360                 if (!rdev->sb) {
3361                         MD_BUG();
3362                         continue;
3363                 }
3364                 disk = &sb->disks[rdev->desc_nr];
3365                 if (disk_faulty(disk)) {
3366                         MD_BUG();
3367                         continue;
3368                 }
3369                 if (disk_active(disk))
3370                         continue;
3371                 return disk;
3372         }
3373         return NULL;
3374 }
3375
3376 static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
3377 void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
3378 {
3379         unsigned int major = MAJOR(dev);
3380         unsigned int index;
3381
3382         index = disk_index(dev);
3383         if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3384                 return;
3385
3386         sync_io[major][index] += nr_sectors;
3387 }
3388
3389 static int is_mddev_idle(mddev_t *mddev)
3390 {
3391         mdk_rdev_t * rdev;
3392         struct md_list_head *tmp;
3393         int idle;
3394         unsigned long curr_events;
3395
3396         idle = 1;
3397         ITERATE_RDEV(mddev,rdev,tmp) {
3398                 int major = MAJOR(rdev->dev);
3399                 int idx = disk_index(rdev->dev);
3400
3401                 if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3402                         continue;
3403
3404                 curr_events = kstat.dk_drive_rblk[major][idx] +
3405                                                 kstat.dk_drive_wblk[major][idx] ;
3406                 curr_events -= sync_io[major][idx];
3407                 if ((curr_events - rdev->last_events) > 32) {
3408                         rdev->last_events = curr_events;
3409                         idle = 0;
3410                 }
3411         }
3412         return idle;
3413 }
3414
3415 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3416
3417 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3418 {
3419         /* another "blocks" (512byte) blocks have been synced */
3420         atomic_sub(blocks, &mddev->recovery_active);
3421         wake_up(&mddev->recovery_wait);
3422         if (!ok) {
3423                 // stop recovery, signal do_sync ....
3424                 if (mddev->pers->stop_resync)
3425                         mddev->pers->stop_resync(mddev);
3426                 if (mddev->recovery_running)
3427                         md_interrupt_thread(md_recovery_thread);
3428         }
3429 }
3430
3431 #define SYNC_MARKS      10
3432 #define SYNC_MARK_STEP  (3*HZ)
3433 int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3434 {
3435         mddev_t *mddev2;
3436         unsigned int max_sectors, currspeed,
3437                 j, window, err, serialize;
3438         unsigned long mark[SYNC_MARKS];
3439         unsigned long mark_cnt[SYNC_MARKS];
3440         int last_mark,m;
3441         struct md_list_head *tmp;
3442         unsigned long last_check;
3443
3444
3445         err = down_interruptible(&mddev->resync_sem);
3446         if (err)
3447                 goto out_nolock;
3448
3449 recheck:
3450         serialize = 0;
3451         ITERATE_MDDEV(mddev2,tmp) {
3452                 if (mddev2 == mddev)
3453                         continue;
3454                 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3455                         printk(KERN_INFO "md: delaying resync of md%d until md%d "
3456                                "has finished resync (they share one or more physical units)\n",
3457                                mdidx(mddev), mdidx(mddev2));
3458                         serialize = 1;
3459                         break;
3460                 }
3461         }
3462         if (serialize) {
3463                 interruptible_sleep_on(&resync_wait);
3464                 if (md_signal_pending(current)) {
3465                         md_flush_signals();
3466                         err = -EINTR;
3467                         goto out;
3468                 }
3469                 goto recheck;
3470         }
3471
3472         mddev->curr_resync = 1;
3473
3474         max_sectors = mddev->sb->size<<1;
3475
3476         printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3477         printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3478                                                 sysctl_speed_limit_min);
3479         printk(KERN_INFO "md: using maximum available idle IO bandwith "
3480                "(but not more than %d KB/sec) for reconstruction.\n",
3481                sysctl_speed_limit_max);
3482
3483         /*
3484          * Resync has low priority.
3485          */
3486         current->nice = 19;
3487
3488         is_mddev_idle(mddev); /* this also initializes IO event counters */
3489         for (m = 0; m < SYNC_MARKS; m++) {
3490                 mark[m] = jiffies;
3491                 mark_cnt[m] = 0;
3492         }
3493         last_mark = 0;
3494         mddev->resync_mark = mark[last_mark];
3495         mddev->resync_mark_cnt = mark_cnt[last_mark];
3496
3497         /*
3498          * Tune reconstruction:
3499          */
3500         window = vm_max_readahead*(PAGE_SIZE/512);
3501         printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
3502                window/2,max_sectors/2);
3503
3504         atomic_set(&mddev->recovery_active, 0);
3505         init_waitqueue_head(&mddev->recovery_wait);
3506         last_check = 0;
3507         for (j = 0; j < max_sectors;) {
3508                 int sectors;
3509
3510                 sectors = mddev->pers->sync_request(mddev, j);
3511
3512                 if (sectors < 0) {
3513                         err = sectors;
3514                         goto out;
3515                 }
3516                 atomic_add(sectors, &mddev->recovery_active);
3517                 j += sectors;
3518                 mddev->curr_resync = j;
3519
3520                 if (last_check + window > j)
3521                         continue;
3522
3523                 last_check = j;
3524
3525                 run_task_queue(&tq_disk);
3526
3527         repeat:
3528                 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3529                         /* step marks */
3530                         int next = (last_mark+1) % SYNC_MARKS;
3531
3532                         mddev->resync_mark = mark[next];
3533                         mddev->resync_mark_cnt = mark_cnt[next];
3534                         mark[next] = jiffies;
3535                         mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3536                         last_mark = next;
3537                 }
3538
3539
3540                 if (md_signal_pending(current)) {
3541                         /*
3542                          * got a signal, exit.
3543                          */
3544                         mddev->curr_resync = 0;
3545                         printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
3546                         md_flush_signals();
3547                         err = -EINTR;
3548                         goto out;
3549                 }
3550
3551                 /*
3552                  * this loop exits only if either when we are slower than
3553                  * the 'hard' speed limit, or the system was IO-idle for
3554                  * a jiffy.
3555                  * the system might be non-idle CPU-wise, but we only care
3556                  * about not overloading the IO subsystem. (things like an
3557                  * e2fsck being done on the RAID array should execute fast)
3558                  */
3559                 if (md_need_resched(current))
3560                         schedule();
3561
3562                 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3563
3564                 if (currspeed > sysctl_speed_limit_min) {
3565                         current->nice = 19;
3566
3567                         if ((currspeed > sysctl_speed_limit_max) ||
3568                                         !is_mddev_idle(mddev)) {
3569                                 current->state = TASK_INTERRUPTIBLE;
3570                                 md_schedule_timeout(HZ/4);
3571                                 goto repeat;
3572                         }
3573                 } else
3574                         current->nice = -20;
3575         }
3576         printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3577         err = 0;
3578         /*
3579          * this also signals 'finished resyncing' to md_stop
3580          */
3581 out:
3582         wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3583         up(&mddev->resync_sem);
3584 out_nolock:
3585         mddev->curr_resync = 0;
3586         wake_up(&resync_wait);
3587         return err;
3588 }
3589
3590
3591 /*
3592  * This is a kernel thread which syncs a spare disk with the active array
3593  *
3594  * the amount of foolproofing might seem to be a tad excessive, but an
3595  * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3596  * of my root partition with the first 0.5 gigs of my /home partition ... so
3597  * i'm a bit nervous ;)
3598  */
3599 void md_do_recovery(void *data)
3600 {
3601         int err;
3602         mddev_t *mddev;
3603         mdp_super_t *sb;
3604         mdp_disk_t *spare;
3605         struct md_list_head *tmp;
3606
3607         printk(KERN_INFO "md: recovery thread got woken up ...\n");
3608 restart:
3609         ITERATE_MDDEV(mddev,tmp) {
3610                 sb = mddev->sb;
3611                 if (!sb)
3612                         continue;
3613                 if (mddev->recovery_running)
3614                         continue;
3615                 if (sb->active_disks == sb->raid_disks)
3616                         continue;
3617                 if (mddev->sb_dirty)
3618                         md_update_sb(mddev);
3619                 if (!sb->spare_disks) {
3620                         printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
3621                                "-- continuing in degraded mode\n", mdidx(mddev));
3622                         continue;
3623                 }
3624                 /*
3625                  * now here we get the spare and resync it.
3626                  */
3627                 spare = get_spare(mddev);
3628                 if (!spare)
3629                         continue;
3630                 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
3631                        mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3632                 if (!mddev->pers->diskop)
3633                         continue;
3634                 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3635                         continue;
3636                 down(&mddev->recovery_sem);
3637                 mddev->recovery_running = 1;
3638                 err = md_do_sync(mddev, spare);
3639                 if (err == -EIO) {
3640                         printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
3641                                mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3642                         if (!disk_faulty(spare)) {
3643                                 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3644                                 mark_disk_faulty(spare);
3645                                 mark_disk_nonsync(spare);
3646                                 mark_disk_inactive(spare);
3647                                 sb->spare_disks--;
3648                                 sb->working_disks--;
3649                                 sb->failed_disks++;
3650                         }
3651                 } else
3652                         if (disk_faulty(spare))
3653                                 mddev->pers->diskop(mddev, &spare,
3654                                                 DISKOP_SPARE_INACTIVE);
3655                 if (err == -EINTR || err == -ENOMEM) {
3656                         /*
3657                          * Recovery got interrupted, or ran out of mem ...
3658                          * signal back that we have finished using the array.
3659                          */
3660                         mddev->pers->diskop(mddev, &spare,
3661                                                          DISKOP_SPARE_INACTIVE);
3662                         up(&mddev->recovery_sem);
3663                         mddev->recovery_running = 0;
3664                         continue;
3665                 } else {
3666                         mddev->recovery_running = 0;
3667                         up(&mddev->recovery_sem);
3668                 }
3669                 if (!disk_faulty(spare)) {
3670                         /*
3671                          * the SPARE_ACTIVE diskop possibly changes the
3672                          * pointer too
3673                          */
3674                         mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3675                         mark_disk_sync(spare);
3676                         mark_disk_active(spare);
3677                         sb->active_disks++;
3678                         sb->spare_disks--;
3679                 }
3680                 mddev->sb_dirty = 1;
3681                 md_update_sb(mddev);
3682                 goto restart;
3683         }
3684         printk(KERN_INFO "md: recovery thread finished ...\n");
3685
3686 }
3687
3688 int md_notify_reboot(struct notifier_block *this,
3689                                         unsigned long code, void *x)
3690 {
3691         struct md_list_head *tmp;
3692         mddev_t *mddev;
3693
3694         if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3695                                         || (code == MD_SYS_POWER_OFF)) {
3696
3697                 printk(KERN_INFO "md: stopping all md devices.\n");
3698
3699                 ITERATE_MDDEV(mddev,tmp)
3700                         do_md_stop (mddev, 1);
3701                 /*
3702                  * certain more exotic SCSI devices are known to be
3703                  * volatile wrt too early system reboots. While the
3704                  * right place to handle this issue is the given
3705                  * driver, we do want to have a safe RAID driver ...
3706                  */
3707                 md_mdelay(1000*1);
3708         }
3709         return NOTIFY_DONE;
3710 }
3711
3712 struct notifier_block md_notifier = {
3713         notifier_call:  md_notify_reboot,
3714         next:           NULL,
3715         priority:       INT_MAX, /* before any real devices */
3716 };
3717
3718 static void md_geninit(void)
3719 {
3720         struct proc_dir_entry *p;
3721         int i;
3722
3723         for(i = 0; i < MAX_MD_DEVS; i++) {
3724                 md_blocksizes[i] = 1024;
3725                 md_size[i] = 0;
3726                 md_hardsect_sizes[i] = 512;
3727                 md_maxreadahead[i] = MD_READAHEAD;
3728         }
3729         blksize_size[MAJOR_NR] = md_blocksizes;
3730         blk_size[MAJOR_NR] = md_size;
3731         max_readahead[MAJOR_NR] = md_maxreadahead;
3732         hardsect_size[MAJOR_NR] = md_hardsect_sizes;
3733
3734         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3735
3736 #ifdef CONFIG_PROC_FS
3737         p = create_proc_entry("mdstat", S_IRUGO, NULL);
3738         if (p)
3739                 p->proc_fops = &md_seq_fops;
3740 #endif
3741 }
3742
3743 int md__init md_init(void)
3744 {
3745         static char * name = "mdrecoveryd";
3746         int minor;
3747
3748         printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
3749                         MD_MAJOR_VERSION, MD_MINOR_VERSION,
3750                         MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3751
3752         if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
3753         {
3754                 printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
3755                 return (-1);
3756         }
3757         devfs_handle = devfs_mk_dir (NULL, "md", NULL);
3758         /* we don't use devfs_register_series because we want to fill md_hd_struct */
3759         for (minor=0; minor < MAX_MD_DEVS; ++minor) {
3760                 char devname[128];
3761                 sprintf (devname, "%u", minor);
3762                 md_hd_struct[minor].de = devfs_register (devfs_handle,
3763                         devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
3764                         S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
3765         }
3766
3767         /* forward all md request to md_make_request */
3768         blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
3769
3770
3771         read_ahead[MAJOR_NR] = INT_MAX;
3772
3773         add_gendisk(&md_gendisk);
3774
3775         md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3776         if (!md_recovery_thread)
3777                 printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
3778
3779         md_register_reboot_notifier(&md_notifier);
3780         raid_table_header = register_sysctl_table(raid_root_table, 1);
3781
3782         md_geninit();
3783         return (0);
3784 }
3785
3786
3787 #ifndef MODULE
3788
3789 /*
3790  * When md (and any require personalities) are compiled into the kernel
3791  * (not a module), arrays can be assembles are boot time using with AUTODETECT
3792  * where specially marked partitions are registered with md_autodetect_dev(),
3793  * and with MD_BOOT where devices to be collected are given on the boot line
3794  * with md=.....
3795  * The code for that is here.
3796  */
3797
3798 struct {
3799         int set;
3800         int noautodetect;
3801 } raid_setup_args md__initdata;
3802
3803 /*
3804  * Searches all registered partitions for autorun RAID arrays
3805  * at boot time.
3806  */
3807 static kdev_t detected_devices[128];
3808 static int dev_cnt;
3809
3810 void md_autodetect_dev(kdev_t dev)
3811 {
3812         if (dev_cnt >= 0 && dev_cnt < 127)
3813                 detected_devices[dev_cnt++] = dev;
3814 }
3815
3816
3817 static void autostart_arrays(void)
3818 {
3819         mdk_rdev_t *rdev;
3820         int i;
3821
3822         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
3823
3824         for (i = 0; i < dev_cnt; i++) {
3825                 kdev_t dev = detected_devices[i];
3826
3827                 if (md_import_device(dev,1)) {
3828                         printk(KERN_ALERT "md: could not import %s!\n",
3829                                 partition_name(dev));
3830                         continue;
3831                 }
3832                 /*
3833                  * Sanity checks:
3834                  */
3835                 rdev = find_rdev_all(dev);
3836                 if (!rdev) {
3837                         MD_BUG();
3838                         continue;
3839                 }
3840                 if (rdev->faulty) {
3841                         MD_BUG();
3842                         continue;
3843                 }
3844                 md_list_add(&rdev->pending, &pending_raid_disks);
3845         }
3846         dev_cnt = 0;
3847
3848         autorun_devices(-1);
3849 }
3850
3851 static struct {
3852         char device_set [MAX_MD_DEVS];
3853         int pers[MAX_MD_DEVS];
3854         int chunk[MAX_MD_DEVS];
3855         char *device_names[MAX_MD_DEVS];
3856 } md_setup_args md__initdata;
3857
3858 /*
3859  * Parse the command-line parameters given our kernel, but do not
3860  * actually try to invoke the MD device now; that is handled by
3861  * md_setup_drive after the low-level disk drivers have initialised.
3862  *
3863  * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3864  *             assigns the task of parsing integer arguments to the
3865  *             invoked program now).  Added ability to initialise all
3866  *             the MD devices (by specifying multiple "md=" lines)
3867  *             instead of just one.  -- KTK
3868  * 18May2000: Added support for persistant-superblock arrays:
3869  *             md=n,0,factor,fault,device-list   uses RAID0 for device n
3870  *             md=n,-1,factor,fault,device-list  uses LINEAR for device n
3871  *             md=n,device-list      reads a RAID superblock from the devices
3872  *             elements in device-list are read by name_to_kdev_t so can be
3873  *             a hex number or something like /dev/hda1 /dev/sdb
3874  * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
3875  *              Shifted name_to_kdev_t() and related operations to md_set_drive()
3876  *              for later execution. Rewrote section to make devfs compatible.
3877  */
3878 static int md__init md_setup(char *str)
3879 {
3880         int minor, level, factor, fault;
3881         char *pername = "";
3882         char *str1 = str;
3883
3884         if (get_option(&str, &minor) != 2) {    /* MD Number */
3885                 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
3886                 return 0;
3887         }
3888         if (minor >= MAX_MD_DEVS) {
3889                 printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
3890                 return 0;
3891         } else if (md_setup_args.device_names[minor]) {
3892                 printk(KERN_WARNING "md: md=%d, Specified more then once. "
3893                        "Replacing previous definition.\n", minor);
3894         }
3895         switch (get_option(&str, &level)) {     /* RAID Personality */
3896         case 2: /* could be 0 or -1.. */
3897                 if (level == 0 || level == -1) {
3898                         if (get_option(&str, &factor) != 2 ||   /* Chunk Size */
3899                                         get_option(&str, &fault) != 2) {
3900                                 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
3901                                 return 0;
3902                         }
3903                         md_setup_args.pers[minor] = level;
3904                         md_setup_args.chunk[minor] = 1 << (factor+12);
3905                         switch(level) {
3906                         case -1:
3907                                 level = LINEAR;
3908                                 pername = "linear";
3909                                 break;
3910                         case 0:
3911                                 level = RAID0;
3912                                 pername = "raid0";
3913                                 break;
3914                         default:
3915                                 printk(KERN_WARNING
3916                                        "md: The kernel has not been configured for raid%d support!\n",
3917                                        level);
3918                                 return 0;
3919                         }
3920                         md_setup_args.pers[minor] = level;
3921                         break;
3922                 }
3923                 /* FALL THROUGH */
3924         case 1: /* the first device is numeric */
3925                 str = str1;
3926                 /* FALL THROUGH */
3927         case 0:
3928                 md_setup_args.pers[minor] = 0;
3929                 pername="super-block";
3930         }
3931
3932         printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
3933                 minor, pername, str);
3934         md_setup_args.device_names[minor] = str;
3935
3936         return 1;
3937 }
3938
3939 extern kdev_t name_to_kdev_t(char *line) md__init;
3940 void md__init md_setup_drive(void)
3941 {
3942         int minor, i;
3943         kdev_t dev;
3944         mddev_t*mddev;
3945         kdev_t devices[MD_SB_DISKS+1];
3946
3947         for (minor = 0; minor < MAX_MD_DEVS; minor++) {
3948                 int err = 0;
3949                 char *devname;
3950                 mdu_disk_info_t dinfo;
3951
3952                 if ((devname = md_setup_args.device_names[minor]) == 0) continue;
3953
3954                 for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
3955
3956                         char *p;
3957                         void *handle;
3958
3959                         p = strchr(devname, ',');
3960                         if (p)
3961                                 *p++ = 0;
3962
3963                         dev = name_to_kdev_t(devname);
3964                         handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
3965                                                         DEVFS_SPECIAL_BLK, 1);
3966                         if (handle != 0) {
3967                                 unsigned major, minor;
3968                                 devfs_get_maj_min(handle, &major, &minor);
3969                                 dev = MKDEV(major, minor);
3970                         }
3971                         if (dev == 0) {
3972                                 printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
3973                                 break;
3974                         }
3975
3976                         devices[i] = dev;
3977                         md_setup_args.device_set[minor] = 1;
3978
3979                         devname = p;
3980                 }
3981                 devices[i] = 0;
3982
3983                 if (md_setup_args.device_set[minor] == 0)
3984                         continue;
3985
3986                 if (mddev_map[minor].mddev) {
3987                         printk(KERN_WARNING
3988                                "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
3989                                minor);
3990                         continue;
3991                 }
3992                 printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
3993
3994                 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3995                 if (!mddev) {
3996                         printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
3997                         continue;
3998                 }
3999                 if (md_setup_args.pers[minor]) {
4000                         /* non-persistent */
4001                         mdu_array_info_t ainfo;
4002                         ainfo.level = pers_to_level(md_setup_args.pers[minor]);
4003                         ainfo.size = 0;
4004                         ainfo.nr_disks =0;
4005                         ainfo.raid_disks =0;
4006                         ainfo.md_minor =minor;
4007                         ainfo.not_persistent = 1;
4008
4009                         ainfo.state = (1 << MD_SB_CLEAN);
4010                         ainfo.active_disks = 0;
4011                         ainfo.working_disks = 0;
4012                         ainfo.failed_disks = 0;
4013                         ainfo.spare_disks = 0;
4014                         ainfo.layout = 0;
4015                         ainfo.chunk_size = md_setup_args.chunk[minor];
4016                         err = set_array_info(mddev, &ainfo);
4017                         for (i = 0; !err && (dev = devices[i]); i++) {
4018                                 dinfo.number = i;
4019                                 dinfo.raid_disk = i;
4020                                 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
4021                                 dinfo.major = MAJOR(dev);
4022                                 dinfo.minor = MINOR(dev);
4023                                 mddev->sb->nr_disks++;
4024                                 mddev->sb->raid_disks++;
4025                                 mddev->sb->active_disks++;
4026                                 mddev->sb->working_disks++;
4027                                 err = add_new_disk (mddev, &dinfo);
4028                         }
4029                 } else {
4030                         /* persistent */
4031                         for (i = 0; (dev = devices[i]); i++) {
4032                                 dinfo.major = MAJOR(dev);
4033                                 dinfo.minor = MINOR(dev);
4034                                 add_new_disk (mddev, &dinfo);
4035                         }
4036                 }
4037                 if (!err)
4038                         err = do_md_run(mddev);
4039                 if (err) {
4040                         mddev->sb_dirty = 0;
4041                         do_md_stop(mddev, 0);
4042                         printk(KERN_WARNING "md: starting md%d failed\n", minor);
4043                 }
4044         }
4045 }
4046
4047 static int md__init raid_setup(char *str)
4048 {
4049         int len, pos;
4050
4051         len = strlen(str) + 1;
4052         pos = 0;
4053
4054         while (pos < len) {
4055                 char *comma = strchr(str+pos, ',');
4056                 int wlen;
4057                 if (comma)
4058                         wlen = (comma-str)-pos;
4059                 else    wlen = (len-1)-pos;
4060
4061                 if (strncmp(str, "noautodetect", wlen) == 0)
4062                         raid_setup_args.noautodetect = 1;
4063                 pos += wlen+1;
4064         }
4065         raid_setup_args.set = 1;
4066         return 1;
4067 }
4068
4069 int md__init md_run_setup(void)
4070 {
4071         if (raid_setup_args.noautodetect)
4072                 printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
4073         else
4074                 autostart_arrays();
4075         md_setup_drive();
4076         return 0;
4077 }
4078
4079 __setup("raid=", raid_setup);
4080 __setup("md=", md_setup);
4081
4082 __initcall(md_init);
4083 __initcall(md_run_setup);
4084
4085 #else /* It is a MODULE */
4086
4087 int init_module(void)
4088 {
4089         return md_init();
4090 }
4091
4092 static void free_device_names(void)
4093 {
4094         while (device_names.next != &device_names) {
4095                 struct list_head *tmp = device_names.next;
4096                 list_del(tmp);
4097                 kfree(tmp);
4098         }
4099 }
4100
4101
4102 void cleanup_module(void)
4103 {
4104         md_unregister_thread(md_recovery_thread);
4105         devfs_unregister(devfs_handle);
4106
4107         devfs_unregister_blkdev(MAJOR_NR,"md");
4108         unregister_reboot_notifier(&md_notifier);
4109         unregister_sysctl_table(raid_table_header);
4110 #ifdef CONFIG_PROC_FS
4111         remove_proc_entry("mdstat", NULL);
4112 #endif
4113
4114         del_gendisk(&md_gendisk);
4115
4116         blk_dev[MAJOR_NR].queue = NULL;
4117         blksize_size[MAJOR_NR] = NULL;
4118         blk_size[MAJOR_NR] = NULL;
4119         max_readahead[MAJOR_NR] = NULL;
4120         hardsect_size[MAJOR_NR] = NULL;
4121
4122         free_device_names();
4123
4124 }
4125 #endif
4126
4127 MD_EXPORT_SYMBOL(md_size);
4128 MD_EXPORT_SYMBOL(register_md_personality);
4129 MD_EXPORT_SYMBOL(unregister_md_personality);
4130 MD_EXPORT_SYMBOL(partition_name);
4131 MD_EXPORT_SYMBOL(md_error);
4132 MD_EXPORT_SYMBOL(md_do_sync);
4133 MD_EXPORT_SYMBOL(md_sync_acct);
4134 MD_EXPORT_SYMBOL(md_done_sync);
4135 MD_EXPORT_SYMBOL(md_recover_arrays);
4136 MD_EXPORT_SYMBOL(md_register_thread);
4137 MD_EXPORT_SYMBOL(md_unregister_thread);
4138 MD_EXPORT_SYMBOL(md_update_sb);
4139 MD_EXPORT_SYMBOL(md_wakeup_thread);
4140 MD_EXPORT_SYMBOL(md_print_devices);
4141 MD_EXPORT_SYMBOL(find_rdev_nr);
4142 MD_EXPORT_SYMBOL(md_interrupt_thread);
4143 MD_EXPORT_SYMBOL(mddev_map);
4144 MD_EXPORT_SYMBOL(md_check_ordering);
4145 MD_EXPORT_SYMBOL(get_spare);
4146 MODULE_LICENSE("GPL");