added mtd driver
[linux-2.4.git] / drivers / md / raid5.c
1 /*
2  * raid5.c : Multiple Devices driver for Linux
3  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4  *         Copyright (C) 1999, 2000 Ingo Molnar
5  *
6  * RAID-5 management functions.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2, or (at your option)
11  * any later version.
12  *
13  * You should have received a copy of the GNU General Public License
14  * (for example /usr/src/linux/COPYING); if not, write to the Free
15  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16  */
17
18
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/locks.h>
22 #include <linux/slab.h>
23 #include <linux/raid/raid5.h>
24 #include <asm/bitops.h>
25 #include <asm/atomic.h>
26
27 static mdk_personality_t raid5_personality;
28
29 /*
30  * Stripe cache
31  */
32
33 #define NR_STRIPES              256
34 #define IO_THRESHOLD            1
35 #define HASH_PAGES              1
36 #define HASH_PAGES_ORDER        0
37 #define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
38 #define HASH_MASK               (NR_HASH - 1)
39 #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
40
41 /*
42  * The following can be used to debug the driver
43  */
44 #define RAID5_DEBUG     0
45 #define RAID5_PARANOIA  1
46 #if RAID5_PARANOIA && CONFIG_SMP
47 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
48 #else
49 # define CHECK_DEVLOCK()
50 #endif
51
52 #if RAID5_DEBUG
53 #define PRINTK(x...) printk(x)
54 #define inline
55 #define __inline__
56 #else
57 #define PRINTK(x...) do { } while (0)
58 #endif
59
60 static void print_raid5_conf (raid5_conf_t *conf);
61
62 static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
63 {
64         if (atomic_dec_and_test(&sh->count)) {
65                 if (!list_empty(&sh->lru))
66                         BUG();
67                 if (atomic_read(&conf->active_stripes)==0)
68                         BUG();
69                 if (test_bit(STRIPE_HANDLE, &sh->state)) {
70                         if (test_bit(STRIPE_DELAYED, &sh->state))
71                                 list_add_tail(&sh->lru, &conf->delayed_list);
72                         else
73                                 list_add_tail(&sh->lru, &conf->handle_list);
74                         md_wakeup_thread(conf->thread);
75                 } else {
76                         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
77                                 atomic_dec(&conf->preread_active_stripes);
78                                 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
79                                         md_wakeup_thread(conf->thread);
80                         }
81                         list_add_tail(&sh->lru, &conf->inactive_list);
82                         atomic_dec(&conf->active_stripes);
83                         if (!conf->inactive_blocked ||
84                             atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
85                                 wake_up(&conf->wait_for_stripe);
86                 }
87         }
88 }
89 static void release_stripe(struct stripe_head *sh)
90 {
91         raid5_conf_t *conf = sh->raid_conf;
92         unsigned long flags;
93         
94         spin_lock_irqsave(&conf->device_lock, flags);
95         __release_stripe(conf, sh);
96         spin_unlock_irqrestore(&conf->device_lock, flags);
97 }
98
99 static void remove_hash(struct stripe_head *sh)
100 {
101         PRINTK("remove_hash(), stripe %lu\n", sh->sector);
102
103         if (sh->hash_pprev) {
104                 if (sh->hash_next)
105                         sh->hash_next->hash_pprev = sh->hash_pprev;
106                 *sh->hash_pprev = sh->hash_next;
107                 sh->hash_pprev = NULL;
108         }
109 }
110
111 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
112 {
113         struct stripe_head **shp = &stripe_hash(conf, sh->sector);
114
115         PRINTK("insert_hash(), stripe %lu\n",sh->sector);
116
117         CHECK_DEVLOCK();
118         if ((sh->hash_next = *shp) != NULL)
119                 (*shp)->hash_pprev = &sh->hash_next;
120         *shp = sh;
121         sh->hash_pprev = shp;
122 }
123
124
125 /* find an idle stripe, make sure it is unhashed, and return it. */
126 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
127 {
128         struct stripe_head *sh = NULL;
129         struct list_head *first;
130
131         CHECK_DEVLOCK();
132         if (list_empty(&conf->inactive_list))
133                 goto out;
134         first = conf->inactive_list.next;
135         sh = list_entry(first, struct stripe_head, lru);
136         list_del_init(first);
137         remove_hash(sh);
138         atomic_inc(&conf->active_stripes);
139 out:
140         return sh;
141 }
142
143 static void shrink_buffers(struct stripe_head *sh, int num)
144 {
145         struct buffer_head *bh;
146         int i;
147
148         for (i=0; i<num ; i++) {
149                 bh = sh->bh_cache[i];
150                 if (!bh)
151                         return;
152                 sh->bh_cache[i] = NULL;
153                 free_page((unsigned long) bh->b_data);
154                 kfree(bh);
155         }
156 }
157
158 static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
159 {
160         struct buffer_head *bh;
161         int i;
162
163         for (i=0; i<num; i++) {
164                 struct page *page;
165                 bh = kmalloc(sizeof(struct buffer_head), priority);
166                 if (!bh)
167                         return 1;
168                 memset(bh, 0, sizeof (struct buffer_head));
169                 init_waitqueue_head(&bh->b_wait);
170                 if ((page = alloc_page(priority)))
171                         bh->b_data = page_address(page);
172                 else {
173                         kfree(bh);
174                         return 1;
175                 }
176                 atomic_set(&bh->b_count, 0);
177                 bh->b_page = page;
178                 sh->bh_cache[i] = bh;
179
180         }
181         return 0;
182 }
183
184 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
185
186 static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
187 {
188         raid5_conf_t *conf = sh->raid_conf;
189         int disks = conf->raid_disks, i;
190
191         if (atomic_read(&sh->count) != 0)
192                 BUG();
193         if (test_bit(STRIPE_HANDLE, &sh->state))
194                 BUG();
195         
196         CHECK_DEVLOCK();
197         PRINTK("init_stripe called, stripe %lu\n", sh->sector);
198
199         remove_hash(sh);
200         
201         sh->sector = sector;
202         sh->size = conf->buffer_size;
203         sh->state = 0;
204
205         for (i=disks; i--; ) {
206                 if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
207                     buffer_locked(sh->bh_cache[i])) {
208                         printk("sector=%lx i=%d %p %p %p %d\n",
209                                sh->sector, i, sh->bh_read[i],
210                                sh->bh_write[i], sh->bh_written[i],
211                                buffer_locked(sh->bh_cache[i]));
212                         BUG();
213                 }
214                 clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
215                 raid5_build_block(sh, i);
216         }
217         insert_hash(conf, sh);
218 }
219
220 /* the buffer size has changed, so unhash all stripes
221  * as active stripes complete, they will go onto inactive list
222  */
223 static void shrink_stripe_cache(raid5_conf_t *conf)
224 {
225         int i;
226         CHECK_DEVLOCK();
227         if (atomic_read(&conf->active_stripes))
228                 BUG();
229         for (i=0; i < NR_HASH; i++) {
230                 struct stripe_head *sh;
231                 while ((sh = conf->stripe_hashtbl[i])) 
232                         remove_hash(sh);
233         }
234 }
235
236 static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
237 {
238         struct stripe_head *sh;
239
240         CHECK_DEVLOCK();
241         PRINTK("__find_stripe, sector %lu\n", sector);
242         for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
243                 if (sh->sector == sector)
244                         return sh;
245         PRINTK("__stripe %lu not in cache\n", sector);
246         return NULL;
247 }
248
249 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock) 
250 {
251         struct stripe_head *sh;
252
253         PRINTK("get_stripe, sector %lu\n", sector);
254
255         md_spin_lock_irq(&conf->device_lock);
256
257         do {
258                 if (conf->buffer_size == 0 ||
259                     (size && size != conf->buffer_size)) {
260                         /* either the size is being changed (buffer_size==0) or
261                          * we need to change it.
262                          * If size==0, we can proceed as soon as buffer_size gets set.
263                          * If size>0, we can proceed when active_stripes reaches 0, or
264                          * when someone else sets the buffer_size to size.
265                          * If someone sets the buffer size to something else, we will need to
266                          * assert that we want to change it again
267                          */
268                         int oldsize = conf->buffer_size;
269                         PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
270                         if (size==0)
271                                 wait_event_lock_irq(conf->wait_for_stripe,
272                                                     conf->buffer_size,
273                                                     conf->device_lock);
274                         else {
275                                 while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
276                                         conf->buffer_size = 0;
277                                         wait_event_lock_irq(conf->wait_for_stripe,
278                                                             atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
279                                                             conf->device_lock);
280                                         PRINTK("waited and now  %ld/%d buffer_size is %d - %d active\n", sector, size,
281                                                conf->buffer_size, atomic_read(&conf->active_stripes));
282                                 }
283
284                                 if (conf->buffer_size != size) {
285                                         printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
286                                         shrink_stripe_cache(conf);
287                                         if (size==0) BUG();
288                                         conf->buffer_size = size;
289                                         PRINTK("size now %d\n", conf->buffer_size);
290                                 }
291                         }
292                 }
293                 if (size == 0)
294                         sector -= sector & ((conf->buffer_size>>9)-1);
295
296                 sh = __find_stripe(conf, sector);
297                 if (!sh) {
298                         if (!conf->inactive_blocked)
299                                 sh = get_free_stripe(conf);
300                         if (noblock && sh == NULL)
301                                 break;
302                         if (!sh) {
303                                 conf->inactive_blocked = 1;
304                                 wait_event_lock_irq(conf->wait_for_stripe,
305                                                     !list_empty(&conf->inactive_list) &&
306                                                     (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
307                                                      || !conf->inactive_blocked),
308                                                     conf->device_lock);
309                                 conf->inactive_blocked = 0;
310                         } else
311                                 init_stripe(sh, sector);
312                 } else {
313                         if (atomic_read(&sh->count)) {
314                                 if (!list_empty(&sh->lru))
315                                         BUG();
316                         } else {
317                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
318                                         atomic_inc(&conf->active_stripes);
319                                 if (list_empty(&sh->lru))
320                                         BUG();
321                                 list_del_init(&sh->lru);
322                         }
323                 }
324         } while (sh == NULL);
325
326         if (sh)
327                 atomic_inc(&sh->count);
328
329         md_spin_unlock_irq(&conf->device_lock);
330         return sh;
331 }
332
333 static int grow_stripes(raid5_conf_t *conf, int num, int priority)
334 {
335         struct stripe_head *sh;
336
337         while (num--) {
338                 sh = kmalloc(sizeof(struct stripe_head), priority);
339                 if (!sh)
340                         return 1;
341                 memset(sh, 0, sizeof(*sh));
342                 sh->raid_conf = conf;
343                 sh->lock = SPIN_LOCK_UNLOCKED;
344
345                 if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
346                         shrink_buffers(sh, conf->raid_disks);
347                         kfree(sh);
348                         return 1;
349                 }
350                 /* we just created an active stripe so... */
351                 atomic_set(&sh->count, 1);
352                 atomic_inc(&conf->active_stripes);
353                 INIT_LIST_HEAD(&sh->lru);
354                 release_stripe(sh);
355         }
356         return 0;
357 }
358
359 static void shrink_stripes(raid5_conf_t *conf, int num)
360 {
361         struct stripe_head *sh;
362
363         while (num--) {
364                 spin_lock_irq(&conf->device_lock);
365                 sh = get_free_stripe(conf);
366                 spin_unlock_irq(&conf->device_lock);
367                 if (!sh)
368                         break;
369                 if (atomic_read(&sh->count))
370                         BUG();
371                 shrink_buffers(sh, conf->raid_disks);
372                 kfree(sh);
373                 atomic_dec(&conf->active_stripes);
374         }
375 }
376
377
378 static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
379 {
380         struct stripe_head *sh = bh->b_private;
381         raid5_conf_t *conf = sh->raid_conf;
382         int disks = conf->raid_disks, i;
383         unsigned long flags;
384
385         for (i=0 ; i<disks; i++)
386                 if (bh == sh->bh_cache[i])
387                         break;
388
389         PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
390         if (i == disks) {
391                 BUG();
392                 return;
393         }
394
395         if (uptodate) {
396                 struct buffer_head *buffer;
397                 spin_lock_irqsave(&conf->device_lock, flags);
398                 /* we can return a buffer if we bypassed the cache or
399                  * if the top buffer is not in highmem.  If there are
400                  * multiple buffers, leave the extra work to
401                  * handle_stripe
402                  */
403                 buffer = sh->bh_read[i];
404                 if (buffer &&
405                     (!PageHighMem(buffer->b_page)
406                      || buffer->b_page == bh->b_page )
407                         ) {
408                         sh->bh_read[i] = buffer->b_reqnext;
409                         buffer->b_reqnext = NULL;
410                 } else
411                         buffer = NULL;
412                 spin_unlock_irqrestore(&conf->device_lock, flags);
413                 if (sh->bh_page[i]==NULL)
414                         set_bit(BH_Uptodate, &bh->b_state);
415                 if (buffer) {
416                         if (buffer->b_page != bh->b_page)
417                                 memcpy(buffer->b_data, bh->b_data, bh->b_size);
418                         buffer->b_end_io(buffer, 1);
419                 }
420         } else {
421                 md_error(conf->mddev, bh->b_dev);
422                 clear_bit(BH_Uptodate, &bh->b_state);
423         }
424         /* must restore b_page before unlocking buffer... */
425         if (sh->bh_page[i]) {
426                 bh->b_page = sh->bh_page[i];
427                 bh->b_data = page_address(bh->b_page);
428                 sh->bh_page[i] = NULL;
429                 clear_bit(BH_Uptodate, &bh->b_state);
430         }
431         clear_bit(BH_Lock, &bh->b_state);
432         set_bit(STRIPE_HANDLE, &sh->state);
433         release_stripe(sh);
434 }
435
436 static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
437 {
438         struct stripe_head *sh = bh->b_private;
439         raid5_conf_t *conf = sh->raid_conf;
440         int disks = conf->raid_disks, i;
441         unsigned long flags;
442
443         for (i=0 ; i<disks; i++)
444                 if (bh == sh->bh_cache[i])
445                         break;
446
447         PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
448         if (i == disks) {
449                 BUG();
450                 return;
451         }
452
453         md_spin_lock_irqsave(&conf->device_lock, flags);
454         if (!uptodate)
455                 md_error(conf->mddev, bh->b_dev);
456         clear_bit(BH_Lock, &bh->b_state);
457         set_bit(STRIPE_HANDLE, &sh->state);
458         __release_stripe(conf, sh);
459         md_spin_unlock_irqrestore(&conf->device_lock, flags);
460 }
461         
462
463
464 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
465 {
466         raid5_conf_t *conf = sh->raid_conf;
467         struct buffer_head *bh = sh->bh_cache[i];
468         unsigned long block = sh->sector / (sh->size >> 9);
469
470         init_buffer(bh, raid5_end_read_request, sh);
471         bh->b_dev       = conf->disks[i].dev;
472         bh->b_blocknr   = block;
473
474         bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
475         bh->b_size      = sh->size;
476         bh->b_list      = BUF_LOCKED;
477         return bh;
478 }
479
480 static int raid5_error (mddev_t *mddev, kdev_t dev)
481 {
482         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
483         mdp_super_t *sb = mddev->sb;
484         struct disk_info *disk;
485         int i;
486
487         PRINTK("raid5_error called\n");
488
489         for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
490                 if (disk->dev == dev) {
491                         if (disk->operational) {
492                                 disk->operational = 0;
493                                 mark_disk_faulty(sb->disks+disk->number);
494                                 mark_disk_nonsync(sb->disks+disk->number);
495                                 mark_disk_inactive(sb->disks+disk->number);
496                                 sb->active_disks--;
497                                 sb->working_disks--;
498                                 sb->failed_disks++;
499                                 mddev->sb_dirty = 1;
500                                 conf->working_disks--;
501                                 conf->failed_disks++;
502                                 md_wakeup_thread(conf->thread);
503                                 printk (KERN_ALERT
504                                         "raid5: Disk failure on %s, disabling device."
505                                         " Operation continuing on %d devices\n",
506                                         partition_name (dev), conf->working_disks);
507                         }
508                         return 0;
509                 }
510         }
511         /*
512          * handle errors in spares (during reconstruction)
513          */
514         if (conf->spare) {
515                 disk = conf->spare;
516                 if (disk->dev == dev) {
517                         printk (KERN_ALERT
518                                 "raid5: Disk failure on spare %s\n",
519                                 partition_name (dev));
520                         if (!conf->spare->operational) {
521                                 /* probably a SET_DISK_FAULTY ioctl */
522                                 return -EIO;
523                         }
524                         disk->operational = 0;
525                         disk->write_only = 0;
526                         conf->spare = NULL;
527                         mark_disk_faulty(sb->disks+disk->number);
528                         mark_disk_nonsync(sb->disks+disk->number);
529                         mark_disk_inactive(sb->disks+disk->number);
530                         sb->spare_disks--;
531                         sb->working_disks--;
532                         sb->failed_disks++;
533
534                         mddev->sb_dirty = 1;
535                         md_wakeup_thread(conf->thread);
536
537                         return 0;
538                 }
539         }
540         MD_BUG();
541         return -EIO;
542 }       
543
544 /*
545  * Input: a 'big' sector number,
546  * Output: index of the data and parity disk, and the sector # in them.
547  */
548 static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
549                         unsigned int data_disks, unsigned int * dd_idx,
550                         unsigned int * pd_idx, raid5_conf_t *conf)
551 {
552         unsigned long stripe;
553         unsigned long chunk_number;
554         unsigned int chunk_offset;
555         unsigned long new_sector;
556         int sectors_per_chunk = conf->chunk_size >> 9;
557
558         /* First compute the information on this sector */
559
560         /*
561          * Compute the chunk number and the sector offset inside the chunk
562          */
563         chunk_number = r_sector / sectors_per_chunk;
564         chunk_offset = r_sector % sectors_per_chunk;
565
566         /*
567          * Compute the stripe number
568          */
569         stripe = chunk_number / data_disks;
570
571         /*
572          * Compute the data disk and parity disk indexes inside the stripe
573          */
574         *dd_idx = chunk_number % data_disks;
575
576         /*
577          * Select the parity disk based on the user selected algorithm.
578          */
579         if (conf->level == 4)
580                 *pd_idx = data_disks;
581         else switch (conf->algorithm) {
582                 case ALGORITHM_LEFT_ASYMMETRIC:
583                         *pd_idx = data_disks - stripe % raid_disks;
584                         if (*dd_idx >= *pd_idx)
585                                 (*dd_idx)++;
586                         break;
587                 case ALGORITHM_RIGHT_ASYMMETRIC:
588                         *pd_idx = stripe % raid_disks;
589                         if (*dd_idx >= *pd_idx)
590                                 (*dd_idx)++;
591                         break;
592                 case ALGORITHM_LEFT_SYMMETRIC:
593                         *pd_idx = data_disks - stripe % raid_disks;
594                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
595                         break;
596                 case ALGORITHM_RIGHT_SYMMETRIC:
597                         *pd_idx = stripe % raid_disks;
598                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
599                         break;
600                 default:
601                         printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
602         }
603
604         /*
605          * Finally, compute the new sector number
606          */
607         new_sector = stripe * sectors_per_chunk + chunk_offset;
608         return new_sector;
609 }
610
611 #if 0
612 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
613 {
614         raid5_conf_t *conf = sh->raid_conf;
615         int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
616         unsigned long new_sector = sh->sector, check;
617         int sectors_per_chunk = conf->chunk_size >> 9;
618         unsigned long stripe = new_sector / sectors_per_chunk;
619         int chunk_offset = new_sector % sectors_per_chunk;
620         int chunk_number, dummy1, dummy2, dd_idx = i;
621         unsigned long r_sector, blocknr;
622
623         switch (conf->algorithm) {
624                 case ALGORITHM_LEFT_ASYMMETRIC:
625                 case ALGORITHM_RIGHT_ASYMMETRIC:
626                         if (i > sh->pd_idx)
627                                 i--;
628                         break;
629                 case ALGORITHM_LEFT_SYMMETRIC:
630                 case ALGORITHM_RIGHT_SYMMETRIC:
631                         if (i < sh->pd_idx)
632                                 i += raid_disks;
633                         i -= (sh->pd_idx + 1);
634                         break;
635                 default:
636                         printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
637         }
638
639         chunk_number = stripe * data_disks + i;
640         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
641         blocknr = r_sector / (sh->size >> 9);
642
643         check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
644         if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
645                 printk("compute_blocknr: map not correct\n");
646                 return 0;
647         }
648         return blocknr;
649 }
650 #endif
651
652 #define check_xor()     do {                                    \
653                            if (count == MAX_XOR_BLOCKS) {       \
654                                 xor_block(count, bh_ptr);       \
655                                 count = 1;                      \
656                            }                                    \
657                         } while(0)
658
659
660 static void compute_block(struct stripe_head *sh, int dd_idx)
661 {
662         raid5_conf_t *conf = sh->raid_conf;
663         int i, count, disks = conf->raid_disks;
664         struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
665
666         PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
667
668
669         memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
670         bh_ptr[0] = sh->bh_cache[dd_idx];
671         count = 1;
672         for (i = disks ; i--; ) {
673                 if (i == dd_idx)
674                         continue;
675                 bh = sh->bh_cache[i];
676                 if (buffer_uptodate(bh))
677                         bh_ptr[count++] = bh;
678                 else
679                         printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
680
681                 check_xor();
682         }
683         if (count != 1)
684                 xor_block(count, bh_ptr);
685         set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
686 }
687
688 static void compute_parity(struct stripe_head *sh, int method)
689 {
690         raid5_conf_t *conf = sh->raid_conf;
691         int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
692         struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
693         struct buffer_head *chosen[MD_SB_DISKS];
694
695         PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
696         memset(chosen, 0, sizeof(chosen));
697
698         count = 1;
699         bh_ptr[0] = sh->bh_cache[pd_idx];
700         switch(method) {
701         case READ_MODIFY_WRITE:
702                 if (!buffer_uptodate(sh->bh_cache[pd_idx]))
703                         BUG();
704                 for (i=disks ; i-- ;) {
705                         if (i==pd_idx)
706                                 continue;
707                         if (sh->bh_write[i] &&
708                             buffer_uptodate(sh->bh_cache[i])) {
709                                 bh_ptr[count++] = sh->bh_cache[i];
710                                 chosen[i] = sh->bh_write[i];
711                                 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
712                                 chosen[i]->b_reqnext = sh->bh_written[i];
713                                 sh->bh_written[i] = chosen[i];
714                                 check_xor();
715                         }
716                 }
717                 break;
718         case RECONSTRUCT_WRITE:
719                 memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
720                 for (i= disks; i-- ;)
721                         if (i!=pd_idx && sh->bh_write[i]) {
722                                 chosen[i] = sh->bh_write[i];
723                                 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
724                                 chosen[i]->b_reqnext = sh->bh_written[i];
725                                 sh->bh_written[i] = chosen[i];
726                         }
727                 break;
728         case CHECK_PARITY:
729                 break;
730         }
731         if (count>1) {
732                 xor_block(count, bh_ptr);
733                 count = 1;
734         }
735         
736         for (i = disks; i--;)
737                 if (chosen[i]) {
738                         struct buffer_head *bh = sh->bh_cache[i];
739                         char *bdata;
740                         bdata = bh_kmap(chosen[i]);
741                         memcpy(bh->b_data,
742                                bdata,sh->size);
743                         bh_kunmap(chosen[i]);
744                         set_bit(BH_Lock, &bh->b_state);
745                         mark_buffer_uptodate(bh, 1);
746                 }
747
748         switch(method) {
749         case RECONSTRUCT_WRITE:
750         case CHECK_PARITY:
751                 for (i=disks; i--;)
752                         if (i != pd_idx) {
753                                 bh_ptr[count++] = sh->bh_cache[i];
754                                 check_xor();
755                         }
756                 break;
757         case READ_MODIFY_WRITE:
758                 for (i = disks; i--;)
759                         if (chosen[i]) {
760                                 bh_ptr[count++] = sh->bh_cache[i];
761                                 check_xor();
762                         }
763         }
764         if (count != 1)
765                 xor_block(count, bh_ptr);
766         
767         if (method != CHECK_PARITY) {
768                 mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
769                 set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
770         } else
771                 mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
772 }
773
774 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
775 {
776         struct buffer_head **bhp;
777         raid5_conf_t *conf = sh->raid_conf;
778
779         PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
780
781
782         spin_lock(&sh->lock);
783         spin_lock_irq(&conf->device_lock);
784         bh->b_reqnext = NULL;
785         if (rw == READ)
786                 bhp = &sh->bh_read[dd_idx];
787         else
788                 bhp = &sh->bh_write[dd_idx];
789         while (*bhp) {
790                 printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
791                 bhp = & (*bhp)->b_reqnext;
792         }
793         *bhp = bh;
794         spin_unlock_irq(&conf->device_lock);
795         spin_unlock(&sh->lock);
796
797         PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
798 }
799
800
801
802
803
804 /*
805  * handle_stripe - do things to a stripe.
806  *
807  * We lock the stripe and then examine the state of various bits
808  * to see what needs to be done.
809  * Possible results:
810  *    return some read request which now have data
811  *    return some write requests which are safely on disc
812  *    schedule a read on some buffers
813  *    schedule a write of some buffers
814  *    return confirmation of parity correctness
815  *
816  * Parity calculations are done inside the stripe lock
817  * buffers are taken off read_list or write_list, and bh_cache buffers
818  * get BH_Lock set before the stripe lock is released.
819  *
820  */
821  
822 static void handle_stripe(struct stripe_head *sh)
823 {
824         raid5_conf_t *conf = sh->raid_conf;
825         int disks = conf->raid_disks;
826         struct buffer_head *return_ok= NULL, *return_fail = NULL;
827         int action[MD_SB_DISKS];
828         int i;
829         int syncing;
830         int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
831         int failed_num=0;
832         struct buffer_head *bh;
833
834         PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
835         memset(action, 0, sizeof(action));
836
837         spin_lock(&sh->lock);
838         clear_bit(STRIPE_HANDLE, &sh->state);
839         clear_bit(STRIPE_DELAYED, &sh->state);
840
841         syncing = test_bit(STRIPE_SYNCING, &sh->state);
842         /* Now to look around and see what can be done */
843
844         for (i=disks; i--; ) {
845                 bh = sh->bh_cache[i];
846                 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
847                 /* maybe we can reply to a read */
848                 if (buffer_uptodate(bh) && sh->bh_read[i]) {
849                         struct buffer_head *rbh, *rbh2;
850                         PRINTK("Return read for disc %d\n", i);
851                         spin_lock_irq(&conf->device_lock);
852                         rbh = sh->bh_read[i];
853                         sh->bh_read[i] = NULL;
854                         spin_unlock_irq(&conf->device_lock);
855                         while (rbh) {
856                                 char *bdata;
857                                 bdata = bh_kmap(rbh);
858                                 memcpy(bdata, bh->b_data, bh->b_size);
859                                 bh_kunmap(rbh);
860                                 rbh2 = rbh->b_reqnext;
861                                 rbh->b_reqnext = return_ok;
862                                 return_ok = rbh;
863                                 rbh = rbh2;
864                         }
865                 }
866
867                 /* now count some things */
868                 if (buffer_locked(bh)) locked++;
869                 if (buffer_uptodate(bh)) uptodate++;
870
871                 
872                 if (sh->bh_read[i]) to_read++;
873                 if (sh->bh_write[i]) to_write++;
874                 if (sh->bh_written[i]) written++;
875                 if (!conf->disks[i].operational) {
876                         failed++;
877                         failed_num = i;
878                 }
879         }
880         PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
881                locked, uptodate, to_read, to_write, failed, failed_num);
882         /* check if the array has lost two devices and, if so, some requests might
883          * need to be failed
884          */
885         if (failed > 1 && to_read+to_write+written) {
886                 for (i=disks; i--; ) {
887                         /* fail all writes first */
888                         if (sh->bh_write[i]) to_write--;
889                         while ((bh = sh->bh_write[i])) {
890                                 sh->bh_write[i] = bh->b_reqnext;
891                                 bh->b_reqnext = return_fail;
892                                 return_fail = bh;
893                         }
894                         /* and fail all 'written' */
895                         if (sh->bh_written[i]) written--;
896                         while ((bh = sh->bh_written[i])) {
897                                 sh->bh_written[i] = bh->b_reqnext;
898                                 bh->b_reqnext = return_fail;
899                                 return_fail = bh;
900                         }
901
902                         /* fail any reads if this device is non-operational */
903                         if (!conf->disks[i].operational) {
904                                 spin_lock_irq(&conf->device_lock);
905                                 if (sh->bh_read[i]) to_read--;
906                                 while ((bh = sh->bh_read[i])) {
907                                         sh->bh_read[i] = bh->b_reqnext;
908                                         bh->b_reqnext = return_fail;
909                                         return_fail = bh;
910                                 }
911                                 spin_unlock_irq(&conf->device_lock);
912                         }
913                 }
914         }
915         if (failed > 1 && syncing) {
916                 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
917                 clear_bit(STRIPE_SYNCING, &sh->state);
918                 syncing = 0;
919         }
920
921         /* might be able to return some write requests if the parity block
922          * is safe, or on a failed drive
923          */
924         bh = sh->bh_cache[sh->pd_idx];
925         if ( written &&
926              ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
927                || (failed == 1 && failed_num == sh->pd_idx))
928             ) {
929             /* any written block on a uptodate or failed drive can be returned */
930             for (i=disks; i--; )
931                 if (sh->bh_written[i]) {
932                     bh = sh->bh_cache[i];
933                     if (!conf->disks[sh->pd_idx].operational ||
934                         (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
935                         /* maybe we can return some write requests */
936                         struct buffer_head *wbh, *wbh2;
937                         PRINTK("Return write for disc %d\n", i);
938                         wbh = sh->bh_written[i];
939                         sh->bh_written[i] = NULL;
940                         while (wbh) {
941                             wbh2 = wbh->b_reqnext;
942                             wbh->b_reqnext = return_ok;
943                             return_ok = wbh;
944                             wbh = wbh2;
945                         }
946                     }
947                 }
948         }
949                 
950         /* Now we might consider reading some blocks, either to check/generate
951          * parity, or to satisfy requests
952          */
953         if (to_read || (syncing && (uptodate < disks))) {
954                 for (i=disks; i--;) {
955                         bh = sh->bh_cache[i];
956                         if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
957                             (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
958                                 /* we would like to get this block, possibly
959                                  * by computing it, but we might not be able to
960                                  */
961                                 if (uptodate == disks-1) {
962                                         PRINTK("Computing block %d\n", i);
963                                         compute_block(sh, i);
964                                         uptodate++;
965                                 } else if (conf->disks[i].operational) {
966                                         set_bit(BH_Lock, &bh->b_state);
967                                         action[i] = READ+1;
968                                         /* if I am just reading this block and we don't have
969                                            a failed drive, or any pending writes then sidestep the cache */
970                                         if (sh->bh_page[i]) BUG();
971                                         if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
972                                             ! syncing && !failed && !to_write) {
973                                                 sh->bh_page[i] = sh->bh_cache[i]->b_page;
974                                                 sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
975                                                 sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
976                                         }
977                                         locked++;
978                                         PRINTK("Reading block %d (sync=%d)\n", i, syncing);
979                                         if (syncing)
980                                                 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
981                                 }
982                         }
983                 }
984                 set_bit(STRIPE_HANDLE, &sh->state);
985         }
986
987         /* now to consider writing and what else, if anything should be read */
988         if (to_write) {
989                 int rmw=0, rcw=0;
990                 for (i=disks ; i--;) {
991                         /* would I have to read this buffer for read_modify_write */
992                         bh = sh->bh_cache[i];
993                         if ((sh->bh_write[i] || i == sh->pd_idx) &&
994                             (!buffer_locked(bh) || sh->bh_page[i]) &&
995                             !buffer_uptodate(bh)) {
996                                 if (conf->disks[i].operational 
997 /*                                  && !(conf->resync_parity && i == sh->pd_idx) */
998                                         )
999                                         rmw++;
1000                                 else rmw += 2*disks;  /* cannot read it */
1001                         }
1002                         /* Would I have to read this buffer for reconstruct_write */
1003                         if (!sh->bh_write[i] && i != sh->pd_idx &&
1004                             (!buffer_locked(bh) || sh->bh_page[i]) &&
1005                             !buffer_uptodate(bh)) {
1006                                 if (conf->disks[i].operational) rcw++;
1007                                 else rcw += 2*disks;
1008                         }
1009                 }
1010                 PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1011                 set_bit(STRIPE_HANDLE, &sh->state);
1012                 if (rmw < rcw && rmw > 0)
1013                         /* prefer read-modify-write, but need to get some data */
1014                         for (i=disks; i--;) {
1015                                 bh = sh->bh_cache[i];
1016                                 if ((sh->bh_write[i] || i == sh->pd_idx) &&
1017                                     !buffer_locked(bh) && !buffer_uptodate(bh) &&
1018                                     conf->disks[i].operational) {
1019                                         if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1020                                         {
1021                                                 PRINTK("Read_old block %d for r-m-w\n", i);
1022                                                 set_bit(BH_Lock, &bh->b_state);
1023                                                 action[i] = READ+1;
1024                                                 locked++;
1025                                         } else {
1026                                                 set_bit(STRIPE_DELAYED, &sh->state);
1027                                                 set_bit(STRIPE_HANDLE, &sh->state);
1028                                         }
1029                                 }
1030                         }
1031                 if (rcw <= rmw && rcw > 0)
1032                         /* want reconstruct write, but need to get some data */
1033                         for (i=disks; i--;) {
1034                                 bh = sh->bh_cache[i];
1035                                 if (!sh->bh_write[i]  && i != sh->pd_idx &&
1036                                     !buffer_locked(bh) && !buffer_uptodate(bh) &&
1037                                     conf->disks[i].operational) {
1038                                         if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1039                                         {
1040                                                 PRINTK("Read_old block %d for Reconstruct\n", i);
1041                                                 set_bit(BH_Lock, &bh->b_state);
1042                                                 action[i] = READ+1;
1043                                                 locked++;
1044                                         } else {
1045                                                 set_bit(STRIPE_DELAYED, &sh->state);
1046                                                 set_bit(STRIPE_HANDLE, &sh->state);
1047                                         }
1048                                 }
1049                         }
1050                 /* now if nothing is locked, and if we have enough data, we can start a write request */
1051                 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1052                         PRINTK("Computing parity...\n");
1053                         compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1054                         /* now every locked buffer is ready to be written */
1055                         for (i=disks; i--;)
1056                                 if (buffer_locked(sh->bh_cache[i])) {
1057                                         PRINTK("Writing block %d\n", i);
1058                                         locked++;
1059                                         action[i] = WRITE+1;
1060                                         if (!conf->disks[i].operational
1061                                             || (i==sh->pd_idx && failed == 0))
1062                                                 set_bit(STRIPE_INSYNC, &sh->state);
1063                                 }
1064                         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1065                                 atomic_dec(&conf->preread_active_stripes);
1066                                 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1067                                         md_wakeup_thread(conf->thread);
1068                         }
1069                 }
1070         }
1071
1072         /* maybe we need to check and possibly fix the parity for this stripe
1073          * Any reads will already have been scheduled, so we just see if enough data
1074          * is available
1075          */
1076         if (syncing && locked == 0 &&
1077             !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1078                 set_bit(STRIPE_HANDLE, &sh->state);
1079                 if (failed == 0) {
1080                         if (uptodate != disks)
1081                                 BUG();
1082                         compute_parity(sh, CHECK_PARITY);
1083                         uptodate--;
1084                         bh = sh->bh_cache[sh->pd_idx];
1085                         if ((*(u32*)bh->b_data) == 0 &&
1086                             !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1087                                 /* parity is correct (on disc, not in buffer any more) */
1088                                 set_bit(STRIPE_INSYNC, &sh->state);
1089                         }
1090                 }
1091                 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1092                         struct disk_info *spare;
1093                         if (failed==0)
1094                                 failed_num = sh->pd_idx;
1095                         /* should be able to compute the missing block and write it to spare */
1096                         if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1097                                 if (uptodate+1 != disks)
1098                                         BUG();
1099                                 compute_block(sh, failed_num);
1100                                 uptodate++;
1101                         }
1102                         if (uptodate != disks)
1103                                 BUG();
1104                         bh = sh->bh_cache[failed_num];
1105                         set_bit(BH_Lock, &bh->b_state);
1106                         action[failed_num] = WRITE+1;
1107                         locked++;
1108                         set_bit(STRIPE_INSYNC, &sh->state);
1109                         if (conf->disks[failed_num].operational)
1110                                 md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
1111                         else if ((spare=conf->spare))
1112                                 md_sync_acct(spare->dev, bh->b_size>>9);
1113
1114                 }
1115         }
1116         if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1117                 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
1118                 clear_bit(STRIPE_SYNCING, &sh->state);
1119         }
1120         
1121         
1122         spin_unlock(&sh->lock);
1123
1124         while ((bh=return_ok)) {
1125                 return_ok = bh->b_reqnext;
1126                 bh->b_reqnext = NULL;
1127                 bh->b_end_io(bh, 1);
1128         }
1129         while ((bh=return_fail)) {
1130                 return_fail = bh->b_reqnext;
1131                 bh->b_reqnext = NULL;
1132                 bh->b_end_io(bh, 0);
1133         }
1134         for (i=disks; i-- ;) 
1135                 if (action[i]) {
1136                         struct buffer_head *bh = sh->bh_cache[i];
1137                         struct disk_info *spare = conf->spare;
1138                         int skip = 0;
1139                         if (action[i] == READ+1)
1140                                 bh->b_end_io = raid5_end_read_request;
1141                         else
1142                                 bh->b_end_io = raid5_end_write_request;
1143                         if (conf->disks[i].operational)
1144                                 bh->b_dev = conf->disks[i].dev;
1145                         else if (spare && action[i] == WRITE+1)
1146                                 bh->b_dev = spare->dev;
1147                         else skip=1;
1148                         if (!skip) {
1149                                 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1150                                 atomic_inc(&sh->count);
1151                                 bh->b_rdev = bh->b_dev;
1152                                 bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1153                                 generic_make_request(action[i]-1, bh);
1154                         } else {
1155                                 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1156                                 clear_bit(BH_Lock, &bh->b_state);
1157                                 set_bit(STRIPE_HANDLE, &sh->state);
1158                         }
1159                 }
1160 }
1161
1162 static inline void raid5_activate_delayed(raid5_conf_t *conf)
1163 {
1164         if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1165                 while (!list_empty(&conf->delayed_list)) {
1166                         struct list_head *l = conf->delayed_list.next;
1167                         struct stripe_head *sh;
1168                         sh = list_entry(l, struct stripe_head, lru);
1169                         list_del_init(l);
1170                         clear_bit(STRIPE_DELAYED, &sh->state);
1171                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1172                                 atomic_inc(&conf->preread_active_stripes);
1173                         list_add_tail(&sh->lru, &conf->handle_list);
1174                 }
1175         }
1176 }
1177 static void raid5_unplug_device(void *data)
1178 {
1179         raid5_conf_t *conf = (raid5_conf_t *)data;
1180         unsigned long flags;
1181
1182         spin_lock_irqsave(&conf->device_lock, flags);
1183
1184         raid5_activate_delayed(conf);
1185         
1186         conf->plugged = 0;
1187         md_wakeup_thread(conf->thread);
1188
1189         spin_unlock_irqrestore(&conf->device_lock, flags);
1190 }
1191
1192 static inline void raid5_plug_device(raid5_conf_t *conf)
1193 {
1194         spin_lock_irq(&conf->device_lock);
1195         if (list_empty(&conf->delayed_list))
1196                 if (!conf->plugged) {
1197                         conf->plugged = 1;
1198                         queue_task(&conf->plug_tq, &tq_disk);
1199                 }
1200         spin_unlock_irq(&conf->device_lock);
1201 }
1202
1203 static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1204 {
1205         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1206         const unsigned int raid_disks = conf->raid_disks;
1207         const unsigned int data_disks = raid_disks - 1;
1208         unsigned int dd_idx, pd_idx;
1209         unsigned long new_sector;
1210         int read_ahead = 0;
1211
1212         struct stripe_head *sh;
1213
1214         if (rw == READA) {
1215                 rw = READ;
1216                 read_ahead=1;
1217         }
1218
1219         new_sector = raid5_compute_sector(bh->b_rsector,
1220                         raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1221
1222         PRINTK("raid5_make_request, sector %lu\n", new_sector);
1223         sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1224         if (sh) {
1225                 sh->pd_idx = pd_idx;
1226
1227                 add_stripe_bh(sh, bh, dd_idx, rw);
1228
1229                 raid5_plug_device(conf);
1230                 handle_stripe(sh);
1231                 release_stripe(sh);
1232         } else
1233                 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1234         return 0;
1235 }
1236
1237 /*
1238  * Determine correct block size for this device.
1239  */
1240 unsigned int device_bsize (kdev_t dev)
1241 {
1242         unsigned int i, correct_size;
1243
1244         correct_size = BLOCK_SIZE;
1245         if (blksize_size[MAJOR(dev)]) {
1246                 i = blksize_size[MAJOR(dev)][MINOR(dev)];
1247                 if (i)
1248                         correct_size = i;
1249         }
1250
1251         return correct_size;
1252 }
1253
1254 static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
1255 {
1256         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1257         struct stripe_head *sh;
1258         int sectors_per_chunk = conf->chunk_size >> 9;
1259         unsigned long stripe = sector_nr/sectors_per_chunk;
1260         int chunk_offset = sector_nr % sectors_per_chunk;
1261         int dd_idx, pd_idx;
1262         unsigned long first_sector;
1263         int raid_disks = conf->raid_disks;
1264         int data_disks = raid_disks-1;
1265         int redone = 0;
1266         int bufsize;
1267
1268         sh = get_active_stripe(conf, sector_nr, 0, 0);
1269         bufsize = sh->size;
1270         redone = sector_nr - sh->sector;
1271         first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1272                 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1273         sh->pd_idx = pd_idx;
1274         spin_lock(&sh->lock);   
1275         set_bit(STRIPE_SYNCING, &sh->state);
1276         clear_bit(STRIPE_INSYNC, &sh->state);
1277         sh->sync_redone = redone;
1278         spin_unlock(&sh->lock);
1279
1280         handle_stripe(sh);
1281         release_stripe(sh);
1282
1283         return (bufsize>>9)-redone;
1284 }
1285
1286 /*
1287  * This is our raid5 kernel thread.
1288  *
1289  * We scan the hash table for stripes which can be handled now.
1290  * During the scan, completed stripes are saved for us by the interrupt
1291  * handler, so that they will not have to wait for our next wakeup.
1292  */
1293 static void raid5d (void *data)
1294 {
1295         struct stripe_head *sh;
1296         raid5_conf_t *conf = data;
1297         mddev_t *mddev = conf->mddev;
1298         int handled;
1299
1300         PRINTK("+++ raid5d active\n");
1301
1302         handled = 0;
1303
1304         if (mddev->sb_dirty)
1305                 md_update_sb(mddev);
1306         md_spin_lock_irq(&conf->device_lock);
1307         while (1) {
1308                 struct list_head *first;
1309
1310                 if (list_empty(&conf->handle_list) &&
1311                     atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1312                     !conf->plugged &&
1313                     !list_empty(&conf->delayed_list))
1314                         raid5_activate_delayed(conf);
1315
1316                 if (list_empty(&conf->handle_list))
1317                         break;
1318
1319                 first = conf->handle_list.next;
1320                 sh = list_entry(first, struct stripe_head, lru);
1321
1322                 list_del_init(first);
1323                 atomic_inc(&sh->count);
1324                 if (atomic_read(&sh->count)!= 1)
1325                         BUG();
1326                 md_spin_unlock_irq(&conf->device_lock);
1327                 
1328                 handled++;
1329                 handle_stripe(sh);
1330                 release_stripe(sh);
1331
1332                 md_spin_lock_irq(&conf->device_lock);
1333         }
1334         PRINTK("%d stripes handled\n", handled);
1335
1336         md_spin_unlock_irq(&conf->device_lock);
1337
1338         PRINTK("--- raid5d inactive\n");
1339 }
1340
1341 /*
1342  * Private kernel thread for parity reconstruction after an unclean
1343  * shutdown. Reconstruction on spare drives in case of a failed drive
1344  * is done by the generic mdsyncd.
1345  */
1346 static void raid5syncd (void *data)
1347 {
1348         raid5_conf_t *conf = data;
1349         mddev_t *mddev = conf->mddev;
1350
1351         if (!conf->resync_parity)
1352                 return;
1353         if (conf->resync_parity == 2)
1354                 return;
1355         down(&mddev->recovery_sem);
1356         if (md_do_sync(mddev,NULL)) {
1357                 up(&mddev->recovery_sem);
1358                 printk("raid5: resync aborted!\n");
1359                 return;
1360         }
1361         conf->resync_parity = 0;
1362         up(&mddev->recovery_sem);
1363         printk("raid5: resync finished.\n");
1364 }
1365
1366 static int raid5_run (mddev_t *mddev)
1367 {
1368         raid5_conf_t *conf;
1369         int i, j, raid_disk, memory;
1370         mdp_super_t *sb = mddev->sb;
1371         mdp_disk_t *desc;
1372         mdk_rdev_t *rdev;
1373         struct disk_info *disk;
1374         struct md_list_head *tmp;
1375         int start_recovery = 0;
1376
1377         MOD_INC_USE_COUNT;
1378
1379         if (sb->level != 5 && sb->level != 4) {
1380                 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1381                 MOD_DEC_USE_COUNT;
1382                 return -EIO;
1383         }
1384
1385         mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1386         if ((conf = mddev->private) == NULL)
1387                 goto abort;
1388         memset (conf, 0, sizeof (*conf));
1389         conf->mddev = mddev;
1390
1391         if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1392                 goto abort;
1393         memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1394
1395         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1396         md_init_waitqueue_head(&conf->wait_for_stripe);
1397         INIT_LIST_HEAD(&conf->handle_list);
1398         INIT_LIST_HEAD(&conf->delayed_list);
1399         INIT_LIST_HEAD(&conf->inactive_list);
1400         atomic_set(&conf->active_stripes, 0);
1401         atomic_set(&conf->preread_active_stripes, 0);
1402         conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
1403
1404         conf->plugged = 0;
1405         conf->plug_tq.sync = 0;
1406         conf->plug_tq.routine = &raid5_unplug_device;
1407         conf->plug_tq.data = conf;
1408
1409         PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1410
1411         ITERATE_RDEV(mddev,rdev,tmp) {
1412                 /*
1413                  * This is important -- we are using the descriptor on
1414                  * the disk only to get a pointer to the descriptor on
1415                  * the main superblock, which might be more recent.
1416                  */
1417                 desc = sb->disks + rdev->desc_nr;
1418                 raid_disk = desc->raid_disk;
1419                 disk = conf->disks + raid_disk;
1420
1421                 if (disk_faulty(desc)) {
1422                         printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1423                         if (!rdev->faulty) {
1424                                 MD_BUG();
1425                                 goto abort;
1426                         }
1427                         disk->number = desc->number;
1428                         disk->raid_disk = raid_disk;
1429                         disk->dev = rdev->dev;
1430
1431                         disk->operational = 0;
1432                         disk->write_only = 0;
1433                         disk->spare = 0;
1434                         disk->used_slot = 1;
1435                         continue;
1436                 }
1437                 if (disk_active(desc)) {
1438                         if (!disk_sync(desc)) {
1439                                 printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1440                                 MD_BUG();
1441                                 goto abort;
1442                         }
1443                         if (raid_disk > sb->raid_disks) {
1444                                 printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1445                                 continue;
1446                         }
1447                         if (disk->operational) {
1448                                 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1449                                 continue;
1450                         }
1451                         printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1452         
1453                         disk->number = desc->number;
1454                         disk->raid_disk = raid_disk;
1455                         disk->dev = rdev->dev;
1456                         disk->operational = 1;
1457                         disk->used_slot = 1;
1458
1459                         conf->working_disks++;
1460                 } else {
1461                         /*
1462                          * Must be a spare disk ..
1463                          */
1464                         printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1465                         disk->number = desc->number;
1466                         disk->raid_disk = raid_disk;
1467                         disk->dev = rdev->dev;
1468
1469                         disk->operational = 0;
1470                         disk->write_only = 0;
1471                         disk->spare = 1;
1472                         disk->used_slot = 1;
1473                 }
1474         }
1475
1476         for (i = 0; i < MD_SB_DISKS; i++) {
1477                 desc = sb->disks + i;
1478                 raid_disk = desc->raid_disk;
1479                 disk = conf->disks + raid_disk;
1480
1481                 if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1482                         !conf->disks[raid_disk].used_slot) {
1483
1484                         disk->number = desc->number;
1485                         disk->raid_disk = raid_disk;
1486                         disk->dev = MKDEV(0,0);
1487
1488                         disk->operational = 0;
1489                         disk->write_only = 0;
1490                         disk->spare = 0;
1491                         disk->used_slot = 1;
1492                 }
1493         }
1494
1495         conf->raid_disks = sb->raid_disks;
1496         /*
1497          * 0 for a fully functional array, 1 for a degraded array.
1498          */
1499         conf->failed_disks = conf->raid_disks - conf->working_disks;
1500         conf->mddev = mddev;
1501         conf->chunk_size = sb->chunk_size;
1502         conf->level = sb->level;
1503         conf->algorithm = sb->layout;
1504         conf->max_nr_stripes = NR_STRIPES;
1505
1506 #if 0
1507         for (i = 0; i < conf->raid_disks; i++) {
1508                 if (!conf->disks[i].used_slot) {
1509                         MD_BUG();
1510                         goto abort;
1511                 }
1512         }
1513 #endif
1514         if (!conf->chunk_size || conf->chunk_size % 4) {
1515                 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1516                 goto abort;
1517         }
1518         if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1519                 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1520                 goto abort;
1521         }
1522         if (conf->failed_disks > 1) {
1523                 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1524                 goto abort;
1525         }
1526
1527         if (conf->working_disks != sb->raid_disks) {
1528                 printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1529                 start_recovery = 1;
1530         }
1531
1532         {
1533                 const char * name = "raid5d";
1534
1535                 conf->thread = md_register_thread(raid5d, conf, name);
1536                 if (!conf->thread) {
1537                         printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1538                         goto abort;
1539                 }
1540         }
1541
1542         memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1543                  conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1544         if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1545                 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1546                 shrink_stripes(conf, conf->max_nr_stripes);
1547                 goto abort;
1548         } else
1549                 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1550
1551         /*
1552          * Regenerate the "device is in sync with the raid set" bit for
1553          * each device.
1554          */
1555         for (i = 0; i < MD_SB_DISKS ; i++) {
1556                 mark_disk_nonsync(sb->disks + i);
1557                 for (j = 0; j < sb->raid_disks; j++) {
1558                         if (!conf->disks[j].operational)
1559                                 continue;
1560                         if (sb->disks[i].number == conf->disks[j].number)
1561                                 mark_disk_sync(sb->disks + i);
1562                 }
1563         }
1564         sb->active_disks = conf->working_disks;
1565
1566         if (sb->active_disks == sb->raid_disks)
1567                 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1568         else
1569                 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1570
1571         if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1572                 const char * name = "raid5syncd";
1573
1574                 conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1575                 if (!conf->resync_thread) {
1576                         printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1577                         goto abort;
1578                 }
1579
1580                 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1581                 conf->resync_parity = 1;
1582                 md_wakeup_thread(conf->resync_thread);
1583         }
1584
1585         print_raid5_conf(conf);
1586         if (start_recovery)
1587                 md_recover_arrays();
1588         print_raid5_conf(conf);
1589
1590         /* Ok, everything is just fine now */
1591         return (0);
1592 abort:
1593         if (conf) {
1594                 print_raid5_conf(conf);
1595                 if (conf->stripe_hashtbl)
1596                         free_pages((unsigned long) conf->stripe_hashtbl,
1597                                                         HASH_PAGES_ORDER);
1598                 kfree(conf);
1599         }
1600         mddev->private = NULL;
1601         printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1602         MOD_DEC_USE_COUNT;
1603         return -EIO;
1604 }
1605
1606 static int raid5_stop_resync (mddev_t *mddev)
1607 {
1608         raid5_conf_t *conf = mddev_to_conf(mddev);
1609         mdk_thread_t *thread = conf->resync_thread;
1610
1611         if (thread) {
1612                 if (conf->resync_parity) {
1613                         conf->resync_parity = 2;
1614                         md_interrupt_thread(thread);
1615                         printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1616                         return 1;
1617                 }
1618                 return 0;
1619         }
1620         return 0;
1621 }
1622
1623 static int raid5_restart_resync (mddev_t *mddev)
1624 {
1625         raid5_conf_t *conf = mddev_to_conf(mddev);
1626
1627         if (conf->resync_parity) {
1628                 if (!conf->resync_thread) {
1629                         MD_BUG();
1630                         return 0;
1631                 }
1632                 printk("raid5: waking up raid5resync.\n");
1633                 conf->resync_parity = 1;
1634                 md_wakeup_thread(conf->resync_thread);
1635                 return 1;
1636         } else
1637                 printk("raid5: no restart-resync needed.\n");
1638         return 0;
1639 }
1640
1641
1642 static int raid5_stop (mddev_t *mddev)
1643 {
1644         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1645
1646         if (conf->resync_thread)
1647                 md_unregister_thread(conf->resync_thread);
1648         md_unregister_thread(conf->thread);
1649         shrink_stripes(conf, conf->max_nr_stripes);
1650         free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1651         kfree(conf);
1652         mddev->private = NULL;
1653         MOD_DEC_USE_COUNT;
1654         return 0;
1655 }
1656
1657 #if RAID5_DEBUG
1658 static void print_sh (struct stripe_head *sh)
1659 {
1660         int i;
1661
1662         printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1663         printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
1664         printk("sh %lu, ", sh->sector);
1665         for (i = 0; i < MD_SB_DISKS; i++) {
1666                 if (sh->bh_cache[i])
1667                         printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1668         }
1669         printk("\n");
1670 }
1671
1672 static void printall (raid5_conf_t *conf)
1673 {
1674         struct stripe_head *sh;
1675         int i;
1676
1677         md_spin_lock_irq(&conf->device_lock);
1678         for (i = 0; i < NR_HASH; i++) {
1679                 sh = conf->stripe_hashtbl[i];
1680                 for (; sh; sh = sh->hash_next) {
1681                         if (sh->raid_conf != conf)
1682                                 continue;
1683                         print_sh(sh);
1684                 }
1685         }
1686         md_spin_unlock_irq(&conf->device_lock);
1687
1688         PRINTK("--- raid5d inactive\n");
1689 }
1690 #endif
1691
1692 static void raid5_status (struct seq_file *seq, mddev_t *mddev)
1693 {
1694         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1695         mdp_super_t *sb = mddev->sb;
1696         int i;
1697
1698         seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1699         seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1700         for (i = 0; i < conf->raid_disks; i++)
1701                 seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_");
1702         seq_printf (seq, "]");
1703 #if RAID5_DEBUG
1704 #define D(x) \
1705         seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1706         printall(conf);
1707 #endif
1708
1709 }
1710
1711 static void print_raid5_conf (raid5_conf_t *conf)
1712 {
1713         int i;
1714         struct disk_info *tmp;
1715
1716         printk("RAID5 conf printout:\n");
1717         if (!conf) {
1718                 printk("(conf==NULL)\n");
1719                 return;
1720         }
1721         printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1722                  conf->working_disks, conf->failed_disks);
1723
1724 #if RAID5_DEBUG
1725         for (i = 0; i < MD_SB_DISKS; i++) {
1726 #else
1727         for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1728 #endif
1729                 tmp = conf->disks + i;
1730                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1731                         i, tmp->spare,tmp->operational,
1732                         tmp->number,tmp->raid_disk,tmp->used_slot,
1733                         partition_name(tmp->dev));
1734         }
1735 }
1736
1737 static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1738 {
1739         int err = 0;
1740         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1741         raid5_conf_t *conf = mddev->private;
1742         struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1743         mdp_super_t *sb = mddev->sb;
1744         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1745         mdk_rdev_t *spare_rdev, *failed_rdev;
1746
1747         print_raid5_conf(conf);
1748         md_spin_lock_irq(&conf->device_lock);
1749         /*
1750          * find the disk ...
1751          */
1752         switch (state) {
1753
1754         case DISKOP_SPARE_ACTIVE:
1755
1756                 /*
1757                  * Find the failed disk within the RAID5 configuration ...
1758                  * (this can only be in the first conf->raid_disks part)
1759                  */
1760                 for (i = 0; i < conf->raid_disks; i++) {
1761                         tmp = conf->disks + i;
1762                         if ((!tmp->operational && !tmp->spare) ||
1763                                         !tmp->used_slot) {
1764                                 failed_disk = i;
1765                                 break;
1766                         }
1767                 }
1768                 /*
1769                  * When we activate a spare disk we _must_ have a disk in
1770                  * the lower (active) part of the array to replace.
1771                  */
1772                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1773                         MD_BUG();
1774                         err = 1;
1775                         goto abort;
1776                 }
1777                 /* fall through */
1778
1779         case DISKOP_SPARE_WRITE:
1780         case DISKOP_SPARE_INACTIVE:
1781
1782                 /*
1783                  * Find the spare disk ... (can only be in the 'high'
1784                  * area of the array)
1785                  */
1786                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1787                         tmp = conf->disks + i;
1788                         if (tmp->spare && tmp->number == (*d)->number) {
1789                                 spare_disk = i;
1790                                 break;
1791                         }
1792                 }
1793                 if (spare_disk == -1) {
1794                         MD_BUG();
1795                         err = 1;
1796                         goto abort;
1797                 }
1798                 break;
1799
1800         case DISKOP_HOT_REMOVE_DISK:
1801
1802                 for (i = 0; i < MD_SB_DISKS; i++) {
1803                         tmp = conf->disks + i;
1804                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
1805                                 if (tmp->operational) {
1806                                         err = -EBUSY;
1807                                         goto abort;
1808                                 }
1809                                 removed_disk = i;
1810                                 break;
1811                         }
1812                 }
1813                 if (removed_disk == -1) {
1814                         MD_BUG();
1815                         err = 1;
1816                         goto abort;
1817                 }
1818                 break;
1819
1820         case DISKOP_HOT_ADD_DISK:
1821
1822                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1823                         tmp = conf->disks + i;
1824                         if (!tmp->used_slot) {
1825                                 added_disk = i;
1826                                 break;
1827                         }
1828                 }
1829                 if (added_disk == -1) {
1830                         MD_BUG();
1831                         err = 1;
1832                         goto abort;
1833                 }
1834                 break;
1835         }
1836
1837         switch (state) {
1838         /*
1839          * Switch the spare disk to write-only mode:
1840          */
1841         case DISKOP_SPARE_WRITE:
1842                 if (conf->spare) {
1843                         MD_BUG();
1844                         err = 1;
1845                         goto abort;
1846                 }
1847                 sdisk = conf->disks + spare_disk;
1848                 sdisk->operational = 1;
1849                 sdisk->write_only = 1;
1850                 conf->spare = sdisk;
1851                 break;
1852         /*
1853          * Deactivate a spare disk:
1854          */
1855         case DISKOP_SPARE_INACTIVE:
1856                 sdisk = conf->disks + spare_disk;
1857                 sdisk->operational = 0;
1858                 sdisk->write_only = 0;
1859                 /*
1860                  * Was the spare being resynced?
1861                  */
1862                 if (conf->spare == sdisk)
1863                         conf->spare = NULL;
1864                 break;
1865         /*
1866          * Activate (mark read-write) the (now sync) spare disk,
1867          * which means we switch it's 'raid position' (->raid_disk)
1868          * with the failed disk. (only the first 'conf->raid_disks'
1869          * slots are used for 'real' disks and we must preserve this
1870          * property)
1871          */
1872         case DISKOP_SPARE_ACTIVE:
1873                 if (!conf->spare) {
1874                         MD_BUG();
1875                         err = 1;
1876                         goto abort;
1877                 }
1878                 sdisk = conf->disks + spare_disk;
1879                 fdisk = conf->disks + failed_disk;
1880
1881                 spare_desc = &sb->disks[sdisk->number];
1882                 failed_desc = &sb->disks[fdisk->number];
1883
1884                 if (spare_desc != *d) {
1885                         MD_BUG();
1886                         err = 1;
1887                         goto abort;
1888                 }
1889
1890                 if (spare_desc->raid_disk != sdisk->raid_disk) {
1891                         MD_BUG();
1892                         err = 1;
1893                         goto abort;
1894                 }
1895                         
1896                 if (sdisk->raid_disk != spare_disk) {
1897                         MD_BUG();
1898                         err = 1;
1899                         goto abort;
1900                 }
1901
1902                 if (failed_desc->raid_disk != fdisk->raid_disk) {
1903                         MD_BUG();
1904                         err = 1;
1905                         goto abort;
1906                 }
1907
1908                 if (fdisk->raid_disk != failed_disk) {
1909                         MD_BUG();
1910                         err = 1;
1911                         goto abort;
1912                 }
1913
1914                 /*
1915                  * do the switch finally
1916                  */
1917                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1918                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1919
1920                 /* There must be a spare_rdev, but there may not be a
1921                  * failed_rdev.  That slot might be empty...
1922                  */
1923                 spare_rdev->desc_nr = failed_desc->number;
1924                 if (failed_rdev)
1925                         failed_rdev->desc_nr = spare_desc->number;
1926                 
1927                 xchg_values(*spare_desc, *failed_desc);
1928                 xchg_values(*fdisk, *sdisk);
1929
1930                 /*
1931                  * (careful, 'failed' and 'spare' are switched from now on)
1932                  *
1933                  * we want to preserve linear numbering and we want to
1934                  * give the proper raid_disk number to the now activated
1935                  * disk. (this means we switch back these values)
1936                  */
1937         
1938                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1939                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1940                 xchg_values(spare_desc->number, failed_desc->number);
1941                 xchg_values(sdisk->number, fdisk->number);
1942
1943                 *d = failed_desc;
1944
1945                 if (sdisk->dev == MKDEV(0,0))
1946                         sdisk->used_slot = 0;
1947
1948                 /*
1949                  * this really activates the spare.
1950                  */
1951                 fdisk->spare = 0;
1952                 fdisk->write_only = 0;
1953
1954                 /*
1955                  * if we activate a spare, we definitely replace a
1956                  * non-operational disk slot in the 'low' area of
1957                  * the disk array.
1958                  */
1959                 conf->failed_disks--;
1960                 conf->working_disks++;
1961                 conf->spare = NULL;
1962
1963                 break;
1964
1965         case DISKOP_HOT_REMOVE_DISK:
1966                 rdisk = conf->disks + removed_disk;
1967
1968                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1969                         MD_BUG();       
1970                         err = 1;
1971                         goto abort;
1972                 }
1973                 rdisk->dev = MKDEV(0,0);
1974                 rdisk->used_slot = 0;
1975
1976                 break;
1977
1978         case DISKOP_HOT_ADD_DISK:
1979                 adisk = conf->disks + added_disk;
1980                 added_desc = *d;
1981
1982                 if (added_disk != added_desc->number) {
1983                         MD_BUG();       
1984                         err = 1;
1985                         goto abort;
1986                 }
1987
1988                 adisk->number = added_desc->number;
1989                 adisk->raid_disk = added_desc->raid_disk;
1990                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1991
1992                 adisk->operational = 0;
1993                 adisk->write_only = 0;
1994                 adisk->spare = 1;
1995                 adisk->used_slot = 1;
1996
1997
1998                 break;
1999
2000         default:
2001                 MD_BUG();       
2002                 err = 1;
2003                 goto abort;
2004         }
2005 abort:
2006         md_spin_unlock_irq(&conf->device_lock);
2007         print_raid5_conf(conf);
2008         return err;
2009 }
2010
2011 static mdk_personality_t raid5_personality=
2012 {
2013         name:           "raid5",
2014         make_request:   raid5_make_request,
2015         run:            raid5_run,
2016         stop:           raid5_stop,
2017         status:         raid5_status,
2018         error_handler:  raid5_error,
2019         diskop:         raid5_diskop,
2020         stop_resync:    raid5_stop_resync,
2021         restart_resync: raid5_restart_resync,
2022         sync_request:   raid5_sync_request
2023 };
2024
2025 static int md__init raid5_init (void)
2026 {
2027         return register_md_personality (RAID5, &raid5_personality);
2028 }
2029
2030 static void raid5_exit (void)
2031 {
2032         unregister_md_personality (RAID5);
2033 }
2034
2035 module_init(raid5_init);
2036 module_exit(raid5_exit);
2037 MODULE_LICENSE("GPL");