2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
6 * RAID-5 management functions.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/locks.h>
22 #include <linux/slab.h>
23 #include <linux/raid/raid5.h>
24 #include <asm/bitops.h>
25 #include <asm/atomic.h>
27 static mdk_personality_t raid5_personality;
33 #define NR_STRIPES 256
34 #define IO_THRESHOLD 1
36 #define HASH_PAGES_ORDER 0
37 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
38 #define HASH_MASK (NR_HASH - 1)
39 #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
42 * The following can be used to debug the driver
45 #define RAID5_PARANOIA 1
46 #if RAID5_PARANOIA && CONFIG_SMP
47 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
49 # define CHECK_DEVLOCK()
53 #define PRINTK(x...) printk(x)
57 #define PRINTK(x...) do { } while (0)
60 static void print_raid5_conf (raid5_conf_t *conf);
62 static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
64 if (atomic_dec_and_test(&sh->count)) {
65 if (!list_empty(&sh->lru))
67 if (atomic_read(&conf->active_stripes)==0)
69 if (test_bit(STRIPE_HANDLE, &sh->state)) {
70 if (test_bit(STRIPE_DELAYED, &sh->state))
71 list_add_tail(&sh->lru, &conf->delayed_list);
73 list_add_tail(&sh->lru, &conf->handle_list);
74 md_wakeup_thread(conf->thread);
76 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
77 atomic_dec(&conf->preread_active_stripes);
78 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
79 md_wakeup_thread(conf->thread);
81 list_add_tail(&sh->lru, &conf->inactive_list);
82 atomic_dec(&conf->active_stripes);
83 if (!conf->inactive_blocked ||
84 atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
85 wake_up(&conf->wait_for_stripe);
89 static void release_stripe(struct stripe_head *sh)
91 raid5_conf_t *conf = sh->raid_conf;
94 spin_lock_irqsave(&conf->device_lock, flags);
95 __release_stripe(conf, sh);
96 spin_unlock_irqrestore(&conf->device_lock, flags);
99 static void remove_hash(struct stripe_head *sh)
101 PRINTK("remove_hash(), stripe %lu\n", sh->sector);
103 if (sh->hash_pprev) {
105 sh->hash_next->hash_pprev = sh->hash_pprev;
106 *sh->hash_pprev = sh->hash_next;
107 sh->hash_pprev = NULL;
111 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
113 struct stripe_head **shp = &stripe_hash(conf, sh->sector);
115 PRINTK("insert_hash(), stripe %lu\n",sh->sector);
118 if ((sh->hash_next = *shp) != NULL)
119 (*shp)->hash_pprev = &sh->hash_next;
121 sh->hash_pprev = shp;
125 /* find an idle stripe, make sure it is unhashed, and return it. */
126 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
128 struct stripe_head *sh = NULL;
129 struct list_head *first;
132 if (list_empty(&conf->inactive_list))
134 first = conf->inactive_list.next;
135 sh = list_entry(first, struct stripe_head, lru);
136 list_del_init(first);
138 atomic_inc(&conf->active_stripes);
143 static void shrink_buffers(struct stripe_head *sh, int num)
145 struct buffer_head *bh;
148 for (i=0; i<num ; i++) {
149 bh = sh->bh_cache[i];
152 sh->bh_cache[i] = NULL;
153 free_page((unsigned long) bh->b_data);
158 static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
160 struct buffer_head *bh;
163 for (i=0; i<num; i++) {
165 bh = kmalloc(sizeof(struct buffer_head), priority);
168 memset(bh, 0, sizeof (struct buffer_head));
169 init_waitqueue_head(&bh->b_wait);
170 if ((page = alloc_page(priority)))
171 bh->b_data = page_address(page);
176 atomic_set(&bh->b_count, 0);
178 sh->bh_cache[i] = bh;
184 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
186 static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
188 raid5_conf_t *conf = sh->raid_conf;
189 int disks = conf->raid_disks, i;
191 if (atomic_read(&sh->count) != 0)
193 if (test_bit(STRIPE_HANDLE, &sh->state))
197 PRINTK("init_stripe called, stripe %lu\n", sh->sector);
202 sh->size = conf->buffer_size;
205 for (i=disks; i--; ) {
206 if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
207 buffer_locked(sh->bh_cache[i])) {
208 printk("sector=%lx i=%d %p %p %p %d\n",
209 sh->sector, i, sh->bh_read[i],
210 sh->bh_write[i], sh->bh_written[i],
211 buffer_locked(sh->bh_cache[i]));
214 clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
215 raid5_build_block(sh, i);
217 insert_hash(conf, sh);
220 /* the buffer size has changed, so unhash all stripes
221 * as active stripes complete, they will go onto inactive list
223 static void shrink_stripe_cache(raid5_conf_t *conf)
227 if (atomic_read(&conf->active_stripes))
229 for (i=0; i < NR_HASH; i++) {
230 struct stripe_head *sh;
231 while ((sh = conf->stripe_hashtbl[i]))
236 static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
238 struct stripe_head *sh;
241 PRINTK("__find_stripe, sector %lu\n", sector);
242 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
243 if (sh->sector == sector)
245 PRINTK("__stripe %lu not in cache\n", sector);
249 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
251 struct stripe_head *sh;
253 PRINTK("get_stripe, sector %lu\n", sector);
255 md_spin_lock_irq(&conf->device_lock);
258 if (conf->buffer_size == 0 ||
259 (size && size != conf->buffer_size)) {
260 /* either the size is being changed (buffer_size==0) or
261 * we need to change it.
262 * If size==0, we can proceed as soon as buffer_size gets set.
263 * If size>0, we can proceed when active_stripes reaches 0, or
264 * when someone else sets the buffer_size to size.
265 * If someone sets the buffer size to something else, we will need to
266 * assert that we want to change it again
268 int oldsize = conf->buffer_size;
269 PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
271 wait_event_lock_irq(conf->wait_for_stripe,
275 while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
276 conf->buffer_size = 0;
277 wait_event_lock_irq(conf->wait_for_stripe,
278 atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
280 PRINTK("waited and now %ld/%d buffer_size is %d - %d active\n", sector, size,
281 conf->buffer_size, atomic_read(&conf->active_stripes));
284 if (conf->buffer_size != size) {
285 printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
286 shrink_stripe_cache(conf);
288 conf->buffer_size = size;
289 PRINTK("size now %d\n", conf->buffer_size);
294 sector -= sector & ((conf->buffer_size>>9)-1);
296 sh = __find_stripe(conf, sector);
298 if (!conf->inactive_blocked)
299 sh = get_free_stripe(conf);
300 if (noblock && sh == NULL)
303 conf->inactive_blocked = 1;
304 wait_event_lock_irq(conf->wait_for_stripe,
305 !list_empty(&conf->inactive_list) &&
306 (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
307 || !conf->inactive_blocked),
309 conf->inactive_blocked = 0;
311 init_stripe(sh, sector);
313 if (atomic_read(&sh->count)) {
314 if (!list_empty(&sh->lru))
317 if (!test_bit(STRIPE_HANDLE, &sh->state))
318 atomic_inc(&conf->active_stripes);
319 if (list_empty(&sh->lru))
321 list_del_init(&sh->lru);
324 } while (sh == NULL);
327 atomic_inc(&sh->count);
329 md_spin_unlock_irq(&conf->device_lock);
333 static int grow_stripes(raid5_conf_t *conf, int num, int priority)
335 struct stripe_head *sh;
338 sh = kmalloc(sizeof(struct stripe_head), priority);
341 memset(sh, 0, sizeof(*sh));
342 sh->raid_conf = conf;
343 sh->lock = SPIN_LOCK_UNLOCKED;
345 if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
346 shrink_buffers(sh, conf->raid_disks);
350 /* we just created an active stripe so... */
351 atomic_set(&sh->count, 1);
352 atomic_inc(&conf->active_stripes);
353 INIT_LIST_HEAD(&sh->lru);
359 static void shrink_stripes(raid5_conf_t *conf, int num)
361 struct stripe_head *sh;
364 spin_lock_irq(&conf->device_lock);
365 sh = get_free_stripe(conf);
366 spin_unlock_irq(&conf->device_lock);
369 if (atomic_read(&sh->count))
371 shrink_buffers(sh, conf->raid_disks);
373 atomic_dec(&conf->active_stripes);
378 static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
380 struct stripe_head *sh = bh->b_private;
381 raid5_conf_t *conf = sh->raid_conf;
382 int disks = conf->raid_disks, i;
385 for (i=0 ; i<disks; i++)
386 if (bh == sh->bh_cache[i])
389 PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
396 struct buffer_head *buffer;
397 spin_lock_irqsave(&conf->device_lock, flags);
398 /* we can return a buffer if we bypassed the cache or
399 * if the top buffer is not in highmem. If there are
400 * multiple buffers, leave the extra work to
403 buffer = sh->bh_read[i];
405 (!PageHighMem(buffer->b_page)
406 || buffer->b_page == bh->b_page )
408 sh->bh_read[i] = buffer->b_reqnext;
409 buffer->b_reqnext = NULL;
412 spin_unlock_irqrestore(&conf->device_lock, flags);
413 if (sh->bh_page[i]==NULL)
414 set_bit(BH_Uptodate, &bh->b_state);
416 if (buffer->b_page != bh->b_page)
417 memcpy(buffer->b_data, bh->b_data, bh->b_size);
418 buffer->b_end_io(buffer, 1);
421 md_error(conf->mddev, bh->b_dev);
422 clear_bit(BH_Uptodate, &bh->b_state);
424 /* must restore b_page before unlocking buffer... */
425 if (sh->bh_page[i]) {
426 bh->b_page = sh->bh_page[i];
427 bh->b_data = page_address(bh->b_page);
428 sh->bh_page[i] = NULL;
429 clear_bit(BH_Uptodate, &bh->b_state);
431 clear_bit(BH_Lock, &bh->b_state);
432 set_bit(STRIPE_HANDLE, &sh->state);
436 static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
438 struct stripe_head *sh = bh->b_private;
439 raid5_conf_t *conf = sh->raid_conf;
440 int disks = conf->raid_disks, i;
443 for (i=0 ; i<disks; i++)
444 if (bh == sh->bh_cache[i])
447 PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
453 md_spin_lock_irqsave(&conf->device_lock, flags);
455 md_error(conf->mddev, bh->b_dev);
456 clear_bit(BH_Lock, &bh->b_state);
457 set_bit(STRIPE_HANDLE, &sh->state);
458 __release_stripe(conf, sh);
459 md_spin_unlock_irqrestore(&conf->device_lock, flags);
464 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
466 raid5_conf_t *conf = sh->raid_conf;
467 struct buffer_head *bh = sh->bh_cache[i];
468 unsigned long block = sh->sector / (sh->size >> 9);
470 init_buffer(bh, raid5_end_read_request, sh);
471 bh->b_dev = conf->disks[i].dev;
472 bh->b_blocknr = block;
474 bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
475 bh->b_size = sh->size;
476 bh->b_list = BUF_LOCKED;
480 static int raid5_error (mddev_t *mddev, kdev_t dev)
482 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
483 mdp_super_t *sb = mddev->sb;
484 struct disk_info *disk;
487 PRINTK("raid5_error called\n");
489 for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
490 if (disk->dev == dev) {
491 if (disk->operational) {
492 disk->operational = 0;
493 mark_disk_faulty(sb->disks+disk->number);
494 mark_disk_nonsync(sb->disks+disk->number);
495 mark_disk_inactive(sb->disks+disk->number);
500 conf->working_disks--;
501 conf->failed_disks++;
502 md_wakeup_thread(conf->thread);
504 "raid5: Disk failure on %s, disabling device."
505 " Operation continuing on %d devices\n",
506 partition_name (dev), conf->working_disks);
512 * handle errors in spares (during reconstruction)
516 if (disk->dev == dev) {
518 "raid5: Disk failure on spare %s\n",
519 partition_name (dev));
520 if (!conf->spare->operational) {
521 /* probably a SET_DISK_FAULTY ioctl */
524 disk->operational = 0;
525 disk->write_only = 0;
527 mark_disk_faulty(sb->disks+disk->number);
528 mark_disk_nonsync(sb->disks+disk->number);
529 mark_disk_inactive(sb->disks+disk->number);
535 md_wakeup_thread(conf->thread);
545 * Input: a 'big' sector number,
546 * Output: index of the data and parity disk, and the sector # in them.
548 static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
549 unsigned int data_disks, unsigned int * dd_idx,
550 unsigned int * pd_idx, raid5_conf_t *conf)
552 unsigned long stripe;
553 unsigned long chunk_number;
554 unsigned int chunk_offset;
555 unsigned long new_sector;
556 int sectors_per_chunk = conf->chunk_size >> 9;
558 /* First compute the information on this sector */
561 * Compute the chunk number and the sector offset inside the chunk
563 chunk_number = r_sector / sectors_per_chunk;
564 chunk_offset = r_sector % sectors_per_chunk;
567 * Compute the stripe number
569 stripe = chunk_number / data_disks;
572 * Compute the data disk and parity disk indexes inside the stripe
574 *dd_idx = chunk_number % data_disks;
577 * Select the parity disk based on the user selected algorithm.
579 if (conf->level == 4)
580 *pd_idx = data_disks;
581 else switch (conf->algorithm) {
582 case ALGORITHM_LEFT_ASYMMETRIC:
583 *pd_idx = data_disks - stripe % raid_disks;
584 if (*dd_idx >= *pd_idx)
587 case ALGORITHM_RIGHT_ASYMMETRIC:
588 *pd_idx = stripe % raid_disks;
589 if (*dd_idx >= *pd_idx)
592 case ALGORITHM_LEFT_SYMMETRIC:
593 *pd_idx = data_disks - stripe % raid_disks;
594 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
596 case ALGORITHM_RIGHT_SYMMETRIC:
597 *pd_idx = stripe % raid_disks;
598 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
601 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
605 * Finally, compute the new sector number
607 new_sector = stripe * sectors_per_chunk + chunk_offset;
612 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
614 raid5_conf_t *conf = sh->raid_conf;
615 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
616 unsigned long new_sector = sh->sector, check;
617 int sectors_per_chunk = conf->chunk_size >> 9;
618 unsigned long stripe = new_sector / sectors_per_chunk;
619 int chunk_offset = new_sector % sectors_per_chunk;
620 int chunk_number, dummy1, dummy2, dd_idx = i;
621 unsigned long r_sector, blocknr;
623 switch (conf->algorithm) {
624 case ALGORITHM_LEFT_ASYMMETRIC:
625 case ALGORITHM_RIGHT_ASYMMETRIC:
629 case ALGORITHM_LEFT_SYMMETRIC:
630 case ALGORITHM_RIGHT_SYMMETRIC:
633 i -= (sh->pd_idx + 1);
636 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
639 chunk_number = stripe * data_disks + i;
640 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
641 blocknr = r_sector / (sh->size >> 9);
643 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
644 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
645 printk("compute_blocknr: map not correct\n");
652 #define check_xor() do { \
653 if (count == MAX_XOR_BLOCKS) { \
654 xor_block(count, bh_ptr); \
660 static void compute_block(struct stripe_head *sh, int dd_idx)
662 raid5_conf_t *conf = sh->raid_conf;
663 int i, count, disks = conf->raid_disks;
664 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
666 PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
669 memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
670 bh_ptr[0] = sh->bh_cache[dd_idx];
672 for (i = disks ; i--; ) {
675 bh = sh->bh_cache[i];
676 if (buffer_uptodate(bh))
677 bh_ptr[count++] = bh;
679 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
684 xor_block(count, bh_ptr);
685 set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
688 static void compute_parity(struct stripe_head *sh, int method)
690 raid5_conf_t *conf = sh->raid_conf;
691 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
692 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
693 struct buffer_head *chosen[MD_SB_DISKS];
695 PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
696 memset(chosen, 0, sizeof(chosen));
699 bh_ptr[0] = sh->bh_cache[pd_idx];
701 case READ_MODIFY_WRITE:
702 if (!buffer_uptodate(sh->bh_cache[pd_idx]))
704 for (i=disks ; i-- ;) {
707 if (sh->bh_write[i] &&
708 buffer_uptodate(sh->bh_cache[i])) {
709 bh_ptr[count++] = sh->bh_cache[i];
710 chosen[i] = sh->bh_write[i];
711 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
712 chosen[i]->b_reqnext = sh->bh_written[i];
713 sh->bh_written[i] = chosen[i];
718 case RECONSTRUCT_WRITE:
719 memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
720 for (i= disks; i-- ;)
721 if (i!=pd_idx && sh->bh_write[i]) {
722 chosen[i] = sh->bh_write[i];
723 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
724 chosen[i]->b_reqnext = sh->bh_written[i];
725 sh->bh_written[i] = chosen[i];
732 xor_block(count, bh_ptr);
736 for (i = disks; i--;)
738 struct buffer_head *bh = sh->bh_cache[i];
740 bdata = bh_kmap(chosen[i]);
743 bh_kunmap(chosen[i]);
744 set_bit(BH_Lock, &bh->b_state);
745 mark_buffer_uptodate(bh, 1);
749 case RECONSTRUCT_WRITE:
753 bh_ptr[count++] = sh->bh_cache[i];
757 case READ_MODIFY_WRITE:
758 for (i = disks; i--;)
760 bh_ptr[count++] = sh->bh_cache[i];
765 xor_block(count, bh_ptr);
767 if (method != CHECK_PARITY) {
768 mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
769 set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
771 mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
774 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
776 struct buffer_head **bhp;
777 raid5_conf_t *conf = sh->raid_conf;
779 PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
782 spin_lock(&sh->lock);
783 spin_lock_irq(&conf->device_lock);
784 bh->b_reqnext = NULL;
786 bhp = &sh->bh_read[dd_idx];
788 bhp = &sh->bh_write[dd_idx];
790 printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
791 bhp = & (*bhp)->b_reqnext;
794 spin_unlock_irq(&conf->device_lock);
795 spin_unlock(&sh->lock);
797 PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
805 * handle_stripe - do things to a stripe.
807 * We lock the stripe and then examine the state of various bits
808 * to see what needs to be done.
810 * return some read request which now have data
811 * return some write requests which are safely on disc
812 * schedule a read on some buffers
813 * schedule a write of some buffers
814 * return confirmation of parity correctness
816 * Parity calculations are done inside the stripe lock
817 * buffers are taken off read_list or write_list, and bh_cache buffers
818 * get BH_Lock set before the stripe lock is released.
822 static void handle_stripe(struct stripe_head *sh)
824 raid5_conf_t *conf = sh->raid_conf;
825 int disks = conf->raid_disks;
826 struct buffer_head *return_ok= NULL, *return_fail = NULL;
827 int action[MD_SB_DISKS];
830 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
832 struct buffer_head *bh;
834 PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
835 memset(action, 0, sizeof(action));
837 spin_lock(&sh->lock);
838 clear_bit(STRIPE_HANDLE, &sh->state);
839 clear_bit(STRIPE_DELAYED, &sh->state);
841 syncing = test_bit(STRIPE_SYNCING, &sh->state);
842 /* Now to look around and see what can be done */
844 for (i=disks; i--; ) {
845 bh = sh->bh_cache[i];
846 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
847 /* maybe we can reply to a read */
848 if (buffer_uptodate(bh) && sh->bh_read[i]) {
849 struct buffer_head *rbh, *rbh2;
850 PRINTK("Return read for disc %d\n", i);
851 spin_lock_irq(&conf->device_lock);
852 rbh = sh->bh_read[i];
853 sh->bh_read[i] = NULL;
854 spin_unlock_irq(&conf->device_lock);
857 bdata = bh_kmap(rbh);
858 memcpy(bdata, bh->b_data, bh->b_size);
860 rbh2 = rbh->b_reqnext;
861 rbh->b_reqnext = return_ok;
867 /* now count some things */
868 if (buffer_locked(bh)) locked++;
869 if (buffer_uptodate(bh)) uptodate++;
872 if (sh->bh_read[i]) to_read++;
873 if (sh->bh_write[i]) to_write++;
874 if (sh->bh_written[i]) written++;
875 if (!conf->disks[i].operational) {
880 PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
881 locked, uptodate, to_read, to_write, failed, failed_num);
882 /* check if the array has lost two devices and, if so, some requests might
885 if (failed > 1 && to_read+to_write+written) {
886 for (i=disks; i--; ) {
887 /* fail all writes first */
888 if (sh->bh_write[i]) to_write--;
889 while ((bh = sh->bh_write[i])) {
890 sh->bh_write[i] = bh->b_reqnext;
891 bh->b_reqnext = return_fail;
894 /* and fail all 'written' */
895 if (sh->bh_written[i]) written--;
896 while ((bh = sh->bh_written[i])) {
897 sh->bh_written[i] = bh->b_reqnext;
898 bh->b_reqnext = return_fail;
902 /* fail any reads if this device is non-operational */
903 if (!conf->disks[i].operational) {
904 spin_lock_irq(&conf->device_lock);
905 if (sh->bh_read[i]) to_read--;
906 while ((bh = sh->bh_read[i])) {
907 sh->bh_read[i] = bh->b_reqnext;
908 bh->b_reqnext = return_fail;
911 spin_unlock_irq(&conf->device_lock);
915 if (failed > 1 && syncing) {
916 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
917 clear_bit(STRIPE_SYNCING, &sh->state);
921 /* might be able to return some write requests if the parity block
922 * is safe, or on a failed drive
924 bh = sh->bh_cache[sh->pd_idx];
926 ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
927 || (failed == 1 && failed_num == sh->pd_idx))
929 /* any written block on a uptodate or failed drive can be returned */
931 if (sh->bh_written[i]) {
932 bh = sh->bh_cache[i];
933 if (!conf->disks[sh->pd_idx].operational ||
934 (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
935 /* maybe we can return some write requests */
936 struct buffer_head *wbh, *wbh2;
937 PRINTK("Return write for disc %d\n", i);
938 wbh = sh->bh_written[i];
939 sh->bh_written[i] = NULL;
941 wbh2 = wbh->b_reqnext;
942 wbh->b_reqnext = return_ok;
950 /* Now we might consider reading some blocks, either to check/generate
951 * parity, or to satisfy requests
953 if (to_read || (syncing && (uptodate < disks))) {
954 for (i=disks; i--;) {
955 bh = sh->bh_cache[i];
956 if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
957 (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
958 /* we would like to get this block, possibly
959 * by computing it, but we might not be able to
961 if (uptodate == disks-1) {
962 PRINTK("Computing block %d\n", i);
963 compute_block(sh, i);
965 } else if (conf->disks[i].operational) {
966 set_bit(BH_Lock, &bh->b_state);
968 /* if I am just reading this block and we don't have
969 a failed drive, or any pending writes then sidestep the cache */
970 if (sh->bh_page[i]) BUG();
971 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
972 ! syncing && !failed && !to_write) {
973 sh->bh_page[i] = sh->bh_cache[i]->b_page;
974 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
975 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
978 PRINTK("Reading block %d (sync=%d)\n", i, syncing);
980 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
984 set_bit(STRIPE_HANDLE, &sh->state);
987 /* now to consider writing and what else, if anything should be read */
990 for (i=disks ; i--;) {
991 /* would I have to read this buffer for read_modify_write */
992 bh = sh->bh_cache[i];
993 if ((sh->bh_write[i] || i == sh->pd_idx) &&
994 (!buffer_locked(bh) || sh->bh_page[i]) &&
995 !buffer_uptodate(bh)) {
996 if (conf->disks[i].operational
997 /* && !(conf->resync_parity && i == sh->pd_idx) */
1000 else rmw += 2*disks; /* cannot read it */
1002 /* Would I have to read this buffer for reconstruct_write */
1003 if (!sh->bh_write[i] && i != sh->pd_idx &&
1004 (!buffer_locked(bh) || sh->bh_page[i]) &&
1005 !buffer_uptodate(bh)) {
1006 if (conf->disks[i].operational) rcw++;
1007 else rcw += 2*disks;
1010 PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1011 set_bit(STRIPE_HANDLE, &sh->state);
1012 if (rmw < rcw && rmw > 0)
1013 /* prefer read-modify-write, but need to get some data */
1014 for (i=disks; i--;) {
1015 bh = sh->bh_cache[i];
1016 if ((sh->bh_write[i] || i == sh->pd_idx) &&
1017 !buffer_locked(bh) && !buffer_uptodate(bh) &&
1018 conf->disks[i].operational) {
1019 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1021 PRINTK("Read_old block %d for r-m-w\n", i);
1022 set_bit(BH_Lock, &bh->b_state);
1026 set_bit(STRIPE_DELAYED, &sh->state);
1027 set_bit(STRIPE_HANDLE, &sh->state);
1031 if (rcw <= rmw && rcw > 0)
1032 /* want reconstruct write, but need to get some data */
1033 for (i=disks; i--;) {
1034 bh = sh->bh_cache[i];
1035 if (!sh->bh_write[i] && i != sh->pd_idx &&
1036 !buffer_locked(bh) && !buffer_uptodate(bh) &&
1037 conf->disks[i].operational) {
1038 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1040 PRINTK("Read_old block %d for Reconstruct\n", i);
1041 set_bit(BH_Lock, &bh->b_state);
1045 set_bit(STRIPE_DELAYED, &sh->state);
1046 set_bit(STRIPE_HANDLE, &sh->state);
1050 /* now if nothing is locked, and if we have enough data, we can start a write request */
1051 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1052 PRINTK("Computing parity...\n");
1053 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1054 /* now every locked buffer is ready to be written */
1056 if (buffer_locked(sh->bh_cache[i])) {
1057 PRINTK("Writing block %d\n", i);
1059 action[i] = WRITE+1;
1060 if (!conf->disks[i].operational
1061 || (i==sh->pd_idx && failed == 0))
1062 set_bit(STRIPE_INSYNC, &sh->state);
1064 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1065 atomic_dec(&conf->preread_active_stripes);
1066 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1067 md_wakeup_thread(conf->thread);
1072 /* maybe we need to check and possibly fix the parity for this stripe
1073 * Any reads will already have been scheduled, so we just see if enough data
1076 if (syncing && locked == 0 &&
1077 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1078 set_bit(STRIPE_HANDLE, &sh->state);
1080 if (uptodate != disks)
1082 compute_parity(sh, CHECK_PARITY);
1084 bh = sh->bh_cache[sh->pd_idx];
1085 if ((*(u32*)bh->b_data) == 0 &&
1086 !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1087 /* parity is correct (on disc, not in buffer any more) */
1088 set_bit(STRIPE_INSYNC, &sh->state);
1091 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1092 struct disk_info *spare;
1094 failed_num = sh->pd_idx;
1095 /* should be able to compute the missing block and write it to spare */
1096 if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1097 if (uptodate+1 != disks)
1099 compute_block(sh, failed_num);
1102 if (uptodate != disks)
1104 bh = sh->bh_cache[failed_num];
1105 set_bit(BH_Lock, &bh->b_state);
1106 action[failed_num] = WRITE+1;
1108 set_bit(STRIPE_INSYNC, &sh->state);
1109 if (conf->disks[failed_num].operational)
1110 md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
1111 else if ((spare=conf->spare))
1112 md_sync_acct(spare->dev, bh->b_size>>9);
1116 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1117 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
1118 clear_bit(STRIPE_SYNCING, &sh->state);
1122 spin_unlock(&sh->lock);
1124 while ((bh=return_ok)) {
1125 return_ok = bh->b_reqnext;
1126 bh->b_reqnext = NULL;
1127 bh->b_end_io(bh, 1);
1129 while ((bh=return_fail)) {
1130 return_fail = bh->b_reqnext;
1131 bh->b_reqnext = NULL;
1132 bh->b_end_io(bh, 0);
1134 for (i=disks; i-- ;)
1136 struct buffer_head *bh = sh->bh_cache[i];
1137 struct disk_info *spare = conf->spare;
1139 if (action[i] == READ+1)
1140 bh->b_end_io = raid5_end_read_request;
1142 bh->b_end_io = raid5_end_write_request;
1143 if (conf->disks[i].operational)
1144 bh->b_dev = conf->disks[i].dev;
1145 else if (spare && action[i] == WRITE+1)
1146 bh->b_dev = spare->dev;
1149 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1150 atomic_inc(&sh->count);
1151 bh->b_rdev = bh->b_dev;
1152 bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1153 generic_make_request(action[i]-1, bh);
1155 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1156 clear_bit(BH_Lock, &bh->b_state);
1157 set_bit(STRIPE_HANDLE, &sh->state);
1162 static inline void raid5_activate_delayed(raid5_conf_t *conf)
1164 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1165 while (!list_empty(&conf->delayed_list)) {
1166 struct list_head *l = conf->delayed_list.next;
1167 struct stripe_head *sh;
1168 sh = list_entry(l, struct stripe_head, lru);
1170 clear_bit(STRIPE_DELAYED, &sh->state);
1171 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1172 atomic_inc(&conf->preread_active_stripes);
1173 list_add_tail(&sh->lru, &conf->handle_list);
1177 static void raid5_unplug_device(void *data)
1179 raid5_conf_t *conf = (raid5_conf_t *)data;
1180 unsigned long flags;
1182 spin_lock_irqsave(&conf->device_lock, flags);
1184 raid5_activate_delayed(conf);
1187 md_wakeup_thread(conf->thread);
1189 spin_unlock_irqrestore(&conf->device_lock, flags);
1192 static inline void raid5_plug_device(raid5_conf_t *conf)
1194 spin_lock_irq(&conf->device_lock);
1195 if (list_empty(&conf->delayed_list))
1196 if (!conf->plugged) {
1198 queue_task(&conf->plug_tq, &tq_disk);
1200 spin_unlock_irq(&conf->device_lock);
1203 static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1205 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1206 const unsigned int raid_disks = conf->raid_disks;
1207 const unsigned int data_disks = raid_disks - 1;
1208 unsigned int dd_idx, pd_idx;
1209 unsigned long new_sector;
1212 struct stripe_head *sh;
1219 new_sector = raid5_compute_sector(bh->b_rsector,
1220 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1222 PRINTK("raid5_make_request, sector %lu\n", new_sector);
1223 sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1225 sh->pd_idx = pd_idx;
1227 add_stripe_bh(sh, bh, dd_idx, rw);
1229 raid5_plug_device(conf);
1233 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1238 * Determine correct block size for this device.
1240 unsigned int device_bsize (kdev_t dev)
1242 unsigned int i, correct_size;
1244 correct_size = BLOCK_SIZE;
1245 if (blksize_size[MAJOR(dev)]) {
1246 i = blksize_size[MAJOR(dev)][MINOR(dev)];
1251 return correct_size;
1254 static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
1256 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1257 struct stripe_head *sh;
1258 int sectors_per_chunk = conf->chunk_size >> 9;
1259 unsigned long stripe = sector_nr/sectors_per_chunk;
1260 int chunk_offset = sector_nr % sectors_per_chunk;
1262 unsigned long first_sector;
1263 int raid_disks = conf->raid_disks;
1264 int data_disks = raid_disks-1;
1268 sh = get_active_stripe(conf, sector_nr, 0, 0);
1270 redone = sector_nr - sh->sector;
1271 first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1272 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1273 sh->pd_idx = pd_idx;
1274 spin_lock(&sh->lock);
1275 set_bit(STRIPE_SYNCING, &sh->state);
1276 clear_bit(STRIPE_INSYNC, &sh->state);
1277 sh->sync_redone = redone;
1278 spin_unlock(&sh->lock);
1283 return (bufsize>>9)-redone;
1287 * This is our raid5 kernel thread.
1289 * We scan the hash table for stripes which can be handled now.
1290 * During the scan, completed stripes are saved for us by the interrupt
1291 * handler, so that they will not have to wait for our next wakeup.
1293 static void raid5d (void *data)
1295 struct stripe_head *sh;
1296 raid5_conf_t *conf = data;
1297 mddev_t *mddev = conf->mddev;
1300 PRINTK("+++ raid5d active\n");
1304 if (mddev->sb_dirty)
1305 md_update_sb(mddev);
1306 md_spin_lock_irq(&conf->device_lock);
1308 struct list_head *first;
1310 if (list_empty(&conf->handle_list) &&
1311 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1313 !list_empty(&conf->delayed_list))
1314 raid5_activate_delayed(conf);
1316 if (list_empty(&conf->handle_list))
1319 first = conf->handle_list.next;
1320 sh = list_entry(first, struct stripe_head, lru);
1322 list_del_init(first);
1323 atomic_inc(&sh->count);
1324 if (atomic_read(&sh->count)!= 1)
1326 md_spin_unlock_irq(&conf->device_lock);
1332 md_spin_lock_irq(&conf->device_lock);
1334 PRINTK("%d stripes handled\n", handled);
1336 md_spin_unlock_irq(&conf->device_lock);
1338 PRINTK("--- raid5d inactive\n");
1342 * Private kernel thread for parity reconstruction after an unclean
1343 * shutdown. Reconstruction on spare drives in case of a failed drive
1344 * is done by the generic mdsyncd.
1346 static void raid5syncd (void *data)
1348 raid5_conf_t *conf = data;
1349 mddev_t *mddev = conf->mddev;
1351 if (!conf->resync_parity)
1353 if (conf->resync_parity == 2)
1355 down(&mddev->recovery_sem);
1356 if (md_do_sync(mddev,NULL)) {
1357 up(&mddev->recovery_sem);
1358 printk("raid5: resync aborted!\n");
1361 conf->resync_parity = 0;
1362 up(&mddev->recovery_sem);
1363 printk("raid5: resync finished.\n");
1366 static int raid5_run (mddev_t *mddev)
1369 int i, j, raid_disk, memory;
1370 mdp_super_t *sb = mddev->sb;
1373 struct disk_info *disk;
1374 struct md_list_head *tmp;
1375 int start_recovery = 0;
1379 if (sb->level != 5 && sb->level != 4) {
1380 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1385 mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1386 if ((conf = mddev->private) == NULL)
1388 memset (conf, 0, sizeof (*conf));
1389 conf->mddev = mddev;
1391 if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1393 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1395 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1396 md_init_waitqueue_head(&conf->wait_for_stripe);
1397 INIT_LIST_HEAD(&conf->handle_list);
1398 INIT_LIST_HEAD(&conf->delayed_list);
1399 INIT_LIST_HEAD(&conf->inactive_list);
1400 atomic_set(&conf->active_stripes, 0);
1401 atomic_set(&conf->preread_active_stripes, 0);
1402 conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
1405 conf->plug_tq.sync = 0;
1406 conf->plug_tq.routine = &raid5_unplug_device;
1407 conf->plug_tq.data = conf;
1409 PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1411 ITERATE_RDEV(mddev,rdev,tmp) {
1413 * This is important -- we are using the descriptor on
1414 * the disk only to get a pointer to the descriptor on
1415 * the main superblock, which might be more recent.
1417 desc = sb->disks + rdev->desc_nr;
1418 raid_disk = desc->raid_disk;
1419 disk = conf->disks + raid_disk;
1421 if (disk_faulty(desc)) {
1422 printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1423 if (!rdev->faulty) {
1427 disk->number = desc->number;
1428 disk->raid_disk = raid_disk;
1429 disk->dev = rdev->dev;
1431 disk->operational = 0;
1432 disk->write_only = 0;
1434 disk->used_slot = 1;
1437 if (disk_active(desc)) {
1438 if (!disk_sync(desc)) {
1439 printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1443 if (raid_disk > sb->raid_disks) {
1444 printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1447 if (disk->operational) {
1448 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1451 printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1453 disk->number = desc->number;
1454 disk->raid_disk = raid_disk;
1455 disk->dev = rdev->dev;
1456 disk->operational = 1;
1457 disk->used_slot = 1;
1459 conf->working_disks++;
1462 * Must be a spare disk ..
1464 printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1465 disk->number = desc->number;
1466 disk->raid_disk = raid_disk;
1467 disk->dev = rdev->dev;
1469 disk->operational = 0;
1470 disk->write_only = 0;
1472 disk->used_slot = 1;
1476 for (i = 0; i < MD_SB_DISKS; i++) {
1477 desc = sb->disks + i;
1478 raid_disk = desc->raid_disk;
1479 disk = conf->disks + raid_disk;
1481 if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1482 !conf->disks[raid_disk].used_slot) {
1484 disk->number = desc->number;
1485 disk->raid_disk = raid_disk;
1486 disk->dev = MKDEV(0,0);
1488 disk->operational = 0;
1489 disk->write_only = 0;
1491 disk->used_slot = 1;
1495 conf->raid_disks = sb->raid_disks;
1497 * 0 for a fully functional array, 1 for a degraded array.
1499 conf->failed_disks = conf->raid_disks - conf->working_disks;
1500 conf->mddev = mddev;
1501 conf->chunk_size = sb->chunk_size;
1502 conf->level = sb->level;
1503 conf->algorithm = sb->layout;
1504 conf->max_nr_stripes = NR_STRIPES;
1507 for (i = 0; i < conf->raid_disks; i++) {
1508 if (!conf->disks[i].used_slot) {
1514 if (!conf->chunk_size || conf->chunk_size % 4) {
1515 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1518 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1519 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1522 if (conf->failed_disks > 1) {
1523 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1527 if (conf->working_disks != sb->raid_disks) {
1528 printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1533 const char * name = "raid5d";
1535 conf->thread = md_register_thread(raid5d, conf, name);
1536 if (!conf->thread) {
1537 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1542 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1543 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1544 if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1545 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1546 shrink_stripes(conf, conf->max_nr_stripes);
1549 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1552 * Regenerate the "device is in sync with the raid set" bit for
1555 for (i = 0; i < MD_SB_DISKS ; i++) {
1556 mark_disk_nonsync(sb->disks + i);
1557 for (j = 0; j < sb->raid_disks; j++) {
1558 if (!conf->disks[j].operational)
1560 if (sb->disks[i].number == conf->disks[j].number)
1561 mark_disk_sync(sb->disks + i);
1564 sb->active_disks = conf->working_disks;
1566 if (sb->active_disks == sb->raid_disks)
1567 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1569 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1571 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1572 const char * name = "raid5syncd";
1574 conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1575 if (!conf->resync_thread) {
1576 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1580 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1581 conf->resync_parity = 1;
1582 md_wakeup_thread(conf->resync_thread);
1585 print_raid5_conf(conf);
1587 md_recover_arrays();
1588 print_raid5_conf(conf);
1590 /* Ok, everything is just fine now */
1594 print_raid5_conf(conf);
1595 if (conf->stripe_hashtbl)
1596 free_pages((unsigned long) conf->stripe_hashtbl,
1600 mddev->private = NULL;
1601 printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1606 static int raid5_stop_resync (mddev_t *mddev)
1608 raid5_conf_t *conf = mddev_to_conf(mddev);
1609 mdk_thread_t *thread = conf->resync_thread;
1612 if (conf->resync_parity) {
1613 conf->resync_parity = 2;
1614 md_interrupt_thread(thread);
1615 printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1623 static int raid5_restart_resync (mddev_t *mddev)
1625 raid5_conf_t *conf = mddev_to_conf(mddev);
1627 if (conf->resync_parity) {
1628 if (!conf->resync_thread) {
1632 printk("raid5: waking up raid5resync.\n");
1633 conf->resync_parity = 1;
1634 md_wakeup_thread(conf->resync_thread);
1637 printk("raid5: no restart-resync needed.\n");
1642 static int raid5_stop (mddev_t *mddev)
1644 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1646 if (conf->resync_thread)
1647 md_unregister_thread(conf->resync_thread);
1648 md_unregister_thread(conf->thread);
1649 shrink_stripes(conf, conf->max_nr_stripes);
1650 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1652 mddev->private = NULL;
1658 static void print_sh (struct stripe_head *sh)
1662 printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1663 printk("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count));
1664 printk("sh %lu, ", sh->sector);
1665 for (i = 0; i < MD_SB_DISKS; i++) {
1666 if (sh->bh_cache[i])
1667 printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1672 static void printall (raid5_conf_t *conf)
1674 struct stripe_head *sh;
1677 md_spin_lock_irq(&conf->device_lock);
1678 for (i = 0; i < NR_HASH; i++) {
1679 sh = conf->stripe_hashtbl[i];
1680 for (; sh; sh = sh->hash_next) {
1681 if (sh->raid_conf != conf)
1686 md_spin_unlock_irq(&conf->device_lock);
1688 PRINTK("--- raid5d inactive\n");
1692 static void raid5_status (struct seq_file *seq, mddev_t *mddev)
1694 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1695 mdp_super_t *sb = mddev->sb;
1698 seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1699 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1700 for (i = 0; i < conf->raid_disks; i++)
1701 seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_");
1702 seq_printf (seq, "]");
1705 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1711 static void print_raid5_conf (raid5_conf_t *conf)
1714 struct disk_info *tmp;
1716 printk("RAID5 conf printout:\n");
1718 printk("(conf==NULL)\n");
1721 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1722 conf->working_disks, conf->failed_disks);
1725 for (i = 0; i < MD_SB_DISKS; i++) {
1727 for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1729 tmp = conf->disks + i;
1730 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1731 i, tmp->spare,tmp->operational,
1732 tmp->number,tmp->raid_disk,tmp->used_slot,
1733 partition_name(tmp->dev));
1737 static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1740 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1741 raid5_conf_t *conf = mddev->private;
1742 struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1743 mdp_super_t *sb = mddev->sb;
1744 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1745 mdk_rdev_t *spare_rdev, *failed_rdev;
1747 print_raid5_conf(conf);
1748 md_spin_lock_irq(&conf->device_lock);
1754 case DISKOP_SPARE_ACTIVE:
1757 * Find the failed disk within the RAID5 configuration ...
1758 * (this can only be in the first conf->raid_disks part)
1760 for (i = 0; i < conf->raid_disks; i++) {
1761 tmp = conf->disks + i;
1762 if ((!tmp->operational && !tmp->spare) ||
1769 * When we activate a spare disk we _must_ have a disk in
1770 * the lower (active) part of the array to replace.
1772 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1779 case DISKOP_SPARE_WRITE:
1780 case DISKOP_SPARE_INACTIVE:
1783 * Find the spare disk ... (can only be in the 'high'
1784 * area of the array)
1786 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1787 tmp = conf->disks + i;
1788 if (tmp->spare && tmp->number == (*d)->number) {
1793 if (spare_disk == -1) {
1800 case DISKOP_HOT_REMOVE_DISK:
1802 for (i = 0; i < MD_SB_DISKS; i++) {
1803 tmp = conf->disks + i;
1804 if (tmp->used_slot && (tmp->number == (*d)->number)) {
1805 if (tmp->operational) {
1813 if (removed_disk == -1) {
1820 case DISKOP_HOT_ADD_DISK:
1822 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1823 tmp = conf->disks + i;
1824 if (!tmp->used_slot) {
1829 if (added_disk == -1) {
1839 * Switch the spare disk to write-only mode:
1841 case DISKOP_SPARE_WRITE:
1847 sdisk = conf->disks + spare_disk;
1848 sdisk->operational = 1;
1849 sdisk->write_only = 1;
1850 conf->spare = sdisk;
1853 * Deactivate a spare disk:
1855 case DISKOP_SPARE_INACTIVE:
1856 sdisk = conf->disks + spare_disk;
1857 sdisk->operational = 0;
1858 sdisk->write_only = 0;
1860 * Was the spare being resynced?
1862 if (conf->spare == sdisk)
1866 * Activate (mark read-write) the (now sync) spare disk,
1867 * which means we switch it's 'raid position' (->raid_disk)
1868 * with the failed disk. (only the first 'conf->raid_disks'
1869 * slots are used for 'real' disks and we must preserve this
1872 case DISKOP_SPARE_ACTIVE:
1878 sdisk = conf->disks + spare_disk;
1879 fdisk = conf->disks + failed_disk;
1881 spare_desc = &sb->disks[sdisk->number];
1882 failed_desc = &sb->disks[fdisk->number];
1884 if (spare_desc != *d) {
1890 if (spare_desc->raid_disk != sdisk->raid_disk) {
1896 if (sdisk->raid_disk != spare_disk) {
1902 if (failed_desc->raid_disk != fdisk->raid_disk) {
1908 if (fdisk->raid_disk != failed_disk) {
1915 * do the switch finally
1917 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1918 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1920 /* There must be a spare_rdev, but there may not be a
1921 * failed_rdev. That slot might be empty...
1923 spare_rdev->desc_nr = failed_desc->number;
1925 failed_rdev->desc_nr = spare_desc->number;
1927 xchg_values(*spare_desc, *failed_desc);
1928 xchg_values(*fdisk, *sdisk);
1931 * (careful, 'failed' and 'spare' are switched from now on)
1933 * we want to preserve linear numbering and we want to
1934 * give the proper raid_disk number to the now activated
1935 * disk. (this means we switch back these values)
1938 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1939 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1940 xchg_values(spare_desc->number, failed_desc->number);
1941 xchg_values(sdisk->number, fdisk->number);
1945 if (sdisk->dev == MKDEV(0,0))
1946 sdisk->used_slot = 0;
1949 * this really activates the spare.
1952 fdisk->write_only = 0;
1955 * if we activate a spare, we definitely replace a
1956 * non-operational disk slot in the 'low' area of
1959 conf->failed_disks--;
1960 conf->working_disks++;
1965 case DISKOP_HOT_REMOVE_DISK:
1966 rdisk = conf->disks + removed_disk;
1968 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1973 rdisk->dev = MKDEV(0,0);
1974 rdisk->used_slot = 0;
1978 case DISKOP_HOT_ADD_DISK:
1979 adisk = conf->disks + added_disk;
1982 if (added_disk != added_desc->number) {
1988 adisk->number = added_desc->number;
1989 adisk->raid_disk = added_desc->raid_disk;
1990 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1992 adisk->operational = 0;
1993 adisk->write_only = 0;
1995 adisk->used_slot = 1;
2006 md_spin_unlock_irq(&conf->device_lock);
2007 print_raid5_conf(conf);
2011 static mdk_personality_t raid5_personality=
2014 make_request: raid5_make_request,
2017 status: raid5_status,
2018 error_handler: raid5_error,
2019 diskop: raid5_diskop,
2020 stop_resync: raid5_stop_resync,
2021 restart_resync: raid5_restart_resync,
2022 sync_request: raid5_sync_request
2025 static int md__init raid5_init (void)
2027 return register_md_personality (RAID5, &raid5_personality);
2030 static void raid5_exit (void)
2032 unregister_md_personality (RAID5);
2035 module_init(raid5_init);
2036 module_exit(raid5_exit);
2037 MODULE_LICENSE("GPL");