added mtd driver
[linux-2.4.git] / drivers / md / raid1.c
1 /*
2  * raid1.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5  *
6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7  *
8  * RAID-1 management functions.
9  *
10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11  *
12  * Fixes to reconstruction by Jakob Ã˜stergaard" <jakob@ostenfeld.dk>
13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2, or (at your option)
18  * any later version.
19  *
20  * You should have received a copy of the GNU General Public License
21  * (for example /usr/src/linux/COPYING); if not, write to the Free
22  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24
25 #include <linux/module.h>
26 #include <linux/config.h>
27 #include <linux/slab.h>
28 #include <linux/raid/raid1.h>
29 #include <asm/atomic.h>
30
31 #define MAJOR_NR MD_MAJOR
32 #define MD_DRIVER
33 #define MD_PERSONALITY
34
35 #define MAX_WORK_PER_DISK 128
36
37 #define NR_RESERVED_BUFS        32
38
39
40 /*
41  * The following can be used to debug the driver
42  */
43 #define RAID1_DEBUG     0
44
45 #if RAID1_DEBUG
46 #define PRINTK(x...)   printk(x)
47 #define inline
48 #define __inline__
49 #else
50 #define PRINTK(x...)  do { } while (0)
51 #endif
52
53
54 static mdk_personality_t raid1_personality;
55 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
56 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
57
58 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
59 {
60         /* return a linked list of "cnt" struct buffer_heads.
61          * don't take any off the free list unless we know we can
62          * get all we need, otherwise we could deadlock
63          */
64         struct buffer_head *bh=NULL;
65
66         while(cnt) {
67                 struct buffer_head *t;
68                 md_spin_lock_irq(&conf->device_lock);
69                 if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
70                         while (cnt) {
71                                 t = conf->freebh;
72                                 conf->freebh = t->b_next;
73                                 t->b_next = bh;
74                                 bh = t;
75                                 t->b_state = 0;
76                                 conf->freebh_cnt--;
77                                 cnt--;
78                         }
79                 md_spin_unlock_irq(&conf->device_lock);
80                 if (cnt == 0)
81                         break;
82                 t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
83                 if (t) {
84                         t->b_next = bh;
85                         bh = t;
86                         cnt--;
87                 } else {
88                         PRINTK("raid1: waiting for %d bh\n", cnt);
89                         conf->freebh_blocked = 1;
90                         wait_disk_event(conf->wait_buffer,
91                                         !conf->freebh_blocked ||
92                                         conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
93                         conf->freebh_blocked = 0;
94                 }
95         }
96         return bh;
97 }
98
99 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
100 {
101         unsigned long flags;
102         spin_lock_irqsave(&conf->device_lock, flags);
103         while (bh) {
104                 struct buffer_head *t = bh;
105                 bh=bh->b_next;
106                 if (t->b_pprev == NULL)
107                         kmem_cache_free(bh_cachep, t);
108                 else {
109                         t->b_next= conf->freebh;
110                         conf->freebh = t;
111                         conf->freebh_cnt++;
112                 }
113         }
114         spin_unlock_irqrestore(&conf->device_lock, flags);
115         wake_up(&conf->wait_buffer);
116 }
117
118 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
119 {
120         /* allocate cnt buffer_heads, possibly less if kmalloc fails */
121         int i = 0;
122
123         while (i < cnt) {
124                 struct buffer_head *bh;
125                 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
126                 if (!bh) break;
127
128                 md_spin_lock_irq(&conf->device_lock);
129                 bh->b_pprev = &conf->freebh;
130                 bh->b_next = conf->freebh;
131                 conf->freebh = bh;
132                 conf->freebh_cnt++;
133                 md_spin_unlock_irq(&conf->device_lock);
134
135                 i++;
136         }
137         return i;
138 }
139
140 static void raid1_shrink_bh(raid1_conf_t *conf)
141 {
142         /* discard all buffer_heads */
143
144         md_spin_lock_irq(&conf->device_lock);
145         while (conf->freebh) {
146                 struct buffer_head *bh = conf->freebh;
147                 conf->freebh = bh->b_next;
148                 kmem_cache_free(bh_cachep, bh);
149                 conf->freebh_cnt--;
150         }
151         md_spin_unlock_irq(&conf->device_lock);
152 }
153                 
154
155 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
156 {
157         struct raid1_bh *r1_bh = NULL;
158
159         do {
160                 md_spin_lock_irq(&conf->device_lock);
161                 if (!conf->freer1_blocked && conf->freer1) {
162                         r1_bh = conf->freer1;
163                         conf->freer1 = r1_bh->next_r1;
164                         conf->freer1_cnt--;
165                         r1_bh->next_r1 = NULL;
166                         r1_bh->state = (1 << R1BH_PreAlloc);
167                         r1_bh->bh_req.b_state = 0;
168                 }
169                 md_spin_unlock_irq(&conf->device_lock);
170                 if (r1_bh)
171                         return r1_bh;
172                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
173                 if (r1_bh) {
174                         memset(r1_bh, 0, sizeof(*r1_bh));
175                         return r1_bh;
176                 }
177                 conf->freer1_blocked = 1;
178                 wait_disk_event(conf->wait_buffer,
179                                 !conf->freer1_blocked ||
180                                 conf->freer1_cnt > NR_RESERVED_BUFS/2
181                         );
182                 conf->freer1_blocked = 0;
183         } while (1);
184 }
185
186 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
187 {
188         struct buffer_head *bh = r1_bh->mirror_bh_list;
189         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
190
191         r1_bh->mirror_bh_list = NULL;
192
193         if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
194                 unsigned long flags;
195                 spin_lock_irqsave(&conf->device_lock, flags);
196                 r1_bh->next_r1 = conf->freer1;
197                 conf->freer1 = r1_bh;
198                 conf->freer1_cnt++;
199                 spin_unlock_irqrestore(&conf->device_lock, flags);
200                 /* don't need to wakeup wait_buffer because
201                  *  raid1_free_bh below will do that
202                  */
203         } else {
204                 kfree(r1_bh);
205         }
206         raid1_free_bh(conf, bh);
207 }
208
209 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
210 {
211         int i = 0;
212
213         while (i < cnt) {
214                 struct raid1_bh *r1_bh;
215                 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
216                 if (!r1_bh)
217                         break;
218                 memset(r1_bh, 0, sizeof(*r1_bh));
219                 set_bit(R1BH_PreAlloc, &r1_bh->state);
220                 r1_bh->mddev = conf->mddev;
221
222                 raid1_free_r1bh(r1_bh);
223                 i++;
224         }
225         return i;
226 }
227
228 static void raid1_shrink_r1bh(raid1_conf_t *conf)
229 {
230         md_spin_lock_irq(&conf->device_lock);
231         while (conf->freer1) {
232                 struct raid1_bh *r1_bh = conf->freer1;
233                 conf->freer1 = r1_bh->next_r1;
234                 conf->freer1_cnt--;
235                 kfree(r1_bh);
236         }
237         md_spin_unlock_irq(&conf->device_lock);
238 }
239
240
241
242 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
243 {
244         unsigned long flags;
245         struct buffer_head *bh = r1_bh->mirror_bh_list;
246         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
247         r1_bh->mirror_bh_list = NULL;
248         
249         spin_lock_irqsave(&conf->device_lock, flags);
250         r1_bh->next_r1 = conf->freebuf;
251         conf->freebuf = r1_bh;
252         spin_unlock_irqrestore(&conf->device_lock, flags);
253         raid1_free_bh(conf, bh);
254 }
255
256 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
257 {
258         struct raid1_bh *r1_bh;
259
260         md_spin_lock_irq(&conf->device_lock);
261         wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
262         r1_bh = conf->freebuf;
263         conf->freebuf = r1_bh->next_r1;
264         r1_bh->next_r1= NULL;
265         md_spin_unlock_irq(&conf->device_lock);
266
267         return r1_bh;
268 }
269
270 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
271 {
272         int i = 0;
273         struct raid1_bh *head = NULL, **tail;
274         tail = &head;
275
276         while (i < cnt) {
277                 struct raid1_bh *r1_bh;
278                 struct page *page;
279
280                 page = alloc_page(GFP_KERNEL);
281                 if (!page)
282                         break;
283
284                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
285                 if (!r1_bh) {
286                         __free_page(page);
287                         break;
288                 }
289                 memset(r1_bh, 0, sizeof(*r1_bh));
290                 r1_bh->bh_req.b_page = page;
291                 r1_bh->bh_req.b_data = page_address(page);
292                 *tail = r1_bh;
293                 r1_bh->next_r1 = NULL;
294                 tail = & r1_bh->next_r1;
295                 i++;
296         }
297         /* this lock probably isn't needed, as at the time when
298          * we are allocating buffers, nobody else will be touching the
299          * freebuf list.  But it doesn't hurt....
300          */
301         md_spin_lock_irq(&conf->device_lock);
302         *tail = conf->freebuf;
303         conf->freebuf = head;
304         md_spin_unlock_irq(&conf->device_lock);
305         return i;
306 }
307
308 static void raid1_shrink_buffers (raid1_conf_t *conf)
309 {
310         struct raid1_bh *head;
311         md_spin_lock_irq(&conf->device_lock);
312         head = conf->freebuf;
313         conf->freebuf = NULL;
314         md_spin_unlock_irq(&conf->device_lock);
315
316         while (head) {
317                 struct raid1_bh *r1_bh = head;
318                 head = r1_bh->next_r1;
319                 __free_page(r1_bh->bh_req.b_page);
320                 kfree(r1_bh);
321         }
322 }
323
324 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
325 {
326         raid1_conf_t *conf = mddev_to_conf(mddev);
327         int i, disks = MD_SB_DISKS;
328         unsigned long flags;
329
330         /*
331          * Later we do read balancing on the read side 
332          * now we use the first available disk.
333          */
334
335         md_spin_lock_irqsave(&conf->device_lock, flags);
336         for (i = 0; i < disks; i++) {
337                 if (conf->mirrors[i].operational) {
338                         *rdev = conf->mirrors[i].dev;
339                         md_spin_unlock_irqrestore(&conf->device_lock, flags);
340                         return (0);
341                 }
342         }
343         md_spin_unlock_irqrestore(&conf->device_lock, flags);
344
345         printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
346         return (-1);
347 }
348
349 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
350 {
351         unsigned long flags;
352         mddev_t *mddev = r1_bh->mddev;
353         raid1_conf_t *conf = mddev_to_conf(mddev);
354
355         md_spin_lock_irqsave(&retry_list_lock, flags);
356         if (raid1_retry_list == NULL)
357                 raid1_retry_tail = &raid1_retry_list;
358         *raid1_retry_tail = r1_bh;
359         raid1_retry_tail = &r1_bh->next_r1;
360         r1_bh->next_r1 = NULL;
361         md_spin_unlock_irqrestore(&retry_list_lock, flags);
362         md_wakeup_thread(conf->thread);
363 }
364
365
366 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
367 {
368         unsigned long flags;
369         spin_lock_irqsave(&conf->segment_lock, flags);
370         if (sector < conf->start_active)
371                 conf->cnt_done--;
372         else if (sector >= conf->start_future && conf->phase == phase)
373                 conf->cnt_future--;
374         else if (!--conf->cnt_pending)
375                 wake_up(&conf->wait_ready);
376
377         spin_unlock_irqrestore(&conf->segment_lock, flags);
378 }
379
380 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
381 {
382         unsigned long flags;
383         spin_lock_irqsave(&conf->segment_lock, flags);
384         if (sector >= conf->start_ready)
385                 --conf->cnt_ready;
386         else if (sector >= conf->start_active) {
387                 if (!--conf->cnt_active) {
388                         conf->start_active = conf->start_ready;
389                         wake_up(&conf->wait_done);
390                 }
391         }
392         spin_unlock_irqrestore(&conf->segment_lock, flags);
393 }
394
395 /*
396  * raid1_end_bh_io() is called when we have finished servicing a mirrored
397  * operation and are ready to return a success/failure code to the buffer
398  * cache layer.
399  */
400 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
401 {
402         struct buffer_head *bh = r1_bh->master_bh;
403
404         io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
405                         test_bit(R1BH_SyncPhase, &r1_bh->state));
406
407         bh->b_end_io(bh, uptodate);
408         raid1_free_r1bh(r1_bh);
409 }
410 void raid1_end_request (struct buffer_head *bh, int uptodate)
411 {
412         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
413
414         /*
415          * this branch is our 'one mirror IO has finished' event handler:
416          */
417         if (!uptodate)
418                 md_error (r1_bh->mddev, bh->b_dev);
419         else
420                 /*
421                  * Set R1BH_Uptodate in our master buffer_head, so that
422                  * we will return a good error code for to the higher
423                  * levels even if IO on some other mirrored buffer fails.
424                  *
425                  * The 'master' represents the complex operation to 
426                  * user-side. So if something waits for IO, then it will
427                  * wait for the 'master' buffer_head.
428                  */
429                 set_bit (R1BH_Uptodate, &r1_bh->state);
430
431         /*
432          * We split up the read and write side, imho they are 
433          * conceptually different.
434          */
435
436         if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
437                 /*
438                  * we have only one buffer_head on the read side
439                  */
440                 
441                 if (uptodate) {
442                         raid1_end_bh_io(r1_bh, uptodate);
443                         return;
444                 }
445                 /*
446                  * oops, read error:
447                  */
448                 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 
449                          partition_name(bh->b_dev), bh->b_blocknr);
450                 raid1_reschedule_retry(r1_bh);
451                 return;
452         }
453
454         /*
455          * WRITE:
456          *
457          * Let's see if all mirrored write operations have finished 
458          * already.
459          */
460
461         if (atomic_dec_and_test(&r1_bh->remaining))
462                 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
463 }
464
465 /*
466  * This routine returns the disk from which the requested read should
467  * be done. It bookkeeps the last read position for every disk
468  * in array and when new read requests come, the disk which last
469  * position is nearest to the request, is chosen.
470  *
471  * TODO: now if there are 2 mirrors in the same 2 devices, performance
472  * degrades dramatically because position is mirror, not device based.
473  * This should be changed to be device based. Also atomic sequential
474  * reads should be somehow balanced.
475  */
476
477 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
478 {
479         int new_disk = conf->last_used;
480         const int sectors = bh->b_size >> 9;
481         const unsigned long this_sector = bh->b_rsector;
482         int disk = new_disk;
483         unsigned long new_distance;
484         unsigned long current_distance;
485         
486         /*
487          * Check if it is sane at all to balance
488          */
489         
490         if (conf->resync_mirrors)
491                 goto rb_out;
492         
493
494 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
495                               ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
496         /* Work around a compiler bug in older gcc */
497         new_disk = *(volatile int *)&new_disk;
498 #endif
499
500         /* make sure that disk is operational */
501         while( !conf->mirrors[new_disk].operational) {
502                 if (new_disk <= 0) new_disk = conf->raid_disks;
503                 new_disk--;
504                 if (new_disk == disk) {
505                         /*
506                          * This means no working disk was found
507                          * Nothing much to do, lets not change anything
508                          * and hope for the best...
509                          */
510                         
511                         new_disk = conf->last_used;
512
513                         goto rb_out;
514                 }
515         }
516         disk = new_disk;
517         /* now disk == new_disk == starting point for search */
518         
519         /*
520          * Don't touch anything for sequential reads.
521          */
522
523         if (this_sector == conf->mirrors[new_disk].head_position)
524                 goto rb_out;
525         
526         /*
527          * If reads have been done only on a single disk
528          * for a time, lets give another disk a change.
529          * This is for kicking those idling disks so that
530          * they would find work near some hotspot.
531          */
532         
533         if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
534                 conf->sect_count = 0;
535
536 #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
537                 /* Work around a compiler bug in egcs-2.92.11 19980921 */
538                 new_disk = *(volatile int *)&new_disk;
539 #endif
540                 do {
541                         if (new_disk<=0)
542                                 new_disk = conf->raid_disks;
543                         new_disk--;
544                         if (new_disk == disk)
545                                 break;
546                 } while ((conf->mirrors[new_disk].write_only) ||
547                          (!conf->mirrors[new_disk].operational));
548
549                 goto rb_out;
550         }
551         
552         current_distance = abs(this_sector -
553                                 conf->mirrors[disk].head_position);
554         
555         /* Find the disk which is closest */
556         
557 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
558                               ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
559         /* Work around a compiler bug in older gcc */
560         disk = *(volatile int *)&disk;
561 #endif
562         do {
563                 if (disk <= 0)
564                         disk = conf->raid_disks;
565                 disk--;
566                 
567                 if ((conf->mirrors[disk].write_only) ||
568                                 (!conf->mirrors[disk].operational))
569                         continue;
570                 
571                 new_distance = abs(this_sector -
572                                         conf->mirrors[disk].head_position);
573                 
574                 if (new_distance < current_distance) {
575                         conf->sect_count = 0;
576                         current_distance = new_distance;
577                         new_disk = disk;
578                 }
579         } while (disk != conf->last_used);
580
581 rb_out:
582         conf->mirrors[new_disk].head_position = this_sector + sectors;
583
584         conf->last_used = new_disk;
585         conf->sect_count += sectors;
586
587         return new_disk;
588 }
589
590 static int raid1_make_request (mddev_t *mddev, int rw,
591                                struct buffer_head * bh)
592 {
593         raid1_conf_t *conf = mddev_to_conf(mddev);
594         struct buffer_head *bh_req, *bhl;
595         struct raid1_bh * r1_bh;
596         int disks = MD_SB_DISKS;
597         int i, sum_bhs = 0;
598         struct mirror_info *mirror;
599         kdev_t dev;
600
601         if (!buffer_locked(bh))
602                 BUG();
603         
604 /*
605  * make_request() can abort the operation when READA is being
606  * used and no empty request is available.
607  *
608  * Currently, just replace the command with READ/WRITE.
609  */
610         if (rw == READA)
611                 rw = READ;
612
613         r1_bh = raid1_alloc_r1bh (conf);
614
615         spin_lock_irq(&conf->segment_lock);
616         wait_event_lock_irq(conf->wait_done,
617                         bh->b_rsector < conf->start_active ||
618                         bh->b_rsector >= conf->start_future,
619                         conf->segment_lock);
620         if (bh->b_rsector < conf->start_active) 
621                 conf->cnt_done++;
622         else {
623                 conf->cnt_future++;
624                 if (conf->phase)
625                         set_bit(R1BH_SyncPhase, &r1_bh->state);
626         }
627         spin_unlock_irq(&conf->segment_lock);
628         
629         /*
630          * i think the read and write branch should be separated completely,
631          * since we want to do read balancing on the read side for example.
632          * Alternative implementations? :) --mingo
633          */
634
635         r1_bh->master_bh = bh;
636         r1_bh->mddev = mddev;
637         r1_bh->cmd = rw;
638
639         if (rw == READ) {
640                 /*
641                  * read balancing logic:
642                  */
643                 spin_lock_irq(&conf->device_lock);
644                 mirror = conf->mirrors + raid1_read_balance(conf, bh);
645                 dev = mirror->dev;
646                 spin_unlock_irq(&conf->device_lock);
647
648                 bh_req = &r1_bh->bh_req;
649                 memcpy(bh_req, bh, sizeof(*bh));
650                 bh_req->b_blocknr = bh->b_rsector;
651                 bh_req->b_dev = dev;
652                 bh_req->b_rdev = dev;
653         /*      bh_req->b_rsector = bh->n_rsector; */
654                 bh_req->b_end_io = raid1_end_request;
655                 bh_req->b_private = r1_bh;
656                 generic_make_request (rw, bh_req);
657                 return 0;
658         }
659
660         /*
661          * WRITE:
662          */
663
664         bhl = raid1_alloc_bh(conf, conf->raid_disks);
665         spin_lock_irq(&conf->device_lock);
666         for (i = 0; i < disks; i++) {
667                 struct buffer_head *mbh;
668                 if (!conf->mirrors[i].operational) 
669                         continue;
670  
671         /*
672          * We should use a private pool (size depending on NR_REQUEST),
673          * to avoid writes filling up the memory with bhs
674          *
675          * Such pools are much faster than kmalloc anyways (so we waste
676          * almost nothing by not using the master bh when writing and
677          * win alot of cleanness) but for now we are cool enough. --mingo
678          *
679          * It's safe to sleep here, buffer heads cannot be used in a shared
680          * manner in the write branch. Look how we lock the buffer at the
681          * beginning of this function to grok the difference ;)
682          */
683                 mbh = bhl;
684                 if (mbh == NULL) {
685                         MD_BUG();
686                         break;
687                 }
688                 bhl = mbh->b_next;
689                 mbh->b_next = NULL;
690                 mbh->b_this_page = (struct buffer_head *)1;
691                 
692         /*
693          * prepare mirrored mbh (fields ordered for max mem throughput):
694          */
695                 mbh->b_blocknr    = bh->b_rsector;
696                 mbh->b_dev        = conf->mirrors[i].dev;
697                 mbh->b_rdev       = conf->mirrors[i].dev;
698                 mbh->b_rsector    = bh->b_rsector;
699                 mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
700                                                 (1<<BH_Mapped) | (1<<BH_Lock);
701
702                 atomic_set(&mbh->b_count, 1);
703                 mbh->b_size       = bh->b_size;
704                 mbh->b_page       = bh->b_page;
705                 mbh->b_data       = bh->b_data;
706                 mbh->b_list       = BUF_LOCKED;
707                 mbh->b_end_io     = raid1_end_request;
708                 mbh->b_private    = r1_bh;
709
710                 mbh->b_next = r1_bh->mirror_bh_list;
711                 r1_bh->mirror_bh_list = mbh;
712                 sum_bhs++;
713         }
714         spin_unlock_irq(&conf->device_lock);
715         if (bhl) raid1_free_bh(conf,bhl);
716         if (!sum_bhs) {
717                 /* Gag - all mirrors non-operational.. */
718                 raid1_end_bh_io(r1_bh, 0);
719                 return 0;
720         }
721         md_atomic_set(&r1_bh->remaining, sum_bhs);
722
723         /*
724          * We have to be a bit careful about the semaphore above, thats
725          * why we start the requests separately. Since kmalloc() could
726          * fail, sleep and make_request() can sleep too, this is the
727          * safer solution. Imagine, end_request decreasing the semaphore
728          * before we could have set it up ... We could play tricks with
729          * the semaphore (presetting it and correcting at the end if
730          * sum_bhs is not 'n' but we have to do end_request by hand if
731          * all requests finish until we had a chance to set up the
732          * semaphore correctly ... lots of races).
733          */
734         bh = r1_bh->mirror_bh_list;
735         while(bh) {
736                 struct buffer_head *bh2 = bh;
737                 bh = bh->b_next;
738                 generic_make_request(rw, bh2);
739         }
740         return (0);
741 }
742
743 static void raid1_status(struct seq_file *seq, mddev_t *mddev)
744 {
745         raid1_conf_t *conf = mddev_to_conf(mddev);
746         int i;
747         
748         seq_printf(seq, " [%d/%d] [", conf->raid_disks,
749                                                  conf->working_disks);
750         for (i = 0; i < conf->raid_disks; i++)
751                 seq_printf(seq, "%s",
752                         conf->mirrors[i].operational ? "U" : "_");
753         seq_printf(seq, "]");
754 }
755
756 #define LAST_DISK KERN_ALERT \
757 "raid1: only one disk left and IO error.\n"
758
759 #define NO_SPARE_DISK KERN_ALERT \
760 "raid1: no spare disk left, degrading mirror level by one.\n"
761
762 #define DISK_FAILED KERN_ALERT \
763 "raid1: Disk failure on %s, disabling device. \n" \
764 "       Operation continuing on %d devices\n"
765
766 #define START_SYNCING KERN_ALERT \
767 "raid1: start syncing spare disk.\n"
768
769 #define ALREADY_SYNCING KERN_INFO \
770 "raid1: syncing already in progress.\n"
771
772 static void mark_disk_bad (mddev_t *mddev, int failed)
773 {
774         raid1_conf_t *conf = mddev_to_conf(mddev);
775         struct mirror_info *mirror = conf->mirrors+failed;
776         mdp_super_t *sb = mddev->sb;
777
778         mirror->operational = 0;
779         mark_disk_faulty(sb->disks+mirror->number);
780         mark_disk_nonsync(sb->disks+mirror->number);
781         mark_disk_inactive(sb->disks+mirror->number);
782         if (!mirror->write_only)
783                 sb->active_disks--;
784         else
785                 sb->spare_disks--;
786         sb->working_disks--;
787         sb->failed_disks++;
788         mddev->sb_dirty = 1;
789         md_wakeup_thread(conf->thread);
790         if (!mirror->write_only)
791                 conf->working_disks--;
792         printk (DISK_FAILED, partition_name (mirror->dev),
793                                  conf->working_disks);
794 }
795
796 static int raid1_error (mddev_t *mddev, kdev_t dev)
797 {
798         raid1_conf_t *conf = mddev_to_conf(mddev);
799         struct mirror_info * mirrors = conf->mirrors;
800         int disks = MD_SB_DISKS;
801         int i;
802         unsigned long flags;
803
804         /* Find the drive.
805          * If it is not operational, then we have already marked it as dead
806          * else if it is the last working disks, ignore the error, let the
807          * next level up know.
808          * else mark the drive as failed
809          */
810
811         for (i = 0; i < disks; i++)
812                 if (mirrors[i].dev==dev && mirrors[i].operational)
813                         break;
814         if (i == disks)
815                 return 0;
816
817         if (i < conf->raid_disks && conf->working_disks == 1) {
818                 /* Don't fail the drive, act as though we were just a
819                  * normal single drive
820                  */
821
822                 return 1;
823         }
824         md_spin_lock_irqsave(&conf->device_lock, flags);
825         mark_disk_bad(mddev, i);
826         md_spin_unlock_irqrestore(&conf->device_lock, flags);
827         return 0;
828 }
829
830 #undef LAST_DISK
831 #undef NO_SPARE_DISK
832 #undef DISK_FAILED
833 #undef START_SYNCING
834
835
836 static void print_raid1_conf (raid1_conf_t *conf)
837 {
838         int i;
839         struct mirror_info *tmp;
840
841         printk("RAID1 conf printout:\n");
842         if (!conf) {
843                 printk("(conf==NULL)\n");
844                 return;
845         }
846         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
847                          conf->raid_disks, conf->nr_disks);
848
849         for (i = 0; i < MD_SB_DISKS; i++) {
850                 tmp = conf->mirrors + i;
851                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
852                         i, tmp->spare,tmp->operational,
853                         tmp->number,tmp->raid_disk,tmp->used_slot,
854                         partition_name(tmp->dev));
855         }
856 }
857
858 static void close_sync(raid1_conf_t *conf)
859 {
860         mddev_t *mddev = conf->mddev;
861         /* If reconstruction was interrupted, we need to close the "active" and "pending"
862          * holes.
863          * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
864          */
865         /* this is really needed when recovery stops too... */
866         spin_lock_irq(&conf->segment_lock);
867         conf->start_active = conf->start_pending;
868         conf->start_ready = conf->start_pending;
869         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
870         conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
871         conf->start_future = (mddev->sb->size<<1)+1;
872         conf->cnt_pending = conf->cnt_future;
873         conf->cnt_future = 0;
874         conf->phase = conf->phase ^1;
875         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
876         conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
877         conf->phase = 0;
878         conf->cnt_future = conf->cnt_done;;
879         conf->cnt_done = 0;
880         spin_unlock_irq(&conf->segment_lock);
881         wake_up(&conf->wait_done);
882 }
883
884 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
885 {
886         int err = 0;
887         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
888         raid1_conf_t *conf = mddev->private;
889         struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
890         mdp_super_t *sb = mddev->sb;
891         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
892         mdk_rdev_t *spare_rdev, *failed_rdev;
893
894         if (conf->resync_mirrors)
895                 return 1; /* Cannot do any diskops during a resync */
896
897         switch (state) {
898         case DISKOP_SPARE_ACTIVE:
899         case DISKOP_SPARE_INACTIVE:
900                 /* need to wait for pending sync io before locking device */
901                 close_sync(conf);
902         }
903
904         md_spin_lock_irq(&conf->device_lock);
905         /*
906          * Need the conf lock when printing out state else we get BUG()s
907          */
908         print_raid1_conf(conf);
909         /*
910          * find the disk ...
911          */
912         switch (state) {
913
914         case DISKOP_SPARE_ACTIVE:
915
916                 /*
917                  * Find the failed disk within the RAID1 configuration ...
918                  * (this can only be in the first conf->working_disks part)
919                  */
920                 for (i = 0; i < conf->raid_disks; i++) {
921                         tmp = conf->mirrors + i;
922                         if ((!tmp->operational && !tmp->spare) ||
923                                         !tmp->used_slot) {
924                                 failed_disk = i;
925                                 break;
926                         }
927                 }
928                 /*
929                  * When we activate a spare disk we _must_ have a disk in
930                  * the lower (active) part of the array to replace. 
931                  */
932                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
933                         MD_BUG();
934                         err = 1;
935                         goto abort;
936                 }
937                 /* fall through */
938
939         case DISKOP_SPARE_WRITE:
940         case DISKOP_SPARE_INACTIVE:
941
942                 /*
943                  * Find the spare disk ... (can only be in the 'high'
944                  * area of the array)
945                  */
946                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
947                         tmp = conf->mirrors + i;
948                         if (tmp->spare && tmp->number == (*d)->number) {
949                                 spare_disk = i;
950                                 break;
951                         }
952                 }
953                 if (spare_disk == -1) {
954                         MD_BUG();
955                         err = 1;
956                         goto abort;
957                 }
958                 break;
959
960         case DISKOP_HOT_REMOVE_DISK:
961
962                 for (i = 0; i < MD_SB_DISKS; i++) {
963                         tmp = conf->mirrors + i;
964                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
965                                 if (tmp->operational) {
966                                         err = -EBUSY;
967                                         goto abort;
968                                 }
969                                 removed_disk = i;
970                                 break;
971                         }
972                 }
973                 if (removed_disk == -1) {
974                         MD_BUG();
975                         err = 1;
976                         goto abort;
977                 }
978                 break;
979
980         case DISKOP_HOT_ADD_DISK:
981
982                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
983                         tmp = conf->mirrors + i;
984                         if (!tmp->used_slot) {
985                                 added_disk = i;
986                                 break;
987                         }
988                 }
989                 if (added_disk == -1) {
990                         MD_BUG();
991                         err = 1;
992                         goto abort;
993                 }
994                 break;
995         }
996
997         switch (state) {
998         /*
999          * Switch the spare disk to write-only mode:
1000          */
1001         case DISKOP_SPARE_WRITE:
1002                 sdisk = conf->mirrors + spare_disk;
1003                 sdisk->operational = 1;
1004                 sdisk->write_only = 1;
1005                 break;
1006         /*
1007          * Deactivate a spare disk:
1008          */
1009         case DISKOP_SPARE_INACTIVE:
1010                 if (conf->start_future > 0) {
1011                         MD_BUG();
1012                         err = -EBUSY;
1013                         break;
1014                 }
1015                 sdisk = conf->mirrors + spare_disk;
1016                 sdisk->operational = 0;
1017                 sdisk->write_only = 0;
1018                 break;
1019         /*
1020          * Activate (mark read-write) the (now sync) spare disk,
1021          * which means we switch it's 'raid position' (->raid_disk)
1022          * with the failed disk. (only the first 'conf->nr_disks'
1023          * slots are used for 'real' disks and we must preserve this
1024          * property)
1025          */
1026         case DISKOP_SPARE_ACTIVE:
1027                 if (conf->start_future > 0) {
1028                         MD_BUG();
1029                         err = -EBUSY;
1030                         break;
1031                 }
1032                 sdisk = conf->mirrors + spare_disk;
1033                 fdisk = conf->mirrors + failed_disk;
1034
1035                 spare_desc = &sb->disks[sdisk->number];
1036                 failed_desc = &sb->disks[fdisk->number];
1037
1038                 if (spare_desc != *d) {
1039                         MD_BUG();
1040                         err = 1;
1041                         goto abort;
1042                 }
1043
1044                 if (spare_desc->raid_disk != sdisk->raid_disk) {
1045                         MD_BUG();
1046                         err = 1;
1047                         goto abort;
1048                 }
1049                         
1050                 if (sdisk->raid_disk != spare_disk) {
1051                         MD_BUG();
1052                         err = 1;
1053                         goto abort;
1054                 }
1055
1056                 if (failed_desc->raid_disk != fdisk->raid_disk) {
1057                         MD_BUG();
1058                         err = 1;
1059                         goto abort;
1060                 }
1061
1062                 if (fdisk->raid_disk != failed_disk) {
1063                         MD_BUG();
1064                         err = 1;
1065                         goto abort;
1066                 }
1067
1068                 /*
1069                  * do the switch finally
1070                  */
1071                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1072                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1073
1074                 /* There must be a spare_rdev, but there may not be a
1075                  * failed_rdev.  That slot might be empty...
1076                  */
1077                 spare_rdev->desc_nr = failed_desc->number;
1078                 if (failed_rdev)
1079                         failed_rdev->desc_nr = spare_desc->number;
1080                 
1081                 xchg_values(*spare_desc, *failed_desc);
1082                 xchg_values(*fdisk, *sdisk);
1083
1084                 /*
1085                  * (careful, 'failed' and 'spare' are switched from now on)
1086                  *
1087                  * we want to preserve linear numbering and we want to
1088                  * give the proper raid_disk number to the now activated
1089                  * disk. (this means we switch back these values)
1090                  */
1091         
1092                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1093                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1094                 xchg_values(spare_desc->number, failed_desc->number);
1095                 xchg_values(sdisk->number, fdisk->number);
1096
1097                 *d = failed_desc;
1098
1099                 if (sdisk->dev == MKDEV(0,0))
1100                         sdisk->used_slot = 0;
1101                 /*
1102                  * this really activates the spare.
1103                  */
1104                 fdisk->spare = 0;
1105                 fdisk->write_only = 0;
1106
1107                 /*
1108                  * if we activate a spare, we definitely replace a
1109                  * non-operational disk slot in the 'low' area of
1110                  * the disk array.
1111                  */
1112
1113                 conf->working_disks++;
1114
1115                 break;
1116
1117         case DISKOP_HOT_REMOVE_DISK:
1118                 rdisk = conf->mirrors + removed_disk;
1119
1120                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1121                         MD_BUG();       
1122                         err = 1;
1123                         goto abort;
1124                 }
1125                 rdisk->dev = MKDEV(0,0);
1126                 rdisk->used_slot = 0;
1127                 conf->nr_disks--;
1128                 break;
1129
1130         case DISKOP_HOT_ADD_DISK:
1131                 adisk = conf->mirrors + added_disk;
1132                 added_desc = *d;
1133
1134                 if (added_disk != added_desc->number) {
1135                         MD_BUG();       
1136                         err = 1;
1137                         goto abort;
1138                 }
1139
1140                 adisk->number = added_desc->number;
1141                 adisk->raid_disk = added_desc->raid_disk;
1142                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1143
1144                 adisk->operational = 0;
1145                 adisk->write_only = 0;
1146                 adisk->spare = 1;
1147                 adisk->used_slot = 1;
1148                 adisk->head_position = 0;
1149                 conf->nr_disks++;
1150
1151                 break;
1152
1153         default:
1154                 MD_BUG();       
1155                 err = 1;
1156                 goto abort;
1157         }
1158 abort:
1159         print_raid1_conf(conf);
1160         md_spin_unlock_irq(&conf->device_lock);
1161         if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1162                 /* should move to "END_REBUILD" when such exists */
1163                 raid1_shrink_buffers(conf);
1164
1165         return err;
1166 }
1167
1168
1169 #define IO_ERROR KERN_ALERT \
1170 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1171
1172 #define REDIRECT_SECTOR KERN_ERR \
1173 "raid1: %s: redirecting sector %lu to another mirror\n"
1174
1175 /*
1176  * This is a kernel thread which:
1177  *
1178  *      1.      Retries failed read operations on working mirrors.
1179  *      2.      Updates the raid superblock when problems encounter.
1180  *      3.      Performs writes following reads for array syncronising.
1181  */
1182 static void end_sync_write(struct buffer_head *bh, int uptodate);
1183 static void end_sync_read(struct buffer_head *bh, int uptodate);
1184
1185 static void raid1d (void *data)
1186 {
1187         struct raid1_bh *r1_bh;
1188         struct buffer_head *bh;
1189         unsigned long flags;
1190         raid1_conf_t *conf = data;
1191         mddev_t *mddev = conf->mddev;
1192         kdev_t dev;
1193
1194         if (mddev->sb_dirty)
1195                 md_update_sb(mddev);
1196
1197         for (;;) {
1198                 md_spin_lock_irqsave(&retry_list_lock, flags);
1199                 r1_bh = raid1_retry_list;
1200                 if (!r1_bh)
1201                         break;
1202                 raid1_retry_list = r1_bh->next_r1;
1203                 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1204
1205                 mddev = r1_bh->mddev;
1206                 bh = &r1_bh->bh_req;
1207                 switch(r1_bh->cmd) {
1208                 case SPECIAL:
1209                         /* have to allocate lots of bh structures and
1210                          * schedule writes
1211                          */
1212                         if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1213                                 int i, sum_bhs = 0;
1214                                 int disks = MD_SB_DISKS;
1215                                 struct buffer_head *bhl, *mbh;
1216                                 
1217                                 conf = mddev_to_conf(mddev);
1218                                 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1219                                 spin_lock_irq(&conf->device_lock);
1220                                 for (i = 0; i < disks ; i++) {
1221                                         if (!conf->mirrors[i].operational)
1222                                                 continue;
1223                                         if (i==conf->last_used)
1224                                                 /* we read from here, no need to write */
1225                                                 continue;
1226                                         if (i < conf->raid_disks
1227                                             && !conf->resync_mirrors)
1228                                                 /* don't need to write this,
1229                                                  * we are just rebuilding */
1230                                                 continue;
1231                                         mbh = bhl;
1232                                         if (!mbh) {
1233                                                 MD_BUG();
1234                                                 break;
1235                                         }
1236                                         bhl = mbh->b_next;
1237                                         mbh->b_this_page = (struct buffer_head *)1;
1238
1239                                                 
1240                                 /*
1241                                  * prepare mirrored bh (fields ordered for max mem throughput):
1242                                  */
1243                                         mbh->b_blocknr    = bh->b_blocknr;
1244                                         mbh->b_dev        = conf->mirrors[i].dev;
1245                                         mbh->b_rdev       = conf->mirrors[i].dev;
1246                                         mbh->b_rsector    = bh->b_blocknr;
1247                                         mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1248                                                 (1<<BH_Mapped) | (1<<BH_Lock);
1249                                         atomic_set(&mbh->b_count, 1);
1250                                         mbh->b_size       = bh->b_size;
1251                                         mbh->b_page       = bh->b_page;
1252                                         mbh->b_data       = bh->b_data;
1253                                         mbh->b_list       = BUF_LOCKED;
1254                                         mbh->b_end_io     = end_sync_write;
1255                                         mbh->b_private    = r1_bh;
1256
1257                                         mbh->b_next = r1_bh->mirror_bh_list;
1258                                         r1_bh->mirror_bh_list = mbh;
1259
1260                                         sum_bhs++;
1261                                 }
1262                                 spin_unlock_irq(&conf->device_lock);
1263                                 md_atomic_set(&r1_bh->remaining, sum_bhs);
1264                                 if (bhl) raid1_free_bh(conf, bhl);
1265                                 mbh = r1_bh->mirror_bh_list;
1266
1267                                 if (!sum_bhs) {
1268                                         /* nowhere to write this too... I guess we
1269                                          * must be done
1270                                          */
1271                                         sync_request_done(bh->b_blocknr, conf);
1272                                         md_done_sync(mddev, bh->b_size>>9, 0);
1273                                         raid1_free_buf(r1_bh);
1274                                 } else
1275                                 while (mbh) {
1276                                         struct buffer_head *bh1 = mbh;
1277                                         mbh = mbh->b_next;
1278                                         generic_make_request(WRITE, bh1);
1279                                         md_sync_acct(bh1->b_dev, bh1->b_size/512);
1280                                 }
1281                         } else {
1282                                 /* There is no point trying a read-for-reconstruct
1283                                  * as reconstruct is about to be aborted
1284                                  */
1285
1286                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1287                                 md_done_sync(mddev, bh->b_size>>9, 0);
1288                         }
1289
1290                         break;
1291                 case READ:
1292                 case READA:
1293                         dev = bh->b_dev;
1294                         raid1_map (mddev, &bh->b_dev);
1295                         if (bh->b_dev == dev) {
1296                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1297                                 raid1_end_bh_io(r1_bh, 0);
1298                         } else {
1299                                 printk (REDIRECT_SECTOR,
1300                                         partition_name(bh->b_dev), bh->b_blocknr);
1301                                 bh->b_rdev = bh->b_dev;
1302                                 bh->b_rsector = bh->b_blocknr;
1303                                 generic_make_request (r1_bh->cmd, bh);
1304                         }
1305                         break;
1306                 }
1307         }
1308         md_spin_unlock_irqrestore(&retry_list_lock, flags);
1309 }
1310 #undef IO_ERROR
1311 #undef REDIRECT_SECTOR
1312
1313 /*
1314  * Private kernel thread to reconstruct mirrors after an unclean
1315  * shutdown.
1316  */
1317 static void raid1syncd (void *data)
1318 {
1319         raid1_conf_t *conf = data;
1320         mddev_t *mddev = conf->mddev;
1321
1322         if (!conf->resync_mirrors)
1323                 return;
1324         if (conf->resync_mirrors == 2)
1325                 return;
1326         down(&mddev->recovery_sem);
1327         if (!md_do_sync(mddev, NULL)) {
1328                 /*
1329                  * Only if everything went Ok.
1330                  */
1331                 conf->resync_mirrors = 0;
1332         }
1333
1334         close_sync(conf);
1335
1336         up(&mddev->recovery_sem);
1337         raid1_shrink_buffers(conf);
1338
1339         md_recover_arrays(); /* incase we are degraded and a spare is available */
1340 }
1341
1342 /*
1343  * perform a "sync" on one "block"
1344  *
1345  * We need to make sure that no normal I/O request - particularly write
1346  * requests - conflict with active sync requests.
1347  * This is achieved by conceptually dividing the device space into a
1348  * number of sections:
1349  *  DONE: 0 .. a-1     These blocks are in-sync
1350  *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1351  *                     no normal IO requests
1352  *  READY: b .. c-1    These blocks have no normal IO requests - sync
1353  *                     request may be happening
1354  *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1355  *                     ones will be added
1356  *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1357  *                     be happening, but not sync
1358  *
1359  * We keep a
1360  *   phase    which flips (0 or 1) each time d moves and
1361  * a count of:
1362  *   z =  active io requests in FUTURE since d moved - marked with
1363  *        current phase
1364  *   y =  active io requests in FUTURE before d moved, or PENDING -
1365  *        marked with previous phase
1366  *   x =  active sync requests in READY
1367  *   w =  active sync requests in ACTIVE
1368  *   v =  active io requests in DONE
1369  *
1370  * Normally, a=b=c=d=0 and z= active io requests
1371  *   or a=b=c=d=END and v= active io requests
1372  * Allowed changes to a,b,c,d:
1373  * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1374  * B:  y==0 -> c=d
1375  * C:   b=c, w+=x, x=0
1376  * D:  w==0 -> a=b
1377  * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1378  *
1379  * At start of sync we apply A.
1380  * When y reaches 0, we apply B then A then being sync requests
1381  * When sync point reaches c-1, we wait for y==0, and W==0, and
1382  * then apply apply B then A then D then C.
1383  * Finally, we apply E
1384  *
1385  * The sync request simply issues a "read" against a working drive
1386  * This is marked so that on completion the raid1d thread is woken to
1387  * issue suitable write requests
1388  */
1389
1390 static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1391 {
1392         raid1_conf_t *conf = mddev_to_conf(mddev);
1393         struct mirror_info *mirror;
1394         struct raid1_bh *r1_bh;
1395         struct buffer_head *bh;
1396         int bsize;
1397         int disk;
1398         int block_nr;
1399         int buffs;
1400         kdev_t dev;
1401
1402         if (!sector_nr) {
1403                 /* we want enough buffers to hold twice the window of 128*/
1404                 buffs = 128 *2 / (PAGE_SIZE>>9);
1405                 buffs = raid1_grow_buffers(conf, buffs);
1406                 if (buffs < 2)
1407                         goto nomem;
1408                 conf->window = buffs*(PAGE_SIZE>>9)/2;
1409         }
1410         spin_lock_irq(&conf->segment_lock);
1411         if (!sector_nr) {
1412                 /* initialize ...*/
1413                 conf->start_active = 0;
1414                 conf->start_ready = 0;
1415                 conf->start_pending = 0;
1416                 conf->start_future = 0;
1417                 conf->phase = 0;
1418                 
1419                 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1420                 conf->cnt_done = conf->cnt_pending = 0;
1421                 if (conf->cnt_ready || conf->cnt_active)
1422                         MD_BUG();
1423         }
1424         while (sector_nr >= conf->start_pending) {
1425                 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1426                         sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1427                         conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1428                 wait_event_lock_irq(conf->wait_done,
1429                                         !conf->cnt_active,
1430                                         conf->segment_lock);
1431                 wait_event_lock_irq(conf->wait_ready,
1432                                         !conf->cnt_pending,
1433                                         conf->segment_lock);
1434                 conf->start_active = conf->start_ready;
1435                 conf->start_ready = conf->start_pending;
1436                 conf->start_pending = conf->start_future;
1437                 conf->start_future = conf->start_future+conf->window;
1438                 // Note: falling off the end is not a problem
1439                 conf->phase = conf->phase ^1;
1440                 conf->cnt_active = conf->cnt_ready;
1441                 conf->cnt_ready = 0;
1442                 conf->cnt_pending = conf->cnt_future;
1443                 conf->cnt_future = 0;
1444                 wake_up(&conf->wait_done);
1445         }
1446         conf->cnt_ready++;
1447         spin_unlock_irq(&conf->segment_lock);
1448                 
1449
1450         /* If reconstructing, and >1 working disc,
1451          * could dedicate one to rebuild and others to
1452          * service read requests ..
1453          */
1454         spin_lock_irq(&conf->device_lock);
1455         disk = conf->last_used;
1456         /* make sure disk is operational */
1457         while (!conf->mirrors[disk].operational) {
1458                 if (disk <= 0) disk = conf->raid_disks;
1459                 disk--;
1460                 if (disk == conf->last_used)
1461                         break;
1462         }
1463         conf->last_used = disk;
1464         
1465         mirror = conf->mirrors+conf->last_used;
1466         dev = mirror->dev;
1467         spin_unlock_irq(&conf->device_lock);
1468         
1469         r1_bh = raid1_alloc_buf (conf);
1470         r1_bh->master_bh = NULL;
1471         r1_bh->mddev = mddev;
1472         r1_bh->cmd = SPECIAL;
1473         bh = &r1_bh->bh_req;
1474
1475         block_nr = sector_nr;
1476         bsize = 512;
1477         while (!(block_nr & 1) && bsize < PAGE_SIZE
1478                         && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
1479                 block_nr >>= 1;
1480                 bsize <<= 1;
1481         }
1482         bh->b_size = bsize;
1483         bh->b_list = BUF_LOCKED;
1484         bh->b_dev = dev;
1485         bh->b_rdev = dev;
1486         bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1487         if (!bh->b_page)
1488                 BUG();
1489         if (!bh->b_data)
1490                 BUG();
1491         if (bh->b_data != page_address(bh->b_page))
1492                 BUG();
1493         bh->b_end_io = end_sync_read;
1494         bh->b_private = r1_bh;
1495         bh->b_blocknr = sector_nr;
1496         bh->b_rsector = sector_nr;
1497         init_waitqueue_head(&bh->b_wait);
1498
1499         generic_make_request(READ, bh);
1500         md_sync_acct(bh->b_dev, bh->b_size/512);
1501
1502         return (bsize >> 9);
1503
1504 nomem:
1505         raid1_shrink_buffers(conf);
1506         return -ENOMEM;
1507 }
1508
1509 static void end_sync_read(struct buffer_head *bh, int uptodate)
1510 {
1511         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1512
1513         /* we have read a block, now it needs to be re-written,
1514          * or re-read if the read failed.
1515          * We don't do much here, just schedule handling by raid1d
1516          */
1517         if (!uptodate)
1518                 md_error (r1_bh->mddev, bh->b_dev);
1519         else
1520                 set_bit(R1BH_Uptodate, &r1_bh->state);
1521         raid1_reschedule_retry(r1_bh);
1522 }
1523
1524 static void end_sync_write(struct buffer_head *bh, int uptodate)
1525 {
1526         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1527         
1528         if (!uptodate)
1529                 md_error (r1_bh->mddev, bh->b_dev);
1530         if (atomic_dec_and_test(&r1_bh->remaining)) {
1531                 mddev_t *mddev = r1_bh->mddev;
1532                 unsigned long sect = bh->b_blocknr;
1533                 int size = bh->b_size;
1534                 raid1_free_buf(r1_bh);
1535                 sync_request_done(sect, mddev_to_conf(mddev));
1536                 md_done_sync(mddev,size>>9, uptodate);
1537         }
1538 }
1539
1540 #define INVALID_LEVEL KERN_WARNING \
1541 "raid1: md%d: raid level not set to mirroring (%d)\n"
1542
1543 #define NO_SB KERN_ERR \
1544 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1545
1546 #define ERRORS KERN_ERR \
1547 "raid1: disabled mirror %s (errors detected)\n"
1548
1549 #define NOT_IN_SYNC KERN_ERR \
1550 "raid1: disabled mirror %s (not in sync)\n"
1551
1552 #define INCONSISTENT KERN_ERR \
1553 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1554
1555 #define ALREADY_RUNNING KERN_ERR \
1556 "raid1: disabled mirror %s (mirror %d already operational)\n"
1557
1558 #define OPERATIONAL KERN_INFO \
1559 "raid1: device %s operational as mirror %d\n"
1560
1561 #define MEM_ERROR KERN_ERR \
1562 "raid1: couldn't allocate memory for md%d\n"
1563
1564 #define SPARE KERN_INFO \
1565 "raid1: spare disk %s\n"
1566
1567 #define NONE_OPERATIONAL KERN_ERR \
1568 "raid1: no operational mirrors for md%d\n"
1569
1570 #define ARRAY_IS_ACTIVE KERN_INFO \
1571 "raid1: raid set md%d active with %d out of %d mirrors\n"
1572
1573 #define THREAD_ERROR KERN_ERR \
1574 "raid1: couldn't allocate thread for md%d\n"
1575
1576 #define START_RESYNC KERN_WARNING \
1577 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1578
1579 static int raid1_run (mddev_t *mddev)
1580 {
1581         raid1_conf_t *conf;
1582         int i, j, disk_idx;
1583         struct mirror_info *disk;
1584         mdp_super_t *sb = mddev->sb;
1585         mdp_disk_t *descriptor;
1586         mdk_rdev_t *rdev;
1587         struct md_list_head *tmp;
1588         int start_recovery = 0;
1589
1590         MOD_INC_USE_COUNT;
1591
1592         if (sb->level != 1) {
1593                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1594                 goto out;
1595         }
1596         /*
1597          * copy the already verified devices into our private RAID1
1598          * bookkeeping area. [whatever we allocate in raid1_run(),
1599          * should be freed in raid1_stop()]
1600          */
1601
1602         conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1603         mddev->private = conf;
1604         if (!conf) {
1605                 printk(MEM_ERROR, mdidx(mddev));
1606                 goto out;
1607         }
1608         memset(conf, 0, sizeof(*conf));
1609
1610         ITERATE_RDEV(mddev,rdev,tmp) {
1611                 if (rdev->faulty) {
1612                         printk(ERRORS, partition_name(rdev->dev));
1613                 } else {
1614                         if (!rdev->sb) {
1615                                 MD_BUG();
1616                                 continue;
1617                         }
1618                 }
1619                 if (rdev->desc_nr == -1) {
1620                         MD_BUG();
1621                         continue;
1622                 }
1623                 descriptor = &sb->disks[rdev->desc_nr];
1624                 disk_idx = descriptor->raid_disk;
1625                 disk = conf->mirrors + disk_idx;
1626
1627                 if (disk_faulty(descriptor)) {
1628                         disk->number = descriptor->number;
1629                         disk->raid_disk = disk_idx;
1630                         disk->dev = rdev->dev;
1631                         disk->sect_limit = MAX_WORK_PER_DISK;
1632                         disk->operational = 0;
1633                         disk->write_only = 0;
1634                         disk->spare = 0;
1635                         disk->used_slot = 1;
1636                         disk->head_position = 0;
1637                         continue;
1638                 }
1639                 if (disk_active(descriptor)) {
1640                         if (!disk_sync(descriptor)) {
1641                                 printk(NOT_IN_SYNC,
1642                                         partition_name(rdev->dev));
1643                                 continue;
1644                         }
1645                         if ((descriptor->number > MD_SB_DISKS) ||
1646                                          (disk_idx > sb->raid_disks)) {
1647
1648                                 printk(INCONSISTENT,
1649                                         partition_name(rdev->dev));
1650                                 continue;
1651                         }
1652                         if (disk->operational) {
1653                                 printk(ALREADY_RUNNING,
1654                                         partition_name(rdev->dev),
1655                                         disk_idx);
1656                                 continue;
1657                         }
1658                         printk(OPERATIONAL, partition_name(rdev->dev),
1659                                         disk_idx);
1660                         disk->number = descriptor->number;
1661                         disk->raid_disk = disk_idx;
1662                         disk->dev = rdev->dev;
1663                         disk->sect_limit = MAX_WORK_PER_DISK;
1664                         disk->operational = 1;
1665                         disk->write_only = 0;
1666                         disk->spare = 0;
1667                         disk->used_slot = 1;
1668                         disk->head_position = 0;
1669                         conf->working_disks++;
1670                 } else {
1671                 /*
1672                  * Must be a spare disk ..
1673                  */
1674                         printk(SPARE, partition_name(rdev->dev));
1675                         disk->number = descriptor->number;
1676                         disk->raid_disk = disk_idx;
1677                         disk->dev = rdev->dev;
1678                         disk->sect_limit = MAX_WORK_PER_DISK;
1679                         disk->operational = 0;
1680                         disk->write_only = 0;
1681                         disk->spare = 1;
1682                         disk->used_slot = 1;
1683                         disk->head_position = 0;
1684                 }
1685         }
1686         conf->raid_disks = sb->raid_disks;
1687         conf->nr_disks = sb->nr_disks;
1688         conf->mddev = mddev;
1689         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1690
1691         conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1692         init_waitqueue_head(&conf->wait_buffer);
1693         init_waitqueue_head(&conf->wait_done);
1694         init_waitqueue_head(&conf->wait_ready);
1695
1696         if (!conf->working_disks) {
1697                 printk(NONE_OPERATIONAL, mdidx(mddev));
1698                 goto out_free_conf;
1699         }
1700
1701
1702         /* pre-allocate some buffer_head structures.
1703          * As a minimum, 1 r1bh and raid_disks buffer_heads
1704          * would probably get us by in tight memory situations,
1705          * but a few more is probably a good idea.
1706          * For now, try NR_RESERVED_BUFS r1bh and
1707          * NR_RESERVED_BUFS*raid_disks bufferheads
1708          * This will allow at least NR_RESERVED_BUFS concurrent
1709          * reads or writes even if kmalloc starts failing
1710          */
1711         if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1712             raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1713                               < NR_RESERVED_BUFS*conf->raid_disks) {
1714                 printk(MEM_ERROR, mdidx(mddev));
1715                 goto out_free_conf;
1716         }
1717
1718         for (i = 0; i < MD_SB_DISKS; i++) {
1719                 
1720                 descriptor = sb->disks+i;
1721                 disk_idx = descriptor->raid_disk;
1722                 disk = conf->mirrors + disk_idx;
1723
1724                 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1725                                 !disk->used_slot) {
1726
1727                         disk->number = descriptor->number;
1728                         disk->raid_disk = disk_idx;
1729                         disk->dev = MKDEV(0,0);
1730
1731                         disk->operational = 0;
1732                         disk->write_only = 0;
1733                         disk->spare = 0;
1734                         disk->used_slot = 1;
1735                         disk->head_position = 0;
1736                 }
1737         }
1738
1739         /*
1740          * find the first working one and use it as a starting point
1741          * to read balancing.
1742          */
1743         for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1744                 /* nothing */;
1745         conf->last_used = j;
1746
1747
1748
1749         {
1750                 const char * name = "raid1d";
1751
1752                 conf->thread = md_register_thread(raid1d, conf, name);
1753                 if (!conf->thread) {
1754                         printk(THREAD_ERROR, mdidx(mddev));
1755                         goto out_free_conf;
1756                 }
1757         }
1758
1759         if (!(sb->state & (1 << MD_SB_CLEAN)) &&
1760             (conf->working_disks > 1)) {
1761                 const char * name = "raid1syncd";
1762
1763                 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1764                 if (!conf->resync_thread) {
1765                         printk(THREAD_ERROR, mdidx(mddev));
1766                         goto out_free_conf;
1767                 }
1768
1769                 printk(START_RESYNC, mdidx(mddev));
1770                 conf->resync_mirrors = 1;
1771                 md_wakeup_thread(conf->resync_thread);
1772         } else if (conf->working_disks != sb->raid_disks) {
1773                 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1774                 start_recovery = 1;
1775         }
1776
1777         /*
1778          * Regenerate the "device is in sync with the raid set" bit for
1779          * each device.
1780          */
1781         for (i = 0; i < MD_SB_DISKS; i++) {
1782                 mark_disk_nonsync(sb->disks+i);
1783                 for (j = 0; j < sb->raid_disks; j++) {
1784                         if (!conf->mirrors[j].operational)
1785                                 continue;
1786                         if (sb->disks[i].number == conf->mirrors[j].number)
1787                                 mark_disk_sync(sb->disks+i);
1788                 }
1789         }
1790         sb->active_disks = conf->working_disks;
1791
1792         if (start_recovery)
1793                 md_recover_arrays();
1794
1795
1796         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1797         /*
1798          * Ok, everything is just fine now
1799          */
1800         return 0;
1801
1802 out_free_conf:
1803         raid1_shrink_r1bh(conf);
1804         raid1_shrink_bh(conf);
1805         raid1_shrink_buffers(conf);
1806         kfree(conf);
1807         mddev->private = NULL;
1808 out:
1809         MOD_DEC_USE_COUNT;
1810         return -EIO;
1811 }
1812
1813 #undef INVALID_LEVEL
1814 #undef NO_SB
1815 #undef ERRORS
1816 #undef NOT_IN_SYNC
1817 #undef INCONSISTENT
1818 #undef ALREADY_RUNNING
1819 #undef OPERATIONAL
1820 #undef SPARE
1821 #undef NONE_OPERATIONAL
1822 #undef ARRAY_IS_ACTIVE
1823
1824 static int raid1_stop_resync (mddev_t *mddev)
1825 {
1826         raid1_conf_t *conf = mddev_to_conf(mddev);
1827
1828         if (conf->resync_thread) {
1829                 if (conf->resync_mirrors) {
1830                         conf->resync_mirrors = 2;
1831                         md_interrupt_thread(conf->resync_thread);
1832
1833                         printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1834                         return 1;
1835                 }
1836                 return 0;
1837         }
1838         return 0;
1839 }
1840
1841 static int raid1_restart_resync (mddev_t *mddev)
1842 {
1843         raid1_conf_t *conf = mddev_to_conf(mddev);
1844
1845         if (conf->resync_mirrors) {
1846                 if (!conf->resync_thread) {
1847                         MD_BUG();
1848                         return 0;
1849                 }
1850                 conf->resync_mirrors = 1;
1851                 md_wakeup_thread(conf->resync_thread);
1852                 return 1;
1853         }
1854         return 0;
1855 }
1856
1857 static int raid1_stop (mddev_t *mddev)
1858 {
1859         raid1_conf_t *conf = mddev_to_conf(mddev);
1860
1861         md_unregister_thread(conf->thread);
1862         if (conf->resync_thread)
1863                 md_unregister_thread(conf->resync_thread);
1864         raid1_shrink_r1bh(conf);
1865         raid1_shrink_bh(conf);
1866         raid1_shrink_buffers(conf);
1867         kfree(conf);
1868         mddev->private = NULL;
1869         MOD_DEC_USE_COUNT;
1870         return 0;
1871 }
1872
1873 static mdk_personality_t raid1_personality=
1874 {
1875         name:           "raid1",
1876         make_request:   raid1_make_request,
1877         run:            raid1_run,
1878         stop:           raid1_stop,
1879         status:         raid1_status,
1880         error_handler:  raid1_error,
1881         diskop:         raid1_diskop,
1882         stop_resync:    raid1_stop_resync,
1883         restart_resync: raid1_restart_resync,
1884         sync_request:   raid1_sync_request
1885 };
1886
1887 static int md__init raid1_init (void)
1888 {
1889         return register_md_personality (RAID1, &raid1_personality);
1890 }
1891
1892 static void raid1_exit (void)
1893 {
1894         unregister_md_personality (RAID1);
1895 }
1896
1897 module_init(raid1_init);
1898 module_exit(raid1_exit);
1899 MODULE_LICENSE("GPL");