2 hptraid.c Copyright (C) 2001 Red Hat, Inc. All rights reserved.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 You should have received a copy of the GNU General Public License
10 (for example /usr/src/linux/COPYING); if not, write to the Free
11 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13 Authors: Arjan van de Ven <arjanv@redhat.com>
16 Copyleft (C) 2001 by Wilfried Weissmann <wweissmann@gmx.at>
17 Copyright (C) 1994-96 Marc ZYNGIER <zyngier@ufr-info-p7.ibp.fr>
18 Based on work done by Søren Schmidt for FreeBSD
21 19.08.2003 v0.03 wweissmann@gmx.at
22 * register the raid volume only if all disks are available
23 * print a warning that raid-(0+)1 failover is not supported
25 15.06.2003 v0.02 wweissmann@gmx.at
26 * correct values of raid-1 superbock
27 * re-add check for availability of all disks
28 * fix offset bug in raid-1 (introduced in raid 0+1 implementation)
30 14.06.2003 wweissmann@gmx.at
31 * superblock has wrong "disks" value on raid-1
32 * fixup for raid-1 disknumbering
33 * do _NOT_ align size to 255*63 boundary
34 I WILL NOT USE FDISK TO DETERMINE THE VOLUME SIZE.
35 I WILL NOT USE FDISK TO DETERMINE THE VOLUME SIZE.
36 I WILL NOT USE FDISK TO DETERMINE THE VOLUME SIZE.
39 13.06.2003 wweissmann@gmx.at
41 * check if all disks of an array are available
44 29.05.2003 wweissmann@gmx.at
45 * release no more devices than available on unload
46 * remove static variables in raid-1 read path
50 #include <linux/module.h>
51 #include <linux/init.h>
52 #include <linux/sched.h>
53 #include <linux/smp_lock.h>
54 #include <linux/kernel.h>
55 #include <linux/blkdev.h>
56 #include <linux/blkpg.h>
57 #include <linux/genhd.h>
58 #include <linux/ioctl.h>
60 #include <linux/ide.h>
61 #include <asm/uaccess.h>
67 static int hptraid_open(struct inode * inode, struct file * filp);
68 static int hptraid_release(struct inode * inode, struct file * filp);
69 static int hptraid_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg);
70 static int hptraidspan_make_request (request_queue_t *q, int rw, struct buffer_head * bh);
71 static int hptraid0_make_request (request_queue_t *q, int rw, struct buffer_head * bh);
72 static int hptraid1_make_request (request_queue_t *q, int rw, struct buffer_head * bh);
73 static int hptraid01_make_request (request_queue_t *q, int rw, struct buffer_head * bh);
78 kdev_t device; /* disk-ID/raid 0+1 volume-ID */
79 unsigned long sectors;
80 struct block_device *bdev;
81 unsigned long last_pos;
85 unsigned int stride; /* stripesize */
86 unsigned int disks; /* number of disks in array */
87 unsigned long sectors; /* disksize in sectors */
92 int previous; /* most recently accessed disk in mirror */
93 struct hptdisk disk[8];
94 unsigned long cutoff[8]; /* raid 0 cutoff */
95 unsigned int cutoff_disks[8];
96 struct hptraid * raid01; /* sub arrays for raid 0+1 */
105 static struct hptraid_dev devlist[]=
109 {IDE0_MAJOR, 64, -1},
111 {IDE1_MAJOR, 64, -1},
113 {IDE2_MAJOR, 64, -1},
115 {IDE3_MAJOR, 64, -1},
117 {IDE4_MAJOR, 64, -1},
119 {IDE5_MAJOR, 64, -1},
124 static struct raid_device_operations hptraidspan_ops = {
126 release: hptraid_release,
127 ioctl: hptraid_ioctl,
128 make_request: hptraidspan_make_request
131 static struct raid_device_operations hptraid0_ops = {
133 release: hptraid_release,
134 ioctl: hptraid_ioctl,
135 make_request: hptraid0_make_request
138 static struct raid_device_operations hptraid1_ops = {
140 release: hptraid_release,
141 ioctl: hptraid_ioctl,
142 make_request: hptraid1_make_request
146 static struct raid_device_operations hptraid01_ops = {
148 release: hptraid_release,
149 ioctl: hptraid_ioctl,
150 make_request: hptraid01_make_request
153 static __initdata struct {
154 struct raid_device_operations *op;
158 {&hptraid0_ops, HPT_T_RAID_0, "RAID 0"},
159 {&hptraid1_ops, HPT_T_RAID_1, "RAID 1"},
160 {&hptraidspan_ops, HPT_T_SPAN, "SPAN"},
161 {&hptraid01_ops, HPT_T_RAID_01_RAID_0, "RAID 0+1"},
165 static struct hptraid raid[14];
167 static int hptraid_ioctl(struct inode *inode, struct file *file,
168 unsigned int cmd, unsigned long arg)
172 unsigned long sectors;
174 if (!inode || !inode->i_rdev)
177 minor = MINOR(inode->i_rdev)>>SHIFT;
180 case BLKGETSIZE: /* Return device size */
181 if (!arg) return -EINVAL;
182 sectors = ataraid_gendisk.part[MINOR(inode->i_rdev)].nr_sects;
183 if (MINOR(inode->i_rdev)&15)
184 return put_user(sectors, (unsigned long *) arg);
185 return put_user(raid[minor].sectors , (unsigned long *) arg);
191 struct hd_geometry *loc = (struct hd_geometry *) arg;
192 unsigned short bios_cyl;
194 if (!loc) return -EINVAL;
196 if (put_user(val, (byte *) &loc->heads)) return -EFAULT;
198 if (put_user(val, (byte *) &loc->sectors)) return -EFAULT;
199 bios_cyl = raid[minor].sectors/63/255;
200 if (put_user(bios_cyl, (unsigned short *) &loc->cylinders)) return -EFAULT;
201 if (put_user((unsigned)ataraid_gendisk.part[MINOR(inode->i_rdev)].start_sect,
202 (unsigned long *) &loc->start)) return -EFAULT;
206 case HDIO_GETGEO_BIG:
208 struct hd_big_geometry *loc = (struct hd_big_geometry *) arg;
209 unsigned int bios_cyl;
210 if (!loc) return -EINVAL;
212 if (put_user(val, (byte *) &loc->heads)) return -EFAULT;
214 if (put_user(val, (byte *) &loc->sectors)) return -EFAULT;
215 bios_cyl = raid[minor].sectors/63/255;
216 if (put_user(bios_cyl, (unsigned int *) &loc->cylinders)) return -EFAULT;
217 if (put_user((unsigned)ataraid_gendisk.part[MINOR(inode->i_rdev)].start_sect,
218 (unsigned long *) &loc->start)) return -EFAULT;
223 return blk_ioctl(inode->i_rdev, cmd, arg);
230 static int hptraidspan_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
235 struct hptraid *thisraid;
237 rsect = bh->b_rsector;
239 device = (bh->b_rdev >> SHIFT)&MAJOR_MASK;
240 thisraid = &raid[device];
243 * Partitions need adding of the start sector of the partition to the
247 rsect += ataraid_gendisk.part[MINOR(bh->b_rdev)].start_sect;
249 for (disk=0;disk<thisraid->disks;disk++) {
252 // the "on next disk" contition check is a bit odd
253 if (thisraid->disk[disk].sectors > rsect+1)
255 rsect-=thisraid->disk[disk].sectors-(disk?11:1);
258 // request spans over 2 disks => request must be split
259 if(rsect+bh->b_size/512 >= thisraid->disk[disk].sectors)
263 * The new BH_Lock semantics in ll_rw_blk.c guarantee that this
264 * is the only IO operation happening on this bh.
267 bh->b_rdev = thisraid->disk[disk].device;
268 bh->b_rsector = rsect;
271 * Let the main block layer submit the IO and resolve recursion:
276 static int hptraid0_compute_request (struct hptraid *thisraid,
278 int rw, struct buffer_head * bh)
280 unsigned long rsect_left,rsect_accum = 0;
282 unsigned int disk=0,real_disk=0;
285 /* Ok. We need to modify this sector number to a new disk + new sector
287 * If there are disks of different sizes, this gets tricky.
288 * Example with 3 disks (1Gb, 4Gb and 5 GB):
289 * The first 3 Gb of the "RAID" are evenly spread over the 3 disks.
290 * Then things get interesting. The next 2Gb (RAID view) are spread
291 * across disk 2 and 3 and the last 1Gb is disk 3 only.
293 * the way this is solved is like this: We have a list of "cutoff"
294 * points where everytime a disk falls out of the "higher" count, we
295 * mark the max sector. So once we pass a cutoff point, we have to
296 * divide by one less.
299 if (thisraid->stride==0)
303 * Woops we need to split the request to avoid crossing a stride
306 if ((bh->b_rsector/thisraid->stride) !=
307 ((bh->b_rsector+(bh->b_size/512)-1)/thisraid->stride)) {
311 rsect_left = bh->b_rsector;;
314 if (thisraid->cutoff_disks[i]==0)
316 if (bh->b_rsector > thisraid->cutoff[i]) {
317 /* we're in the wrong area so far */
318 rsect_left -= thisraid->cutoff[i];
319 rsect_accum += thisraid->cutoff[i] /
320 thisraid->cutoff_disks[i];
322 block = rsect_left / thisraid->stride;
323 disk = block % thisraid->cutoff_disks[i];
324 block = (block / thisraid->cutoff_disks[i]) *
326 bh->b_rsector = rsect_accum +
327 (rsect_left % thisraid->stride) + block;
333 if ((disk==0) && (thisraid->disk[i].sectors > rsect_accum)) {
337 if ((disk>0) && (thisraid->disk[i].sectors >= rsect_accum)) {
344 /* All but the first disk have a 10 sector offset */
350 * The new BH_Lock semantics in ll_rw_blk.c guarantee that this
351 * is the only IO operation happening on this bh.
354 bh->b_rdev = thisraid->disk[disk].device;
357 * Let the main block layer submit the IO and resolve recursion:
362 static int hptraid0_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
368 * save the sector, it must be restored before a request-split
371 rsect = bh->b_rsector;
374 * Partitions need adding of the start sector of the partition to the
378 bh->b_rsector += ataraid_gendisk.part[MINOR(bh->b_rdev)].start_sect;
380 device = (bh->b_rdev >> SHIFT)&MAJOR_MASK;
381 if( hptraid0_compute_request(raid+device, q, rw, bh) != 1 ) {
382 /* request must be split => restore sector */
383 bh->b_rsector = rsect;
390 static int hptraid1_read_request (request_queue_t *q, int rw, struct buffer_head * bh)
394 int bestsofar,bestdist,i;
396 /* Reads are simple in principle. Pick a disk and go.
397 Initially I cheat by just picking the one which the last known
398 head position is closest by.
399 Later on, online/offline checking and performance needs adding */
401 device = (bh->b_rdev >> SHIFT)&MAJOR_MASK;
402 bh->b_rsector += ataraid_gendisk.part[MINOR(bh->b_rdev)].start_sect;
405 bestdist = raid[device].disk[0].last_pos - bh->b_rsector;
411 for (i=1 ; i<raid[device].disks; i++) {
412 dist = raid[device].disk[i].last_pos - bh->b_rsector;
418 /* it's a tie; try to do some read balancing */
419 if (bestdist==dist) {
420 if ( (raid[device].previous>bestsofar) &&
421 (raid[device].previous<=i) )
423 raid[device].previous =
424 (raid[device].previous + 1) %
426 } else if (bestdist>dist) {
433 bh->b_rdev = raid[device].disk[bestsofar].device;
434 raid[device].disk[bestsofar].last_pos = bh->b_rsector+(bh->b_size>>9);
437 * Let the main block layer submit the IO and resolve recursion:
443 static int hptraid1_write_request(request_queue_t *q, int rw, struct buffer_head * bh)
445 struct buffer_head *bh1;
446 struct ataraid_bh_private *private;
450 device = (bh->b_rdev >> SHIFT)&MAJOR_MASK;
451 private = ataraid_get_private();
455 private->parent = bh;
457 atomic_set(&private->count,raid[device].disks);
460 for (i = 0; i< raid[device].disks; i++) {
461 bh1=ataraid_get_bhead();
462 /* If this ever fails we're doomed */
467 * dupe the bufferhead and update the parts that need to be
470 memcpy(bh1, bh, sizeof(*bh));
472 bh1->b_end_io = ataraid_end_request;
473 bh1->b_private = private;
474 bh1->b_rsector += ataraid_gendisk.part[MINOR(bh->b_rdev)].start_sect; /* partition offset */
475 bh1->b_rdev = raid[device].disk[i].device;
477 /* update the last known head position for the drive */
478 raid[device].disk[i].last_pos = bh1->b_rsector+(bh1->b_size>>9);
480 if( raid[device].raid01 ) {
481 if( hptraid0_compute_request(
482 raid[device].raid01 +
486 * If a split is requested then it is requested
487 * in the first iteration. This is true because
488 * of the cutoff is not used in raid 0+1.
499 generic_make_request(rw,bh1);
504 static int hptraid1_make_request (request_queue_t *q, int rw, struct buffer_head * bh) {
506 * Read and Write are totally different cases; split them totally
513 return hptraid1_read_request(q,rw,bh);
515 return hptraid1_write_request(q,rw,bh);
518 static int hptraid01_read_request (request_queue_t *q, int rw, struct buffer_head * bh)
520 int rsector=bh->b_rsector;
523 /* select mirror volume */
524 hptraid1_read_request(q, rw, bh);
526 /* stripe volume is selected by "bh->b_rdev" */
527 if( hptraid0_compute_request(
528 raid[(bh->b_rdev >> SHIFT)&MAJOR_MASK].
529 raid01 + (bh->b_rdev-1) ,
532 /* request must be split => restore sector and device */
533 bh->b_rsector = rsector;
542 static int hptraid01_make_request (request_queue_t *q, int rw, struct buffer_head * bh) {
544 * Read and Write are totally different cases; split them totally
551 return hptraid01_read_request(q,rw,bh);
553 return hptraid1_write_request(q,rw,bh);
556 static int read_disk_sb (int major, int minor, unsigned char *buffer,int bufsize)
559 struct buffer_head *bh = NULL;
560 kdev_t dev = MKDEV(major,minor);
562 if (blksize_size[major]==NULL) /* device doesn't exist */
566 /* Superblock is at 4096+412 bytes */
567 set_blocksize (dev, 4096);
568 bh = bread (dev, 1, 4096);
572 memcpy (buffer, bh->b_data, bufsize);
574 printk(KERN_ERR "hptraid: Error reading superblock.\n");
584 static unsigned long maxsectors (int major,int minor)
586 unsigned long lba = 0;
588 ide_drive_t *ideinfo;
590 dev = MKDEV(major,minor);
591 ideinfo = ide_info_ptr (dev, 0);
596 /* first sector of the last cluster */
597 if (ideinfo->head==0)
599 if (ideinfo->sect==0)
601 lba = (ideinfo->capacity);
606 static void writeentry(struct hptraid * raid, struct hptraid_dev * disk,
607 int index, struct highpoint_raid_conf * prom) {
611 struct block_device *bdev;
613 bdev = bdget(MKDEV(disk->major,disk->minor));
614 if (bdev && blkdev_get(bdev,FMODE_READ|FMODE_WRITE,0,BDEV_RAW) == 0) {
615 raid->disk[index].bdev = bdev;
617 * This is supposed to prevent others from stealing our
618 * underlying disks now blank the /proc/partitions table for
619 * the wrong partition table, so that scripts don't
620 * accidentally mount it and crash the kernel
622 /* XXX: the 0 is an utter hack --hch */
623 gd=get_gendisk(MKDEV(disk->major, 0));
625 if (gd->major==disk->major)
626 for (j=1+(disk->minor<<gd->minor_shift);
627 j<((disk->minor+1)<<gd->minor_shift);
628 j++) gd->part[j].nr_sects=0;
631 raid->disk[index].device = MKDEV(disk->major,disk->minor);
632 raid->disk[index].sectors = maxsectors(disk->major,disk->minor);
633 raid->stride = (1<<prom->raid0_shift);
634 raid->disks = prom->raid_disks;
635 raid->sectors = prom->total_secs;
636 raid->sectors += raid->sectors&1?1:0;
637 raid->magic_0=prom->magic_0;
638 raid->magic_1=prom->magic_1;
642 static int probedisk(struct hptraid_dev *disk, int device, u_int8_t type)
645 struct highpoint_raid_conf *prom;
646 static unsigned char block[4096];
648 if (disk->device != -1) /* disk is occupied? */
651 if (maxsectors(disk->major,disk->minor)==0)
654 if (read_disk_sb(disk->major,disk->minor,(unsigned char*)&block,sizeof(block)))
657 prom = (struct highpoint_raid_conf*)&block[512];
659 if (prom->magic!= 0x5a7816f0)
661 switch (prom->type) {
665 case HPT_T_RAID_01_RAID_0:
666 if(prom->type != type)
670 printk(KERN_INFO "hptraid: unknown raid level-id %i\n",
675 /* disk from another array? */
676 if (raid[device].disks) { /* only check if raid is not empty */
677 if (type == HPT_T_RAID_01_RAID_0 ) {
678 if( prom->magic_1 != raid[device].magic_1) {
682 else if (prom->magic_0 != raid[device].magic_0) {
687 i = prom->disk_number;
693 if ( type == HPT_T_RAID_01_RAID_0 ) {
695 /* allocate helper raid devices for level 0+1 */
696 if (raid[device].raid01 == NULL ) {
699 kmalloc(2 * sizeof(struct hptraid),GFP_KERNEL);
700 if ( raid[device].raid01 == NULL ) {
701 printk(KERN_ERR "hptraid: out of memory\n");
702 raid[device].disks=-1;
705 memset(raid[device].raid01, 0,
706 2 * sizeof(struct hptraid));
709 /* find free sub-stucture */
710 for (j=0; j<2; j++) {
711 if ( raid[device].raid01[j].disks == 0 ||
712 raid[device].raid01[j].magic_0 == prom->magic_0 )
714 writeentry(raid[device].raid01+j, disk,
724 raid[device].stride=raid[device].raid01[j].stride;
725 raid[device].disks=j+1;
726 raid[device].sectors=raid[device].raid01[j].sectors;
727 raid[device].disk[j].sectors=raid[device].raid01[j].sectors;
728 raid[device].magic_1=prom->magic_1;
731 writeentry(raid+device, disk, i, prom);
739 static void fill_cutoff(struct hptraid * device)
742 unsigned long smallest;
750 if ((device->disk[j].sectors < smallest) && (device->disk[j].sectors>bar))
751 smallest = device->disk[j].sectors;
754 if (device->disk[j].sectors >= smallest)
757 smallest = smallest * count;
759 device->cutoff[i] = smallest;
760 device->cutoff_disks[i] = count;
765 static int count_disks(struct hptraid * raid) {
768 if (raid->disk[i].device!=0) {
769 printk(KERN_INFO "Drive %i is %li Mb \n",
770 i,raid->disk[i].sectors/2048);
777 static void raid1_fixup(struct hptraid * raid) {
780 /* disknumbers and total disks values are bogus */
781 if (raid->disk[i].device!=0) {
782 raid->disk[count]=raid->disk[i];
784 memset(raid->disk+i, 0, sizeof(struct hptdisk));
792 static int hptraid_init_one(int device, u_int8_t type, const char * label)
795 memset(raid+device, 0, sizeof(struct hptraid));
796 for (i=0; i < 14; i++) {
797 if( probedisk(devlist+i, device, type) < 0 )
801 /* Initialize raid levels */
804 fill_cutoff(raid+device);
808 raid1_fixup(raid+device);
811 case HPT_T_RAID_01_RAID_0:
812 for(i=0; i < 2 && raid[device].raid01 &&
813 raid[device].raid01[i].disks; i++) {
814 fill_cutoff(raid[device].raid01+i);
815 /* initialize raid 0+1 volumes */
816 raid[device].disk[i].device=i+1;
821 /* Verify that we have all disks */
823 count=count_disks(raid+device);
825 if (count != raid[device].disks) {
826 printk(KERN_INFO "%s consists of %i drives but found %i drives\n",
827 label, raid[device].disks, count);
831 printk(KERN_INFO "%s consists of %i drives.\n",
833 if (type == HPT_T_RAID_01_RAID_0 ) {
834 for(i=0;i<raid[device].disks;i++) {
835 count=count_disks(raid[device].raid01+i);
836 if(count == raid[device].raid01[i].disks) {
837 printk(KERN_ERR "Sub-Raid %i array consists of %i drives.\n",
841 printk(KERN_ERR "Sub-Raid %i array consists of %i drives but found %i disk members.\n",
842 i, raid[device].raid01[i].disks,
847 printk(KERN_WARNING "ataraid%i: raid-0+1 disk failover is not implemented!\n",
850 else if (type == HPT_T_RAID_1) {
851 printk(KERN_WARNING "ataraid%i: raid-1 disk failover is not implemented!\n",
854 /* Initialize the gendisk structure */
856 ataraid_register_disk(device,raid[device].sectors);
861 return -ENODEV; /* No more raid volumes */
864 static int hptraid_init(void)
867 int device,i,count=0;
869 printk(KERN_INFO "Highpoint HPT370 Softwareraid driver for linux version 0.03\n");
871 for(i=0; oplist[i].op; i++) {
874 device=ataraid_get_device(oplist[i].op);
876 return (count?0:-ENODEV);
877 retval = hptraid_init_one(device, oplist[i].type,
880 ataraid_release_device(device);
885 return (count?0:retval);
888 static void __exit hptraid_exit (void)
891 for (device = 0; device<14; device++) {
893 struct block_device *bdev = raid[device].disk[i].bdev;
894 raid[device].disk[i].bdev = NULL;
896 blkdev_put(bdev, BDEV_RAW);
898 if (raid[device].sectors) {
899 ataraid_release_device(device);
900 if( raid[device].raid01 ) {
901 kfree(raid[device].raid01);
907 static int hptraid_open(struct inode * inode, struct file * filp)
912 static int hptraid_release(struct inode * inode, struct file * filp)
918 module_init(hptraid_init);
919 module_exit(hptraid_exit);
920 MODULE_LICENSE("GPL");