2 * linux/drivers/block/ll_rw_blk.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
12 * This handles all read/write requests to block devices
14 #include <linux/sched.h>
15 #include <linux/kernel.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/errno.h>
18 #include <linux/string.h>
19 #include <linux/config.h>
20 #include <linux/locks.h>
22 #include <linux/swap.h>
23 #include <linux/init.h>
24 #include <linux/smp_lock.h>
25 #include <linux/completion.h>
26 #include <linux/bootmem.h>
28 #include <asm/system.h>
30 #include <linux/blk.h>
31 #include <linux/highmem.h>
32 #include <linux/slab.h>
33 #include <linux/module.h>
36 * MAC Floppy IWM hooks
39 #ifdef CONFIG_MAC_FLOPPY_IWM
40 extern int mac_floppy_init(void);
44 * For the allocated request tables
46 static kmem_cache_t *request_cachep;
49 * The "disk" task queue is used to start the actual requests
52 DECLARE_TASK_QUEUE(tq_disk);
55 * Protect the request list against multiple users..
57 * With this spinlock the Linux block IO subsystem is 100% SMP threaded
58 * from the IRQ event side, and almost 100% SMP threaded from the syscall
59 * side (we still have protect against block device array operations, and
60 * the do_request() side is casually still unsafe. The kernel lock protects
61 * this part currently.).
63 * there is a fair chance that things will work just OK if these functions
64 * are called with no global kernel lock held ...
66 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
68 /* This specifies how many sectors to read ahead on the disk. */
70 int read_ahead[MAX_BLKDEV];
76 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
79 * blk_size contains the size of all block-devices in units of 1024 byte
82 * blk_size[MAJOR][MINOR]
84 * if (!blk_size[MAJOR]) then no minor size checking is done.
86 int * blk_size[MAX_BLKDEV];
89 * blksize_size contains the size of all block-devices:
91 * blksize_size[MAJOR][MINOR]
93 * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
95 int * blksize_size[MAX_BLKDEV];
98 * hardsect_size contains the size of the hardware sector of a device.
100 * hardsect_size[MAJOR][MINOR]
102 * if (!hardsect_size[MAJOR])
103 * then 512 bytes is assumed.
105 * sector_size is hardsect_size[MAJOR][MINOR]
106 * This is currently set by some scsi devices and read by the msdos fs driver.
107 * Other uses may appear later.
109 int * hardsect_size[MAX_BLKDEV];
112 * The following tunes the read-ahead algorithm in mm/filemap.c
114 int * max_readahead[MAX_BLKDEV];
117 * Max number of sectors per request
119 int * max_sectors[MAX_BLKDEV];
121 unsigned long blk_max_low_pfn, blk_max_pfn;
122 int blk_nohighio = 0;
124 static inline int get_max_sectors(kdev_t dev)
126 if (!max_sectors[MAJOR(dev)])
128 return max_sectors[MAJOR(dev)][MINOR(dev)];
131 inline request_queue_t *blk_get_queue(kdev_t dev)
133 struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
136 return bdev->queue(dev);
138 return &blk_dev[MAJOR(dev)].request_queue;
141 static int __blk_cleanup_queue(struct request_list *list)
143 struct list_head *head = &list->free;
147 while (!list_empty(head)) {
148 rq = list_entry(head->next, struct request, queue);
149 list_del(&rq->queue);
150 kmem_cache_free(request_cachep, rq);
154 if (i != list->count)
155 printk("request list leak!\n");
162 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
163 * @q: the request queue to be released
166 * blk_cleanup_queue is the pair to blk_init_queue(). It should
167 * be called when a request queue is being released; typically
168 * when a block device is being de-registered. Currently, its
169 * primary task it to free all the &struct request structures that
170 * were allocated to the queue.
172 * Hopefully the low level driver will have finished any
173 * outstanding requests first...
175 void blk_cleanup_queue(request_queue_t * q)
177 int count = q->nr_requests;
179 count -= __blk_cleanup_queue(&q->rq[READ]);
180 count -= __blk_cleanup_queue(&q->rq[WRITE]);
183 printk("blk_cleanup_queue: leaked requests (%d)\n", count);
185 memset(q, 0, sizeof(*q));
189 * blk_queue_headactive - indicate whether head of request queue may be active
190 * @q: The queue which this applies to.
191 * @active: A flag indication where the head of the queue is active.
194 * The driver for a block device may choose to leave the currently active
195 * request on the request queue, removing it only when it has completed.
196 * The queue handling routines assume this by default for safety reasons
197 * and will not involve the head of the request queue in any merging or
198 * reordering of requests when the queue is unplugged (and thus may be
199 * working on this particular request).
201 * If a driver removes requests from the queue before processing them, then
202 * it may indicate that it does so, there by allowing the head of the queue
203 * to be involved in merging and reordering. This is done be calling
204 * blk_queue_headactive() with an @active flag of %0.
206 * If a driver processes several requests at once, it must remove them (or
207 * at least all but one of them) from the request queue.
209 * When a queue is plugged the head will be assumed to be inactive.
212 void blk_queue_headactive(request_queue_t * q, int active)
214 q->head_active = active;
218 * blk_queue_make_request - define an alternate make_request function for a device
219 * @q: the request queue for the device to be affected
220 * @mfn: the alternate make_request function
223 * The normal way for &struct buffer_heads to be passed to a device
224 * driver is for them to be collected into requests on a request
225 * queue, and then to allow the device driver to select requests
226 * off that queue when it is ready. This works well for many block
227 * devices. However some block devices (typically virtual devices
228 * such as md or lvm) do not benefit from the processing on the
229 * request queue, and are served best by having the requests passed
230 * directly to them. This can be achieved by providing a function
231 * to blk_queue_make_request().
234 * The driver that does this *must* be able to deal appropriately
235 * with buffers in "highmemory", either by calling bh_kmap() to get
236 * a kernel mapping, to by calling create_bounce() to create a
237 * buffer in normal memory.
240 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
242 q->make_request_fn = mfn;
246 * blk_queue_bounce_limit - set bounce buffer limit for queue
247 * @q: the request queue for the device
248 * @dma_addr: bus address limit
251 * Different hardware can have different requirements as to what pages
252 * it can do I/O directly to. A low level driver can call
253 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
254 * buffers for doing I/O to pages residing above @page. By default
255 * the block layer sets this to the highest numbered "low" memory page.
257 void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
259 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
260 unsigned long mb = dma_addr >> 20;
261 static request_queue_t *old_q;
264 * keep this for debugging for now...
266 if (dma_addr != BLK_BOUNCE_HIGH && q != old_q) {
268 printk("blk: queue %p, ", q);
269 if (dma_addr == BLK_BOUNCE_ANY)
270 printk("no I/O memory limit\n");
272 printk("I/O limit %luMb (mask 0x%Lx)\n", mb,
273 (long long) dma_addr);
276 q->bounce_pfn = bounce_pfn;
281 * can we merge the two segments, or do we need to start a new one?
283 inline int blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt)
286 * if bh and nxt are contigous and don't cross a 4g boundary, it's ok
288 if (BH_CONTIG(bh, nxt) && BH_PHYS_4G(bh, nxt))
294 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
296 if (req->nr_segments < max_segments) {
303 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
304 struct buffer_head *bh, int max_segments)
306 if (blk_seg_merge_ok(req->bhtail, bh))
309 return ll_new_segment(q, req, max_segments);
312 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
313 struct buffer_head *bh, int max_segments)
315 if (blk_seg_merge_ok(bh, req->bh))
318 return ll_new_segment(q, req, max_segments);
321 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
322 struct request *next, int max_segments)
324 int total_segments = req->nr_segments + next->nr_segments;
326 if (blk_seg_merge_ok(req->bhtail, next->bh))
329 if (total_segments > max_segments)
332 req->nr_segments = total_segments;
337 * "plug" the device if there are no outstanding requests: this will
338 * force the transfer to start only after we have put all the requests
341 * This is called with interrupts off and no requests on the queue.
342 * (and with the request spinlock acquired)
344 static void generic_plug_device(request_queue_t *q, kdev_t dev)
347 * no need to replug device
349 if (!list_empty(&q->queue_head) || q->plugged)
353 queue_task(&q->plug_tq, &tq_disk);
357 * remove the plug and let it rip..
359 static inline void __generic_unplug_device(request_queue_t *q)
363 if (!list_empty(&q->queue_head))
368 void generic_unplug_device(void *data)
370 request_queue_t *q = (request_queue_t *) data;
373 spin_lock_irqsave(&io_request_lock, flags);
374 __generic_unplug_device(q);
375 spin_unlock_irqrestore(&io_request_lock, flags);
378 /** blk_grow_request_list
379 * @q: The &request_queue_t
380 * @nr_requests: how many requests are desired
382 * More free requests are added to the queue's free lists, bringing
383 * the total number of requests to @nr_requests.
385 * The requests are added equally to the request queue's read
386 * and write freelists.
388 * This function can sleep.
390 * Returns the (new) number of requests which the queue has available.
392 int blk_grow_request_list(request_queue_t *q, int nr_requests)
395 /* Several broken drivers assume that this function doesn't sleep,
396 * this causes system hangs during boot.
397 * As a temporary fix, make the function non-blocking.
399 spin_lock_irqsave(&io_request_lock, flags);
400 while (q->nr_requests < nr_requests) {
404 rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC);
407 memset(rq, 0, sizeof(*rq));
408 rq->rq_status = RQ_INACTIVE;
409 rw = q->nr_requests & 1;
410 list_add(&rq->queue, &q->rq[rw].free);
414 q->batch_requests = q->nr_requests / 4;
415 if (q->batch_requests > 32)
416 q->batch_requests = 32;
417 spin_unlock_irqrestore(&io_request_lock, flags);
418 return q->nr_requests;
421 static void blk_init_free_list(request_queue_t *q)
424 int megs; /* Total memory, in megabytes */
427 INIT_LIST_HEAD(&q->rq[READ].free);
428 INIT_LIST_HEAD(&q->rq[WRITE].free);
429 q->rq[READ].count = 0;
430 q->rq[WRITE].count = 0;
434 megs = si.totalram >> (20 - PAGE_SHIFT);
438 blk_grow_request_list(q, nr_requests);
440 init_waitqueue_head(&q->wait_for_requests[0]);
441 init_waitqueue_head(&q->wait_for_requests[1]);
442 spin_lock_init(&q->queue_lock);
445 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
448 * blk_init_queue - prepare a request queue for use with a block device
449 * @q: The &request_queue_t to be initialised
450 * @rfn: The function to be called to process requests that have been
451 * placed on the queue.
454 * If a block device wishes to use the standard request handling procedures,
455 * which sorts requests and coalesces adjacent requests, then it must
456 * call blk_init_queue(). The function @rfn will be called when there
457 * are requests on the queue that need to be processed. If the device
458 * supports plugging, then @rfn may not be called immediately when requests
459 * are available on the queue, but may be called at some time later instead.
460 * Plugged queues are generally unplugged when a buffer belonging to one
461 * of the requests on the queue is needed, or due to memory pressure.
463 * @rfn is not required, or even expected, to remove all requests off the
464 * queue, but only as many as it can handle at a time. If it does leave
465 * requests on the queue, it is responsible for arranging that the requests
466 * get dealt with eventually.
468 * A global spin lock $io_request_lock must be held while manipulating the
469 * requests on the request queue.
471 * The request on the head of the queue is by default assumed to be
472 * potentially active, and it is not considered for re-ordering or merging
473 * whenever the given queue is unplugged. This behaviour can be changed with
474 * blk_queue_headactive().
477 * blk_init_queue() must be paired with a blk_cleanup_queue() call
478 * when the block device is deactivated (such as at module unload).
480 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
482 INIT_LIST_HEAD(&q->queue_head);
483 elevator_init(&q->elevator, ELEVATOR_LINUS);
484 blk_init_free_list(q);
486 q->back_merge_fn = ll_back_merge_fn;
487 q->front_merge_fn = ll_front_merge_fn;
488 q->merge_requests_fn = ll_merge_requests_fn;
489 q->make_request_fn = __make_request;
491 q->plug_tq.routine = &generic_unplug_device;
495 * These booleans describe the queue properties. We set the
496 * default (and most common) values here. Other drivers can
497 * use the appropriate functions to alter the queue properties.
500 q->plug_device_fn = generic_plug_device;
503 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
506 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queue);
508 * Get a free request. io_request_lock must be held and interrupts
509 * disabled on the way in. Returns NULL if there are no free requests.
511 static struct request *get_request(request_queue_t *q, int rw)
513 struct request *rq = NULL;
514 struct request_list *rl = q->rq + rw;
516 if (!list_empty(&rl->free)) {
517 rq = blkdev_free_rq(&rl->free);
518 list_del(&rq->queue);
520 rq->rq_status = RQ_ACTIVE;
530 * Here's the request allocation design:
532 * 1: Blocking on request exhaustion is a key part of I/O throttling.
534 * 2: We want to be `fair' to all requesters. We must avoid starvation, and
535 * attempt to ensure that all requesters sleep for a similar duration. Hence
536 * no stealing requests when there are other processes waiting.
538 * 3: We also wish to support `batching' of requests. So when a process is
539 * woken, we want to allow it to allocate a decent number of requests
540 * before it blocks again, so they can be nicely merged (this only really
541 * matters if the process happens to be adding requests near the head of
544 * 4: We want to avoid scheduling storms. This isn't really important, because
545 * the system will be I/O bound anyway. But it's easy.
547 * There is tension between requirements 2 and 3. Once a task has woken,
548 * we don't want to allow it to sleep as soon as it takes its second request.
549 * But we don't want currently-running tasks to steal all the requests
550 * from the sleepers. We handle this with wakeup hysteresis around
551 * 0 .. batch_requests and with the assumption that request taking is much,
552 * much faster than request freeing.
554 * So here's what we do:
556 * a) A READA requester fails if free_requests < batch_requests
558 * We don't want READA requests to prevent sleepers from ever
559 * waking. Note that READA is used extremely rarely - a few
560 * filesystems use it for directory readahead.
562 * When a process wants a new request:
564 * b) If free_requests == 0, the requester sleeps in FIFO manner.
566 * b) If 0 < free_requests < batch_requests and there are waiters,
567 * we still take a request non-blockingly. This provides batching.
569 * c) If free_requests >= batch_requests, the caller is immediately
570 * granted a new request.
572 * When a request is released:
574 * d) If free_requests < batch_requests, do nothing.
576 * f) If free_requests >= batch_requests, wake up a single waiter.
578 * The net effect is that when a process is woken at the batch_requests level,
579 * it will be able to take approximately (batch_requests) requests before
580 * blocking again (at the tail of the queue).
582 * This all assumes that the rate of taking requests is much, much higher
583 * than the rate of releasing them. Which is very true.
588 static struct request *__get_request_wait(request_queue_t *q, int rw)
590 register struct request *rq;
591 DECLARE_WAITQUEUE(wait, current);
593 generic_unplug_device(q);
594 add_wait_queue_exclusive(&q->wait_for_requests[rw], &wait);
596 set_current_state(TASK_UNINTERRUPTIBLE);
597 if (q->rq[rw].count == 0)
599 spin_lock_irq(&io_request_lock);
600 rq = get_request(q, rw);
601 spin_unlock_irq(&io_request_lock);
602 } while (rq == NULL);
603 remove_wait_queue(&q->wait_for_requests[rw], &wait);
604 current->state = TASK_RUNNING;
608 /* RO fail safe mechanism */
610 static long ro_bits[MAX_BLKDEV][8];
612 int is_read_only(kdev_t dev)
618 if (major < 0 || major >= MAX_BLKDEV) return 0;
619 return ro_bits[major][minor >> 5] & (1 << (minor & 31));
622 void set_device_ro(kdev_t dev,int flag)
628 if (major < 0 || major >= MAX_BLKDEV) return;
629 if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
630 else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
633 inline void drive_stat_acct (kdev_t dev, int rw,
634 unsigned long nr_sectors, int new_io)
636 unsigned int major = MAJOR(dev);
639 index = disk_index(dev);
640 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
643 kstat.dk_drive[major][index] += new_io;
645 kstat.dk_drive_rio[major][index] += new_io;
646 kstat.dk_drive_rblk[major][index] += nr_sectors;
647 } else if (rw == WRITE) {
648 kstat.dk_drive_wio[major][index] += new_io;
649 kstat.dk_drive_wblk[major][index] += nr_sectors;
651 printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
654 #ifdef CONFIG_BLK_STATS
656 * Return up to two hd_structs on which to do IO accounting for a given
659 * On a partitioned device, we want to account both against the partition
660 * and against the whole disk.
662 static void locate_hd_struct(struct request *req,
663 struct hd_struct **hd1,
664 struct hd_struct **hd2)
671 gd = get_gendisk(req->rq_dev);
672 if (gd && gd->part) {
673 /* Mask out the partition bits: account for the entire disk */
674 int devnr = MINOR(req->rq_dev) >> gd->minor_shift;
675 int whole_minor = devnr << gd->minor_shift;
677 *hd1 = &gd->part[whole_minor];
678 if (whole_minor != MINOR(req->rq_dev))
679 *hd2= &gd->part[MINOR(req->rq_dev)];
684 * Round off the performance stats on an hd_struct.
686 * The average IO queue length and utilisation statistics are maintained
687 * by observing the current state of the queue length and the amount of
688 * time it has been in this state for.
689 * Normally, that accounting is done on IO completion, but that can result
690 * in more than a second's worth of IO being accounted for within any one
691 * second, leading to >100% utilisation. To deal with that, we do a
692 * round-off before returning the results when reading /proc/partitions,
693 * accounting immediately for all queue usage up to the current jiffies and
694 * restarting the counters again.
696 void disk_round_stats(struct hd_struct *hd)
698 unsigned long now = jiffies;
700 hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change));
701 hd->last_queue_change = now;
703 if (hd->ios_in_flight)
704 hd->io_ticks += (now - hd->last_idle_time);
705 hd->last_idle_time = now;
708 static inline void down_ios(struct hd_struct *hd)
710 disk_round_stats(hd);
714 static inline void up_ios(struct hd_struct *hd)
716 disk_round_stats(hd);
720 static void account_io_start(struct hd_struct *hd, struct request *req,
721 int merge, int sectors)
727 hd->rd_sectors += sectors;
732 hd->wr_sectors += sectors;
739 static void account_io_end(struct hd_struct *hd, struct request *req)
741 unsigned long duration = jiffies - req->start_time;
744 hd->rd_ticks += duration;
748 hd->wr_ticks += duration;
755 void req_new_io(struct request *req, int merge, int sectors)
757 struct hd_struct *hd1, *hd2;
759 locate_hd_struct(req, &hd1, &hd2);
761 account_io_start(hd1, req, merge, sectors);
763 account_io_start(hd2, req, merge, sectors);
766 void req_merged_io(struct request *req)
768 struct hd_struct *hd1, *hd2;
770 locate_hd_struct(req, &hd1, &hd2);
777 void req_finished_io(struct request *req)
779 struct hd_struct *hd1, *hd2;
781 locate_hd_struct(req, &hd1, &hd2);
783 account_io_end(hd1, req);
785 account_io_end(hd2, req);
787 EXPORT_SYMBOL(req_finished_io);
788 #endif /* CONFIG_BLK_STATS */
791 * add-request adds a request to the linked list.
792 * io_request_lock is held and interrupts disabled, as we muck with the
793 * request queue list.
795 * By this point, req->cmd is always either READ/WRITE, never READA,
796 * which is important for drive_stat_acct() above.
798 static inline void add_request(request_queue_t * q, struct request * req,
799 struct list_head *insert_here)
801 drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
803 if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
804 spin_unlock_irq(&io_request_lock);
809 * elevator indicated where it wants this request to be
810 * inserted at elevator_merge time
812 list_add(&req->queue, insert_here);
816 * Must be called with io_request_lock held and interrupts disabled
818 void blkdev_release_request(struct request *req)
820 request_queue_t *q = req->q;
823 req->rq_status = RQ_INACTIVE;
827 * Request may not have originated from ll_rw_blk. if not,
828 * assume it has free buffers and check waiters
831 list_add(&req->queue, &q->rq[rw].free);
832 if (++q->rq[rw].count >= q->batch_requests &&
833 waitqueue_active(&q->wait_for_requests[rw]))
834 wake_up(&q->wait_for_requests[rw]);
839 * Has to be called with the request spinlock acquired
841 static void attempt_merge(request_queue_t * q,
846 struct request *next;
848 next = blkdev_next_request(req);
849 if (req->sector + req->nr_sectors != next->sector)
851 if (req->cmd != next->cmd
852 || req->rq_dev != next->rq_dev
853 || req->nr_sectors + next->nr_sectors > max_sectors
857 * If we are not allowed to merge these requests, then
858 * return. If we are allowed to merge, then the count
859 * will have been updated to the appropriate number,
860 * and we shouldn't do it here too.
862 if (!q->merge_requests_fn(q, req, next, max_segments))
865 q->elevator.elevator_merge_req_fn(req, next);
866 req->bhtail->b_reqnext = next->bh;
867 req->bhtail = next->bhtail;
868 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
869 list_del(&next->queue);
871 /* One last thing: we have removed a request, so we now have one
872 less expected IO to complete for accounting purposes. */
875 blkdev_release_request(next);
878 static inline void attempt_back_merge(request_queue_t * q,
883 if (&req->queue == q->queue_head.prev)
885 attempt_merge(q, req, max_sectors, max_segments);
888 static inline void attempt_front_merge(request_queue_t * q,
889 struct list_head * head,
894 struct list_head * prev;
896 prev = req->queue.prev;
899 attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
902 static int __make_request(request_queue_t * q, int rw,
903 struct buffer_head * bh)
905 unsigned int sector, count;
906 int max_segments = MAX_SEGMENTS;
907 struct request * req, *freereq = NULL;
908 int rw_ahead, max_sectors, el_ret;
909 struct list_head *head, *insert_here;
911 elevator_t *elevator = &q->elevator;
913 count = bh->b_size >> 9;
914 sector = bh->b_rsector;
916 rw_ahead = 0; /* normal case; gets changed below for READA */
919 #if 0 /* bread() misinterprets failed READA attempts as IO errors on SMP */
922 rw = READ; /* drop into READ */
925 latency = elevator_request_latency(elevator, rw);
932 /* We'd better have a real physical mapping!
933 Check this bit only if the buffer was dirty and just locked
934 down by us so at this point flushpage will block and
935 won't clear the mapped bit under us. */
936 if (!buffer_mapped(bh))
940 * Temporary solution - in 2.5 this will be done by the lowlevel
941 * driver. Create a bounce buffer if the buffer data points into
942 * high memory - keep the original buffer otherwise.
944 bh = blk_queue_bounce(q, rw, bh);
946 /* look for a free request. */
948 * Try to coalesce the new request with old requests
950 max_sectors = get_max_sectors(bh->b_rdev);
954 head = &q->queue_head;
956 * Now we acquire the request spinlock, we have to be mega careful
957 * not to schedule or do something nonatomic
959 spin_lock_irq(&io_request_lock);
961 insert_here = head->prev;
962 if (list_empty(head)) {
963 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
965 } else if (q->head_active && !q->plugged)
968 el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors);
971 case ELEVATOR_BACK_MERGE:
972 if (!q->back_merge_fn(q, req, bh, max_segments)) {
973 insert_here = &req->queue;
976 req->bhtail->b_reqnext = bh;
978 req->nr_sectors = req->hard_nr_sectors += count;
979 blk_started_io(count);
980 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
981 req_new_io(req, 1, count);
982 attempt_back_merge(q, req, max_sectors, max_segments);
985 case ELEVATOR_FRONT_MERGE:
986 if (!q->front_merge_fn(q, req, bh, max_segments)) {
987 insert_here = req->queue.prev;
990 bh->b_reqnext = req->bh;
993 * may not be valid, but queues not having bounce
994 * enabled for highmem pages must not look at
997 req->buffer = bh->b_data;
998 req->current_nr_sectors = req->hard_cur_sectors = count;
999 req->sector = req->hard_sector = sector;
1000 req->nr_sectors = req->hard_nr_sectors += count;
1001 blk_started_io(count);
1002 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
1003 req_new_io(req, 1, count);
1004 attempt_front_merge(q, head, req, max_sectors, max_segments);
1008 * elevator says don't/can't merge. get new request
1010 case ELEVATOR_NO_MERGE:
1012 * use elevator hints as to where to insert the
1013 * request. if no hints, just add it to the back
1017 insert_here = &req->queue;
1021 printk("elevator returned crap (%d)\n", el_ret);
1031 * See description above __get_request_wait()
1034 if (q->rq[rw].count < q->batch_requests) {
1035 spin_unlock_irq(&io_request_lock);
1038 req = get_request(q, rw);
1042 req = get_request(q, rw);
1044 spin_unlock_irq(&io_request_lock);
1045 freereq = __get_request_wait(q, rw);
1051 /* fill up the request-info, and add it to the queue */
1052 req->elevator_sequence = latency;
1055 req->hard_sector = req->sector = sector;
1056 req->hard_nr_sectors = req->nr_sectors = count;
1057 req->current_nr_sectors = req->hard_cur_sectors = count;
1058 req->nr_segments = 1; /* Always 1 for a new request. */
1059 req->nr_hw_segments = 1; /* Always 1 for a new request. */
1060 req->buffer = bh->b_data;
1061 req->waiting = NULL;
1064 req->rq_dev = bh->b_rdev;
1065 req->start_time = jiffies;
1066 req_new_io(req, 0, count);
1067 blk_started_io(count);
1068 add_request(q, req, insert_here);
1071 blkdev_release_request(freereq);
1072 spin_unlock_irq(&io_request_lock);
1075 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1080 * generic_make_request: hand a buffer head to it's device driver for I/O
1081 * @rw: READ, WRITE, or READA - what sort of I/O is desired.
1082 * @bh: The buffer head describing the location in memory and on the device.
1084 * generic_make_request() is used to make I/O requests of block
1085 * devices. It is passed a &struct buffer_head and a &rw value. The
1086 * %READ and %WRITE options are (hopefully) obvious in meaning. The
1087 * %READA value means that a read is required, but that the driver is
1088 * free to fail the request if, for example, it cannot get needed
1089 * resources immediately.
1091 * generic_make_request() does not return any status. The
1092 * success/failure status of the request, along with notification of
1093 * completion, is delivered asynchronously through the bh->b_end_io
1094 * function described (one day) else where.
1096 * The caller of generic_make_request must make sure that b_page,
1097 * b_addr, b_size are set to describe the memory buffer, that b_rdev
1098 * and b_rsector are set to describe the device address, and the
1099 * b_end_io and optionally b_private are set to describe how
1100 * completion notification should be signaled. BH_Mapped should also
1101 * be set (to confirm that b_dev and b_blocknr are valid).
1103 * generic_make_request and the drivers it calls may use b_reqnext,
1104 * and may change b_rdev and b_rsector. So the values of these fields
1105 * should NOT be depended on after the call to generic_make_request.
1106 * Because of this, the caller should record the device address
1107 * information in b_dev and b_blocknr.
1109 * Apart from those fields mentioned above, no other fields, and in
1110 * particular, no other flags, are changed by generic_make_request or
1111 * any lower level drivers.
1113 void generic_make_request (int rw, struct buffer_head * bh)
1115 int major = MAJOR(bh->b_rdev);
1122 /* Test device size, when known. */
1123 if (blk_size[major])
1124 minorsize = blk_size[major][MINOR(bh->b_rdev)];
1126 unsigned long maxsector = (minorsize << 1) + 1;
1127 unsigned long sector = bh->b_rsector;
1128 unsigned int count = bh->b_size >> 9;
1130 if (maxsector < count || maxsector - count < sector) {
1132 bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
1134 /* This may well happen - the kernel calls bread()
1135 without checking the size of the device, e.g.,
1136 when mounting a device. */
1138 "attempt to access beyond end of device\n");
1139 printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
1140 kdevname(bh->b_rdev), rw,
1141 (sector + count)>>1, minorsize);
1144 bh->b_end_io(bh, 0);
1150 * Resolve the mapping until finished. (drivers are
1151 * still free to implement/resolve their own stacking
1152 * by explicitly returning 0)
1154 /* NOTE: we don't repeat the blk_size check for each new device.
1155 * Stacking drivers are expected to know what they are doing.
1158 q = blk_get_queue(bh->b_rdev);
1161 "generic_make_request: Trying to access "
1162 "nonexistent block-device %s (%ld)\n",
1163 kdevname(bh->b_rdev), bh->b_rsector);
1164 buffer_IO_error(bh);
1167 } while (q->make_request_fn(q, rw, bh));
1172 * submit_bh: submit a buffer_head to the block device later for I/O
1173 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1174 * @bh: The &struct buffer_head which describes the I/O
1176 * submit_bh() is very similar in purpose to generic_make_request(), and
1177 * uses that function to do most of the work.
1179 * The extra functionality provided by submit_bh is to determine
1180 * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
1181 * This is is appropriate for IO requests that come from the buffer
1182 * cache and page cache which (currently) always use aligned blocks.
1184 void submit_bh(int rw, struct buffer_head * bh)
1186 int count = bh->b_size >> 9;
1188 if (!test_bit(BH_Lock, &bh->b_state))
1191 set_bit(BH_Req, &bh->b_state);
1192 set_bit(BH_Launder, &bh->b_state);
1195 * First step, 'identity mapping' - RAID or LVM might
1196 * further remap this.
1198 bh->b_rdev = bh->b_dev;
1199 bh->b_rsector = bh->b_blocknr * count;
1201 generic_make_request(rw, bh);
1205 kstat.pgpgout += count;
1208 kstat.pgpgin += count;
1214 * ll_rw_block: low-level access to block devices
1215 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
1216 * @nr: number of &struct buffer_heads in the array
1217 * @bhs: array of pointers to &struct buffer_head
1219 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
1220 * and requests an I/O operation on them, either a %READ or a %WRITE.
1221 * The third %READA option is described in the documentation for
1222 * generic_make_request() which ll_rw_block() calls.
1224 * This function provides extra functionality that is not in
1225 * generic_make_request() that is relevant to buffers in the buffer
1226 * cache or page cache. In particular it drops any buffer that it
1227 * cannot get a lock on (with the BH_Lock state bit), any buffer that
1228 * appears to be clean when doing a write request, and any buffer that
1229 * appears to be up-to-date when doing read request. Further it marks
1230 * as clean buffers that are processed for writing (the buffer cache
1231 * wont assume that they are actually clean until the buffer gets
1234 * ll_rw_block sets b_end_io to simple completion handler that marks
1235 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
1236 * any waiters. As client that needs a more interesting completion
1237 * routine should call submit_bh() (or generic_make_request())
1241 * All of the buffers must be for the same device, and must also be
1242 * of the current approved size for the device. */
1244 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
1253 major = MAJOR(bhs[0]->b_dev);
1255 /* Determine correct block size for this device. */
1256 correct_size = get_hardsect_size(bhs[0]->b_dev);
1258 /* Verify requested block sizes. */
1259 for (i = 0; i < nr; i++) {
1260 struct buffer_head *bh = bhs[i];
1261 if (bh->b_size % correct_size) {
1262 printk(KERN_NOTICE "ll_rw_block: device %s: "
1263 "only %d-char blocks implemented (%u)\n",
1264 kdevname(bhs[0]->b_dev),
1265 correct_size, bh->b_size);
1270 if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
1271 printk(KERN_NOTICE "Can't write to read-only device %s\n",
1272 kdevname(bhs[0]->b_dev));
1276 for (i = 0; i < nr; i++) {
1277 struct buffer_head *bh = bhs[i];
1279 /* Only one thread can actually submit the I/O. */
1280 if (test_and_set_bit(BH_Lock, &bh->b_state))
1283 /* We have the buffer lock */
1284 atomic_inc(&bh->b_count);
1285 bh->b_end_io = end_buffer_io_sync;
1289 if (!atomic_set_buffer_clean(bh))
1290 /* Hmmph! Nothing to write */
1292 __mark_buffer_clean(bh);
1297 if (buffer_uptodate(bh))
1298 /* Hmmph! Already have it */
1304 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1313 /* Make sure we don't get infinite dirty retries.. */
1314 for (i = 0; i < nr; i++)
1315 mark_buffer_clean(bhs[i]);
1318 #ifdef CONFIG_STRAM_SWAP
1319 extern int stram_device_init (void);
1324 * end_that_request_first - end I/O on one buffer.
1325 * @req: the request being processed
1326 * @uptodate: 0 for I/O error
1327 * @name: the name printed for an I/O error
1330 * Ends I/O on the first buffer attached to @req, and sets it up
1331 * for the next buffer_head (if any) in the cluster.
1334 * 0 - we are done with this request, call end_that_request_last()
1335 * 1 - still buffers pending for this request
1338 * Drivers implementing their own end_request handling must call
1339 * blk_finished_io() appropriately.
1342 int end_that_request_first (struct request *req, int uptodate, char *name)
1344 struct buffer_head * bh;
1349 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
1350 kdevname(req->rq_dev), name, req->sector);
1352 if ((bh = req->bh) != NULL) {
1353 nsect = bh->b_size >> 9;
1354 blk_finished_io(nsect);
1355 req->bh = bh->b_reqnext;
1356 bh->b_reqnext = NULL;
1357 bh->b_end_io(bh, uptodate);
1358 if ((bh = req->bh) != NULL) {
1359 req->hard_sector += nsect;
1360 req->hard_nr_sectors -= nsect;
1361 req->sector = req->hard_sector;
1362 req->nr_sectors = req->hard_nr_sectors;
1364 req->current_nr_sectors = bh->b_size >> 9;
1365 req->hard_cur_sectors = req->current_nr_sectors;
1366 if (req->nr_sectors < req->current_nr_sectors) {
1367 req->nr_sectors = req->current_nr_sectors;
1368 printk("end_request: buffer-list destroyed\n");
1370 req->buffer = bh->b_data;
1377 void end_that_request_last(struct request *req)
1379 if (req->waiting != NULL)
1380 complete(req->waiting);
1381 req_finished_io(req);
1383 blkdev_release_request(req);
1386 int __init blk_dev_init(void)
1388 struct blk_dev_struct *dev;
1390 request_cachep = kmem_cache_create("blkdev_requests",
1391 sizeof(struct request),
1392 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1394 if (!request_cachep)
1395 panic("Can't create request pool slab cache\n");
1397 for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1400 memset(ro_bits,0,sizeof(ro_bits));
1401 memset(max_readahead, 0, sizeof(max_readahead));
1402 memset(max_sectors, 0, sizeof(max_sectors));
1404 blk_max_low_pfn = max_low_pfn - 1;
1405 blk_max_pfn = max_pfn - 1;
1407 #ifdef CONFIG_AMIGA_Z2RAM
1410 #ifdef CONFIG_STRAM_SWAP
1411 stram_device_init();
1413 #ifdef CONFIG_ISP16_CDI
1416 #ifdef CONFIG_BLK_DEV_PS2
1419 #ifdef CONFIG_BLK_DEV_XD
1425 #ifdef CONFIG_BLK_DEV_MFM
1428 #ifdef CONFIG_PARIDE
1429 { extern void paride_init(void); paride_init(); };
1431 #ifdef CONFIG_MAC_FLOPPY
1434 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1437 #ifdef CONFIG_AMIGA_FLOPPY
1438 amiga_floppy_init();
1440 #ifdef CONFIG_ATARI_FLOPPY
1441 atari_floppy_init();
1443 #ifdef CONFIG_BLK_DEV_FD
1446 #if defined(__i386__) /* Do we even need this? */
1450 #ifdef CONFIG_CDU31A
1453 #ifdef CONFIG_ATARI_ACSI
1468 #ifdef CONFIG_CDU535
1483 #ifdef CONFIG_APBLOCK
1495 #if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_BLOCK)
1498 #ifdef CONFIG_BLK_DEV_XPRAM
1502 #ifdef CONFIG_SUN_JSFLASH
1508 EXPORT_SYMBOL(io_request_lock);
1509 EXPORT_SYMBOL(end_that_request_first);
1510 EXPORT_SYMBOL(end_that_request_last);
1511 EXPORT_SYMBOL(blk_grow_request_list);
1512 EXPORT_SYMBOL(blk_init_queue);
1513 EXPORT_SYMBOL(blk_get_queue);
1514 EXPORT_SYMBOL(blk_cleanup_queue);
1515 EXPORT_SYMBOL(blk_queue_headactive);
1516 EXPORT_SYMBOL(blk_queue_make_request);
1517 EXPORT_SYMBOL(generic_make_request);
1518 EXPORT_SYMBOL(blkdev_release_request);
1519 EXPORT_SYMBOL(generic_unplug_device);
1520 EXPORT_SYMBOL(blk_queue_bounce_limit);
1521 EXPORT_SYMBOL(blk_max_low_pfn);
1522 EXPORT_SYMBOL(blk_max_pfn);
1523 EXPORT_SYMBOL(blk_seg_merge_ok);
1524 EXPORT_SYMBOL(blk_nohighio);