4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
34 #include <linux/slab.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48 #include <linux/module.h>
49 #include <linux/completion.h>
51 #include <asm/uaccess.h>
53 #include <asm/bitops.h>
54 #include <asm/mmu_context.h>
56 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
57 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
58 number of unused buffer heads */
60 /* Anti-deadlock ordering:
61 * lru_list_lock > hash_table_lock > unused_list_lock
64 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
69 static unsigned int bh_hash_mask;
70 static unsigned int bh_hash_shift;
71 static struct buffer_head **hash_table;
72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
74 static struct buffer_head *lru_list[NR_LIST];
76 static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
77 #define lru_list_lock lru_list_lock_cacheline.lock
79 static int nr_buffers_type[NR_LIST];
80 static unsigned long size_buffers_type[NR_LIST];
82 static struct buffer_head * unused_list;
83 static int nr_unused_buffer_heads;
84 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
85 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
87 static int grow_buffers(kdev_t dev, unsigned long block, int size);
88 static int osync_buffers_list(struct list_head *);
89 static void __refile_buffer(struct buffer_head *);
92 * A global sysctl-controlled flag which puts the machine into "laptop mode"
96 static DECLARE_WAIT_QUEUE_HEAD(kupdate_wait);
98 /* This is used by some architectures to estimate available memory. */
99 atomic_t buffermem_pages = ATOMIC_INIT(0);
101 /* Here is the parameter block for the bdflush process. If you add or
102 * remove any of the parameters, make sure to update kernel/sysctl.c
103 * and the documentation at linux/Documentation/sysctl/vm.txt.
108 /* The dummy values in this structure are left in there for compatibility
109 * with old programs that play with the /proc entries.
111 union bdflush_param {
113 int nfract; /* Percentage of buffer cache dirty to
115 int ndirty; /* Maximum number of dirty blocks to write out per
117 int dummy2; /* old "nrefill" */
118 int dummy3; /* unused */
119 int interval; /* jiffies delay between kupdate flushes */
120 int age_buffer; /* Time for normal buffer to age before we flush it */
121 int nfract_sync;/* Percentage of buffer cache dirty to
122 activate bdflush synchronously */
123 int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
124 int dummy5; /* unused */
126 unsigned int data[N_PARAM];
127 } bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
129 /* These are the min and max parameter values that we will allow to be assigned */
130 int bdflush_min[N_PARAM] = { 0, 1, 0, 0, 0, 1*HZ, 0, 0, 0};
131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
133 static inline int write_buffer_delay(struct buffer_head *bh)
135 struct page *page = bh->b_page;
137 if (!TryLockPage(page)) {
138 spin_unlock(&lru_list_lock);
140 page->mapping->a_ops->writepage(page);
147 static inline void write_buffer(struct buffer_head *bh)
149 if (buffer_delay(bh)) {
150 struct page *page = bh->b_page;
153 if (buffer_delay(bh)) {
154 page->mapping->a_ops->writepage(page);
160 ll_rw_block(WRITE, 1, &bh);
163 void fastcall unlock_buffer(struct buffer_head *bh)
165 clear_bit(BH_Wait_IO, &bh->b_state);
166 clear_bit(BH_Launder, &bh->b_state);
168 * When a locked buffer is visible to the I/O layer BH_Launder
169 * is set. This means before unlocking we must clear BH_Launder,
170 * mb() on alpha and then clear BH_Lock, so no reader can see
171 * BH_Launder set on an unlocked buffer and then risk to deadlock.
173 smp_mb__after_clear_bit();
174 clear_bit(BH_Lock, &bh->b_state);
175 smp_mb__after_clear_bit();
176 if (waitqueue_active(&bh->b_wait))
177 wake_up(&bh->b_wait);
181 * Note that the real wait_on_buffer() is an inline function that checks
182 * that the buffer is locked before calling this, so that unnecessary disk
183 * unplugging does not occur.
185 void __wait_on_buffer(struct buffer_head * bh)
187 struct task_struct *tsk = current;
188 DECLARE_WAITQUEUE(wait, tsk);
191 add_wait_queue(&bh->b_wait, &wait);
193 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
194 if (!buffer_locked(bh))
197 * We must read tq_disk in TQ_ACTIVE after the
198 * add_wait_queue effect is visible to other cpus.
199 * We could unplug some line above it wouldn't matter
200 * but we can't do that right after add_wait_queue
201 * without an smp_mb() in between because spin_unlock
202 * has inclusive semantics.
203 * Doing it here is the most efficient place so we
204 * don't do a suprious unplug if we get a racy
205 * wakeup that make buffer_locked to return 0, and
206 * doing it here avoids an explicit smp_mb() we
207 * rely on the implicit one in set_task_state.
209 run_task_queue(&tq_disk);
211 } while (buffer_locked(bh));
212 tsk->state = TASK_RUNNING;
213 remove_wait_queue(&bh->b_wait, &wait);
218 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
219 * unlock the buffer. This is what ll_rw_block uses too.
221 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
223 mark_buffer_uptodate(bh, uptodate);
229 * The buffers have been marked clean and locked. Just submit the dang
232 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
235 struct buffer_head * bh = *array++;
236 bh->b_end_io = end_buffer_io_sync;
237 submit_bh(WRITE, bh);
242 * Write some buffers from the head of the dirty queue.
244 * This must be called with the LRU lock held, and will
248 static int write_some_buffers(kdev_t dev)
250 struct buffer_head *next;
251 struct buffer_head *array[NRSYNC];
255 next = lru_list[BUF_DIRTY];
256 nr = nr_buffers_type[BUF_DIRTY];
258 while (next && --nr >= 0) {
259 struct buffer_head * bh = next;
260 next = bh->b_next_free;
262 if (dev != NODEV && bh->b_dev != dev)
264 if (test_and_set_bit(BH_Lock, &bh->b_state))
266 if (buffer_delay(bh)) {
267 if (write_buffer_delay(bh)) {
269 write_locked_buffers(array, count);
272 } else if (atomic_set_buffer_clean(bh)) {
279 spin_unlock(&lru_list_lock);
280 write_locked_buffers(array, count);
286 spin_unlock(&lru_list_lock);
289 write_locked_buffers(array, count);
294 * Write out all buffers on the dirty list.
296 static void write_unlocked_buffers(kdev_t dev)
299 spin_lock(&lru_list_lock);
300 while (write_some_buffers(dev));
304 * Wait for a buffer on the proper list.
306 * This must be called with the LRU lock held, and
307 * will return with it released.
309 static int wait_for_buffers(kdev_t dev, int index, int refile)
311 struct buffer_head * next;
314 next = lru_list[index];
315 nr = nr_buffers_type[index];
316 while (next && --nr >= 0) {
317 struct buffer_head *bh = next;
318 next = bh->b_next_free;
320 if (!buffer_locked(bh)) {
325 if (dev != NODEV && bh->b_dev != dev)
329 spin_unlock(&lru_list_lock);
334 spin_unlock(&lru_list_lock);
338 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
341 spin_lock(&lru_list_lock);
342 } while (wait_for_buffers(dev, index, refile));
346 /* Call sync_buffers with wait!=0 to ensure that the call does not
347 * return until all buffer writes have completed. Sync() may return
348 * before the writes have finished; fsync() may not.
351 /* Godamity-damn. Some buffers (bitmaps for filesystems)
352 * spontaneously dirty themselves without ever brelse being called.
353 * We will ultimately want to put these in a separate list, but for
354 * now we search all of the lists for dirty buffers.
356 int sync_buffers(kdev_t dev, int wait)
360 /* One pass for no-wait, three for wait:
361 * 0) write out all dirty, unlocked buffers;
362 * 1) wait for all dirty locked buffers;
363 * 2) write out all dirty, unlocked buffers;
364 * 2) wait for completion by waiting for all buffers to unlock.
366 write_unlocked_buffers(dev);
368 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
369 write_unlocked_buffers(dev);
370 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
374 EXPORT_SYMBOL(sync_buffers);
376 int fsync_super(struct super_block *sb)
378 kdev_t dev = sb->s_dev;
379 sync_buffers(dev, 0);
385 if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
386 sb->s_op->write_super(sb);
388 if (sb->s_op && sb->s_op->sync_fs)
389 sb->s_op->sync_fs(sb);
392 return sync_buffers(dev, 1);
395 int fsync_no_super(kdev_t dev)
397 sync_buffers(dev, 0);
398 return sync_buffers(dev, 1);
401 int fsync_dev(kdev_t dev)
403 sync_buffers(dev, 0);
411 return sync_buffers(dev, 1);
415 * There's no real reason to pretend we should
416 * ever do anything differently
418 void sync_dev(kdev_t dev)
423 asmlinkage long sys_sync(void)
430 * filp may be NULL if called via the msync of a vma.
433 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
435 struct inode * inode = dentry->d_inode;
436 struct super_block * sb;
441 /* sync the inode to buffers */
442 write_inode_now(inode, 0);
444 /* sync the superblock to buffers */
447 if (sb->s_op && sb->s_op->write_super)
448 sb->s_op->write_super(sb);
451 /* .. finally sync the buffers to disk */
453 ret = sync_buffers(dev, 1);
458 asmlinkage long sys_fsync(unsigned int fd)
461 struct dentry * dentry;
462 struct inode * inode;
470 dentry = file->f_dentry;
471 inode = dentry->d_inode;
474 if (!file->f_op || !file->f_op->fsync) {
475 /* Why? We can still call filemap_fdatasync */
479 /* We need to protect against concurrent writers.. */
481 ret = filemap_fdatasync(inode->i_mapping);
482 err = file->f_op->fsync(file, dentry, 0);
485 err = filemap_fdatawait(inode->i_mapping);
496 int do_fdatasync(struct file *file)
499 struct dentry *dentry;
502 if (unlikely(!file->f_op || !file->f_op->fsync))
505 dentry = file->f_dentry;
506 inode = dentry->d_inode;
508 ret = filemap_fdatasync(inode->i_mapping);
509 err = file->f_op->fsync(file, dentry, 1);
512 err = filemap_fdatawait(inode->i_mapping);
518 asmlinkage long sys_fdatasync(unsigned int fd)
529 inode = file->f_dentry->d_inode;
531 ret = do_fdatasync(file);
539 /* After several hours of tedious analysis, the following hash
540 * function won. Do not mess with it... -DaveM
542 #define _hashfn(dev,block) \
543 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
544 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
545 ((block) << (bh_hash_shift - 12))))
546 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
548 static inline void __insert_into_hash_list(struct buffer_head *bh)
550 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
551 struct buffer_head *next = *head;
557 next->b_pprev = &bh->b_next;
560 static __inline__ void __hash_unlink(struct buffer_head *bh)
562 struct buffer_head **pprev = bh->b_pprev;
564 struct buffer_head *next = bh->b_next;
566 next->b_pprev = pprev;
572 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
574 struct buffer_head **bhp = &lru_list[blist];
576 if (bh->b_prev_free || bh->b_next_free) BUG();
580 bh->b_prev_free = bh;
582 bh->b_next_free = *bhp;
583 bh->b_prev_free = (*bhp)->b_prev_free;
584 (*bhp)->b_prev_free->b_next_free = bh;
585 (*bhp)->b_prev_free = bh;
586 nr_buffers_type[blist]++;
587 size_buffers_type[blist] += bh->b_size >> 9;
590 static void __remove_from_lru_list(struct buffer_head * bh)
592 struct buffer_head *next = bh->b_next_free;
594 struct buffer_head *prev = bh->b_prev_free;
595 int blist = bh->b_list;
597 prev->b_next_free = next;
598 next->b_prev_free = prev;
599 if (lru_list[blist] == bh) {
602 lru_list[blist] = next;
604 bh->b_next_free = NULL;
605 bh->b_prev_free = NULL;
606 nr_buffers_type[blist]--;
607 size_buffers_type[blist] -= bh->b_size >> 9;
611 /* must be called with both the hash_table_lock and the lru_list_lock
613 static void __remove_from_queues(struct buffer_head *bh)
616 __remove_from_lru_list(bh);
619 static void remove_from_queues(struct buffer_head *bh)
621 spin_lock(&lru_list_lock);
622 write_lock(&hash_table_lock);
623 __remove_from_queues(bh);
624 write_unlock(&hash_table_lock);
625 spin_unlock(&lru_list_lock);
628 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
630 struct buffer_head *bh, **p = &hash(dev, block);
632 read_lock(&hash_table_lock);
639 if (bh->b_blocknr != block)
641 if (bh->b_size != size)
643 if (bh->b_dev != dev)
649 read_unlock(&hash_table_lock);
653 void fastcall buffer_insert_list(struct buffer_head *bh, struct list_head *list)
655 spin_lock(&lru_list_lock);
656 if (buffer_attached(bh))
657 list_del(&bh->b_inode_buffers);
658 set_buffer_attached(bh);
659 list_add_tail(&bh->b_inode_buffers, list);
660 spin_unlock(&lru_list_lock);
664 * The caller must have the lru_list lock before calling the
665 * remove_inode_queue functions.
667 static void __remove_inode_queue(struct buffer_head *bh)
669 list_del(&bh->b_inode_buffers);
670 clear_buffer_attached(bh);
673 static inline void remove_inode_queue(struct buffer_head *bh)
675 if (buffer_attached(bh))
676 __remove_inode_queue(bh);
679 int inode_has_buffers(struct inode *inode)
683 spin_lock(&lru_list_lock);
684 ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
685 spin_unlock(&lru_list_lock);
690 /* If invalidate_buffers() will trash dirty buffers, it means some kind
691 of fs corruption is going on. Trashing dirty data always imply losing
692 information that was supposed to be just stored on the physical layer
695 Thus invalidate_buffers in general usage is not allwowed to trash
696 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
697 be preserved. These buffers are simply skipped.
699 We also skip buffers which are still in use. For example this can
700 happen if a userspace program is reading the block device.
702 NOTE: In the case where the user removed a removable-media-disk even if
703 there's still dirty data not synced on disk (due a bug in the device driver
704 or due an error of the user), by not destroying the dirty buffers we could
705 generate corruption also on the next media inserted, thus a parameter is
706 necessary to handle this case in the most safe way possible (trying
707 to not corrupt also the new disk inserted with the data belonging to
708 the old now corrupted disk). Also for the ramdisk the natural thing
709 to do in order to release the ramdisk memory is to destroy dirty buffers.
711 These are two special cases. Normal usage imply the device driver
712 to issue a sync on the device (without waiting I/O completion) and
713 then an invalidate_buffers call that doesn't trash dirty buffers.
715 For handling cache coherency with the blkdev pagecache the 'update' case
716 is been introduced. It is needed to re-read from disk any pinned
717 buffer. NOTE: re-reading from disk is destructive so we can do it only
718 when we assume nobody is changing the buffercache under our I/O and when
719 we think the disk contains more recent information than the buffercache.
720 The update == 1 pass marks the buffers we need to update, the update == 2
721 pass does the actual I/O. */
722 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
725 struct buffer_head * bh, * bh_next;
726 kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */
730 spin_lock(&lru_list_lock);
731 for(nlist = 0; nlist < NR_LIST; nlist++) {
732 bh = lru_list[nlist];
735 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
736 bh_next = bh->b_next_free;
738 /* Another device? */
739 if (bh->b_dev != dev)
744 if (buffer_locked(bh)) {
746 spin_unlock(&lru_list_lock);
749 spin_lock(&lru_list_lock);
753 write_lock(&hash_table_lock);
754 /* All buffers in the lru lists are mapped */
755 if (!buffer_mapped(bh))
757 if (buffer_dirty(bh) && destroy_dirty_buffers)
758 printk("invalidate: dirty buffer\n");
759 if (!atomic_read(&bh->b_count)) {
760 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
761 remove_inode_queue(bh);
763 } else if (!bdev->bd_openers)
764 printk("invalidate: busy buffer\n");
766 write_unlock(&hash_table_lock);
772 spin_unlock(&lru_list_lock);
776 /* Get rid of the page cache */
777 invalidate_inode_pages(bdev->bd_inode);
780 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
782 struct block_device *bdev = bdget(dev);
784 invalidate_bdev(bdev, destroy_dirty_buffers);
789 static void free_more_memory(void)
793 try_to_free_pages(GFP_NOIO);
794 run_task_queue(&tq_disk);
798 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
800 bh->b_list = BUF_CLEAN;
801 bh->b_end_io = handler;
802 bh->b_private = private;
805 void end_buffer_io_async(struct buffer_head * bh, int uptodate)
807 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
809 struct buffer_head *tmp;
813 mark_buffer_uptodate(bh, uptodate);
815 /* This is a temporary buffer used for page I/O. */
822 * Be _very_ careful from here on. Bad things can happen if
823 * two buffer heads end IO at almost the same time and both
824 * decide that the page is now completely done.
826 * Async buffer_heads are here only as labels for IO, and get
827 * thrown away once the IO for this page is complete. IO is
828 * deemed complete once all buffers have been visited
829 * (b_count==0) and are now unlocked. We must make sure that
830 * only the _last_ buffer that decrements its count is the one
831 * that unlock the page..
833 spin_lock_irqsave(&page_uptodate_lock, flags);
834 mark_buffer_async(bh, 0);
836 tmp = bh->b_this_page;
838 if (buffer_locked(tmp)) {
839 if (buffer_async(tmp))
841 } else if (!buffer_uptodate(tmp))
843 tmp = tmp->b_this_page;
846 /* OK, the async IO on this page is complete. */
847 spin_unlock_irqrestore(&page_uptodate_lock, flags);
850 * If none of the buffers had errors and all were uptodate
851 * then we can set the page uptodate:
853 if (fullup && !PageError(page))
854 SetPageUptodate(page);
861 spin_unlock_irqrestore(&page_uptodate_lock, flags);
865 inline void set_buffer_async_io(struct buffer_head *bh)
867 bh->b_end_io = end_buffer_io_async;
868 mark_buffer_async(bh, 1);
872 * Synchronise all the inode's dirty buffers to the disk.
874 * We have conflicting pressures: we want to make sure that all
875 * initially dirty buffers get waited on, but that any subsequently
876 * dirtied buffers don't. After all, we don't want fsync to last
877 * forever if somebody is actively writing to the file.
879 * Do this in two main stages: first we copy dirty buffers to a
880 * temporary inode list, queueing the writes as we go. Then we clean
881 * up, waiting for those writes to complete.
883 * During this second stage, any subsequent updates to the file may end
884 * up refiling the buffer on the original inode's dirty list again, so
885 * there is a chance we will end up with a buffer queued for write but
886 * not yet completed on that list. So, as a final cleanup we go through
887 * the osync code to catch these locked, dirty buffers without requeuing
888 * any newly dirty buffers for write.
890 int fsync_buffers_list(struct list_head *list)
892 struct buffer_head *bh;
893 struct list_head tmp;
896 INIT_LIST_HEAD(&tmp);
898 spin_lock(&lru_list_lock);
900 while (!list_empty(list)) {
901 bh = BH_ENTRY(list->next);
902 list_del(&bh->b_inode_buffers);
903 if (!buffer_dirty(bh) && !buffer_locked(bh))
904 clear_buffer_attached(bh);
906 set_buffer_attached(bh);
907 list_add(&bh->b_inode_buffers, &tmp);
908 if (buffer_dirty(bh)) {
910 spin_unlock(&lru_list_lock);
912 * Wait I/O completion before submitting
913 * the buffer, to be sure the write will
914 * be effective on the latest data in
915 * the buffer. (otherwise - if there's old
916 * I/O in flight - write_buffer would become
922 spin_lock(&lru_list_lock);
927 while (!list_empty(&tmp)) {
928 bh = BH_ENTRY(tmp.prev);
929 remove_inode_queue(bh);
931 spin_unlock(&lru_list_lock);
933 if (!buffer_uptodate(bh))
936 spin_lock(&lru_list_lock);
939 spin_unlock(&lru_list_lock);
940 err2 = osync_buffers_list(list);
949 * osync is designed to support O_SYNC io. It waits synchronously for
950 * all already-submitted IO to complete, but does not queue any new
951 * writes to the disk.
953 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
954 * you dirty the buffers, and then use osync_buffers_list to wait for
955 * completion. Any other dirty buffers which are not yet queued for
956 * write will not be flushed to disk by the osync.
958 static int osync_buffers_list(struct list_head *list)
960 struct buffer_head *bh;
964 spin_lock(&lru_list_lock);
967 list_for_each_prev(p, list) {
969 if (buffer_locked(bh)) {
971 spin_unlock(&lru_list_lock);
973 if (!buffer_uptodate(bh))
976 spin_lock(&lru_list_lock);
981 spin_unlock(&lru_list_lock);
986 * Invalidate any and all dirty buffers on a given inode. We are
987 * probably unmounting the fs, but that doesn't mean we have already
988 * done a sync(). Just drop the buffers from the inode list.
990 void invalidate_inode_buffers(struct inode *inode)
992 struct list_head * entry;
994 spin_lock(&lru_list_lock);
995 while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
996 remove_inode_queue(BH_ENTRY(entry));
997 while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
998 remove_inode_queue(BH_ENTRY(entry));
999 spin_unlock(&lru_list_lock);
1004 * Ok, this is getblk, and it isn't very clear, again to hinder
1005 * race-conditions. Most of the code is seldom used, (ie repeating),
1006 * so it should be much more efficient than it looks.
1008 * The algorithm is changed: hopefully better, and an elusive bug removed.
1010 * 14.02.92: changed it to sync dirty buffers a bit: better performance
1011 * when the filesystem starts to get full of dirty blocks (I hope).
1013 struct buffer_head * getblk(kdev_t dev, int block, int size)
1016 struct buffer_head * bh;
1018 bh = get_hash_table(dev, block, size);
1024 if (!grow_buffers(dev, block, size))
1029 /* -1 -> no need to flush
1031 1 -> sync flush (wait for I/O completion) */
1032 static int balance_dirty_state(void)
1034 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1036 dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1037 tot = nr_free_buffer_pages();
1040 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1041 hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1043 /* First, check for the "real" dirty limit. */
1044 if (dirty > soft_dirty_limit) {
1045 if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
1053 static int bdflush_stop(void)
1055 unsigned long dirty, tot, dirty_limit;
1057 dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1058 tot = nr_free_buffer_pages();
1061 dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1063 if (!laptop_mode && dirty > dirty_limit)
1069 * if a new dirty buffer is created we need to balance bdflush.
1071 * in the future we might want to make bdflush aware of different
1072 * pressures on different devices - thus the (currently unused)
1075 void balance_dirty(void)
1077 int state = balance_dirty_state();
1085 * And if we're _really_ out of balance, wait for
1086 * some of the dirty/locked buffers ourselves.
1087 * This will throttle heavy writers.
1090 spin_lock(&lru_list_lock);
1091 write_some_buffers(NODEV);
1094 EXPORT_SYMBOL(balance_dirty);
1096 inline void fastcall __mark_dirty(struct buffer_head *bh)
1098 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1102 /* atomic version, the user must call balance_dirty() by hand
1103 as soon as it become possible to block */
1104 void fastcall __mark_buffer_dirty(struct buffer_head *bh)
1106 if (!atomic_set_buffer_dirty(bh))
1110 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1112 if (!atomic_set_buffer_dirty(bh)) {
1114 printk("%s: dirtied buffer\n", current->comm);
1120 void set_buffer_flushtime(struct buffer_head *bh)
1122 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1124 EXPORT_SYMBOL(set_buffer_flushtime);
1126 int get_buffer_flushtime(void)
1128 return bdf_prm.b_un.interval;
1130 EXPORT_SYMBOL(get_buffer_flushtime);
1133 * A buffer may need to be moved from one buffer list to another
1134 * (e.g. in case it is not shared any more). Handle this.
1136 static void __refile_buffer(struct buffer_head *bh)
1138 int dispose = BUF_CLEAN;
1139 if (buffer_locked(bh))
1140 dispose = BUF_LOCKED;
1141 if (buffer_dirty(bh))
1142 dispose = BUF_DIRTY;
1143 if (dispose != bh->b_list) {
1144 __remove_from_lru_list(bh);
1145 bh->b_list = dispose;
1146 if (dispose == BUF_CLEAN)
1147 remove_inode_queue(bh);
1148 __insert_into_lru_list(bh, dispose);
1152 void refile_buffer(struct buffer_head *bh)
1154 spin_lock(&lru_list_lock);
1155 __refile_buffer(bh);
1156 spin_unlock(&lru_list_lock);
1160 * Release a buffer head
1162 void __brelse(struct buffer_head * buf)
1164 if (atomic_read(&buf->b_count)) {
1168 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1172 * bforget() is like brelse(), except it discards any
1173 * potentially dirty data.
1175 void __bforget(struct buffer_head * buf)
1177 mark_buffer_clean(buf);
1182 * bread() - reads a specified block and returns the bh
1183 * @block: number of block
1184 * @size: size (in bytes) to read
1186 * Reads a specified block, and returns buffer head that
1187 * contains it. It returns NULL if the block was unreadable.
1189 struct buffer_head * bread(kdev_t dev, int block, int size)
1191 struct buffer_head * bh;
1193 bh = getblk(dev, block, size);
1194 if (buffer_uptodate(bh))
1196 set_bit(BH_Sync, &bh->b_state);
1197 ll_rw_block(READ, 1, &bh);
1199 if (buffer_uptodate(bh))
1206 * Note: the caller should wake up the buffer_wait list if needed.
1208 static void __put_unused_buffer_head(struct buffer_head * bh)
1210 if (unlikely(buffer_attached(bh)))
1212 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1213 kmem_cache_free(bh_cachep, bh);
1217 bh->b_this_page = NULL;
1219 nr_unused_buffer_heads++;
1220 bh->b_next_free = unused_list;
1225 void put_unused_buffer_head(struct buffer_head *bh)
1227 spin_lock(&unused_list_lock);
1228 __put_unused_buffer_head(bh);
1229 spin_unlock(&unused_list_lock);
1231 EXPORT_SYMBOL(put_unused_buffer_head);
1234 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1235 * no-buffer-head deadlock. Return NULL on failure; waiting for
1236 * buffer heads is now handled in create_buffers().
1238 struct buffer_head * get_unused_buffer_head(int async)
1240 struct buffer_head * bh;
1242 spin_lock(&unused_list_lock);
1243 if (nr_unused_buffer_heads > NR_RESERVED) {
1245 unused_list = bh->b_next_free;
1246 nr_unused_buffer_heads--;
1247 spin_unlock(&unused_list_lock);
1250 spin_unlock(&unused_list_lock);
1252 /* This is critical. We can't call out to the FS
1253 * to get more buffer heads, because the FS may need
1254 * more buffer-heads itself. Thus SLAB_NOFS.
1256 if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1258 bh->b_this_page = NULL;
1263 * If we need an async buffer, use the reserved buffer heads.
1264 * Non-PF_MEMALLOC tasks can just loop in create_buffers().
1266 if (async && (current->flags & PF_MEMALLOC)) {
1267 spin_lock(&unused_list_lock);
1270 unused_list = bh->b_next_free;
1271 nr_unused_buffer_heads--;
1272 spin_unlock(&unused_list_lock);
1275 spin_unlock(&unused_list_lock);
1280 EXPORT_SYMBOL(get_unused_buffer_head);
1282 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1284 if (offset >= PAGE_SIZE)
1287 if (PageHighMem(page)) {
1288 bh->b_data = (char *)offset;
1290 bh->b_data = page_address(page) + offset;
1294 EXPORT_SYMBOL(set_bh_page);
1297 * Create the appropriate buffers when given a page for data area and
1298 * the size of each buffer.. Use the bh->b_this_page linked list to
1299 * follow the buffers created. Return NULL if unable to create more
1301 * The async flag is used to differentiate async IO (paging, swapping)
1302 * from ordinary buffer allocations, and only async requests are allowed
1303 * to sleep waiting for buffer heads.
1305 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1307 struct buffer_head *bh, *head;
1313 while ((offset -= size) >= 0) {
1314 bh = get_unused_buffer_head(async);
1319 bh->b_this_page = head;
1323 bh->b_next_free = NULL;
1325 atomic_set(&bh->b_count, 0);
1328 set_bh_page(bh, page, offset);
1330 bh->b_list = BUF_CLEAN;
1331 bh->b_end_io = NULL;
1335 * In case anything failed, we just free everything we got.
1339 spin_lock(&unused_list_lock);
1342 head = head->b_this_page;
1343 __put_unused_buffer_head(bh);
1345 spin_unlock(&unused_list_lock);
1347 /* Wake up any waiters ... */
1348 wake_up(&buffer_wait);
1352 * Return failure for non-async IO requests. Async IO requests
1353 * are not allowed to fail, so we have to wait until buffer heads
1354 * become available. But we don't want tasks sleeping with
1355 * partially complete buffers, so all were released above.
1360 /* We're _really_ low on memory. Now we just
1361 * wait for old buffer heads to become free due to
1362 * finishing IO. Since this is an async request and
1363 * the reserve list is empty, we're sure there are
1364 * async buffer heads in use.
1366 run_task_queue(&tq_disk);
1373 * Called when truncating a buffer on a page completely.
1375 static void discard_buffer(struct buffer_head * bh)
1377 if (buffer_mapped(bh) || buffer_delay(bh)) {
1378 mark_buffer_clean(bh);
1380 clear_bit(BH_Uptodate, &bh->b_state);
1381 clear_bit(BH_Mapped, &bh->b_state);
1382 clear_bit(BH_Req, &bh->b_state);
1383 clear_bit(BH_New, &bh->b_state);
1384 clear_bit(BH_Delay, &bh->b_state);
1385 remove_from_queues(bh);
1391 * try_to_release_page - release old fs-specific metadata on a page
1395 int try_to_release_page(struct page * page, int gfp_mask)
1397 if (!PageLocked(page))
1402 if (!page->mapping->a_ops->releasepage)
1404 if (page->mapping->a_ops->releasepage(page, gfp_mask))
1407 * We couldn't release buffer metadata; don't even bother trying
1408 * to release buffers.
1412 return try_to_free_buffers(page, gfp_mask);
1416 * We don't have to release all buffers here, but
1417 * we have to be sure that no dirty buffer is left
1418 * and no IO is going on (no buffer is locked), because
1419 * we have truncated the file and are going to free the
1422 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1424 struct buffer_head *head, *bh, *next;
1425 unsigned int curr_off = 0;
1427 if (!PageLocked(page))
1432 head = page->buffers;
1435 unsigned int next_off = curr_off + bh->b_size;
1436 next = bh->b_this_page;
1439 * is this block fully flushed?
1441 if (offset <= curr_off)
1443 curr_off = next_off;
1445 } while (bh != head);
1448 * subtle. We release buffer-heads only if this is
1449 * the 'final' flushpage. We have invalidated the get_block
1450 * cached value unconditionally, so real IO is not
1453 * If the free doesn't work out, the buffers can be
1454 * left around - they just turn into anonymous buffers
1458 if (!try_to_release_page(page, 0))
1465 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1467 struct buffer_head *bh, *head, *tail;
1469 /* FIXME: create_buffers should fail if there's no enough memory */
1470 head = create_buffers(page, blocksize, 1);
1478 bh->b_end_io = NULL;
1480 bh = bh->b_this_page;
1482 tail->b_this_page = head;
1483 page->buffers = head;
1484 page_cache_get(page);
1486 EXPORT_SYMBOL(create_empty_buffers);
1489 * We are taking a block for data and we don't want any output from any
1490 * buffer-cache aliases starting from return from that function and
1491 * until the moment when something will explicitly mark the buffer
1492 * dirty (hopefully that will not happen until we will free that block ;-)
1493 * We don't even need to mark it not-uptodate - nobody can expect
1494 * anything from a newly allocated buffer anyway. We used to used
1495 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1496 * don't want to mark the alias unmapped, for example - it would confuse
1497 * anyone who might pick it with bread() afterwards...
1500 static void unmap_underlying_metadata(struct buffer_head * bh)
1502 struct buffer_head *old_bh;
1504 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1506 mark_buffer_clean(old_bh);
1507 wait_on_buffer(old_bh);
1508 clear_bit(BH_Req, &old_bh->b_state);
1514 * NOTE! All mapped/uptodate combinations are valid:
1516 * Mapped Uptodate Meaning
1518 * No No "unknown" - must do get_block()
1519 * No Yes "hole" - zero-filled
1520 * Yes No "allocated" - allocated on disk, not read in
1521 * Yes Yes "valid" - allocated and up-to-date in memory.
1523 * "Dirty" is valid only with the last case (mapped+uptodate).
1527 * block_write_full_page() is SMP threaded - the kernel lock is not held.
1529 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1532 unsigned long block;
1533 struct buffer_head *bh, *head;
1536 if (!PageLocked(page))
1540 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1541 head = page->buffers;
1543 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1548 /* Stage 1: make sure we have all the buffers mapped! */
1551 * If the buffer isn't up-to-date, we can't be sure
1552 * that the buffer has been initialized with the proper
1553 * block number information etc..
1555 * Leave it to the low-level FS to make all those
1556 * decisions (block #0 may actually be a valid block)
1558 if (!buffer_mapped(bh)) {
1559 err = get_block(inode, block, bh, 1);
1563 unmap_underlying_metadata(bh);
1565 bh = bh->b_this_page;
1567 } while (bh != head);
1569 /* Stage 2: lock the buffers, mark them clean */
1572 set_buffer_async_io(bh);
1573 set_bit(BH_Uptodate, &bh->b_state);
1574 clear_bit(BH_Dirty, &bh->b_state);
1575 bh = bh->b_this_page;
1576 } while (bh != head);
1578 /* Stage 3: submit the IO */
1580 struct buffer_head *next = bh->b_this_page;
1581 submit_bh(WRITE, bh);
1583 } while (bh != head);
1585 /* Done - end_buffer_io_async will unlock */
1586 SetPageUptodate(page);
1588 wakeup_page_waiters(page);
1594 * ENOSPC, or some other error. We may already have added some
1595 * blocks to the file, so we need to write these out to avoid
1596 * exposing stale data.
1598 ClearPageUptodate(page);
1601 /* Recovery: lock and submit the mapped buffers */
1603 if (buffer_mapped(bh)) {
1605 set_buffer_async_io(bh);
1608 bh = bh->b_this_page;
1609 } while (bh != head);
1611 struct buffer_head *next = bh->b_this_page;
1612 if (buffer_mapped(bh)) {
1613 set_bit(BH_Uptodate, &bh->b_state);
1614 clear_bit(BH_Dirty, &bh->b_state);
1615 submit_bh(WRITE, bh);
1618 } while (bh != head);
1621 wakeup_page_waiters(page);
1625 static int __block_prepare_write(struct inode *inode, struct page *page,
1626 unsigned from, unsigned to, get_block_t *get_block)
1628 unsigned block_start, block_end;
1629 unsigned long block;
1631 unsigned blocksize, bbits;
1632 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1633 char *kaddr = kmap(page);
1635 blocksize = 1 << inode->i_blkbits;
1637 create_empty_buffers(page, inode->i_dev, blocksize);
1638 head = page->buffers;
1640 bbits = inode->i_blkbits;
1641 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1643 for(bh = head, block_start = 0; bh != head || !block_start;
1644 block++, block_start=block_end, bh = bh->b_this_page) {
1647 block_end = block_start+blocksize;
1648 if (block_end <= from)
1650 if (block_start >= to)
1652 clear_bit(BH_New, &bh->b_state);
1653 if (!buffer_mapped(bh)) {
1654 err = get_block(inode, block, bh, 1);
1657 if (buffer_new(bh)) {
1658 unmap_underlying_metadata(bh);
1659 if (Page_Uptodate(page)) {
1660 set_bit(BH_Uptodate, &bh->b_state);
1664 memset(kaddr+to, 0, block_end-to);
1665 if (block_start < from)
1666 memset(kaddr+block_start, 0, from-block_start);
1667 if (block_end > to || block_start < from)
1668 flush_dcache_page(page);
1672 if (Page_Uptodate(page)) {
1673 set_bit(BH_Uptodate, &bh->b_state);
1676 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1677 (block_start < from || block_end > to)) {
1678 ll_rw_block(READ, 1, &bh);
1683 * If we issued read requests - let them complete.
1685 while(wait_bh > wait) {
1686 wait_on_buffer(*--wait_bh);
1687 if (!buffer_uptodate(*wait_bh))
1693 * Zero out any newly allocated blocks to avoid exposing stale
1694 * data. If BH_New is set, we know that the block was newly
1695 * allocated in the above loop.
1697 * Details the buffer can be new and uptodate because:
1698 * 1) hole in uptodate page, get_block(create) allocate the block,
1699 * so the buffer is new and additionally we also mark it uptodate
1700 * 2) The buffer is not mapped and uptodate due a previous partial read.
1702 * We can always ignore uptodate buffers here, if you mark a buffer
1703 * uptodate you must make sure it contains the right data first.
1705 * We must stop the "undo/clear" fixup pass not at the caller "to"
1706 * but at the last block that we successfully arrived in the main loop.
1709 to = block_start; /* stop at the last successfully handled block */
1712 block_end = block_start+blocksize;
1713 if (block_end <= from)
1715 if (block_start >= to)
1717 if (buffer_new(bh) && !buffer_uptodate(bh)) {
1718 memset(kaddr+block_start, 0, bh->b_size);
1719 flush_dcache_page(page);
1720 set_bit(BH_Uptodate, &bh->b_state);
1721 mark_buffer_dirty(bh);
1724 block_start = block_end;
1725 bh = bh->b_this_page;
1726 } while (bh != head);
1730 static int __block_commit_write(struct inode *inode, struct page *page,
1731 unsigned from, unsigned to)
1733 unsigned block_start, block_end;
1734 int partial = 0, need_balance_dirty = 0;
1736 struct buffer_head *bh, *head;
1738 blocksize = 1 << inode->i_blkbits;
1740 for(bh = head = page->buffers, block_start = 0;
1741 bh != head || !block_start;
1742 block_start=block_end, bh = bh->b_this_page) {
1743 block_end = block_start + blocksize;
1744 if (block_end <= from || block_start >= to) {
1745 if (!buffer_uptodate(bh))
1748 set_bit(BH_Uptodate, &bh->b_state);
1749 if (!atomic_set_buffer_dirty(bh)) {
1751 buffer_insert_inode_data_queue(bh, inode);
1752 need_balance_dirty = 1;
1757 if (need_balance_dirty)
1760 * is this a partial write that happened to make all buffers
1761 * uptodate then we can optimize away a bogus readpage() for
1762 * the next read(). Here we 'discover' wether the page went
1763 * uptodate as a result of this (potentially partial) write.
1766 SetPageUptodate(page);
1771 * Generic "read page" function for block devices that have the normal
1772 * get_block functionality. This is most of the block device filesystems.
1773 * Reads the page asynchronously --- the unlock_buffer() and
1774 * mark_buffer_uptodate() functions propagate buffer state into the
1775 * page struct once IO has completed.
1777 int block_read_full_page(struct page *page, get_block_t *get_block)
1779 struct inode *inode = page->mapping->host;
1780 unsigned long iblock, lblock;
1781 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1782 unsigned int blocksize, blocks;
1785 if (!PageLocked(page))
1787 blocksize = 1 << inode->i_blkbits;
1789 create_empty_buffers(page, inode->i_dev, blocksize);
1790 head = page->buffers;
1792 blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1793 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794 lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1800 if (buffer_uptodate(bh))
1803 if (!buffer_mapped(bh)) {
1804 if (iblock < lblock) {
1805 if (get_block(inode, iblock, bh, 0))
1808 if (!buffer_mapped(bh)) {
1809 memset(kmap(page) + i*blocksize, 0, blocksize);
1810 flush_dcache_page(page);
1812 set_bit(BH_Uptodate, &bh->b_state);
1815 /* get_block() might have updated the buffer synchronously */
1816 if (buffer_uptodate(bh))
1822 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1826 * All buffers are uptodate - we can set the page uptodate
1827 * as well. But not if get_block() returned an error.
1829 if (!PageError(page))
1830 SetPageUptodate(page);
1835 /* Stage two: lock the buffers */
1836 for (i = 0; i < nr; i++) {
1837 struct buffer_head * bh = arr[i];
1839 set_buffer_async_io(bh);
1842 /* Stage 3: start the IO */
1843 for (i = 0; i < nr; i++) {
1844 struct buffer_head * bh = arr[i];
1845 if (buffer_uptodate(bh))
1846 end_buffer_io_async(bh, 1);
1848 submit_bh(READ, bh);
1851 wakeup_page_waiters(page);
1856 /* utility function for filesystems that need to do work on expanding
1857 * truncates. Uses prepare/commit_write to allow the filesystem to
1858 * deal with the hole.
1860 int generic_cont_expand(struct inode *inode, loff_t size)
1862 struct address_space *mapping = inode->i_mapping;
1864 unsigned long index, offset, limit;
1868 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1869 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1870 send_sig(SIGXFSZ, current, 0);
1873 if (size > inode->i_sb->s_maxbytes)
1876 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1878 /* ugh. in prepare/commit_write, if from==to==start of block, we
1879 ** skip the prepare. make sure we never send an offset for the start
1882 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1885 index = size >> PAGE_CACHE_SHIFT;
1887 page = grab_cache_page(mapping, index);
1890 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1892 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1895 page_cache_release(page);
1903 * For moronic filesystems that do not allow holes in file.
1904 * We may have to extend the file.
1907 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1909 struct address_space *mapping = page->mapping;
1910 struct inode *inode = mapping->host;
1911 struct page *new_page;
1912 unsigned long pgpos;
1915 unsigned blocksize = 1 << inode->i_blkbits;
1918 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1920 new_page = grab_cache_page(mapping, pgpos);
1923 /* we might sleep */
1924 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1925 UnlockPage(new_page);
1926 page_cache_release(new_page);
1929 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1930 if (zerofrom & (blocksize-1)) {
1931 *bytes |= (blocksize-1);
1934 status = __block_prepare_write(inode, new_page, zerofrom,
1935 PAGE_CACHE_SIZE, get_block);
1938 kaddr = page_address(new_page);
1939 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1940 flush_dcache_page(new_page);
1941 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1943 UnlockPage(new_page);
1944 page_cache_release(new_page);
1947 if (page->index < pgpos) {
1948 /* completely inside the area */
1951 /* page covers the boundary, find the boundary offset */
1952 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1954 /* if we will expand the thing last block will be filled */
1955 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1956 *bytes |= (blocksize-1);
1960 /* starting below the boundary? Nothing to zero out */
1961 if (offset <= zerofrom)
1964 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1967 kaddr = page_address(page);
1968 if (zerofrom < offset) {
1969 memset(kaddr+zerofrom, 0, offset-zerofrom);
1970 flush_dcache_page(page);
1971 __block_commit_write(inode, page, zerofrom, offset);
1975 ClearPageUptodate(page);
1980 ClearPageUptodate(new_page);
1982 UnlockPage(new_page);
1983 page_cache_release(new_page);
1988 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1989 get_block_t *get_block)
1991 struct inode *inode = page->mapping->host;
1992 int err = __block_prepare_write(inode, page, from, to, get_block);
1994 ClearPageUptodate(page);
2000 int block_commit_write(struct page *page, unsigned from, unsigned to)
2002 struct inode *inode = page->mapping->host;
2003 __block_commit_write(inode,page,from,to);
2008 int generic_commit_write(struct file *file, struct page *page,
2009 unsigned from, unsigned to)
2011 struct inode *inode = page->mapping->host;
2012 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2013 __block_commit_write(inode,page,from,to);
2015 if (pos > inode->i_size) {
2016 inode->i_size = pos;
2017 mark_inode_dirty(inode);
2022 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2024 unsigned long index = from >> PAGE_CACHE_SHIFT;
2025 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2026 unsigned blocksize, iblock, length, pos;
2027 struct inode *inode = mapping->host;
2029 struct buffer_head *bh;
2032 blocksize = 1 << inode->i_blkbits;
2033 length = offset & (blocksize - 1);
2035 /* Block boundary? Nothing to do */
2039 length = blocksize - length;
2040 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2042 page = grab_cache_page(mapping, index);
2048 create_empty_buffers(page, inode->i_dev, blocksize);
2050 /* Find the buffer that contains "offset" */
2053 while (offset >= pos) {
2054 bh = bh->b_this_page;
2060 if (!buffer_mapped(bh)) {
2061 /* Hole? Nothing to do */
2062 if (buffer_uptodate(bh))
2064 get_block(inode, iblock, bh, 0);
2065 /* Still unmapped? Nothing to do */
2066 if (!buffer_mapped(bh))
2070 /* Ok, it's mapped. Make sure it's up-to-date */
2071 if (Page_Uptodate(page))
2072 set_bit(BH_Uptodate, &bh->b_state);
2074 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2076 ll_rw_block(READ, 1, &bh);
2078 /* Uhhuh. Read error. Complain and punt. */
2079 if (!buffer_uptodate(bh))
2083 memset(kmap(page) + offset, 0, length);
2084 flush_dcache_page(page);
2087 if (!atomic_set_buffer_dirty(bh)) {
2089 buffer_insert_inode_data_queue(bh, inode);
2097 page_cache_release(page);
2102 int block_write_full_page(struct page *page, get_block_t *get_block)
2104 struct inode *inode = page->mapping->host;
2105 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2110 if (page->index < end_index)
2111 return __block_write_full_page(inode, page, get_block);
2113 /* things got complicated... */
2114 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2115 /* OK, are we completely out? */
2116 if (page->index >= end_index+1 || !offset) {
2121 /* Sigh... will have to work, then... */
2122 err = __block_prepare_write(inode, page, 0, offset, get_block);
2124 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2125 flush_dcache_page(page);
2126 __block_commit_write(inode,page,0,offset);
2132 ClearPageUptodate(page);
2137 * Commence writeout of all the buffers against a page. The
2138 * page must be locked. Returns zero on success or a negative
2141 int writeout_one_page(struct page *page)
2143 struct buffer_head *bh, *head = page->buffers;
2145 if (!PageLocked(page))
2149 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2152 bh->b_flushtime = jiffies;
2153 ll_rw_block(WRITE, 1, &bh);
2154 } while ((bh = bh->b_this_page) != head);
2157 EXPORT_SYMBOL(writeout_one_page);
2160 * Wait for completion of I/O of all buffers against a page. The page
2161 * must be locked. Returns zero on success or a negative errno.
2163 int waitfor_one_page(struct page *page)
2166 struct buffer_head *bh, *head = page->buffers;
2171 if (buffer_req(bh) && !buffer_uptodate(bh))
2173 } while ((bh = bh->b_this_page) != head);
2176 EXPORT_SYMBOL(waitfor_one_page);
2178 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2180 struct buffer_head tmp;
2181 struct inode *inode = mapping->host;
2184 get_block(inode, block, &tmp, 0);
2185 return tmp.b_blocknr;
2188 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2190 int i, nr_blocks, retval;
2191 unsigned long * blocks = iobuf->blocks;
2195 length = iobuf->length;
2196 nr_blocks = length / blocksize;
2197 /* build the blocklist */
2198 for (i = 0; i < nr_blocks; i++, blocknr++) {
2199 struct buffer_head bh;
2202 bh.b_dev = inode->i_dev;
2203 bh.b_size = blocksize;
2206 if (((loff_t) blocknr) * blocksize >= inode->i_size)
2209 /* Only allow get_block to create new blocks if we are safely
2210 beyond EOF. O_DIRECT is unsafe inside sparse files. */
2211 retval = get_block(inode, blocknr, &bh,
2212 ((rw != READ) && beyond_eof));
2216 /* report error to userspace */
2219 /* do short I/O until 'i' */
2224 if (buffer_new(&bh))
2226 if (!buffer_mapped(&bh)) {
2227 /* there was an hole in the filesystem */
2232 if (buffer_new(&bh))
2233 unmap_underlying_metadata(&bh);
2234 if (!buffer_mapped(&bh))
2235 /* upper layers need to pass the error on or
2236 * fall back to buffered IO. */
2239 blocks[i] = bh.b_blocknr;
2242 /* patch length to handle short I/O */
2243 iobuf->length = i * blocksize;
2246 retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2248 down(&inode->i_sem);
2249 /* restore orig length */
2250 iobuf->length = length;
2257 * IO completion routine for a buffer_head being used for kiobuf IO: we
2258 * can't dispatch the kiobuf callback until io_count reaches 0.
2261 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2263 struct kiobuf *kiobuf;
2265 mark_buffer_uptodate(bh, uptodate);
2267 kiobuf = bh->b_private;
2268 end_kio_request(kiobuf, uptodate);
2273 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2274 * for them to complete. Clean up the buffer_heads afterwards.
2277 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2281 struct buffer_head *tmp;
2286 for (i = nr; --i >= 0; ) {
2289 wait_on_buffer(tmp);
2291 if (!buffer_uptodate(tmp)) {
2292 /* We are traversing bh'es in reverse order so
2293 clearing iosize on error calculates the
2294 amount of IO before the first error. */
2306 * Start I/O on a physical range of kernel memory, defined by a vector
2307 * of kiobuf structs (much like a user-space iovec list).
2309 * The kiobuf must already be locked for IO. IO is submitted
2310 * asynchronously: you need to check page->locked and page->uptodate.
2312 * It is up to the caller to make sure that there are enough blocks
2313 * passed in to completely map the iobufs to disk.
2316 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2317 kdev_t dev, unsigned long b[], int size)
2327 unsigned long blocknr;
2328 struct kiobuf * iobuf = NULL;
2330 struct buffer_head *tmp, **bhs = NULL;
2336 * First, do some alignment and validity checks
2338 for (i = 0; i < nr; i++) {
2340 if ((iobuf->offset & (size-1)) ||
2341 (iobuf->length & (size-1)))
2343 if (!iobuf->nr_pages)
2344 panic("brw_kiovec: iobuf not initialised");
2348 * OK to walk down the iovec doing page IO on each page we find.
2350 bufind = bhind = transferred = err = 0;
2351 for (i = 0; i < nr; i++) {
2353 offset = iobuf->offset;
2354 length = iobuf->length;
2359 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2360 map = iobuf->maplist[pageind];
2366 while (length > 0) {
2367 blocknr = b[bufind++];
2368 if (blocknr == -1UL) {
2370 /* there was an hole in the filesystem */
2371 memset(kmap(map) + offset, 0, size);
2372 flush_dcache_page(map);
2375 transferred += size;
2383 set_bh_page(tmp, map, offset);
2384 tmp->b_this_page = tmp;
2386 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2388 tmp->b_blocknr = blocknr;
2389 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2392 set_bit(BH_Uptodate, &tmp->b_state);
2393 clear_bit(BH_Dirty, &tmp->b_state);
2395 set_bit(BH_Uptodate, &tmp->b_state);
2397 atomic_inc(&iobuf->io_count);
2400 * Wait for IO if we have got too much
2402 if (bhind >= KIO_MAX_SECTORS) {
2403 kiobuf_wait_for_io(iobuf); /* wake-one */
2404 err = wait_kio(rw, bhind, bhs, size);
2416 if (offset >= PAGE_SIZE) {
2420 } /* End of block loop */
2421 } /* End of page loop */
2422 } /* End of iovec loop */
2424 /* Is there any IO still left to submit? */
2426 kiobuf_wait_for_io(iobuf); /* wake-one */
2427 err = wait_kio(rw, bhind, bhs, size);
2441 * Start I/O on a page.
2442 * This function expects the page to be locked and may return
2443 * before I/O is complete. You then have to check page->locked
2444 * and page->uptodate.
2446 * brw_page() is SMP-safe, although it's being called with the
2447 * kernel lock held - but the code is ready.
2449 * FIXME: we need a swapper_inode->get_block function to remove
2450 * some of the bmap kludges and interface ugliness here.
2452 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2454 struct buffer_head *head, *bh;
2456 if (!PageLocked(page))
2457 panic("brw_page: page not locked for I/O");
2460 create_empty_buffers(page, dev, size);
2461 head = bh = page->buffers;
2463 /* Stage 1: lock all the buffers */
2466 bh->b_blocknr = *(b++);
2467 set_bit(BH_Mapped, &bh->b_state);
2468 set_buffer_async_io(bh);
2469 bh = bh->b_this_page;
2470 } while (bh != head);
2472 /* Stage 2: start the IO */
2474 struct buffer_head *next = bh->b_this_page;
2477 } while (bh != head);
2478 wakeup_page_waiters(page);
2482 int block_symlink(struct inode *inode, const char *symname, int len)
2484 struct address_space *mapping = inode->i_mapping;
2485 struct page *page = grab_cache_page(mapping, 0);
2491 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2494 kaddr = page_address(page);
2495 memcpy(kaddr, symname, len-1);
2496 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2498 * Notice that we are _not_ going to block here - end of page is
2499 * unmapped, so this will only try to map the rest of page, see
2500 * that it is unmapped (typically even will not look into inode -
2501 * ->i_size will be enough for everything) and zero it out.
2502 * OTOH it's obviously correct and should make the page up-to-date.
2504 err = mapping->a_ops->readpage(NULL, page);
2506 page_cache_release(page);
2509 mark_inode_dirty(inode);
2513 page_cache_release(page);
2518 static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2520 struct buffer_head *bh, *tail;
2525 bh = bh->b_this_page;
2527 tail->b_this_page = head;
2528 page->buffers = head;
2529 page_cache_get(page);
2533 * Create the page-cache page that contains the requested block
2535 static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2538 struct buffer_head *bh;
2540 page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2544 if (!PageLocked(page))
2549 if (bh->b_size == size)
2551 if (!try_to_free_buffers(page, GFP_NOFS))
2555 bh = create_buffers(page, size, 0);
2558 link_dev_buffers(page, bh);
2563 page_cache_release(page);
2567 static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2569 struct buffer_head *head = page->buffers;
2570 struct buffer_head *bh = head;
2571 unsigned int uptodate;
2573 uptodate = 1 << BH_Mapped;
2574 if (Page_Uptodate(page))
2575 uptodate |= 1 << BH_Uptodate;
2577 write_lock(&hash_table_lock);
2579 if (!(bh->b_state & (1 << BH_Mapped))) {
2580 init_buffer(bh, NULL, NULL);
2582 bh->b_blocknr = block;
2583 bh->b_state = uptodate;
2586 /* Insert the buffer into the hash lists if necessary */
2588 __insert_into_hash_list(bh);
2591 bh = bh->b_this_page;
2592 } while (bh != head);
2593 write_unlock(&hash_table_lock);
2597 * Try to increase the number of buffers available: the size argument
2598 * is used to determine what kind of buffers we want.
2600 static int grow_buffers(kdev_t dev, unsigned long block, int size)
2603 struct block_device *bdev;
2604 unsigned long index;
2607 /* Size must be multiple of hard sectorsize */
2608 if (size & (get_hardsect_size(dev)-1))
2610 /* Size must be within 512 bytes and PAGE_SIZE */
2611 if (size < 512 || size > PAGE_SIZE)
2617 } while ((size << sizebits) < PAGE_SIZE);
2619 index = block >> sizebits;
2620 block = index << sizebits;
2622 bdev = bdget(kdev_t_to_nr(dev));
2624 printk("No block device for %s\n", kdevname(dev));
2628 /* Create a page with the proper size buffers.. */
2629 page = grow_dev_page(bdev, index, size);
2631 /* This is "wrong" - talk to Al Viro */
2632 atomic_dec(&bdev->bd_count);
2636 /* Hash in the buffers on the hash list */
2637 hash_page_buffers(page, dev, block, size);
2639 page_cache_release(page);
2641 /* We hashed up this page, so increment buffermem */
2642 atomic_inc(&buffermem_pages);
2647 * The first time the VM inspects a page which has locked buffers, it
2648 * will just mark it as needing waiting upon on the scan of the page LRU.
2649 * BH_Wait_IO is used for this.
2651 * The second time the VM visits the page, if it still has locked
2652 * buffers, it is time to start writing them out. (BH_Wait_IO was set).
2654 * The third time the VM visits the page, if the I/O hasn't completed
2655 * then it's time to wait upon writeout. BH_Lock and BH_Launder are
2658 * There is also the case of buffers which were locked by someone else
2659 * - write(2) callers, bdflush, etc. There can be a huge number of these
2660 * and we don't want to just skip them all and fail the page allocation.
2661 * We want to be able to wait on these buffers as well.
2663 * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2664 * underway against the buffer, doesn't matter who started it - we know
2665 * that the buffer will eventually come unlocked, and so it's safe to
2668 * The caller holds the page lock and the caller will free this page
2669 * into current->local_page, so by waiting on the page's buffers the
2670 * caller is guaranteed to obtain this page.
2672 * sync_page_buffers() will sort-of return true if all the buffers
2673 * against this page are freeable, so try_to_free_buffers() should
2674 * try to free the page's buffers a second time. This is a bit
2675 * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2677 static int sync_page_buffers(struct buffer_head *head)
2679 struct buffer_head * bh = head;
2683 if (!buffer_dirty(bh) && !buffer_locked(bh))
2686 /* Don't start IO first time around.. */
2687 if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2692 /* Second time through we start actively writing out.. */
2693 if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2694 if (unlikely(!buffer_launder(bh))) {
2703 if (!atomic_set_buffer_clean(bh)) {
2708 __mark_buffer_clean(bh);
2710 bh->b_end_io = end_buffer_io_sync;
2711 submit_bh(WRITE, bh);
2713 } while ((bh = bh->b_this_page) != head);
2719 * Can the buffer be thrown out?
2721 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
2722 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2725 * try_to_free_buffers() checks if all the buffers on this particular page
2726 * are unused, and free's the page if so.
2728 * Wake up bdflush() if this fails - if we're running low on memory due
2729 * to dirty buffers, we need to flush them out as quickly as possible.
2731 * NOTE: There are quite a number of ways that threads of control can
2732 * obtain a reference to a buffer head within a page. So we must
2733 * lock out all of these paths to cleanly toss the page.
2735 int fastcall try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2737 struct buffer_head * tmp, * bh = page->buffers;
2739 cleaned_buffers_try_again:
2740 spin_lock(&lru_list_lock);
2741 write_lock(&hash_table_lock);
2744 if (buffer_busy(tmp))
2745 goto busy_buffer_page;
2746 tmp = tmp->b_this_page;
2747 } while (tmp != bh);
2749 spin_lock(&unused_list_lock);
2752 /* if this buffer was hashed, this page counts as buffermem */
2754 atomic_dec(&buffermem_pages);
2756 struct buffer_head * p = tmp;
2757 tmp = tmp->b_this_page;
2759 if (p->b_dev == B_FREE) BUG();
2761 remove_inode_queue(p);
2762 __remove_from_queues(p);
2763 __put_unused_buffer_head(p);
2764 } while (tmp != bh);
2765 spin_unlock(&unused_list_lock);
2767 /* Wake up anyone waiting for buffer heads */
2768 wake_up(&buffer_wait);
2770 /* And free the page */
2771 page->buffers = NULL;
2772 page_cache_release(page);
2773 write_unlock(&hash_table_lock);
2774 spin_unlock(&lru_list_lock);
2778 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2779 write_unlock(&hash_table_lock);
2780 spin_unlock(&lru_list_lock);
2781 gfp_mask = pf_gfp_mask(gfp_mask);
2782 if (gfp_mask & __GFP_IO) {
2783 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2784 if (sync_page_buffers(bh)) {
2785 /* no IO or waiting next time */
2787 goto cleaned_buffers_try_again;
2791 if (balance_dirty_state() >= 0)
2795 EXPORT_SYMBOL(try_to_free_buffers);
2797 /* ================== Debugging =================== */
2799 void show_buffers(void)
2802 struct buffer_head * bh;
2803 int delalloc = 0, found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2805 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2808 printk("Buffer memory: %6dkB\n",
2809 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2811 printk("Cache memory: %6ldkB\n",
2812 (page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2814 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2815 if (!spin_trylock(&lru_list_lock))
2817 for(nlist = 0; nlist < NR_LIST; nlist++) {
2818 delalloc = found = locked = dirty = used = lastused = 0;
2819 bh = lru_list[nlist];
2824 if (buffer_locked(bh))
2826 if (buffer_dirty(bh))
2828 if (buffer_delay(bh))
2830 if (atomic_read(&bh->b_count))
2831 used++, lastused = found;
2832 bh = bh->b_next_free;
2833 } while (bh != lru_list[nlist]);
2835 int tmp = nr_buffers_type[nlist];
2837 printk("%9s: BUG -> found %d, reported %d\n",
2838 buf_types[nlist], found, tmp);
2840 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2841 "%d locked, %d dirty, %d delay\n",
2842 buf_types[nlist], found, size_buffers_type[nlist]>>(10-9),
2843 used, lastused, locked, dirty, delalloc);
2845 spin_unlock(&lru_list_lock);
2849 /* ===================== Init ======================= */
2852 * allocate the hash table and init the free list
2853 * Use gfp() for the hash table to decrease TLB misses, use
2854 * SLAB cache for buffer heads.
2856 void __init buffer_init(unsigned long mempages)
2859 unsigned int nr_hash;
2861 /* The buffer cache hash table is less important these days,
2866 mempages *= sizeof(struct buffer_head *);
2868 for (order = 0; (1 << order) < mempages; order++)
2871 /* try to allocate something until we get it or we're asking
2872 for something that is really too small */
2877 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2878 bh_hash_mask = (nr_hash - 1);
2882 while((tmp >>= 1UL) != 0UL)
2885 hash_table = (struct buffer_head **)
2886 __get_free_pages(GFP_ATOMIC, order);
2887 } while (hash_table == NULL && --order > 0);
2888 printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
2889 nr_hash, order, (PAGE_SIZE << order));
2892 panic("Failed to allocate buffer hash table\n");
2894 /* Setup hash chains. */
2895 for(i = 0; i < nr_hash; i++)
2896 hash_table[i] = NULL;
2898 /* Setup lru lists. */
2899 for(i = 0; i < NR_LIST; i++)
2905 /* ====================== bdflush support =================== */
2907 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2908 * response to dirty buffers. Once this process is activated, we write back
2909 * a limited number of buffers to the disks and then go back to sleep again.
2912 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2914 void wakeup_bdflush(void)
2916 wake_up_interruptible(&bdflush_wait);
2919 void wakeup_kupdate(void)
2921 if (waitqueue_active(&kupdate_wait))
2922 wake_up(&kupdate_wait);
2926 * Here we attempt to write back old buffers. We also try to flush inodes
2927 * and supers as well, since this function is essentially "update", and
2928 * otherwise there would be no way of ensuring that these quantities ever
2929 * get written back. Ideally, we would have a timestamp on the inodes
2930 * and superblocks so that we could write back only the old ones as well
2933 static int sync_old_buffers(void)
2936 sync_unlocked_inodes();
2941 struct buffer_head *bh;
2943 spin_lock(&lru_list_lock);
2944 bh = lru_list[BUF_DIRTY];
2947 if (time_before(jiffies, bh->b_flushtime) && !laptop_mode)
2949 if (write_some_buffers(NODEV))
2953 spin_unlock(&lru_list_lock);
2957 int block_sync_page(struct page *page)
2959 run_task_queue(&tq_disk);
2963 /* This is the interface to bdflush. As we get more sophisticated, we can
2964 * pass tuning parameters to this "process", to adjust how it behaves.
2965 * We would want to verify each parameter, however, to make sure that it
2968 asmlinkage long sys_bdflush(int func, long data)
2970 if (!capable(CAP_SYS_ADMIN))
2974 /* do_exit directly and let kupdate to do its work alone. */
2976 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2977 a syscall that doesn't care about the current mm context. */
2979 struct mm_struct *user_mm;
2982 * bdflush will spend all of it's time in kernel-space,
2983 * without touching user-space, so we can switch it into
2984 * 'lazy TLB mode' to reduce the cost of context-switches
2985 * to and from bdflush.
2987 user_mm = start_lazy_tlb();
2988 error = sync_old_buffers();
2989 end_lazy_tlb(user_mm);
2994 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2996 int i = (func-2) >> 1;
2997 if (i >= 0 && i < N_PARAM) {
2998 if ((func & 1) == 0)
2999 return put_user(bdf_prm.data[i], (int*)data);
3001 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
3002 bdf_prm.data[i] = data;
3009 /* Having func 0 used to launch the actual bdflush and then never
3010 * return (unless explicitly killed). We return zero here to
3011 * remain semi-compatible with present update(8) programs.
3017 * This is the actual bdflush daemon itself. It used to be started from
3018 * the syscall above, but now we launch it ourselves internally with
3019 * kernel_thread(...) directly after the first thread in init/main.c
3021 int bdflush(void *startup)
3023 struct task_struct *tsk = current;
3026 * We have a bare-bones task_struct, and really should fill
3027 * in a few more things so "top" and /proc/2/{exe,root,cwd}
3028 * display semi-sane things. Not real crucial though...
3033 strcpy(tsk->comm, "bdflush");
3035 /* avoid getting signals */
3036 spin_lock_irq(&tsk->sigmask_lock);
3038 sigfillset(&tsk->blocked);
3039 recalc_sigpending(tsk);
3040 spin_unlock_irq(&tsk->sigmask_lock);
3042 complete((struct completion *)startup);
3045 * FIXME: The ndirty logic here is wrong. It's supposed to
3046 * send bdflush back to sleep after writing ndirty buffers.
3047 * In fact, the test is wrong so bdflush will in fact
3048 * sleep when bdflush_stop() returns true.
3050 * FIXME: If it proves useful to implement ndirty properly,
3051 * then perhaps the value of ndirty should be scaled by the
3052 * amount of memory in the machine.
3055 int ndirty = bdf_prm.b_un.ndirty;
3057 CHECK_EMERGENCY_SYNC
3059 while (ndirty > 0) {
3060 spin_lock(&lru_list_lock);
3061 if (!write_some_buffers(NODEV))
3065 if (ndirty > 0 || bdflush_stop())
3066 interruptible_sleep_on(&bdflush_wait);
3071 * This is the kernel update daemon. It was used to live in userspace
3072 * but since it's need to run safely we want it unkillable by mistake.
3073 * You don't need to change your userspace configuration since
3074 * the userspace `update` will do_exit(0) at the first sys_bdflush().
3076 int kupdate(void *startup)
3078 struct task_struct * tsk = current;
3083 strcpy(tsk->comm, "kupdated");
3085 /* sigstop and sigcont will stop and wakeup kupdate */
3086 spin_lock_irq(&tsk->sigmask_lock);
3087 sigfillset(&tsk->blocked);
3088 siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
3089 recalc_sigpending(tsk);
3090 spin_unlock_irq(&tsk->sigmask_lock);
3092 complete((struct completion *)startup);
3095 DECLARE_WAITQUEUE(wait, tsk);
3097 add_wait_queue(&kupdate_wait, &wait);
3099 /* update interval */
3100 interval = bdf_prm.b_un.interval;
3102 tsk->state = TASK_INTERRUPTIBLE;
3103 schedule_timeout(interval);
3105 tsk->state = TASK_STOPPED;
3106 schedule(); /* wait for SIGCONT */
3108 remove_wait_queue(&kupdate_wait, &wait);
3109 /* check for sigstop */
3110 if (signal_pending(tsk)) {
3111 int sig, stopped = 0;
3112 struct siginfo info;
3114 spin_lock_irq(&tsk->sigmask_lock);
3115 sig = dequeue_signal(¤t->blocked, &info);
3118 spin_unlock_irq(&tsk->sigmask_lock);
3120 tsk->state = TASK_STOPPED;
3121 schedule(); /* wait for SIGCONT */
3125 printk(KERN_DEBUG "kupdate() activated...\n");
3130 run_task_queue(&tq_disk);
3134 static int __init bdflush_init(void)
3136 static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3138 kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3139 wait_for_completion(&startup);
3140 kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3141 wait_for_completion(&startup);
3145 module_init(bdflush_init)