more changes on original files
[linux-2.4.git] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6
7 /*
8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
9  * been avoided by NEVER letting an interrupt change a buffer (except for the
10  * data, of course), but instead letting the caller do it.
11  */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17  */
18
19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
20  * hash table, use SLAB cache for buffer heads. -DaveM
21  */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24  * - RMK
25  */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/slab.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48 #include <linux/module.h>
49 #include <linux/completion.h>
50
51 #include <asm/uaccess.h>
52 #include <asm/io.h>
53 #include <asm/bitops.h>
54 #include <asm/mmu_context.h>
55
56 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
57 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
58                                              number of unused buffer heads */
59
60 /* Anti-deadlock ordering:
61  *      lru_list_lock > hash_table_lock > unused_list_lock
62  */
63
64 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
65
66 /*
67  * Hash table gook..
68  */
69 static unsigned int bh_hash_mask;
70 static unsigned int bh_hash_shift;
71 static struct buffer_head **hash_table;
72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
73
74 static struct buffer_head *lru_list[NR_LIST];
75
76 static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
77 #define lru_list_lock  lru_list_lock_cacheline.lock
78
79 static int nr_buffers_type[NR_LIST];
80 static unsigned long size_buffers_type[NR_LIST];
81
82 static struct buffer_head * unused_list;
83 static int nr_unused_buffer_heads;
84 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
85 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
86
87 static int grow_buffers(kdev_t dev, unsigned long block, int size);
88 static int osync_buffers_list(struct list_head *);
89 static void __refile_buffer(struct buffer_head *);
90
91 /*
92  * A global sysctl-controlled flag which puts the machine into "laptop mode"
93  */
94 int laptop_mode;
95
96 static DECLARE_WAIT_QUEUE_HEAD(kupdate_wait);
97
98 /* This is used by some architectures to estimate available memory. */
99 atomic_t buffermem_pages = ATOMIC_INIT(0);
100
101 /* Here is the parameter block for the bdflush process. If you add or
102  * remove any of the parameters, make sure to update kernel/sysctl.c
103  * and the documentation at linux/Documentation/sysctl/vm.txt.
104  */
105
106 #define N_PARAM 9
107
108 /* The dummy values in this structure are left in there for compatibility
109  * with old programs that play with the /proc entries.
110  */
111 union bdflush_param {
112         struct {
113                 int nfract;     /* Percentage of buffer cache dirty to 
114                                    activate bdflush */
115                 int ndirty;     /* Maximum number of dirty blocks to write out per
116                                    wake-cycle */
117                 int dummy2;     /* old "nrefill" */
118                 int dummy3;     /* unused */
119                 int interval;   /* jiffies delay between kupdate flushes */
120                 int age_buffer; /* Time for normal buffer to age before we flush it */
121                 int nfract_sync;/* Percentage of buffer cache dirty to 
122                                    activate bdflush synchronously */
123                 int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
124                 int dummy5;     /* unused */
125         } b_un;
126         unsigned int data[N_PARAM];
127 } bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
128
129 /* These are the min and max parameter values that we will allow to be assigned */
130 int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
132
133 static inline int write_buffer_delay(struct buffer_head *bh)
134 {
135         struct page *page = bh->b_page;
136
137         if (!TryLockPage(page)) {
138                 spin_unlock(&lru_list_lock);
139                 unlock_buffer(bh);
140                 page->mapping->a_ops->writepage(page);
141                 return 1;
142         }
143
144         return 0;
145 }
146
147 static inline void write_buffer(struct buffer_head *bh)
148 {
149         if (buffer_delay(bh)) {
150                 struct page *page = bh->b_page;
151
152                 lock_page(page);
153                 if (buffer_delay(bh)) {
154                         page->mapping->a_ops->writepage(page);
155                         return;
156                 }
157                 unlock_page(page);
158         }
159
160         ll_rw_block(WRITE, 1, &bh);
161 }
162
163 void fastcall unlock_buffer(struct buffer_head *bh)
164 {
165         clear_bit(BH_Wait_IO, &bh->b_state);
166         clear_bit(BH_Launder, &bh->b_state);
167         /*
168          * When a locked buffer is visible to the I/O layer BH_Launder
169          * is set. This means before unlocking we must clear BH_Launder,
170          * mb() on alpha and then clear BH_Lock, so no reader can see
171          * BH_Launder set on an unlocked buffer and then risk to deadlock.
172          */
173         smp_mb__after_clear_bit();
174         clear_bit(BH_Lock, &bh->b_state);
175         smp_mb__after_clear_bit();
176         if (waitqueue_active(&bh->b_wait))
177                 wake_up(&bh->b_wait);
178 }
179
180 /*
181  * Note that the real wait_on_buffer() is an inline function that checks
182  * that the buffer is locked before calling this, so that unnecessary disk
183  * unplugging does not occur.
184  */
185 void __wait_on_buffer(struct buffer_head * bh)
186 {
187         struct task_struct *tsk = current;
188         DECLARE_WAITQUEUE(wait, tsk);
189
190         get_bh(bh);
191         add_wait_queue(&bh->b_wait, &wait);
192         do {
193                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
194                 if (!buffer_locked(bh))
195                         break;
196                 /*
197                  * We must read tq_disk in TQ_ACTIVE after the
198                  * add_wait_queue effect is visible to other cpus.
199                  * We could unplug some line above it wouldn't matter
200                  * but we can't do that right after add_wait_queue
201                  * without an smp_mb() in between because spin_unlock
202                  * has inclusive semantics.
203                  * Doing it here is the most efficient place so we
204                  * don't do a suprious unplug if we get a racy
205                  * wakeup that make buffer_locked to return 0, and
206                  * doing it here avoids an explicit smp_mb() we
207                  * rely on the implicit one in set_task_state.
208                  */
209                 run_task_queue(&tq_disk);
210                 schedule();
211         } while (buffer_locked(bh));
212         tsk->state = TASK_RUNNING;
213         remove_wait_queue(&bh->b_wait, &wait);
214         put_bh(bh);
215 }
216
217 /*
218  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
219  * unlock the buffer. This is what ll_rw_block uses too.
220  */
221 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
222 {
223         mark_buffer_uptodate(bh, uptodate);
224         unlock_buffer(bh);
225         put_bh(bh);
226 }
227
228 /*
229  * The buffers have been marked clean and locked.  Just submit the dang
230  * things.. 
231  */
232 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
233 {
234         do {
235                 struct buffer_head * bh = *array++;
236                 bh->b_end_io = end_buffer_io_sync;
237                 submit_bh(WRITE, bh);
238         } while (--count);
239 }
240
241 /*
242  * Write some buffers from the head of the dirty queue.
243  *
244  * This must be called with the LRU lock held, and will
245  * return without it!
246  */
247 #define NRSYNC (32)
248 static int write_some_buffers(kdev_t dev)
249 {
250         struct buffer_head *next;
251         struct buffer_head *array[NRSYNC];
252         unsigned int count;
253         int nr;
254
255         next = lru_list[BUF_DIRTY];
256         nr = nr_buffers_type[BUF_DIRTY];
257         count = 0;
258         while (next && --nr >= 0) {
259                 struct buffer_head * bh = next;
260                 next = bh->b_next_free;
261
262                 if (dev != NODEV && bh->b_dev != dev)
263                         continue;
264                 if (test_and_set_bit(BH_Lock, &bh->b_state))
265                         continue;
266                 if (buffer_delay(bh)) {
267                         if (write_buffer_delay(bh)) {
268                                 if (count)
269                                         write_locked_buffers(array, count);
270                                 return -EAGAIN;
271                         }
272                 } else if (atomic_set_buffer_clean(bh)) {
273                         __refile_buffer(bh);
274                         get_bh(bh);
275                         array[count++] = bh;
276                         if (count < NRSYNC)
277                                 continue;
278
279                         spin_unlock(&lru_list_lock);
280                         write_locked_buffers(array, count);
281                         return -EAGAIN;
282                 }
283                 unlock_buffer(bh);
284                 __refile_buffer(bh);
285         }
286         spin_unlock(&lru_list_lock);
287
288         if (count)
289                 write_locked_buffers(array, count);
290         return 0;
291 }
292
293 /*
294  * Write out all buffers on the dirty list.
295  */
296 static void write_unlocked_buffers(kdev_t dev)
297 {
298         do
299                 spin_lock(&lru_list_lock);
300         while (write_some_buffers(dev));
301 }
302
303 /*
304  * Wait for a buffer on the proper list.
305  *
306  * This must be called with the LRU lock held, and
307  * will return with it released.
308  */
309 static int wait_for_buffers(kdev_t dev, int index, int refile)
310 {
311         struct buffer_head * next;
312         int nr;
313
314         next = lru_list[index];
315         nr = nr_buffers_type[index];
316         while (next && --nr >= 0) {
317                 struct buffer_head *bh = next;
318                 next = bh->b_next_free;
319
320                 if (!buffer_locked(bh)) {
321                         if (refile)
322                                 __refile_buffer(bh);
323                         continue;
324                 }
325                 if (dev != NODEV && bh->b_dev != dev)
326                         continue;
327
328                 get_bh(bh);
329                 spin_unlock(&lru_list_lock);
330                 wait_on_buffer (bh);
331                 put_bh(bh);
332                 return -EAGAIN;
333         }
334         spin_unlock(&lru_list_lock);
335         return 0;
336 }
337
338 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
339 {
340         do {
341                 spin_lock(&lru_list_lock);
342         } while (wait_for_buffers(dev, index, refile));
343         return 0;
344 }
345
346 /* Call sync_buffers with wait!=0 to ensure that the call does not
347  * return until all buffer writes have completed.  Sync() may return
348  * before the writes have finished; fsync() may not.
349  */
350
351 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
352  * spontaneously dirty themselves without ever brelse being called.
353  * We will ultimately want to put these in a separate list, but for
354  * now we search all of the lists for dirty buffers.
355  */
356 int sync_buffers(kdev_t dev, int wait)
357 {
358         int err = 0;
359
360         /* One pass for no-wait, three for wait:
361          * 0) write out all dirty, unlocked buffers;
362          * 1) wait for all dirty locked buffers;
363          * 2) write out all dirty, unlocked buffers;
364          * 2) wait for completion by waiting for all buffers to unlock.
365          */
366         write_unlocked_buffers(dev);
367         if (wait) {
368                 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
369                 write_unlocked_buffers(dev);
370                 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
371         }
372         return err;
373 }
374 EXPORT_SYMBOL(sync_buffers);
375
376 int fsync_super(struct super_block *sb)
377 {
378         kdev_t dev = sb->s_dev;
379         sync_buffers(dev, 0);
380
381         lock_kernel();
382         sync_inodes_sb(sb);
383         DQUOT_SYNC_SB(sb);
384         lock_super(sb);
385         if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
386                 sb->s_op->write_super(sb);
387         unlock_super(sb);
388         if (sb->s_op && sb->s_op->sync_fs)
389                 sb->s_op->sync_fs(sb);
390         unlock_kernel();
391
392         return sync_buffers(dev, 1);
393 }
394
395 int fsync_no_super(kdev_t dev)
396 {
397         sync_buffers(dev, 0);
398         return sync_buffers(dev, 1);
399 }
400
401 int fsync_dev(kdev_t dev)
402 {
403         sync_buffers(dev, 0);
404
405         lock_kernel();
406         sync_inodes(dev);
407         DQUOT_SYNC_DEV(dev);
408         sync_supers(dev, 1);
409         unlock_kernel();
410
411         return sync_buffers(dev, 1);
412 }
413
414 /*
415  * There's no real reason to pretend we should
416  * ever do anything differently
417  */
418 void sync_dev(kdev_t dev)
419 {
420         fsync_dev(dev);
421 }
422
423 asmlinkage long sys_sync(void)
424 {
425         fsync_dev(0);
426         return 0;
427 }
428
429 /*
430  *      filp may be NULL if called via the msync of a vma.
431  */
432  
433 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
434 {
435         struct inode * inode = dentry->d_inode;
436         struct super_block * sb;
437         kdev_t dev;
438         int ret;
439
440         lock_kernel();
441         /* sync the inode to buffers */
442         write_inode_now(inode, 0);
443
444         /* sync the superblock to buffers */
445         sb = inode->i_sb;
446         lock_super(sb);
447         if (sb->s_op && sb->s_op->write_super)
448                 sb->s_op->write_super(sb);
449         unlock_super(sb);
450
451         /* .. finally sync the buffers to disk */
452         dev = inode->i_dev;
453         ret = sync_buffers(dev, 1);
454         unlock_kernel();
455         return ret;
456 }
457
458 asmlinkage long sys_fsync(unsigned int fd)
459 {
460         struct file * file;
461         struct dentry * dentry;
462         struct inode * inode;
463         int ret, err;
464
465         ret = -EBADF;
466         file = fget(fd);
467         if (!file)
468                 goto out;
469
470         dentry = file->f_dentry;
471         inode = dentry->d_inode;
472
473         ret = -EINVAL;
474         if (!file->f_op || !file->f_op->fsync) {
475                 /* Why?  We can still call filemap_fdatasync */
476                 goto out_putf;
477         }
478
479         /* We need to protect against concurrent writers.. */
480         down(&inode->i_sem);
481         ret = filemap_fdatasync(inode->i_mapping);
482         err = file->f_op->fsync(file, dentry, 0);
483         if (err && !ret)
484                 ret = err;
485         err = filemap_fdatawait(inode->i_mapping);
486         if (err && !ret)
487                 ret = err;
488         up(&inode->i_sem);
489
490 out_putf:
491         fput(file);
492 out:
493         return ret;
494 }
495
496 int do_fdatasync(struct file *file)
497 {
498         int ret, err;
499         struct dentry *dentry;
500         struct inode *inode;
501
502         if (unlikely(!file->f_op || !file->f_op->fsync))
503                 return -EINVAL;
504         
505         dentry = file->f_dentry;
506         inode = dentry->d_inode;
507
508         ret = filemap_fdatasync(inode->i_mapping);
509         err = file->f_op->fsync(file, dentry, 1);
510         if (err && !ret)
511                 ret = err;
512         err = filemap_fdatawait(inode->i_mapping);
513         if (err && !ret)
514                 ret = err;
515         return ret;
516 }
517
518 asmlinkage long sys_fdatasync(unsigned int fd)
519 {
520         struct file * file;
521         struct inode *inode;
522         int ret;
523
524         ret = -EBADF;
525         file = fget(fd);
526         if (!file)
527                 goto out;
528
529         inode = file->f_dentry->d_inode;
530         down(&inode->i_sem);
531         ret = do_fdatasync(file);
532         up(&inode->i_sem);
533
534         fput(file);
535 out:
536         return ret;
537 }
538
539 /* After several hours of tedious analysis, the following hash
540  * function won.  Do not mess with it... -DaveM
541  */
542 #define _hashfn(dev,block)      \
543         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
544          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
545           ((block) << (bh_hash_shift - 12))))
546 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
547
548 static inline void __insert_into_hash_list(struct buffer_head *bh)
549 {
550         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
551         struct buffer_head *next = *head;
552
553         *head = bh;
554         bh->b_pprev = head;
555         bh->b_next = next;
556         if (next != NULL)
557                 next->b_pprev = &bh->b_next;
558 }
559
560 static __inline__ void __hash_unlink(struct buffer_head *bh)
561 {
562         struct buffer_head **pprev = bh->b_pprev;
563         if (pprev) {
564                 struct buffer_head *next = bh->b_next;
565                 if (next)
566                         next->b_pprev = pprev;
567                 *pprev = next;
568                 bh->b_pprev = NULL;
569         }
570 }
571
572 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
573 {
574         struct buffer_head **bhp = &lru_list[blist];
575
576         if (bh->b_prev_free || bh->b_next_free) BUG();
577
578         if(!*bhp) {
579                 *bhp = bh;
580                 bh->b_prev_free = bh;
581         }
582         bh->b_next_free = *bhp;
583         bh->b_prev_free = (*bhp)->b_prev_free;
584         (*bhp)->b_prev_free->b_next_free = bh;
585         (*bhp)->b_prev_free = bh;
586         nr_buffers_type[blist]++;
587         size_buffers_type[blist] += bh->b_size >> 9;
588 }
589
590 static void __remove_from_lru_list(struct buffer_head * bh)
591 {
592         struct buffer_head *next = bh->b_next_free;
593         if (next) {
594                 struct buffer_head *prev = bh->b_prev_free;
595                 int blist = bh->b_list;
596
597                 prev->b_next_free = next;
598                 next->b_prev_free = prev;
599                 if (lru_list[blist] == bh) {
600                         if (next == bh)
601                                 next = NULL;
602                         lru_list[blist] = next;
603                 }
604                 bh->b_next_free = NULL;
605                 bh->b_prev_free = NULL;
606                 nr_buffers_type[blist]--;
607                 size_buffers_type[blist] -= bh->b_size >> 9;
608         }
609 }
610
611 /* must be called with both the hash_table_lock and the lru_list_lock
612    held */
613 static void __remove_from_queues(struct buffer_head *bh)
614 {
615         __hash_unlink(bh);
616         __remove_from_lru_list(bh);
617 }
618
619 static void remove_from_queues(struct buffer_head *bh)
620 {
621         spin_lock(&lru_list_lock);
622         write_lock(&hash_table_lock);
623         __remove_from_queues(bh);
624         write_unlock(&hash_table_lock); 
625         spin_unlock(&lru_list_lock);
626 }
627
628 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
629 {
630         struct buffer_head *bh, **p = &hash(dev, block);
631
632         read_lock(&hash_table_lock);
633
634         for (;;) {
635                 bh = *p;
636                 if (!bh)
637                         break;
638                 p = &bh->b_next;
639                 if (bh->b_blocknr != block)
640                         continue;
641                 if (bh->b_size != size)
642                         continue;
643                 if (bh->b_dev != dev)
644                         continue;
645                 get_bh(bh);
646                 break;
647         }
648
649         read_unlock(&hash_table_lock);
650         return bh;
651 }
652
653 void fastcall buffer_insert_list(struct buffer_head *bh, struct list_head *list)
654 {
655         spin_lock(&lru_list_lock);
656         if (buffer_attached(bh))
657                 list_del(&bh->b_inode_buffers);
658         set_buffer_attached(bh);
659         list_add_tail(&bh->b_inode_buffers, list);
660         spin_unlock(&lru_list_lock);
661 }
662
663 /*
664  * The caller must have the lru_list lock before calling the 
665  * remove_inode_queue functions.
666  */
667 static void __remove_inode_queue(struct buffer_head *bh)
668 {
669         list_del(&bh->b_inode_buffers);
670         clear_buffer_attached(bh);
671 }
672
673 static inline void remove_inode_queue(struct buffer_head *bh)
674 {
675         if (buffer_attached(bh))
676                 __remove_inode_queue(bh);
677 }
678
679 int inode_has_buffers(struct inode *inode)
680 {
681         int ret;
682         
683         spin_lock(&lru_list_lock);
684         ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
685         spin_unlock(&lru_list_lock);
686         
687         return ret;
688 }
689
690 /* If invalidate_buffers() will trash dirty buffers, it means some kind
691    of fs corruption is going on. Trashing dirty data always imply losing
692    information that was supposed to be just stored on the physical layer
693    by the user.
694
695    Thus invalidate_buffers in general usage is not allwowed to trash
696    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
697    be preserved.  These buffers are simply skipped.
698   
699    We also skip buffers which are still in use.  For example this can
700    happen if a userspace program is reading the block device.
701
702    NOTE: In the case where the user removed a removable-media-disk even if
703    there's still dirty data not synced on disk (due a bug in the device driver
704    or due an error of the user), by not destroying the dirty buffers we could
705    generate corruption also on the next media inserted, thus a parameter is
706    necessary to handle this case in the most safe way possible (trying
707    to not corrupt also the new disk inserted with the data belonging to
708    the old now corrupted disk). Also for the ramdisk the natural thing
709    to do in order to release the ramdisk memory is to destroy dirty buffers.
710
711    These are two special cases. Normal usage imply the device driver
712    to issue a sync on the device (without waiting I/O completion) and
713    then an invalidate_buffers call that doesn't trash dirty buffers.
714
715    For handling cache coherency with the blkdev pagecache the 'update' case
716    is been introduced. It is needed to re-read from disk any pinned
717    buffer. NOTE: re-reading from disk is destructive so we can do it only
718    when we assume nobody is changing the buffercache under our I/O and when
719    we think the disk contains more recent information than the buffercache.
720    The update == 1 pass marks the buffers we need to update, the update == 2
721    pass does the actual I/O. */
722 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
723 {
724         int i, nlist, slept;
725         struct buffer_head * bh, * bh_next;
726         kdev_t dev = to_kdev_t(bdev->bd_dev);   /* will become bdev */
727
728  retry:
729         slept = 0;
730         spin_lock(&lru_list_lock);
731         for(nlist = 0; nlist < NR_LIST; nlist++) {
732                 bh = lru_list[nlist];
733                 if (!bh)
734                         continue;
735                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
736                         bh_next = bh->b_next_free;
737
738                         /* Another device? */
739                         if (bh->b_dev != dev)
740                                 continue;
741                         /* Not hashed? */
742                         if (!bh->b_pprev)
743                                 continue;
744                         if (buffer_locked(bh)) {
745                                 get_bh(bh);
746                                 spin_unlock(&lru_list_lock);
747                                 wait_on_buffer(bh);
748                                 slept = 1;
749                                 spin_lock(&lru_list_lock);
750                                 put_bh(bh);
751                         }
752
753                         write_lock(&hash_table_lock);
754                         /* All buffers in the lru lists are mapped */
755                         if (!buffer_mapped(bh))
756                                 BUG();
757                         if (buffer_dirty(bh) && destroy_dirty_buffers)
758                                 printk("invalidate: dirty buffer\n");
759                         if (!atomic_read(&bh->b_count)) {
760                                 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
761                                         remove_inode_queue(bh);
762                                 }
763                         } else if (!bdev->bd_openers)
764                                 printk("invalidate: busy buffer\n");
765
766                         write_unlock(&hash_table_lock);
767                         if (slept)
768                                 goto out;
769                 }
770         }
771 out:
772         spin_unlock(&lru_list_lock);
773         if (slept)
774                 goto retry;
775
776         /* Get rid of the page cache */
777         invalidate_inode_pages(bdev->bd_inode);
778 }
779
780 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
781 {
782         struct block_device *bdev = bdget(dev);
783         if (bdev) {
784                 invalidate_bdev(bdev, destroy_dirty_buffers);
785                 bdput(bdev);
786         }
787 }
788
789 static void free_more_memory(void)
790 {
791         balance_dirty();
792         wakeup_bdflush();
793         try_to_free_pages(GFP_NOIO);
794         run_task_queue(&tq_disk);
795         yield();
796 }
797
798 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
799 {
800         bh->b_list = BUF_CLEAN;
801         bh->b_end_io = handler;
802         bh->b_private = private;
803 }
804
805 void end_buffer_io_async(struct buffer_head * bh, int uptodate)
806 {
807         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
808         unsigned long flags;
809         struct buffer_head *tmp;
810         struct page *page;
811         int fullup = 1;
812
813         mark_buffer_uptodate(bh, uptodate);
814
815         /* This is a temporary buffer used for page I/O. */
816         page = bh->b_page;
817
818         if (!uptodate)
819                 SetPageError(page);
820
821         /*
822          * Be _very_ careful from here on. Bad things can happen if
823          * two buffer heads end IO at almost the same time and both
824          * decide that the page is now completely done.
825          *
826          * Async buffer_heads are here only as labels for IO, and get
827          * thrown away once the IO for this page is complete.  IO is
828          * deemed complete once all buffers have been visited
829          * (b_count==0) and are now unlocked. We must make sure that
830          * only the _last_ buffer that decrements its count is the one
831          * that unlock the page..
832          */
833         spin_lock_irqsave(&page_uptodate_lock, flags);
834         mark_buffer_async(bh, 0);
835         unlock_buffer(bh);
836         tmp = bh->b_this_page;
837         while (tmp != bh) {
838                 if (buffer_locked(tmp)) {
839                         if (buffer_async(tmp))
840                                 goto still_busy;
841                 } else if (!buffer_uptodate(tmp))
842                         fullup = 0;
843                 tmp = tmp->b_this_page;
844         }
845
846         /* OK, the async IO on this page is complete. */
847         spin_unlock_irqrestore(&page_uptodate_lock, flags);
848
849         /*
850          * If none of the buffers had errors and all were uptodate
851          * then we can set the page uptodate:
852          */
853         if (fullup && !PageError(page))
854                 SetPageUptodate(page);
855
856         UnlockPage(page);
857
858         return;
859
860 still_busy:
861         spin_unlock_irqrestore(&page_uptodate_lock, flags);
862         return;
863 }
864
865 inline void set_buffer_async_io(struct buffer_head *bh)
866 {
867         bh->b_end_io = end_buffer_io_async;
868         mark_buffer_async(bh, 1);
869 }
870
871 /*
872  * Synchronise all the inode's dirty buffers to the disk.
873  *
874  * We have conflicting pressures: we want to make sure that all
875  * initially dirty buffers get waited on, but that any subsequently
876  * dirtied buffers don't.  After all, we don't want fsync to last
877  * forever if somebody is actively writing to the file.
878  *
879  * Do this in two main stages: first we copy dirty buffers to a
880  * temporary inode list, queueing the writes as we go.  Then we clean
881  * up, waiting for those writes to complete.
882  * 
883  * During this second stage, any subsequent updates to the file may end
884  * up refiling the buffer on the original inode's dirty list again, so
885  * there is a chance we will end up with a buffer queued for write but
886  * not yet completed on that list.  So, as a final cleanup we go through
887  * the osync code to catch these locked, dirty buffers without requeuing
888  * any newly dirty buffers for write.
889  */
890 int fsync_buffers_list(struct list_head *list)
891 {
892         struct buffer_head *bh;
893         struct list_head tmp;
894         int err = 0, err2;
895         
896         INIT_LIST_HEAD(&tmp);
897         
898         spin_lock(&lru_list_lock);
899
900         while (!list_empty(list)) {
901                 bh = BH_ENTRY(list->next);
902                 list_del(&bh->b_inode_buffers);
903                 if (!buffer_dirty(bh) && !buffer_locked(bh))
904                         clear_buffer_attached(bh);
905                 else {
906                         set_buffer_attached(bh);
907                         list_add(&bh->b_inode_buffers, &tmp);
908                         if (buffer_dirty(bh)) {
909                                 get_bh(bh);
910                                 spin_unlock(&lru_list_lock);
911                         /*
912                          * Wait I/O completion before submitting
913                          * the buffer, to be sure the write will
914                          * be effective on the latest data in
915                          * the buffer. (otherwise - if there's old
916                          * I/O in flight - write_buffer would become
917                          * a noop)
918                          */
919                                 wait_on_buffer(bh);
920                                 write_buffer(bh);
921                                 brelse(bh);
922                                 spin_lock(&lru_list_lock);
923                         }
924                 }
925         }
926
927         while (!list_empty(&tmp)) {
928                 bh = BH_ENTRY(tmp.prev);
929                 remove_inode_queue(bh);
930                 get_bh(bh);
931                 spin_unlock(&lru_list_lock);
932                 wait_on_buffer(bh);
933                 if (!buffer_uptodate(bh))
934                         err = -EIO;
935                 brelse(bh);
936                 spin_lock(&lru_list_lock);
937         }
938         
939         spin_unlock(&lru_list_lock);
940         err2 = osync_buffers_list(list);
941
942         if (err)
943                 return err;
944         else
945                 return err2;
946 }
947
948 /*
949  * osync is designed to support O_SYNC io.  It waits synchronously for
950  * all already-submitted IO to complete, but does not queue any new
951  * writes to the disk.
952  *
953  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
954  * you dirty the buffers, and then use osync_buffers_list to wait for
955  * completion.  Any other dirty buffers which are not yet queued for
956  * write will not be flushed to disk by the osync.
957  */
958 static int osync_buffers_list(struct list_head *list)
959 {
960         struct buffer_head *bh;
961         struct list_head *p;
962         int err = 0;
963
964         spin_lock(&lru_list_lock);
965         
966  repeat:
967         list_for_each_prev(p, list) {
968                 bh = BH_ENTRY(p);
969                 if (buffer_locked(bh)) {
970                         get_bh(bh);
971                         spin_unlock(&lru_list_lock);
972                         wait_on_buffer(bh);
973                         if (!buffer_uptodate(bh))
974                                 err = -EIO;
975                         brelse(bh);
976                         spin_lock(&lru_list_lock);
977                         goto repeat;
978                 }
979         }
980
981         spin_unlock(&lru_list_lock);
982         return err;
983 }
984
985 /*
986  * Invalidate any and all dirty buffers on a given inode.  We are
987  * probably unmounting the fs, but that doesn't mean we have already
988  * done a sync().  Just drop the buffers from the inode list.
989  */
990 void invalidate_inode_buffers(struct inode *inode)
991 {
992         struct list_head * entry;
993         
994         spin_lock(&lru_list_lock);
995         while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
996                 remove_inode_queue(BH_ENTRY(entry));
997         while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
998                 remove_inode_queue(BH_ENTRY(entry));
999         spin_unlock(&lru_list_lock);
1000 }
1001
1002
1003 /*
1004  * Ok, this is getblk, and it isn't very clear, again to hinder
1005  * race-conditions. Most of the code is seldom used, (ie repeating),
1006  * so it should be much more efficient than it looks.
1007  *
1008  * The algorithm is changed: hopefully better, and an elusive bug removed.
1009  *
1010  * 14.02.92: changed it to sync dirty buffers a bit: better performance
1011  * when the filesystem starts to get full of dirty blocks (I hope).
1012  */
1013 struct buffer_head * getblk(kdev_t dev, int block, int size)
1014 {
1015         for (;;) {
1016                 struct buffer_head * bh;
1017
1018                 bh = get_hash_table(dev, block, size);
1019                 if (bh) {
1020                         touch_buffer(bh);
1021                         return bh;
1022                 }
1023
1024                 if (!grow_buffers(dev, block, size))
1025                         free_more_memory();
1026         }
1027 }
1028
1029 /* -1 -> no need to flush
1030     0 -> async flush
1031     1 -> sync flush (wait for I/O completion) */
1032 static int balance_dirty_state(void)
1033 {
1034         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1035
1036         dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1037         tot = nr_free_buffer_pages();
1038
1039         dirty *= 100;
1040         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1041         hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1042
1043         /* First, check for the "real" dirty limit. */
1044         if (dirty > soft_dirty_limit) {
1045                 if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
1046                         return 1;
1047                 return 0;
1048         }
1049
1050         return -1;
1051 }
1052
1053 static int bdflush_stop(void)
1054 {
1055         unsigned long dirty, tot, dirty_limit;
1056
1057         dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1058         tot = nr_free_buffer_pages();
1059
1060         dirty *= 100;
1061         dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1062
1063         if (!laptop_mode && dirty > dirty_limit)
1064                 return 0;
1065         return 1;
1066 }
1067
1068 /*
1069  * if a new dirty buffer is created we need to balance bdflush.
1070  *
1071  * in the future we might want to make bdflush aware of different
1072  * pressures on different devices - thus the (currently unused)
1073  * 'dev' parameter.
1074  */
1075 void balance_dirty(void)
1076 {
1077         int state = balance_dirty_state();
1078
1079         if (state < 0)
1080                 return;
1081
1082         wakeup_bdflush();
1083
1084         /*
1085          * And if we're _really_ out of balance, wait for
1086          * some of the dirty/locked buffers ourselves.
1087          * This will throttle heavy writers.
1088          */
1089         if (state > 0) {
1090                 spin_lock(&lru_list_lock);
1091                 write_some_buffers(NODEV);
1092         }
1093 }
1094 EXPORT_SYMBOL(balance_dirty);
1095
1096 inline void fastcall __mark_dirty(struct buffer_head *bh)
1097 {
1098         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1099         refile_buffer(bh);
1100 }
1101
1102 /* atomic version, the user must call balance_dirty() by hand
1103    as soon as it become possible to block */
1104 void fastcall __mark_buffer_dirty(struct buffer_head *bh)
1105 {
1106         if (!atomic_set_buffer_dirty(bh))
1107                 __mark_dirty(bh);
1108 }
1109
1110 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1111 {
1112         if (!atomic_set_buffer_dirty(bh)) {
1113                 if (block_dump)
1114                         printk("%s: dirtied buffer\n", current->comm);
1115                 __mark_dirty(bh);
1116                 balance_dirty();
1117         }
1118 }
1119
1120 void set_buffer_flushtime(struct buffer_head *bh)
1121 {
1122         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1123 }
1124 EXPORT_SYMBOL(set_buffer_flushtime);
1125
1126 int get_buffer_flushtime(void)
1127 {
1128         return bdf_prm.b_un.interval;
1129 }
1130 EXPORT_SYMBOL(get_buffer_flushtime);
1131
1132 /*
1133  * A buffer may need to be moved from one buffer list to another
1134  * (e.g. in case it is not shared any more). Handle this.
1135  */
1136 static void __refile_buffer(struct buffer_head *bh)
1137 {
1138         int dispose = BUF_CLEAN;
1139         if (buffer_locked(bh))
1140                 dispose = BUF_LOCKED;
1141         if (buffer_dirty(bh))
1142                 dispose = BUF_DIRTY;
1143         if (dispose != bh->b_list) {
1144                 __remove_from_lru_list(bh);
1145                 bh->b_list = dispose;
1146                 if (dispose == BUF_CLEAN)
1147                         remove_inode_queue(bh);
1148                 __insert_into_lru_list(bh, dispose);
1149         }
1150 }
1151
1152 void refile_buffer(struct buffer_head *bh)
1153 {
1154         spin_lock(&lru_list_lock);
1155         __refile_buffer(bh);
1156         spin_unlock(&lru_list_lock);
1157 }
1158
1159 /*
1160  * Release a buffer head
1161  */
1162 void __brelse(struct buffer_head * buf)
1163 {
1164         if (atomic_read(&buf->b_count)) {
1165                 put_bh(buf);
1166                 return;
1167         }
1168         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1169 }
1170
1171 /*
1172  * bforget() is like brelse(), except it discards any
1173  * potentially dirty data.
1174  */
1175 void __bforget(struct buffer_head * buf)
1176 {
1177         mark_buffer_clean(buf);
1178         __brelse(buf);
1179 }
1180
1181 /**
1182  *      bread() - reads a specified block and returns the bh
1183  *      @block: number of block
1184  *      @size: size (in bytes) to read
1185  * 
1186  *      Reads a specified block, and returns buffer head that
1187  *      contains it. It returns NULL if the block was unreadable.
1188  */
1189 struct buffer_head * bread(kdev_t dev, int block, int size)
1190 {
1191         struct buffer_head * bh;
1192
1193         bh = getblk(dev, block, size);
1194         if (buffer_uptodate(bh))
1195                 return bh;
1196         set_bit(BH_Sync, &bh->b_state);
1197         ll_rw_block(READ, 1, &bh);
1198         wait_on_buffer(bh);
1199         if (buffer_uptodate(bh))
1200                 return bh;
1201         brelse(bh);
1202         return NULL;
1203 }
1204
1205 /*
1206  * Note: the caller should wake up the buffer_wait list if needed.
1207  */
1208 static void __put_unused_buffer_head(struct buffer_head * bh)
1209 {
1210         if (unlikely(buffer_attached(bh)))
1211                 BUG();
1212         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1213                 kmem_cache_free(bh_cachep, bh);
1214         } else {
1215                 bh->b_dev = B_FREE;
1216                 bh->b_blocknr = -1;
1217                 bh->b_this_page = NULL;
1218
1219                 nr_unused_buffer_heads++;
1220                 bh->b_next_free = unused_list;
1221                 unused_list = bh;
1222         }
1223 }
1224
1225 void put_unused_buffer_head(struct buffer_head *bh)
1226 {
1227         spin_lock(&unused_list_lock);
1228         __put_unused_buffer_head(bh);
1229         spin_unlock(&unused_list_lock);
1230 }
1231 EXPORT_SYMBOL(put_unused_buffer_head);
1232
1233 /*
1234  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1235  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1236  * buffer heads is now handled in create_buffers().
1237  */ 
1238 struct buffer_head * get_unused_buffer_head(int async)
1239 {
1240         struct buffer_head * bh;
1241
1242         spin_lock(&unused_list_lock);
1243         if (nr_unused_buffer_heads > NR_RESERVED) {
1244                 bh = unused_list;
1245                 unused_list = bh->b_next_free;
1246                 nr_unused_buffer_heads--;
1247                 spin_unlock(&unused_list_lock);
1248                 return bh;
1249         }
1250         spin_unlock(&unused_list_lock);
1251
1252         /* This is critical.  We can't call out to the FS
1253          * to get more buffer heads, because the FS may need
1254          * more buffer-heads itself.  Thus SLAB_NOFS.
1255          */
1256         if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1257                 bh->b_blocknr = -1;
1258                 bh->b_this_page = NULL;
1259                 return bh;
1260         }
1261
1262         /*
1263          * If we need an async buffer, use the reserved buffer heads.
1264          * Non-PF_MEMALLOC tasks can just loop in create_buffers().
1265          */
1266         if (async && (current->flags & PF_MEMALLOC)) {
1267                 spin_lock(&unused_list_lock);
1268                 if (unused_list) {
1269                         bh = unused_list;
1270                         unused_list = bh->b_next_free;
1271                         nr_unused_buffer_heads--;
1272                         spin_unlock(&unused_list_lock);
1273                         return bh;
1274                 }
1275                 spin_unlock(&unused_list_lock);
1276         }
1277
1278         return NULL;
1279 }
1280 EXPORT_SYMBOL(get_unused_buffer_head);
1281
1282 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1283 {
1284         if (offset >= PAGE_SIZE)
1285                 BUG();
1286
1287         if (PageHighMem(page)) {
1288                 bh->b_data = (char *)offset;
1289         } else {
1290                 bh->b_data = page_address(page) + offset;
1291         }
1292         bh->b_page = page;
1293 }
1294 EXPORT_SYMBOL(set_bh_page);
1295
1296 /*
1297  * Create the appropriate buffers when given a page for data area and
1298  * the size of each buffer.. Use the bh->b_this_page linked list to
1299  * follow the buffers created.  Return NULL if unable to create more
1300  * buffers.
1301  * The async flag is used to differentiate async IO (paging, swapping)
1302  * from ordinary buffer allocations, and only async requests are allowed
1303  * to sleep waiting for buffer heads. 
1304  */
1305 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1306 {
1307         struct buffer_head *bh, *head;
1308         long offset;
1309
1310 try_again:
1311         head = NULL;
1312         offset = PAGE_SIZE;
1313         while ((offset -= size) >= 0) {
1314                 bh = get_unused_buffer_head(async);
1315                 if (!bh)
1316                         goto no_grow;
1317
1318                 bh->b_dev = NODEV;
1319                 bh->b_this_page = head;
1320                 head = bh;
1321
1322                 bh->b_state = 0;
1323                 bh->b_next_free = NULL;
1324                 bh->b_pprev = NULL;
1325                 atomic_set(&bh->b_count, 0);
1326                 bh->b_size = size;
1327
1328                 set_bh_page(bh, page, offset);
1329
1330                 bh->b_list = BUF_CLEAN;
1331                 bh->b_end_io = NULL;
1332         }
1333         return head;
1334 /*
1335  * In case anything failed, we just free everything we got.
1336  */
1337 no_grow:
1338         if (head) {
1339                 spin_lock(&unused_list_lock);
1340                 do {
1341                         bh = head;
1342                         head = head->b_this_page;
1343                         __put_unused_buffer_head(bh);
1344                 } while (head);
1345                 spin_unlock(&unused_list_lock);
1346
1347                 /* Wake up any waiters ... */
1348                 wake_up(&buffer_wait);
1349         }
1350
1351         /*
1352          * Return failure for non-async IO requests.  Async IO requests
1353          * are not allowed to fail, so we have to wait until buffer heads
1354          * become available.  But we don't want tasks sleeping with 
1355          * partially complete buffers, so all were released above.
1356          */
1357         if (!async)
1358                 return NULL;
1359
1360         /* We're _really_ low on memory. Now we just
1361          * wait for old buffer heads to become free due to
1362          * finishing IO.  Since this is an async request and
1363          * the reserve list is empty, we're sure there are 
1364          * async buffer heads in use.
1365          */
1366         run_task_queue(&tq_disk);
1367
1368         free_more_memory();
1369         goto try_again;
1370 }
1371
1372 /*
1373  * Called when truncating a buffer on a page completely.
1374  */
1375 static void discard_buffer(struct buffer_head * bh)
1376 {
1377         if (buffer_mapped(bh) || buffer_delay(bh)) {
1378                 mark_buffer_clean(bh);
1379                 lock_buffer(bh);
1380                 clear_bit(BH_Uptodate, &bh->b_state);
1381                 clear_bit(BH_Mapped, &bh->b_state);
1382                 clear_bit(BH_Req, &bh->b_state);
1383                 clear_bit(BH_New, &bh->b_state);
1384                 clear_bit(BH_Delay, &bh->b_state);
1385                 remove_from_queues(bh);
1386                 unlock_buffer(bh);
1387         }
1388 }
1389
1390 /**
1391  * try_to_release_page - release old fs-specific metadata on a page
1392  *
1393  */
1394
1395 int try_to_release_page(struct page * page, int gfp_mask)
1396 {
1397         if (!PageLocked(page))
1398                 BUG();
1399         
1400         if (!page->mapping)
1401                 goto try_to_free;
1402         if (!page->mapping->a_ops->releasepage)
1403                 goto try_to_free;
1404         if (page->mapping->a_ops->releasepage(page, gfp_mask))
1405                 goto try_to_free;
1406         /*
1407          * We couldn't release buffer metadata; don't even bother trying
1408          * to release buffers.
1409          */
1410         return 0;
1411 try_to_free:    
1412         return try_to_free_buffers(page, gfp_mask);
1413 }
1414
1415 /*
1416  * We don't have to release all buffers here, but
1417  * we have to be sure that no dirty buffer is left
1418  * and no IO is going on (no buffer is locked), because
1419  * we have truncated the file and are going to free the
1420  * blocks on-disk..
1421  */
1422 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1423 {
1424         struct buffer_head *head, *bh, *next;
1425         unsigned int curr_off = 0;
1426
1427         if (!PageLocked(page))
1428                 BUG();
1429         if (!page->buffers)
1430                 return 1;
1431
1432         head = page->buffers;
1433         bh = head;
1434         do {
1435                 unsigned int next_off = curr_off + bh->b_size;
1436                 next = bh->b_this_page;
1437
1438                 /*
1439                  * is this block fully flushed?
1440                  */
1441                 if (offset <= curr_off)
1442                         discard_buffer(bh);
1443                 curr_off = next_off;
1444                 bh = next;
1445         } while (bh != head);
1446
1447         /*
1448          * subtle. We release buffer-heads only if this is
1449          * the 'final' flushpage. We have invalidated the get_block
1450          * cached value unconditionally, so real IO is not
1451          * possible anymore.
1452          *
1453          * If the free doesn't work out, the buffers can be
1454          * left around - they just turn into anonymous buffers
1455          * instead.
1456          */
1457         if (!offset) {
1458                 if (!try_to_release_page(page, 0))
1459                         return 0;
1460         }
1461
1462         return 1;
1463 }
1464
1465 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1466 {
1467         struct buffer_head *bh, *head, *tail;
1468
1469         /* FIXME: create_buffers should fail if there's no enough memory */
1470         head = create_buffers(page, blocksize, 1);
1471         if (page->buffers)
1472                 BUG();
1473
1474         bh = head;
1475         do {
1476                 bh->b_dev = dev;
1477                 bh->b_blocknr = 0;
1478                 bh->b_end_io = NULL;
1479                 tail = bh;
1480                 bh = bh->b_this_page;
1481         } while (bh);
1482         tail->b_this_page = head;
1483         page->buffers = head;
1484         page_cache_get(page);
1485 }
1486 EXPORT_SYMBOL(create_empty_buffers);
1487
1488 /*
1489  * We are taking a block for data and we don't want any output from any
1490  * buffer-cache aliases starting from return from that function and
1491  * until the moment when something will explicitly mark the buffer
1492  * dirty (hopefully that will not happen until we will free that block ;-)
1493  * We don't even need to mark it not-uptodate - nobody can expect
1494  * anything from a newly allocated buffer anyway. We used to used
1495  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1496  * don't want to mark the alias unmapped, for example - it would confuse
1497  * anyone who might pick it with bread() afterwards...
1498  */
1499
1500 static void unmap_underlying_metadata(struct buffer_head * bh)
1501 {
1502         struct buffer_head *old_bh;
1503
1504         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1505         if (old_bh) {
1506                 mark_buffer_clean(old_bh);
1507                 wait_on_buffer(old_bh);
1508                 clear_bit(BH_Req, &old_bh->b_state);
1509                 __brelse(old_bh);
1510         }
1511 }
1512
1513 /*
1514  * NOTE! All mapped/uptodate combinations are valid:
1515  *
1516  *      Mapped  Uptodate        Meaning
1517  *
1518  *      No      No              "unknown" - must do get_block()
1519  *      No      Yes             "hole" - zero-filled
1520  *      Yes     No              "allocated" - allocated on disk, not read in
1521  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1522  *
1523  * "Dirty" is valid only with the last case (mapped+uptodate).
1524  */
1525
1526 /*
1527  * block_write_full_page() is SMP threaded - the kernel lock is not held.
1528  */
1529 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1530 {
1531         int err, i;
1532         unsigned long block;
1533         struct buffer_head *bh, *head;
1534         int need_unlock;
1535
1536         if (!PageLocked(page))
1537                 BUG();
1538
1539         if (!page->buffers)
1540                 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1541         head = page->buffers;
1542
1543         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1544
1545         bh = head;
1546         i = 0;
1547
1548         /* Stage 1: make sure we have all the buffers mapped! */
1549         do {
1550                 /*
1551                  * If the buffer isn't up-to-date, we can't be sure
1552                  * that the buffer has been initialized with the proper
1553                  * block number information etc..
1554                  *
1555                  * Leave it to the low-level FS to make all those
1556                  * decisions (block #0 may actually be a valid block)
1557                  */
1558                 if (!buffer_mapped(bh)) {
1559                         err = get_block(inode, block, bh, 1);
1560                         if (err)
1561                                 goto out;
1562                         if (buffer_new(bh))
1563                                 unmap_underlying_metadata(bh);
1564                 }
1565                 bh = bh->b_this_page;
1566                 block++;
1567         } while (bh != head);
1568
1569         /* Stage 2: lock the buffers, mark them clean */
1570         do {
1571                 lock_buffer(bh);
1572                 set_buffer_async_io(bh);
1573                 set_bit(BH_Uptodate, &bh->b_state);
1574                 clear_bit(BH_Dirty, &bh->b_state);
1575                 bh = bh->b_this_page;
1576         } while (bh != head);
1577
1578         /* Stage 3: submit the IO */
1579         do {
1580                 struct buffer_head *next = bh->b_this_page;
1581                 submit_bh(WRITE, bh);
1582                 bh = next;
1583         } while (bh != head);
1584
1585         /* Done - end_buffer_io_async will unlock */
1586         SetPageUptodate(page);
1587
1588         wakeup_page_waiters(page);
1589
1590         return 0;
1591
1592 out:
1593         /*
1594          * ENOSPC, or some other error.  We may already have added some
1595          * blocks to the file, so we need to write these out to avoid
1596          * exposing stale data.
1597          */
1598         ClearPageUptodate(page);
1599         bh = head;
1600         need_unlock = 1;
1601         /* Recovery: lock and submit the mapped buffers */
1602         do {
1603                 if (buffer_mapped(bh)) {
1604                         lock_buffer(bh);
1605                         set_buffer_async_io(bh);
1606                         need_unlock = 0;
1607                 }
1608                 bh = bh->b_this_page;
1609         } while (bh != head);
1610         do {
1611                 struct buffer_head *next = bh->b_this_page;
1612                 if (buffer_mapped(bh)) {
1613                         set_bit(BH_Uptodate, &bh->b_state);
1614                         clear_bit(BH_Dirty, &bh->b_state);
1615                         submit_bh(WRITE, bh);
1616                 }
1617                 bh = next;
1618         } while (bh != head);
1619         if (need_unlock)
1620                 UnlockPage(page);
1621         wakeup_page_waiters(page);
1622         return err;
1623 }
1624
1625 static int __block_prepare_write(struct inode *inode, struct page *page,
1626                 unsigned from, unsigned to, get_block_t *get_block)
1627 {
1628         unsigned block_start, block_end;
1629         unsigned long block;
1630         int err = 0;
1631         unsigned blocksize, bbits;
1632         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1633         char *kaddr = kmap(page);
1634
1635         blocksize = 1 << inode->i_blkbits;
1636         if (!page->buffers)
1637                 create_empty_buffers(page, inode->i_dev, blocksize);
1638         head = page->buffers;
1639
1640         bbits = inode->i_blkbits;
1641         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1642
1643         for(bh = head, block_start = 0; bh != head || !block_start;
1644             block++, block_start=block_end, bh = bh->b_this_page) {
1645                 if (!bh)
1646                         BUG();
1647                 block_end = block_start+blocksize;
1648                 if (block_end <= from)
1649                         continue;
1650                 if (block_start >= to)
1651                         break;
1652                 clear_bit(BH_New, &bh->b_state);
1653                 if (!buffer_mapped(bh)) {
1654                         err = get_block(inode, block, bh, 1);
1655                         if (err)
1656                                 goto out;
1657                         if (buffer_new(bh)) {
1658                                 unmap_underlying_metadata(bh);
1659                                 if (Page_Uptodate(page)) {
1660                                         set_bit(BH_Uptodate, &bh->b_state);
1661                                         continue;
1662                                 }
1663                                 if (block_end > to)
1664                                         memset(kaddr+to, 0, block_end-to);
1665                                 if (block_start < from)
1666                                         memset(kaddr+block_start, 0, from-block_start);
1667                                 if (block_end > to || block_start < from)
1668                                         flush_dcache_page(page);
1669                                 continue;
1670                         }
1671                 }
1672                 if (Page_Uptodate(page)) {
1673                         set_bit(BH_Uptodate, &bh->b_state);
1674                         continue; 
1675                 }
1676                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1677                      (block_start < from || block_end > to)) {
1678                         ll_rw_block(READ, 1, &bh);
1679                         *wait_bh++=bh;
1680                 }
1681         }
1682         /*
1683          * If we issued read requests - let them complete.
1684          */
1685         while(wait_bh > wait) {
1686                 wait_on_buffer(*--wait_bh);
1687                 if (!buffer_uptodate(*wait_bh))
1688                         return -EIO;
1689         }
1690         return 0;
1691 out:
1692         /*
1693          * Zero out any newly allocated blocks to avoid exposing stale
1694          * data.  If BH_New is set, we know that the block was newly
1695          * allocated in the above loop.
1696          *
1697          * Details the buffer can be new and uptodate because:
1698          * 1) hole in uptodate page, get_block(create) allocate the block,
1699          *    so the buffer is new and additionally we also mark it uptodate
1700          * 2) The buffer is not mapped and uptodate due a previous partial read.
1701          *
1702          * We can always ignore uptodate buffers here, if you mark a buffer
1703          * uptodate you must make sure it contains the right data first.
1704          *
1705          * We must stop the "undo/clear" fixup pass not at the caller "to"
1706          * but at the last block that we successfully arrived in the main loop.
1707          */
1708         bh = head;
1709         to = block_start; /* stop at the last successfully handled block */
1710         block_start = 0;
1711         do {
1712                 block_end = block_start+blocksize;
1713                 if (block_end <= from)
1714                         goto next_bh;
1715                 if (block_start >= to)
1716                         break;
1717                 if (buffer_new(bh) && !buffer_uptodate(bh)) {
1718                         memset(kaddr+block_start, 0, bh->b_size);
1719                         flush_dcache_page(page);
1720                         set_bit(BH_Uptodate, &bh->b_state);
1721                         mark_buffer_dirty(bh);
1722                 }
1723 next_bh:
1724                 block_start = block_end;
1725                 bh = bh->b_this_page;
1726         } while (bh != head);
1727         return err;
1728 }
1729
1730 static int __block_commit_write(struct inode *inode, struct page *page,
1731                 unsigned from, unsigned to)
1732 {
1733         unsigned block_start, block_end;
1734         int partial = 0, need_balance_dirty = 0;
1735         unsigned blocksize;
1736         struct buffer_head *bh, *head;
1737
1738         blocksize = 1 << inode->i_blkbits;
1739
1740         for(bh = head = page->buffers, block_start = 0;
1741             bh != head || !block_start;
1742             block_start=block_end, bh = bh->b_this_page) {
1743                 block_end = block_start + blocksize;
1744                 if (block_end <= from || block_start >= to) {
1745                         if (!buffer_uptodate(bh))
1746                                 partial = 1;
1747                 } else {
1748                         set_bit(BH_Uptodate, &bh->b_state);
1749                         if (!atomic_set_buffer_dirty(bh)) {
1750                                 __mark_dirty(bh);
1751                                 buffer_insert_inode_data_queue(bh, inode);
1752                                 need_balance_dirty = 1;
1753                         }
1754                 }
1755         }
1756
1757         if (need_balance_dirty)
1758                 balance_dirty();
1759         /*
1760          * is this a partial write that happened to make all buffers
1761          * uptodate then we can optimize away a bogus readpage() for
1762          * the next read(). Here we 'discover' wether the page went
1763          * uptodate as a result of this (potentially partial) write.
1764          */
1765         if (!partial)
1766                 SetPageUptodate(page);
1767         return 0;
1768 }
1769
1770 /*
1771  * Generic "read page" function for block devices that have the normal
1772  * get_block functionality. This is most of the block device filesystems.
1773  * Reads the page asynchronously --- the unlock_buffer() and
1774  * mark_buffer_uptodate() functions propagate buffer state into the
1775  * page struct once IO has completed.
1776  */
1777 int block_read_full_page(struct page *page, get_block_t *get_block)
1778 {
1779         struct inode *inode = page->mapping->host;
1780         unsigned long iblock, lblock;
1781         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1782         unsigned int blocksize, blocks;
1783         int nr, i;
1784
1785         if (!PageLocked(page))
1786                 PAGE_BUG(page);
1787         blocksize = 1 << inode->i_blkbits;
1788         if (!page->buffers)
1789                 create_empty_buffers(page, inode->i_dev, blocksize);
1790         head = page->buffers;
1791
1792         blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1793         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794         lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1795         bh = head;
1796         nr = 0;
1797         i = 0;
1798
1799         do {
1800                 if (buffer_uptodate(bh))
1801                         continue;
1802
1803                 if (!buffer_mapped(bh)) {
1804                         if (iblock < lblock) {
1805                                 if (get_block(inode, iblock, bh, 0))
1806                                         SetPageError(page);
1807                         }
1808                         if (!buffer_mapped(bh)) {
1809                                 memset(kmap(page) + i*blocksize, 0, blocksize);
1810                                 flush_dcache_page(page);
1811                                 kunmap(page);
1812                                 set_bit(BH_Uptodate, &bh->b_state);
1813                                 continue;
1814                         }
1815                         /* get_block() might have updated the buffer synchronously */
1816                         if (buffer_uptodate(bh))
1817                                 continue;
1818                 }
1819
1820                 arr[nr] = bh;
1821                 nr++;
1822         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1823
1824         if (!nr) {
1825                 /*
1826                  * All buffers are uptodate - we can set the page uptodate
1827                  * as well. But not if get_block() returned an error.
1828                  */
1829                 if (!PageError(page))
1830                         SetPageUptodate(page);
1831                 UnlockPage(page);
1832                 return 0;
1833         }
1834
1835         /* Stage two: lock the buffers */
1836         for (i = 0; i < nr; i++) {
1837                 struct buffer_head * bh = arr[i];
1838                 lock_buffer(bh);
1839                 set_buffer_async_io(bh);
1840         }
1841
1842         /* Stage 3: start the IO */
1843         for (i = 0; i < nr; i++) {
1844                 struct buffer_head * bh = arr[i];
1845                 if (buffer_uptodate(bh))
1846                         end_buffer_io_async(bh, 1);
1847                 else
1848                         submit_bh(READ, bh);
1849         }
1850
1851         wakeup_page_waiters(page);
1852         
1853         return 0;
1854 }
1855
1856 /* utility function for filesystems that need to do work on expanding
1857  * truncates.  Uses prepare/commit_write to allow the filesystem to
1858  * deal with the hole.  
1859  */
1860 int generic_cont_expand(struct inode *inode, loff_t size)
1861 {
1862         struct address_space *mapping = inode->i_mapping;
1863         struct page *page;
1864         unsigned long index, offset, limit;
1865         int err;
1866
1867         err = -EFBIG;
1868         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1869         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1870                 send_sig(SIGXFSZ, current, 0);
1871                 goto out;
1872         }
1873         if (size > inode->i_sb->s_maxbytes)
1874                 goto out;
1875
1876         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1877
1878         /* ugh.  in prepare/commit_write, if from==to==start of block, we 
1879         ** skip the prepare.  make sure we never send an offset for the start
1880         ** of a block
1881         */
1882         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1883                 offset++;
1884         }
1885         index = size >> PAGE_CACHE_SHIFT;
1886         err = -ENOMEM;
1887         page = grab_cache_page(mapping, index);
1888         if (!page)
1889                 goto out;
1890         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1891         if (!err) {
1892                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1893         }
1894         UnlockPage(page);
1895         page_cache_release(page);
1896         if (err > 0)
1897                 err = 0;
1898 out:
1899         return err;
1900 }
1901
1902 /*
1903  * For moronic filesystems that do not allow holes in file.
1904  * We may have to extend the file.
1905  */
1906
1907 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1908 {
1909         struct address_space *mapping = page->mapping;
1910         struct inode *inode = mapping->host;
1911         struct page *new_page;
1912         unsigned long pgpos;
1913         long status;
1914         unsigned zerofrom;
1915         unsigned blocksize = 1 << inode->i_blkbits;
1916         char *kaddr;
1917
1918         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1919                 status = -ENOMEM;
1920                 new_page = grab_cache_page(mapping, pgpos);
1921                 if (!new_page)
1922                         goto out;
1923                 /* we might sleep */
1924                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1925                         UnlockPage(new_page);
1926                         page_cache_release(new_page);
1927                         continue;
1928                 }
1929                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1930                 if (zerofrom & (blocksize-1)) {
1931                         *bytes |= (blocksize-1);
1932                         (*bytes)++;
1933                 }
1934                 status = __block_prepare_write(inode, new_page, zerofrom,
1935                                                 PAGE_CACHE_SIZE, get_block);
1936                 if (status)
1937                         goto out_unmap;
1938                 kaddr = page_address(new_page);
1939                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1940                 flush_dcache_page(new_page);
1941                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1942                 kunmap(new_page);
1943                 UnlockPage(new_page);
1944                 page_cache_release(new_page);
1945         }
1946
1947         if (page->index < pgpos) {
1948                 /* completely inside the area */
1949                 zerofrom = offset;
1950         } else {
1951                 /* page covers the boundary, find the boundary offset */
1952                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1953
1954                 /* if we will expand the thing last block will be filled */
1955                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1956                         *bytes |= (blocksize-1);
1957                         (*bytes)++;
1958                 }
1959
1960                 /* starting below the boundary? Nothing to zero out */
1961                 if (offset <= zerofrom)
1962                         zerofrom = offset;
1963         }
1964         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1965         if (status)
1966                 goto out1;
1967         kaddr = page_address(page);
1968         if (zerofrom < offset) {
1969                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1970                 flush_dcache_page(page);
1971                 __block_commit_write(inode, page, zerofrom, offset);
1972         }
1973         return 0;
1974 out1:
1975         ClearPageUptodate(page);
1976         kunmap(page);
1977         return status;
1978
1979 out_unmap:
1980         ClearPageUptodate(new_page);
1981         kunmap(new_page);
1982         UnlockPage(new_page);
1983         page_cache_release(new_page);
1984 out:
1985         return status;
1986 }
1987
1988 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1989                         get_block_t *get_block)
1990 {
1991         struct inode *inode = page->mapping->host;
1992         int err = __block_prepare_write(inode, page, from, to, get_block);
1993         if (err) {
1994                 ClearPageUptodate(page);
1995                 kunmap(page);
1996         }
1997         return err;
1998 }
1999
2000 int block_commit_write(struct page *page, unsigned from, unsigned to)
2001 {
2002         struct inode *inode = page->mapping->host;
2003         __block_commit_write(inode,page,from,to);
2004         kunmap(page);
2005         return 0;
2006 }
2007
2008 int generic_commit_write(struct file *file, struct page *page,
2009                 unsigned from, unsigned to)
2010 {
2011         struct inode *inode = page->mapping->host;
2012         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2013         __block_commit_write(inode,page,from,to);
2014         kunmap(page);
2015         if (pos > inode->i_size) {
2016                 inode->i_size = pos;
2017                 mark_inode_dirty(inode);
2018         }
2019         return 0;
2020 }
2021
2022 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2023 {
2024         unsigned long index = from >> PAGE_CACHE_SHIFT;
2025         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2026         unsigned blocksize, iblock, length, pos;
2027         struct inode *inode = mapping->host;
2028         struct page *page;
2029         struct buffer_head *bh;
2030         int err;
2031
2032         blocksize = 1 << inode->i_blkbits;
2033         length = offset & (blocksize - 1);
2034
2035         /* Block boundary? Nothing to do */
2036         if (!length)
2037                 return 0;
2038
2039         length = blocksize - length;
2040         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2041         
2042         page = grab_cache_page(mapping, index);
2043         err = -ENOMEM;
2044         if (!page)
2045                 goto out;
2046
2047         if (!page->buffers)
2048                 create_empty_buffers(page, inode->i_dev, blocksize);
2049
2050         /* Find the buffer that contains "offset" */
2051         bh = page->buffers;
2052         pos = blocksize;
2053         while (offset >= pos) {
2054                 bh = bh->b_this_page;
2055                 iblock++;
2056                 pos += blocksize;
2057         }
2058
2059         err = 0;
2060         if (!buffer_mapped(bh)) {
2061                 /* Hole? Nothing to do */
2062                 if (buffer_uptodate(bh))
2063                         goto unlock;
2064                 get_block(inode, iblock, bh, 0);
2065                 /* Still unmapped? Nothing to do */
2066                 if (!buffer_mapped(bh))
2067                         goto unlock;
2068         }
2069
2070         /* Ok, it's mapped. Make sure it's up-to-date */
2071         if (Page_Uptodate(page))
2072                 set_bit(BH_Uptodate, &bh->b_state);
2073
2074         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2075                 err = -EIO;
2076                 ll_rw_block(READ, 1, &bh);
2077                 wait_on_buffer(bh);
2078                 /* Uhhuh. Read error. Complain and punt. */
2079                 if (!buffer_uptodate(bh))
2080                         goto unlock;
2081         }
2082
2083         memset(kmap(page) + offset, 0, length);
2084         flush_dcache_page(page);
2085         kunmap(page);
2086
2087         if (!atomic_set_buffer_dirty(bh)) {
2088                 __mark_dirty(bh);
2089                 buffer_insert_inode_data_queue(bh, inode);
2090                 balance_dirty();
2091         }
2092
2093         err = 0;
2094
2095 unlock:
2096         UnlockPage(page);
2097         page_cache_release(page);
2098 out:
2099         return err;
2100 }
2101
2102 int block_write_full_page(struct page *page, get_block_t *get_block)
2103 {
2104         struct inode *inode = page->mapping->host;
2105         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2106         unsigned offset;
2107         int err;
2108
2109         /* easy case */
2110         if (page->index < end_index)
2111                 return __block_write_full_page(inode, page, get_block);
2112
2113         /* things got complicated... */
2114         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2115         /* OK, are we completely out? */
2116         if (page->index >= end_index+1 || !offset) {
2117                 UnlockPage(page);
2118                 return -EIO;
2119         }
2120
2121         /* Sigh... will have to work, then... */
2122         err = __block_prepare_write(inode, page, 0, offset, get_block);
2123         if (!err) {
2124                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2125                 flush_dcache_page(page);
2126                 __block_commit_write(inode,page,0,offset);
2127 done:
2128                 kunmap(page);
2129                 UnlockPage(page);
2130                 return err;
2131         }
2132         ClearPageUptodate(page);
2133         goto done;
2134 }
2135
2136 /*
2137  * Commence writeout of all the buffers against a page.  The
2138  * page must be locked.   Returns zero on success or a negative
2139  * errno.
2140  */
2141 int writeout_one_page(struct page *page)
2142 {
2143         struct buffer_head *bh, *head = page->buffers;
2144
2145         if (!PageLocked(page))
2146                 BUG();
2147         bh = head;
2148         do {
2149                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2150                         continue;
2151
2152                 bh->b_flushtime = jiffies;
2153                 ll_rw_block(WRITE, 1, &bh);     
2154         } while ((bh = bh->b_this_page) != head);
2155         return 0;
2156 }
2157 EXPORT_SYMBOL(writeout_one_page);
2158
2159 /*
2160  * Wait for completion of I/O of all buffers against a page.  The page
2161  * must be locked.  Returns zero on success or a negative errno.
2162  */
2163 int waitfor_one_page(struct page *page)
2164 {
2165         int error = 0;
2166         struct buffer_head *bh, *head = page->buffers;
2167
2168         bh = head;
2169         do {
2170                 wait_on_buffer(bh);
2171                 if (buffer_req(bh) && !buffer_uptodate(bh))
2172                         error = -EIO;
2173         } while ((bh = bh->b_this_page) != head);
2174         return error;
2175 }
2176 EXPORT_SYMBOL(waitfor_one_page);
2177
2178 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2179 {
2180         struct buffer_head tmp;
2181         struct inode *inode = mapping->host;
2182         tmp.b_state = 0;
2183         tmp.b_blocknr = 0;
2184         get_block(inode, block, &tmp, 0);
2185         return tmp.b_blocknr;
2186 }
2187
2188 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2189 {
2190         int i, nr_blocks, retval;
2191         unsigned long * blocks = iobuf->blocks;
2192         int length;
2193         int beyond_eof = 0;
2194         
2195         length = iobuf->length;
2196         nr_blocks = length / blocksize;
2197         /* build the blocklist */
2198         for (i = 0; i < nr_blocks; i++, blocknr++) {
2199                 struct buffer_head bh;
2200
2201                 bh.b_state = 0;
2202                 bh.b_dev = inode->i_dev;
2203                 bh.b_size = blocksize;
2204                 bh.b_page = NULL;
2205
2206                 if (((loff_t) blocknr) * blocksize >= inode->i_size)
2207                         beyond_eof = 1;
2208
2209                 /* Only allow get_block to create new blocks if we are safely
2210                    beyond EOF.  O_DIRECT is unsafe inside sparse files. */
2211                 retval = get_block(inode, blocknr, &bh, 
2212                                    ((rw != READ) && beyond_eof));
2213
2214                 if (retval) {
2215                         if (!i)
2216                                 /* report error to userspace */
2217                                 goto out;
2218                         else
2219                                 /* do short I/O until 'i' */
2220                                 break;
2221                 }
2222
2223                 if (rw == READ) {
2224                         if (buffer_new(&bh))
2225                                 BUG();
2226                         if (!buffer_mapped(&bh)) {
2227                                 /* there was an hole in the filesystem */
2228                                 blocks[i] = -1UL;
2229                                 continue;
2230                         }
2231                 } else {
2232                         if (buffer_new(&bh))
2233                                 unmap_underlying_metadata(&bh);
2234                         if (!buffer_mapped(&bh))
2235                                 /* upper layers need to pass the error on or
2236                                  * fall back to buffered IO. */
2237                                 return -ENOTBLK;
2238                 }
2239                 blocks[i] = bh.b_blocknr;
2240         }
2241
2242         /* patch length to handle short I/O */
2243         iobuf->length = i * blocksize;
2244         if (!beyond_eof)
2245                 up(&inode->i_sem);
2246         retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2247         if (!beyond_eof)
2248                 down(&inode->i_sem);
2249         /* restore orig length */
2250         iobuf->length = length;
2251  out:
2252
2253         return retval;
2254 }
2255
2256 /*
2257  * IO completion routine for a buffer_head being used for kiobuf IO: we
2258  * can't dispatch the kiobuf callback until io_count reaches 0.  
2259  */
2260
2261 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2262 {
2263         struct kiobuf *kiobuf;
2264         
2265         mark_buffer_uptodate(bh, uptodate);
2266
2267         kiobuf = bh->b_private;
2268         end_kio_request(kiobuf, uptodate);
2269         unlock_buffer(bh);
2270 }
2271
2272 /*
2273  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2274  * for them to complete.  Clean up the buffer_heads afterwards.  
2275  */
2276
2277 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2278 {
2279         int iosize, err;
2280         int i;
2281         struct buffer_head *tmp;
2282
2283         iosize = 0;
2284         err = 0;
2285
2286         for (i = nr; --i >= 0; ) {
2287                 iosize += size;
2288                 tmp = bh[i];
2289                 wait_on_buffer(tmp);
2290                 
2291                 if (!buffer_uptodate(tmp)) {
2292                         /* We are traversing bh'es in reverse order so
2293                            clearing iosize on error calculates the
2294                            amount of IO before the first error. */
2295                         iosize = 0;
2296                         err = -EIO;
2297                 }
2298         }
2299         
2300         if (iosize)
2301                 return iosize;
2302         return err;
2303 }
2304
2305 /*
2306  * Start I/O on a physical range of kernel memory, defined by a vector
2307  * of kiobuf structs (much like a user-space iovec list).
2308  *
2309  * The kiobuf must already be locked for IO.  IO is submitted
2310  * asynchronously: you need to check page->locked and page->uptodate.
2311  *
2312  * It is up to the caller to make sure that there are enough blocks
2313  * passed in to completely map the iobufs to disk.
2314  */
2315
2316 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
2317                kdev_t dev, unsigned long b[], int size)
2318 {
2319         int             err;
2320         int             length;
2321         int             transferred;
2322         int             i;
2323         int             bufind;
2324         int             pageind;
2325         int             bhind;
2326         int             offset;
2327         unsigned long   blocknr;
2328         struct kiobuf * iobuf = NULL;
2329         struct page *   map;
2330         struct buffer_head *tmp, **bhs = NULL;
2331
2332         if (!nr)
2333                 return 0;
2334         
2335         /* 
2336          * First, do some alignment and validity checks 
2337          */
2338         for (i = 0; i < nr; i++) {
2339                 iobuf = iovec[i];
2340                 if ((iobuf->offset & (size-1)) ||
2341                     (iobuf->length & (size-1)))
2342                         return -EINVAL;
2343                 if (!iobuf->nr_pages)
2344                         panic("brw_kiovec: iobuf not initialised");
2345         }
2346
2347         /* 
2348          * OK to walk down the iovec doing page IO on each page we find. 
2349          */
2350         bufind = bhind = transferred = err = 0;
2351         for (i = 0; i < nr; i++) {
2352                 iobuf = iovec[i];
2353                 offset = iobuf->offset;
2354                 length = iobuf->length;
2355                 iobuf->errno = 0;
2356                 if (!bhs)
2357                         bhs = iobuf->bh;
2358                 
2359                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2360                         map  = iobuf->maplist[pageind];
2361                         if (!map) {
2362                                 err = -EFAULT;
2363                                 goto finished;
2364                         }
2365                         
2366                         while (length > 0) {
2367                                 blocknr = b[bufind++];
2368                                 if (blocknr == -1UL) {
2369                                         if (rw == READ) {
2370                                                 /* there was an hole in the filesystem */
2371                                                 memset(kmap(map) + offset, 0, size);
2372                                                 flush_dcache_page(map);
2373                                                 kunmap(map);
2374
2375                                                 transferred += size;
2376                                                 goto skip_block;
2377                                         } else
2378                                                 BUG();
2379                                 }
2380                                 tmp = bhs[bhind++];
2381
2382                                 tmp->b_size = size;
2383                                 set_bh_page(tmp, map, offset);
2384                                 tmp->b_this_page = tmp;
2385
2386                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2387                                 tmp->b_dev = dev;
2388                                 tmp->b_blocknr = blocknr;
2389                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2390
2391                                 if (rw == WRITE) {
2392                                         set_bit(BH_Uptodate, &tmp->b_state);
2393                                         clear_bit(BH_Dirty, &tmp->b_state);
2394                                 } else
2395                                         set_bit(BH_Uptodate, &tmp->b_state);
2396
2397                                 atomic_inc(&iobuf->io_count);
2398                                 submit_bh(rw, tmp);
2399                                 /* 
2400                                  * Wait for IO if we have got too much 
2401                                  */
2402                                 if (bhind >= KIO_MAX_SECTORS) {
2403                                         kiobuf_wait_for_io(iobuf); /* wake-one */
2404                                         err = wait_kio(rw, bhind, bhs, size);
2405                                         if (err >= 0)
2406                                                 transferred += err;
2407                                         else
2408                                                 goto finished;
2409                                         bhind = 0;
2410                                 }
2411
2412                         skip_block:
2413                                 length -= size;
2414                                 offset += size;
2415
2416                                 if (offset >= PAGE_SIZE) {
2417                                         offset = 0;
2418                                         break;
2419                                 }
2420                         } /* End of block loop */
2421                 } /* End of page loop */                
2422         } /* End of iovec loop */
2423
2424         /* Is there any IO still left to submit? */
2425         if (bhind) {
2426                 kiobuf_wait_for_io(iobuf); /* wake-one */
2427                 err = wait_kio(rw, bhind, bhs, size);
2428                 if (err >= 0)
2429                         transferred += err;
2430                 else
2431                         goto finished;
2432         }
2433
2434  finished:
2435         if (transferred)
2436                 return transferred;
2437         return err;
2438 }
2439
2440 /*
2441  * Start I/O on a page.
2442  * This function expects the page to be locked and may return
2443  * before I/O is complete. You then have to check page->locked
2444  * and page->uptodate.
2445  *
2446  * brw_page() is SMP-safe, although it's being called with the
2447  * kernel lock held - but the code is ready.
2448  *
2449  * FIXME: we need a swapper_inode->get_block function to remove
2450  *        some of the bmap kludges and interface ugliness here.
2451  */
2452 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2453 {
2454         struct buffer_head *head, *bh;
2455
2456         if (!PageLocked(page))
2457                 panic("brw_page: page not locked for I/O");
2458
2459         if (!page->buffers)
2460                 create_empty_buffers(page, dev, size);
2461         head = bh = page->buffers;
2462
2463         /* Stage 1: lock all the buffers */
2464         do {
2465                 lock_buffer(bh);
2466                 bh->b_blocknr = *(b++);
2467                 set_bit(BH_Mapped, &bh->b_state);
2468                 set_buffer_async_io(bh);
2469                 bh = bh->b_this_page;
2470         } while (bh != head);
2471
2472         /* Stage 2: start the IO */
2473         do {
2474                 struct buffer_head *next = bh->b_this_page;
2475                 submit_bh(rw, bh);
2476                 bh = next;
2477         } while (bh != head);
2478         wakeup_page_waiters(page);
2479         return 0;
2480 }
2481
2482 int block_symlink(struct inode *inode, const char *symname, int len)
2483 {
2484         struct address_space *mapping = inode->i_mapping;
2485         struct page *page = grab_cache_page(mapping, 0);
2486         int err = -ENOMEM;
2487         char *kaddr;
2488
2489         if (!page)
2490                 goto fail;
2491         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2492         if (err)
2493                 goto fail_map;
2494         kaddr = page_address(page);
2495         memcpy(kaddr, symname, len-1);
2496         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2497         /*
2498          * Notice that we are _not_ going to block here - end of page is
2499          * unmapped, so this will only try to map the rest of page, see
2500          * that it is unmapped (typically even will not look into inode -
2501          * ->i_size will be enough for everything) and zero it out.
2502          * OTOH it's obviously correct and should make the page up-to-date.
2503          */
2504         err = mapping->a_ops->readpage(NULL, page);
2505         wait_on_page(page);
2506         page_cache_release(page);
2507         if (err < 0)
2508                 goto fail;
2509         mark_inode_dirty(inode);
2510         return 0;
2511 fail_map:
2512         UnlockPage(page);
2513         page_cache_release(page);
2514 fail:
2515         return err;
2516 }
2517
2518 static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2519 {
2520         struct buffer_head *bh, *tail;
2521
2522         bh = head;
2523         do {
2524                 tail = bh;
2525                 bh = bh->b_this_page;
2526         } while (bh);
2527         tail->b_this_page = head;
2528         page->buffers = head;
2529         page_cache_get(page);
2530 }
2531
2532 /*
2533  * Create the page-cache page that contains the requested block
2534  */
2535 static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2536 {
2537         struct page * page;
2538         struct buffer_head *bh;
2539
2540         page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2541         if (!page)
2542                 return NULL;
2543
2544         if (!PageLocked(page))
2545                 BUG();
2546
2547         bh = page->buffers;
2548         if (bh) {
2549                 if (bh->b_size == size)
2550                         return page;
2551                 if (!try_to_free_buffers(page, GFP_NOFS))
2552                         goto failed;
2553         }
2554
2555         bh = create_buffers(page, size, 0);
2556         if (!bh)
2557                 goto failed;
2558         link_dev_buffers(page, bh);
2559         return page;
2560
2561 failed:
2562         UnlockPage(page);
2563         page_cache_release(page);
2564         return NULL;
2565 }
2566
2567 static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2568 {
2569         struct buffer_head *head = page->buffers;
2570         struct buffer_head *bh = head;
2571         unsigned int uptodate;
2572
2573         uptodate = 1 << BH_Mapped;
2574         if (Page_Uptodate(page))
2575                 uptodate |= 1 << BH_Uptodate;
2576
2577         write_lock(&hash_table_lock);
2578         do {
2579                 if (!(bh->b_state & (1 << BH_Mapped))) {
2580                         init_buffer(bh, NULL, NULL);
2581                         bh->b_dev = dev;
2582                         bh->b_blocknr = block;
2583                         bh->b_state = uptodate;
2584                 }
2585
2586                 /* Insert the buffer into the hash lists if necessary */
2587                 if (!bh->b_pprev)
2588                         __insert_into_hash_list(bh);
2589
2590                 block++;
2591                 bh = bh->b_this_page;
2592         } while (bh != head);
2593         write_unlock(&hash_table_lock);
2594 }
2595
2596 /*
2597  * Try to increase the number of buffers available: the size argument
2598  * is used to determine what kind of buffers we want.
2599  */
2600 static int grow_buffers(kdev_t dev, unsigned long block, int size)
2601 {
2602         struct page * page;
2603         struct block_device *bdev;
2604         unsigned long index;
2605         int sizebits;
2606
2607         /* Size must be multiple of hard sectorsize */
2608         if (size & (get_hardsect_size(dev)-1))
2609                 BUG();
2610         /* Size must be within 512 bytes and PAGE_SIZE */
2611         if (size < 512 || size > PAGE_SIZE)
2612                 BUG();
2613
2614         sizebits = -1;
2615         do {
2616                 sizebits++;
2617         } while ((size << sizebits) < PAGE_SIZE);
2618
2619         index = block >> sizebits;
2620         block = index << sizebits;
2621
2622         bdev = bdget(kdev_t_to_nr(dev));
2623         if (!bdev) {
2624                 printk("No block device for %s\n", kdevname(dev));
2625                 BUG();
2626         }
2627
2628         /* Create a page with the proper size buffers.. */
2629         page = grow_dev_page(bdev, index, size);
2630
2631         /* This is "wrong" - talk to Al Viro */
2632         atomic_dec(&bdev->bd_count);
2633         if (!page)
2634                 return 0;
2635
2636         /* Hash in the buffers on the hash list */
2637         hash_page_buffers(page, dev, block, size);
2638         UnlockPage(page);
2639         page_cache_release(page);
2640
2641         /* We hashed up this page, so increment buffermem */
2642         atomic_inc(&buffermem_pages);
2643         return 1;
2644 }
2645
2646 /*
2647  * The first time the VM inspects a page which has locked buffers, it
2648  * will just mark it as needing waiting upon on the scan of the page LRU.
2649  * BH_Wait_IO is used for this.
2650  *
2651  * The second time the VM visits the page, if it still has locked
2652  * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
2653  *
2654  * The third time the VM visits the page, if the I/O hasn't completed
2655  * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
2656  * used for this.
2657  *
2658  * There is also the case of buffers which were locked by someone else
2659  * - write(2) callers, bdflush, etc.  There can be a huge number of these
2660  * and we don't want to just skip them all and fail the page allocation. 
2661  * We want to be able to wait on these buffers as well.
2662  *
2663  * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2664  * underway against the buffer, doesn't matter who started it - we know
2665  * that the buffer will eventually come unlocked, and so it's safe to
2666  * wait on it.
2667  *
2668  * The caller holds the page lock and the caller will free this page
2669  * into current->local_page, so by waiting on the page's buffers the
2670  * caller is guaranteed to obtain this page.
2671  *
2672  * sync_page_buffers() will sort-of return true if all the buffers
2673  * against this page are freeable, so try_to_free_buffers() should
2674  * try to free the page's buffers a second time.  This is a bit
2675  * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2676  */
2677 static int sync_page_buffers(struct buffer_head *head)
2678 {
2679         struct buffer_head * bh = head;
2680         int tryagain = 1;
2681
2682         do {
2683                 if (!buffer_dirty(bh) && !buffer_locked(bh))
2684                         continue;
2685
2686                 /* Don't start IO first time around.. */
2687                 if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2688                         tryagain = 0;
2689                         continue;
2690                 }
2691
2692                 /* Second time through we start actively writing out.. */
2693                 if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2694                         if (unlikely(!buffer_launder(bh))) {
2695                                 tryagain = 0;
2696                                 continue;
2697                         }
2698                         wait_on_buffer(bh);
2699                         tryagain = 1;
2700                         continue;
2701                 }
2702
2703                 if (!atomic_set_buffer_clean(bh)) {
2704                         unlock_buffer(bh);
2705                         continue;
2706                 }
2707
2708                 __mark_buffer_clean(bh);
2709                 get_bh(bh);
2710                 bh->b_end_io = end_buffer_io_sync;
2711                 submit_bh(WRITE, bh);
2712                 tryagain = 0;
2713         } while ((bh = bh->b_this_page) != head);
2714
2715         return tryagain;
2716 }
2717
2718 /*
2719  * Can the buffer be thrown out?
2720  */
2721 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock))
2722 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2723
2724 /*
2725  * try_to_free_buffers() checks if all the buffers on this particular page
2726  * are unused, and free's the page if so.
2727  *
2728  * Wake up bdflush() if this fails - if we're running low on memory due
2729  * to dirty buffers, we need to flush them out as quickly as possible.
2730  *
2731  * NOTE: There are quite a number of ways that threads of control can
2732  *       obtain a reference to a buffer head within a page.  So we must
2733  *       lock out all of these paths to cleanly toss the page.
2734  */
2735 int fastcall try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2736 {
2737         struct buffer_head * tmp, * bh = page->buffers;
2738
2739 cleaned_buffers_try_again:
2740         spin_lock(&lru_list_lock);
2741         write_lock(&hash_table_lock);
2742         tmp = bh;
2743         do {
2744                 if (buffer_busy(tmp))
2745                         goto busy_buffer_page;
2746                 tmp = tmp->b_this_page;
2747         } while (tmp != bh);
2748
2749         spin_lock(&unused_list_lock);
2750         tmp = bh;
2751
2752         /* if this buffer was hashed, this page counts as buffermem */
2753         if (bh->b_pprev)
2754                 atomic_dec(&buffermem_pages);
2755         do {
2756                 struct buffer_head * p = tmp;
2757                 tmp = tmp->b_this_page;
2758
2759                 if (p->b_dev == B_FREE) BUG();
2760
2761                 remove_inode_queue(p);
2762                 __remove_from_queues(p);
2763                 __put_unused_buffer_head(p);
2764         } while (tmp != bh);
2765         spin_unlock(&unused_list_lock);
2766
2767         /* Wake up anyone waiting for buffer heads */
2768         wake_up(&buffer_wait);
2769
2770         /* And free the page */
2771         page->buffers = NULL;
2772         page_cache_release(page);
2773         write_unlock(&hash_table_lock);
2774         spin_unlock(&lru_list_lock);
2775         return 1;
2776
2777 busy_buffer_page:
2778         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2779         write_unlock(&hash_table_lock);
2780         spin_unlock(&lru_list_lock);
2781         gfp_mask = pf_gfp_mask(gfp_mask);
2782         if (gfp_mask & __GFP_IO) {
2783                 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2784                         if (sync_page_buffers(bh)) {
2785                                 /* no IO or waiting next time */
2786                                 gfp_mask = 0;
2787                                 goto cleaned_buffers_try_again;
2788                         }
2789                 }
2790         }
2791         if (balance_dirty_state() >= 0)
2792                 wakeup_bdflush();
2793         return 0;
2794 }
2795 EXPORT_SYMBOL(try_to_free_buffers);
2796
2797 /* ================== Debugging =================== */
2798
2799 void show_buffers(void)
2800 {
2801 #ifdef CONFIG_SMP
2802         struct buffer_head * bh;
2803         int delalloc = 0, found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2804         int nlist;
2805         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2806 #endif
2807
2808         printk("Buffer memory:   %6dkB\n",
2809                 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2810
2811         printk("Cache memory:   %6ldkB\n",
2812                 (page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2813
2814 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2815         if (!spin_trylock(&lru_list_lock))
2816                 return;
2817         for(nlist = 0; nlist < NR_LIST; nlist++) {
2818                 delalloc = found = locked = dirty = used = lastused = 0;
2819                 bh = lru_list[nlist];
2820                 if(!bh) continue;
2821
2822                 do {
2823                         found++;
2824                         if (buffer_locked(bh))
2825                                 locked++;
2826                         if (buffer_dirty(bh))
2827                                 dirty++;
2828                         if (buffer_delay(bh))
2829                                 delalloc++;
2830                         if (atomic_read(&bh->b_count))
2831                                 used++, lastused = found;
2832                         bh = bh->b_next_free;
2833                 } while (bh != lru_list[nlist]);
2834                 {
2835                         int tmp = nr_buffers_type[nlist];
2836                         if (found != tmp)
2837                                 printk("%9s: BUG -> found %d, reported %d\n",
2838                                        buf_types[nlist], found, tmp);
2839                 }
2840                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2841                        "%d locked, %d dirty, %d delay\n",
2842                        buf_types[nlist], found, size_buffers_type[nlist]>>(10-9),
2843                        used, lastused, locked, dirty, delalloc);
2844         }
2845         spin_unlock(&lru_list_lock);
2846 #endif
2847 }
2848
2849 /* ===================== Init ======================= */
2850
2851 /*
2852  * allocate the hash table and init the free list
2853  * Use gfp() for the hash table to decrease TLB misses, use
2854  * SLAB cache for buffer heads.
2855  */
2856 void __init buffer_init(unsigned long mempages)
2857 {
2858         int order, i;
2859         unsigned int nr_hash;
2860
2861         /* The buffer cache hash table is less important these days,
2862          * trim it a bit.
2863          */
2864         mempages >>= 14;
2865
2866         mempages *= sizeof(struct buffer_head *);
2867
2868         for (order = 0; (1 << order) < mempages; order++)
2869                 ;
2870
2871         /* try to allocate something until we get it or we're asking
2872            for something that is really too small */
2873
2874         do {
2875                 unsigned long tmp;
2876
2877                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2878                 bh_hash_mask = (nr_hash - 1);
2879
2880                 tmp = nr_hash;
2881                 bh_hash_shift = 0;
2882                 while((tmp >>= 1UL) != 0UL)
2883                         bh_hash_shift++;
2884
2885                 hash_table = (struct buffer_head **)
2886                     __get_free_pages(GFP_ATOMIC, order);
2887         } while (hash_table == NULL && --order > 0);
2888         printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
2889                nr_hash, order, (PAGE_SIZE << order));
2890
2891         if (!hash_table)
2892                 panic("Failed to allocate buffer hash table\n");
2893
2894         /* Setup hash chains. */
2895         for(i = 0; i < nr_hash; i++)
2896                 hash_table[i] = NULL;
2897
2898         /* Setup lru lists. */
2899         for(i = 0; i < NR_LIST; i++)
2900                 lru_list[i] = NULL;
2901
2902 }
2903
2904
2905 /* ====================== bdflush support =================== */
2906
2907 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2908  * response to dirty buffers.  Once this process is activated, we write back
2909  * a limited number of buffers to the disks and then go back to sleep again.
2910  */
2911
2912 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2913
2914 void wakeup_bdflush(void)
2915 {
2916         wake_up_interruptible(&bdflush_wait);
2917 }
2918
2919 void wakeup_kupdate(void)
2920 {
2921         if (waitqueue_active(&kupdate_wait))
2922                 wake_up(&kupdate_wait);
2923 }
2924
2925 /* 
2926  * Here we attempt to write back old buffers.  We also try to flush inodes 
2927  * and supers as well, since this function is essentially "update", and 
2928  * otherwise there would be no way of ensuring that these quantities ever 
2929  * get written back.  Ideally, we would have a timestamp on the inodes
2930  * and superblocks so that we could write back only the old ones as well
2931  */
2932
2933 static int sync_old_buffers(void)
2934 {
2935         lock_kernel();
2936         sync_unlocked_inodes();
2937         sync_supers(0, 0);
2938         unlock_kernel();
2939
2940         for (;;) {
2941                 struct buffer_head *bh;
2942
2943                 spin_lock(&lru_list_lock);
2944                 bh = lru_list[BUF_DIRTY];
2945                 if (!bh)
2946                         break;
2947                 if (time_before(jiffies, bh->b_flushtime) && !laptop_mode)
2948                         break;
2949                 if (write_some_buffers(NODEV))
2950                         continue;
2951                 return 0;
2952         }
2953         spin_unlock(&lru_list_lock);
2954         return 0;
2955 }
2956
2957 int block_sync_page(struct page *page)
2958 {
2959         run_task_queue(&tq_disk);
2960         return 0;
2961 }
2962
2963 /* This is the interface to bdflush.  As we get more sophisticated, we can
2964  * pass tuning parameters to this "process", to adjust how it behaves. 
2965  * We would want to verify each parameter, however, to make sure that it 
2966  * is reasonable. */
2967
2968 asmlinkage long sys_bdflush(int func, long data)
2969 {
2970         if (!capable(CAP_SYS_ADMIN))
2971                 return -EPERM;
2972
2973         if (func == 1) {
2974                 /* do_exit directly and let kupdate to do its work alone. */
2975                 do_exit(0);
2976 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2977          a syscall that doesn't care about the current mm context. */
2978                 int error;
2979                 struct mm_struct *user_mm;
2980
2981                 /*
2982                  * bdflush will spend all of it's time in kernel-space,
2983                  * without touching user-space, so we can switch it into
2984                  * 'lazy TLB mode' to reduce the cost of context-switches
2985                  * to and from bdflush.
2986                  */
2987                 user_mm = start_lazy_tlb();
2988                 error = sync_old_buffers();
2989                 end_lazy_tlb(user_mm);
2990                 return error;
2991 #endif
2992         }
2993
2994         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2995         if (func >= 2) {
2996                 int i = (func-2) >> 1;
2997                 if (i >= 0 && i < N_PARAM) {
2998                         if ((func & 1) == 0)
2999                                 return put_user(bdf_prm.data[i], (int*)data);
3000
3001                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
3002                                 bdf_prm.data[i] = data;
3003                                 return 0;
3004                         }
3005                 }
3006                 return -EINVAL;
3007         }
3008
3009         /* Having func 0 used to launch the actual bdflush and then never
3010          * return (unless explicitly killed). We return zero here to 
3011          * remain semi-compatible with present update(8) programs.
3012          */
3013         return 0;
3014 }
3015
3016 /*
3017  * This is the actual bdflush daemon itself. It used to be started from
3018  * the syscall above, but now we launch it ourselves internally with
3019  * kernel_thread(...)  directly after the first thread in init/main.c
3020  */
3021 int bdflush(void *startup)
3022 {
3023         struct task_struct *tsk = current;
3024
3025         /*
3026          *      We have a bare-bones task_struct, and really should fill
3027          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
3028          *      display semi-sane things. Not real crucial though...  
3029          */
3030
3031         tsk->session = 1;
3032         tsk->pgrp = 1;
3033         strcpy(tsk->comm, "bdflush");
3034
3035         /* avoid getting signals */
3036         spin_lock_irq(&tsk->sigmask_lock);
3037         flush_signals(tsk);
3038         sigfillset(&tsk->blocked);
3039         recalc_sigpending(tsk);
3040         spin_unlock_irq(&tsk->sigmask_lock);
3041
3042         complete((struct completion *)startup);
3043
3044         /*
3045          * FIXME: The ndirty logic here is wrong.  It's supposed to
3046          * send bdflush back to sleep after writing ndirty buffers.
3047          * In fact, the test is wrong so bdflush will in fact
3048          * sleep when bdflush_stop() returns true.
3049          *
3050          * FIXME: If it proves useful to implement ndirty properly,
3051          * then perhaps the value of ndirty should be scaled by the
3052          * amount of memory in the machine.
3053          */
3054         for (;;) {
3055                 int ndirty = bdf_prm.b_un.ndirty;
3056
3057                 CHECK_EMERGENCY_SYNC
3058
3059                 while (ndirty > 0) {
3060                         spin_lock(&lru_list_lock);
3061                         if (!write_some_buffers(NODEV))
3062                                 break;
3063                         ndirty -= NRSYNC;
3064                 }
3065                 if (ndirty > 0 || bdflush_stop())
3066                         interruptible_sleep_on(&bdflush_wait);
3067         }
3068 }
3069
3070 /*
3071  * This is the kernel update daemon. It was used to live in userspace
3072  * but since it's need to run safely we want it unkillable by mistake.
3073  * You don't need to change your userspace configuration since
3074  * the userspace `update` will do_exit(0) at the first sys_bdflush().
3075  */
3076 int kupdate(void *startup)
3077 {
3078         struct task_struct * tsk = current;
3079         int interval;
3080
3081         tsk->session = 1;
3082         tsk->pgrp = 1;
3083         strcpy(tsk->comm, "kupdated");
3084
3085         /* sigstop and sigcont will stop and wakeup kupdate */
3086         spin_lock_irq(&tsk->sigmask_lock);
3087         sigfillset(&tsk->blocked);
3088         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
3089         recalc_sigpending(tsk);
3090         spin_unlock_irq(&tsk->sigmask_lock);
3091
3092         complete((struct completion *)startup);
3093
3094         for (;;) {
3095                 DECLARE_WAITQUEUE(wait, tsk);
3096
3097                 add_wait_queue(&kupdate_wait, &wait);
3098
3099                 /* update interval */
3100                 interval = bdf_prm.b_un.interval;
3101                 if (interval) {
3102                         tsk->state = TASK_INTERRUPTIBLE;
3103                         schedule_timeout(interval);
3104                 } else {
3105                         tsk->state = TASK_STOPPED;
3106                         schedule(); /* wait for SIGCONT */
3107                 }
3108                 remove_wait_queue(&kupdate_wait, &wait);
3109                 /* check for sigstop */
3110                 if (signal_pending(tsk)) {
3111                         int sig, stopped = 0;
3112                         struct siginfo info;
3113
3114                         spin_lock_irq(&tsk->sigmask_lock);
3115                         sig = dequeue_signal(&current->blocked, &info);
3116                         if (sig == SIGSTOP)
3117                                 stopped = 1;
3118                         spin_unlock_irq(&tsk->sigmask_lock);
3119                         if (stopped) {
3120                                 tsk->state = TASK_STOPPED;
3121                                 schedule(); /* wait for SIGCONT */
3122                         }
3123                 }
3124 #ifdef DEBUG
3125                 printk(KERN_DEBUG "kupdate() activated...\n");
3126 #endif
3127                 sync_old_buffers();
3128                 if (laptop_mode)
3129                         fsync_dev(NODEV);
3130                 run_task_queue(&tq_disk);
3131         }
3132 }
3133
3134 static int __init bdflush_init(void)
3135 {
3136         static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3137
3138         kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3139         wait_for_completion(&startup);
3140         kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3141         wait_for_completion(&startup);
3142         return 0;
3143 }
3144
3145 module_init(bdflush_init)
3146