fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/slab.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/swapctl.h>
  39 #include <linux/smp_lock.h>
  40 #include <linux/vmalloc.h>
  41 #include <linux/blkdev.h>
  42 #include <linux/sysrq.h>
  43 #include <linux/file.h>
  44 #include <linux/init.h>
  45 #include <linux/quotaops.h>
  46 #include <linux/iobuf.h>
  47 #include <linux/highmem.h>
  48 #include <linux/module.h>
  49 #include <linux/completion.h>
  50
  51 #include <asm/uaccess.h>
  52 #include <asm/io.h>
  53 #include <asm/bitops.h>
  54 #include <asm/mmu_context.h>
  55
  56 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
  57 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  58                                              number of unused buffer heads */
  59
  60 /* Anti-deadlock ordering:
  61  *      lru_list_lock > hash_table_lock > unused_list_lock
  62  */
  63
  64 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
  65
  66 /*
  67  * Hash table gook..
  68  */
  69 static unsigned int bh_hash_mask;
  70 static unsigned int bh_hash_shift;
  71 static struct buffer_head **hash_table;
  72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  73
  74 static struct buffer_head *lru_list[NR_LIST];
  75
  76 static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
  77 #define lru_list_lock  lru_list_lock_cacheline.lock
  78
  79 static int nr_buffers_type[NR_LIST];
  80 static unsigned long size_buffers_type[NR_LIST];
  81
  82 static struct buffer_head * unused_list;
  83 static int nr_unused_buffer_heads;
  84 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  85 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  86
  87 static int grow_buffers(kdev_t dev, unsigned long block, int size);
  88 static int osync_buffers_list(struct list_head *);
  89 static void __refile_buffer(struct buffer_head *);
  90
  91 /* This is used by some architectures to estimate available memory. */
  92 atomic_t buffermem_pages = ATOMIC_INIT(0);
  93
  94 /* Here is the parameter block for the bdflush process. If you add or
  95  * remove any of the parameters, make sure to update kernel/sysctl.c
  96  * and the documentation at linux/Documentation/sysctl/vm.txt.
  97  */
  98
  99 #define N_PARAM 9
 100
 101 /* The dummy values in this structure are left in there for compatibility
 102  * with old programs that play with the /proc entries.
 103  */
 104 union bdflush_param {
 105         struct {
 106                 int nfract;     /* Percentage of buffer cache dirty to
 107                                    activate bdflush */
 108                 int ndirty;     /* Maximum number of dirty blocks to write out per
 109                                    wake-cycle */
 110                 int dummy2;     /* old "nrefill" */
 111                 int dummy3;     /* unused */
 112                 int interval;   /* jiffies delay between kupdate flushes */
 113                 int age_buffer; /* Time for normal buffer to age before we flush it */
 114                 int nfract_sync;/* Percentage of buffer cache dirty to
 115                                    activate bdflush synchronously */
 116                 int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
 117                 int dummy5;     /* unused */
 118         } b_un;
 119         unsigned int data[N_PARAM];
 120 } bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
 121
 122 /* These are the min and max parameter values that we will allow to be assigned */
 123 int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
 124 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
 125
 126 void unlock_buffer(struct buffer_head *bh)
 127 {
 128         clear_bit(BH_Wait_IO, &bh->b_state);
 129         clear_bit(BH_Launder, &bh->b_state);
 130         /*
 131          * When a locked buffer is visible to the I/O layer BH_Launder
 132          * is set. This means before unlocking we must clear BH_Launder,
 133          * mb() on alpha and then clear BH_Lock, so no reader can see
 134          * BH_Launder set on an unlocked buffer and then risk to deadlock.
 135          */
 136         smp_mb__after_clear_bit();
 137         clear_bit(BH_Lock, &bh->b_state);
 138         smp_mb__after_clear_bit();
 139         if (waitqueue_active(&bh->b_wait))
 140                 wake_up(&bh->b_wait);
 141 }
 142
 143 /*
 144  * Note that the real wait_on_buffer() is an inline function that checks
 145  * that the buffer is locked before calling this, so that unnecessary disk
 146  * unplugging does not occur.
 147  */
 148 void __wait_on_buffer(struct buffer_head * bh)
 149 {
 150         struct task_struct *tsk = current;
 151         DECLARE_WAITQUEUE(wait, tsk);
 152
 153         get_bh(bh);
 154         add_wait_queue(&bh->b_wait, &wait);
 155         do {
 156                 run_task_queue(&tq_disk);
 157                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 158                 if (!buffer_locked(bh))
 159                         break;
 160                 schedule();
 161         } while (buffer_locked(bh));
 162         tsk->state = TASK_RUNNING;
 163         remove_wait_queue(&bh->b_wait, &wait);
 164         put_bh(bh);
 165 }
 166
 167 /*
 168  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 169  * unlock the buffer. This is what ll_rw_block uses too.
 170  */
 171 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 172 {
 173         mark_buffer_uptodate(bh, uptodate);
 174         unlock_buffer(bh);
 175         put_bh(bh);
 176 }
 177
 178 /*
 179  * The buffers have been marked clean and locked.  Just submit the dang
 180  * things..
 181  */
 182 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
 183 {
 184         do {
 185                 struct buffer_head * bh = *array++;
 186                 bh->b_end_io = end_buffer_io_sync;
 187                 submit_bh(WRITE, bh);
 188         } while (--count);
 189 }
 190
 191 /*
 192  * Write some buffers from the head of the dirty queue.
 193  *
 194  * This must be called with the LRU lock held, and will
 195  * return without it!
 196  */
 197 #define NRSYNC (32)
 198 static int write_some_buffers(kdev_t dev)
 199 {
 200         struct buffer_head *next;
 201         struct buffer_head *array[NRSYNC];
 202         unsigned int count;
 203         int nr;
 204
 205         next = lru_list[BUF_DIRTY];
 206         nr = nr_buffers_type[BUF_DIRTY];
 207         count = 0;
 208         while (next && --nr >= 0) {
 209                 struct buffer_head * bh = next;
 210                 next = bh->b_next_free;
 211
 212                 if (dev != NODEV && bh->b_dev != dev)
 213                         continue;
 214                 if (test_and_set_bit(BH_Lock, &bh->b_state))
 215                         continue;
 216                 if (atomic_set_buffer_clean(bh)) {
 217                         __refile_buffer(bh);
 218                         get_bh(bh);
 219                         array[count++] = bh;
 220                         if (count < NRSYNC)
 221                                 continue;
 222
 223                         spin_unlock(&lru_list_lock);
 224                         write_locked_buffers(array, count);
 225                         return -EAGAIN;
 226                 }
 227                 unlock_buffer(bh);
 228                 __refile_buffer(bh);
 229         }
 230         spin_unlock(&lru_list_lock);
 231
 232         if (count)
 233                 write_locked_buffers(array, count);
 234         return 0;
 235 }
 236
 237 /*
 238  * Write out all buffers on the dirty list.
 239  */
 240 static void write_unlocked_buffers(kdev_t dev)
 241 {
 242         do
 243                 spin_lock(&lru_list_lock);
 244         while (write_some_buffers(dev));
 245 }
 246
 247 /*
 248  * Wait for a buffer on the proper list.
 249  *
 250  * This must be called with the LRU lock held, and
 251  * will return with it released.
 252  */
 253 static int wait_for_buffers(kdev_t dev, int index, int refile)
 254 {
 255         struct buffer_head * next;
 256         int nr;
 257
 258         next = lru_list[index];
 259         nr = nr_buffers_type[index];
 260         while (next && --nr >= 0) {
 261                 struct buffer_head *bh = next;
 262                 next = bh->b_next_free;
 263
 264                 if (!buffer_locked(bh)) {
 265                         if (refile)
 266                                 __refile_buffer(bh);
 267                         continue;
 268                 }
 269                 if (dev != NODEV && bh->b_dev != dev)
 270                         continue;
 271
 272                 get_bh(bh);
 273                 spin_unlock(&lru_list_lock);
 274                 wait_on_buffer (bh);
 275                 put_bh(bh);
 276                 return -EAGAIN;
 277         }
 278         spin_unlock(&lru_list_lock);
 279         return 0;
 280 }
 281
 282 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
 283 {
 284         do {
 285                 spin_lock(&lru_list_lock);
 286         } while (wait_for_buffers(dev, index, refile));
 287         return 0;
 288 }
 289
 290 /* Call sync_buffers with wait!=0 to ensure that the call does not
 291  * return until all buffer writes have completed.  Sync() may return
 292  * before the writes have finished; fsync() may not.
 293  */
 294
 295 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 296  * spontaneously dirty themselves without ever brelse being called.
 297  * We will ultimately want to put these in a separate list, but for
 298  * now we search all of the lists for dirty buffers.
 299  */
 300 int sync_buffers(kdev_t dev, int wait)
 301 {
 302         int err = 0;
 303
 304         /* One pass for no-wait, three for wait:
 305          * 0) write out all dirty, unlocked buffers;
 306          * 1) wait for all dirty locked buffers;
 307          * 2) write out all dirty, unlocked buffers;
 308          * 2) wait for completion by waiting for all buffers to unlock.
 309          */
 310         write_unlocked_buffers(dev);
 311         if (wait) {
 312                 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
 313                 write_unlocked_buffers(dev);
 314                 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
 315         }
 316         return err;
 317 }
 318
 319 int fsync_super(struct super_block *sb)
 320 {
 321         kdev_t dev = sb->s_dev;
 322         sync_buffers(dev, 0);
 323
 324         lock_kernel();
 325         sync_inodes_sb(sb);
 326         DQUOT_SYNC(dev);
 327         lock_super(sb);
 328         if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
 329                 sb->s_op->write_super(sb);
 330         unlock_super(sb);
 331         if (sb->s_op && sb->s_op->sync_fs)
 332                 sb->s_op->sync_fs(sb);
 333         unlock_kernel();
 334
 335         return sync_buffers(dev, 1);
 336 }
 337
 338 int fsync_no_super(kdev_t dev)
 339 {
 340         sync_buffers(dev, 0);
 341         return sync_buffers(dev, 1);
 342 }
 343
 344 int fsync_dev(kdev_t dev)
 345 {
 346         sync_buffers(dev, 0);
 347
 348         lock_kernel();
 349         sync_inodes(dev);
 350         DQUOT_SYNC(dev);
 351         sync_supers(dev, 1);
 352         unlock_kernel();
 353
 354         return sync_buffers(dev, 1);
 355 }
 356
 357 /*
 358  * There's no real reason to pretend we should
 359  * ever do anything differently
 360  */
 361 void sync_dev(kdev_t dev)
 362 {
 363         fsync_dev(dev);
 364 }
 365
 366 asmlinkage long sys_sync(void)
 367 {
 368         fsync_dev(0);
 369         return 0;
 370 }
 371
 372 /*
 373  *      filp may be NULL if called via the msync of a vma.
 374  */
 375
 376 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 377 {
 378         struct inode * inode = dentry->d_inode;
 379         struct super_block * sb;
 380         kdev_t dev;
 381         int ret;
 382
 383         lock_kernel();
 384         /* sync the inode to buffers */
 385         write_inode_now(inode, 0);
 386
 387         /* sync the superblock to buffers */
 388         sb = inode->i_sb;
 389         lock_super(sb);
 390         if (sb->s_op && sb->s_op->write_super)
 391                 sb->s_op->write_super(sb);
 392         unlock_super(sb);
 393
 394         /* .. finally sync the buffers to disk */
 395         dev = inode->i_dev;
 396         ret = sync_buffers(dev, 1);
 397         unlock_kernel();
 398         return ret;
 399 }
 400
 401 asmlinkage long sys_fsync(unsigned int fd)
 402 {
 403         struct file * file;
 404         struct dentry * dentry;
 405         struct inode * inode;
 406         int ret, err;
 407
 408         ret = -EBADF;
 409         file = fget(fd);
 410         if (!file)
 411                 goto out;
 412
 413         dentry = file->f_dentry;
 414         inode = dentry->d_inode;
 415
 416         ret = -EINVAL;
 417         if (!file->f_op || !file->f_op->fsync) {
 418                 /* Why?  We can still call filemap_fdatasync */
 419                 goto out_putf;
 420         }
 421
 422         /* We need to protect against concurrent writers.. */
 423         down(&inode->i_sem);
 424         ret = filemap_fdatasync(inode->i_mapping);
 425         err = file->f_op->fsync(file, dentry, 0);
 426         if (err && !ret)
 427                 ret = err;
 428         err = filemap_fdatawait(inode->i_mapping);
 429         if (err && !ret)
 430                 ret = err;
 431         up(&inode->i_sem);
 432
 433 out_putf:
 434         fput(file);
 435 out:
 436         return ret;
 437 }
 438
 439 asmlinkage long sys_fdatasync(unsigned int fd)
 440 {
 441         struct file * file;
 442         struct dentry * dentry;
 443         struct inode * inode;
 444         int ret, err;
 445
 446         ret = -EBADF;
 447         file = fget(fd);
 448         if (!file)
 449                 goto out;
 450
 451         dentry = file->f_dentry;
 452         inode = dentry->d_inode;
 453
 454         ret = -EINVAL;
 455         if (!file->f_op || !file->f_op->fsync)
 456                 goto out_putf;
 457
 458         down(&inode->i_sem);
 459         ret = filemap_fdatasync(inode->i_mapping);
 460         err = file->f_op->fsync(file, dentry, 1);
 461         if (err && !ret)
 462                 ret = err;
 463         err = filemap_fdatawait(inode->i_mapping);
 464         if (err && !ret)
 465                 ret = err;
 466         up(&inode->i_sem);
 467
 468 out_putf:
 469         fput(file);
 470 out:
 471         return ret;
 472 }
 473
 474 /* After several hours of tedious analysis, the following hash
 475  * function won.  Do not mess with it... -DaveM
 476  */
 477 #define _hashfn(dev,block)      \
 478         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 479          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
 480           ((block) << (bh_hash_shift - 12))))
 481 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
 482
 483 static inline void __insert_into_hash_list(struct buffer_head *bh)
 484 {
 485         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 486         struct buffer_head *next = *head;
 487
 488         *head = bh;
 489         bh->b_pprev = head;
 490         bh->b_next = next;
 491         if (next != NULL)
 492                 next->b_pprev = &bh->b_next;
 493 }
 494
 495 static __inline__ void __hash_unlink(struct buffer_head *bh)
 496 {
 497         struct buffer_head **pprev = bh->b_pprev;
 498         if (pprev) {
 499                 struct buffer_head *next = bh->b_next;
 500                 if (next)
 501                         next->b_pprev = pprev;
 502                 *pprev = next;
 503                 bh->b_pprev = NULL;
 504         }
 505 }
 506
 507 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 508 {
 509         struct buffer_head **bhp = &lru_list[blist];
 510
 511         if (bh->b_prev_free || bh->b_next_free) BUG();
 512
 513         if(!*bhp) {
 514                 *bhp = bh;
 515                 bh->b_prev_free = bh;
 516         }
 517         bh->b_next_free = *bhp;
 518         bh->b_prev_free = (*bhp)->b_prev_free;
 519         (*bhp)->b_prev_free->b_next_free = bh;
 520         (*bhp)->b_prev_free = bh;
 521         nr_buffers_type[blist]++;
 522         size_buffers_type[blist] += bh->b_size;
 523 }
 524
 525 static void __remove_from_lru_list(struct buffer_head * bh)
 526 {
 527         struct buffer_head *next = bh->b_next_free;
 528         if (next) {
 529                 struct buffer_head *prev = bh->b_prev_free;
 530                 int blist = bh->b_list;
 531
 532                 prev->b_next_free = next;
 533                 next->b_prev_free = prev;
 534                 if (lru_list[blist] == bh) {
 535                         if (next == bh)
 536                                 next = NULL;
 537                         lru_list[blist] = next;
 538                 }
 539                 bh->b_next_free = NULL;
 540                 bh->b_prev_free = NULL;
 541                 nr_buffers_type[blist]--;
 542                 size_buffers_type[blist] -= bh->b_size;
 543         }
 544 }
 545
 546 /* must be called with both the hash_table_lock and the lru_list_lock
 547    held */
 548 static void __remove_from_queues(struct buffer_head *bh)
 549 {
 550         __hash_unlink(bh);
 551         __remove_from_lru_list(bh);
 552 }
 553
 554 static void remove_from_queues(struct buffer_head *bh)
 555 {
 556         spin_lock(&lru_list_lock);
 557         write_lock(&hash_table_lock);
 558         __remove_from_queues(bh);
 559         write_unlock(&hash_table_lock);
 560         spin_unlock(&lru_list_lock);
 561 }
 562
 563 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 564 {
 565         struct buffer_head *bh, **p = &hash(dev, block);
 566
 567         read_lock(&hash_table_lock);
 568
 569         for (;;) {
 570                 bh = *p;
 571                 if (!bh)
 572                         break;
 573                 p = &bh->b_next;
 574                 if (bh->b_blocknr != block)
 575                         continue;
 576                 if (bh->b_size != size)
 577                         continue;
 578                 if (bh->b_dev != dev)
 579                         continue;
 580                 get_bh(bh);
 581                 break;
 582         }
 583
 584         read_unlock(&hash_table_lock);
 585         return bh;
 586 }
 587
 588 void buffer_insert_list(struct buffer_head *bh, struct list_head *list)
 589 {
 590         spin_lock(&lru_list_lock);
 591         if (buffer_attached(bh))
 592                 list_del(&bh->b_inode_buffers);
 593         set_buffer_attached(bh);
 594         list_add(&bh->b_inode_buffers, list);
 595         spin_unlock(&lru_list_lock);
 596 }
 597
 598 /*
 599  * The caller must have the lru_list lock before calling the
 600  * remove_inode_queue functions.
 601  */
 602 static void __remove_inode_queue(struct buffer_head *bh)
 603 {
 604         list_del(&bh->b_inode_buffers);
 605         clear_buffer_attached(bh);
 606 }
 607
 608 static inline void remove_inode_queue(struct buffer_head *bh)
 609 {
 610         if (buffer_attached(bh))
 611                 __remove_inode_queue(bh);
 612 }
 613
 614 int inode_has_buffers(struct inode *inode)
 615 {
 616         int ret;
 617
 618         spin_lock(&lru_list_lock);
 619         ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
 620         spin_unlock(&lru_list_lock);
 621
 622         return ret;
 623 }
 624
 625 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 626    of fs corruption is going on. Trashing dirty data always imply losing
 627    information that was supposed to be just stored on the physical layer
 628    by the user.
 629
 630    Thus invalidate_buffers in general usage is not allwowed to trash
 631    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 632    be preserved.  These buffers are simply skipped.
 633
 634    We also skip buffers which are still in use.  For example this can
 635    happen if a userspace program is reading the block device.
 636
 637    NOTE: In the case where the user removed a removable-media-disk even if
 638    there's still dirty data not synced on disk (due a bug in the device driver
 639    or due an error of the user), by not destroying the dirty buffers we could
 640    generate corruption also on the next media inserted, thus a parameter is
 641    necessary to handle this case in the most safe way possible (trying
 642    to not corrupt also the new disk inserted with the data belonging to
 643    the old now corrupted disk). Also for the ramdisk the natural thing
 644    to do in order to release the ramdisk memory is to destroy dirty buffers.
 645
 646    These are two special cases. Normal usage imply the device driver
 647    to issue a sync on the device (without waiting I/O completion) and
 648    then an invalidate_buffers call that doesn't trash dirty buffers.
 649
 650    For handling cache coherency with the blkdev pagecache the 'update' case
 651    is been introduced. It is needed to re-read from disk any pinned
 652    buffer. NOTE: re-reading from disk is destructive so we can do it only
 653    when we assume nobody is changing the buffercache under our I/O and when
 654    we think the disk contains more recent information than the buffercache.
 655    The update == 1 pass marks the buffers we need to update, the update == 2
 656    pass does the actual I/O. */
 657 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 658 {
 659         int i, nlist, slept;
 660         struct buffer_head * bh, * bh_next;
 661         kdev_t dev = to_kdev_t(bdev->bd_dev);   /* will become bdev */
 662
 663  retry:
 664         slept = 0;
 665         spin_lock(&lru_list_lock);
 666         for(nlist = 0; nlist < NR_LIST; nlist++) {
 667                 bh = lru_list[nlist];
 668                 if (!bh)
 669                         continue;
 670                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 671                         bh_next = bh->b_next_free;
 672
 673                         /* Another device? */
 674                         if (bh->b_dev != dev)
 675                                 continue;
 676                         /* Not hashed? */
 677                         if (!bh->b_pprev)
 678                                 continue;
 679                         if (buffer_locked(bh)) {
 680                                 get_bh(bh);
 681                                 spin_unlock(&lru_list_lock);
 682                                 wait_on_buffer(bh);
 683                                 slept = 1;
 684                                 spin_lock(&lru_list_lock);
 685                                 put_bh(bh);
 686                         }
 687
 688                         write_lock(&hash_table_lock);
 689                         /* All buffers in the lru lists are mapped */
 690                         if (!buffer_mapped(bh))
 691                                 BUG();
 692                         if (buffer_dirty(bh) && destroy_dirty_buffers)
 693                                 printk("invalidate: dirty buffer\n");
 694                         if (!atomic_read(&bh->b_count)) {
 695                                 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
 696                                         remove_inode_queue(bh);
 697                                 }
 698                         } else if (!bdev->bd_openers)
 699                                 printk("invalidate: busy buffer\n");
 700
 701                         write_unlock(&hash_table_lock);
 702                         if (slept)
 703                                 goto out;
 704                 }
 705         }
 706 out:
 707         spin_unlock(&lru_list_lock);
 708         if (slept)
 709                 goto retry;
 710
 711         /* Get rid of the page cache */
 712         invalidate_inode_pages(bdev->bd_inode);
 713 }
 714
 715 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 716 {
 717         struct block_device *bdev = bdget(dev);
 718         if (bdev) {
 719                 invalidate_bdev(bdev, destroy_dirty_buffers);
 720                 bdput(bdev);
 721         }
 722 }
 723
 724 static void free_more_memory(void)
 725 {
 726         balance_dirty();
 727         wakeup_bdflush();
 728         try_to_free_pages(GFP_NOIO);
 729         run_task_queue(&tq_disk);
 730         yield();
 731 }
 732
 733 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 734 {
 735         bh->b_list = BUF_CLEAN;
 736         bh->b_end_io = handler;
 737         bh->b_private = private;
 738 }
 739
 740 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 741 {
 742         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 743         unsigned long flags;
 744         struct buffer_head *tmp;
 745         struct page *page;
 746         int fullup = 1;
 747
 748         mark_buffer_uptodate(bh, uptodate);
 749
 750         /* This is a temporary buffer used for page I/O. */
 751         page = bh->b_page;
 752
 753         if (!uptodate)
 754                 SetPageError(page);
 755
 756         /*
 757          * Be _very_ careful from here on. Bad things can happen if
 758          * two buffer heads end IO at almost the same time and both
 759          * decide that the page is now completely done.
 760          *
 761          * Async buffer_heads are here only as labels for IO, and get
 762          * thrown away once the IO for this page is complete.  IO is
 763          * deemed complete once all buffers have been visited
 764          * (b_count==0) and are now unlocked. We must make sure that
 765          * only the _last_ buffer that decrements its count is the one
 766          * that unlock the page..
 767          */
 768         spin_lock_irqsave(&page_uptodate_lock, flags);
 769         mark_buffer_async(bh, 0);
 770         unlock_buffer(bh);
 771         tmp = bh->b_this_page;
 772         while (tmp != bh) {
 773                 if (buffer_locked(tmp)) {
 774                         if (buffer_async(tmp))
 775                                 goto still_busy;
 776                 } else if (!buffer_uptodate(tmp))
 777                         fullup = 0;
 778                 tmp = tmp->b_this_page;
 779         }
 780
 781         /* OK, the async IO on this page is complete. */
 782         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 783
 784         /*
 785          * If none of the buffers had errors and all were uptodate
 786          * then we can set the page uptodate:
 787          */
 788         if (fullup && !PageError(page))
 789                 SetPageUptodate(page);
 790
 791         UnlockPage(page);
 792
 793         return;
 794
 795 still_busy:
 796         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 797         return;
 798 }
 799
 800 inline void set_buffer_async_io(struct buffer_head *bh)
 801 {
 802         bh->b_end_io = end_buffer_io_async;
 803         mark_buffer_async(bh, 1);
 804 }
 805
 806 /*
 807  * Synchronise all the inode's dirty buffers to the disk.
 808  *
 809  * We have conflicting pressures: we want to make sure that all
 810  * initially dirty buffers get waited on, but that any subsequently
 811  * dirtied buffers don't.  After all, we don't want fsync to last
 812  * forever if somebody is actively writing to the file.
 813  *
 814  * Do this in two main stages: first we copy dirty buffers to a
 815  * temporary inode list, queueing the writes as we go.  Then we clean
 816  * up, waiting for those writes to complete.
 817  *
 818  * During this second stage, any subsequent updates to the file may end
 819  * up refiling the buffer on the original inode's dirty list again, so
 820  * there is a chance we will end up with a buffer queued for write but
 821  * not yet completed on that list.  So, as a final cleanup we go through
 822  * the osync code to catch these locked, dirty buffers without requeuing
 823  * any newly dirty buffers for write.
 824  */
 825 int fsync_buffers_list(struct list_head *list)
 826 {
 827         struct buffer_head *bh;
 828         struct list_head tmp;
 829         int err = 0, err2;
 830
 831         INIT_LIST_HEAD(&tmp);
 832
 833         spin_lock(&lru_list_lock);
 834
 835         while (!list_empty(list)) {
 836                 bh = BH_ENTRY(list->next);
 837                 list_del(&bh->b_inode_buffers);
 838                 if (!buffer_dirty(bh) && !buffer_locked(bh))
 839                         clear_buffer_attached(bh);
 840                 else {
 841                         set_buffer_attached(bh);
 842                         list_add(&bh->b_inode_buffers, &tmp);
 843                         if (buffer_dirty(bh)) {
 844                                 get_bh(bh);
 845                                 spin_unlock(&lru_list_lock);
 846                         /*
 847                          * Wait I/O completion before submitting
 848                          * the buffer, to be sure the write will
 849                          * be effective on the latest data in
 850                          * the buffer. (otherwise - if there's old
 851                          * I/O in flight - write_buffer would become
 852                          * a noop)
 853                          */
 854                                 wait_on_buffer(bh);
 855                                 ll_rw_block(WRITE, 1, &bh);
 856                                 brelse(bh);
 857                                 spin_lock(&lru_list_lock);
 858                         }
 859                 }
 860         }
 861
 862         while (!list_empty(&tmp)) {
 863                 bh = BH_ENTRY(tmp.prev);
 864                 remove_inode_queue(bh);
 865                 get_bh(bh);
 866                 spin_unlock(&lru_list_lock);
 867                 wait_on_buffer(bh);
 868                 if (!buffer_uptodate(bh))
 869                         err = -EIO;
 870                 brelse(bh);
 871                 spin_lock(&lru_list_lock);
 872         }
 873
 874         spin_unlock(&lru_list_lock);
 875         err2 = osync_buffers_list(list);
 876
 877         if (err)
 878                 return err;
 879         else
 880                 return err2;
 881 }
 882
 883 /*
 884  * osync is designed to support O_SYNC io.  It waits synchronously for
 885  * all already-submitted IO to complete, but does not queue any new
 886  * writes to the disk.
 887  *
 888  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 889  * you dirty the buffers, and then use osync_buffers_list to wait for
 890  * completion.  Any other dirty buffers which are not yet queued for
 891  * write will not be flushed to disk by the osync.
 892  */
 893 static int osync_buffers_list(struct list_head *list)
 894 {
 895         struct buffer_head *bh;
 896         struct list_head *p;
 897         int err = 0;
 898
 899         spin_lock(&lru_list_lock);
 900
 901  repeat:
 902         list_for_each_prev(p, list) {
 903                 bh = BH_ENTRY(p);
 904                 if (buffer_locked(bh)) {
 905                         get_bh(bh);
 906                         spin_unlock(&lru_list_lock);
 907                         wait_on_buffer(bh);
 908                         if (!buffer_uptodate(bh))
 909                                 err = -EIO;
 910                         brelse(bh);
 911                         spin_lock(&lru_list_lock);
 912                         goto repeat;
 913                 }
 914         }
 915
 916         spin_unlock(&lru_list_lock);
 917         return err;
 918 }
 919
 920 /*
 921  * Invalidate any and all dirty buffers on a given inode.  We are
 922  * probably unmounting the fs, but that doesn't mean we have already
 923  * done a sync().  Just drop the buffers from the inode list.
 924  */
 925 void invalidate_inode_buffers(struct inode *inode)
 926 {
 927         struct list_head * entry;
 928
 929         spin_lock(&lru_list_lock);
 930         while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
 931                 remove_inode_queue(BH_ENTRY(entry));
 932         while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
 933                 remove_inode_queue(BH_ENTRY(entry));
 934         spin_unlock(&lru_list_lock);
 935 }
 936
 937
 938 /*
 939  * Ok, this is getblk, and it isn't very clear, again to hinder
 940  * race-conditions. Most of the code is seldom used, (ie repeating),
 941  * so it should be much more efficient than it looks.
 942  *
 943  * The algorithm is changed: hopefully better, and an elusive bug removed.
 944  *
 945  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 946  * when the filesystem starts to get full of dirty blocks (I hope).
 947  */
 948 struct buffer_head * getblk(kdev_t dev, int block, int size)
 949 {
 950         for (;;) {
 951                 struct buffer_head * bh;
 952
 953                 bh = get_hash_table(dev, block, size);
 954                 if (bh) {
 955                         touch_buffer(bh);
 956                         return bh;
 957                 }
 958
 959                 if (!grow_buffers(dev, block, size))
 960                         free_more_memory();
 961         }
 962 }
 963
 964 /* -1 -> no need to flush
 965     0 -> async flush
 966     1 -> sync flush (wait for I/O completion) */
 967 static int balance_dirty_state(void)
 968 {
 969         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 970
 971         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 972         tot = nr_free_buffer_pages();
 973
 974         dirty *= 100;
 975         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 976         hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
 977
 978         /* First, check for the "real" dirty limit. */
 979         if (dirty > soft_dirty_limit) {
 980                 if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
 981                         return 1;
 982                 return 0;
 983         }
 984
 985         return -1;
 986 }
 987
 988 static int bdflush_stop(void)
 989 {
 990         unsigned long dirty, tot, dirty_limit;
 991
 992         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 993         tot = nr_free_buffer_pages();
 994
 995         dirty *= 100;
 996         dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
 997
 998         if (dirty > dirty_limit)
 999                 return 0;
1000         return 1;
1001 }
1002
1003 /*
1004  * if a new dirty buffer is created we need to balance bdflush.
1005  *
1006  * in the future we might want to make bdflush aware of different
1007  * pressures on different devices - thus the (currently unused)
1008  * 'dev' parameter.
1009  */
1010 void balance_dirty(void)
1011 {
1012         int state = balance_dirty_state();
1013
1014         if (state < 0)
1015                 return;
1016
1017         wakeup_bdflush();
1018
1019         /*
1020          * And if we're _really_ out of balance, wait for
1021          * some of the dirty/locked buffers ourselves.
1022          * This will throttle heavy writers.
1023          */
1024         if (state > 0) {
1025                 spin_lock(&lru_list_lock);
1026                 write_some_buffers(NODEV);
1027         }
1028 }
1029
1030 inline void __mark_dirty(struct buffer_head *bh)
1031 {
1032         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1033         refile_buffer(bh);
1034 }
1035
1036 /* atomic version, the user must call balance_dirty() by hand
1037    as soon as it become possible to block */
1038 void __mark_buffer_dirty(struct buffer_head *bh)
1039 {
1040         if (!atomic_set_buffer_dirty(bh))
1041                 __mark_dirty(bh);
1042 }
1043
1044 void mark_buffer_dirty(struct buffer_head *bh)
1045 {
1046         if (!atomic_set_buffer_dirty(bh)) {
1047                 __mark_dirty(bh);
1048                 balance_dirty();
1049         }
1050 }
1051
1052 void set_buffer_flushtime(struct buffer_head *bh)
1053 {
1054         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1055 }
1056 EXPORT_SYMBOL(set_buffer_flushtime);
1057
1058 /*
1059  * A buffer may need to be moved from one buffer list to another
1060  * (e.g. in case it is not shared any more). Handle this.
1061  */
1062 static void __refile_buffer(struct buffer_head *bh)
1063 {
1064         int dispose = BUF_CLEAN;
1065         if (buffer_locked(bh))
1066                 dispose = BUF_LOCKED;
1067         if (buffer_dirty(bh))
1068                 dispose = BUF_DIRTY;
1069         if (dispose != bh->b_list) {
1070                 __remove_from_lru_list(bh);
1071                 bh->b_list = dispose;
1072                 if (dispose == BUF_CLEAN)
1073                         remove_inode_queue(bh);
1074                 __insert_into_lru_list(bh, dispose);
1075         }
1076 }
1077
1078 void refile_buffer(struct buffer_head *bh)
1079 {
1080         spin_lock(&lru_list_lock);
1081         __refile_buffer(bh);
1082         spin_unlock(&lru_list_lock);
1083 }
1084
1085 /*
1086  * Release a buffer head
1087  */
1088 void __brelse(struct buffer_head * buf)
1089 {
1090         if (atomic_read(&buf->b_count)) {
1091                 put_bh(buf);
1092                 return;
1093         }
1094         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1095 }
1096
1097 /*
1098  * bforget() is like brelse(), except it discards any
1099  * potentially dirty data.
1100  */
1101 void __bforget(struct buffer_head * buf)
1102 {
1103         mark_buffer_clean(buf);
1104         __brelse(buf);
1105 }
1106
1107 /**
1108  *      bread() - reads a specified block and returns the bh
1109  *      @block: number of block
1110  *      @size: size (in bytes) to read
1111  *
1112  *      Reads a specified block, and returns buffer head that
1113  *      contains it. It returns NULL if the block was unreadable.
1114  */
1115 struct buffer_head * bread(kdev_t dev, int block, int size)
1116 {
1117         struct buffer_head * bh;
1118
1119         bh = getblk(dev, block, size);
1120         if (buffer_uptodate(bh))
1121                 return bh;
1122         ll_rw_block(READ, 1, &bh);
1123         wait_on_buffer(bh);
1124         if (buffer_uptodate(bh))
1125                 return bh;
1126         brelse(bh);
1127         return NULL;
1128 }
1129
1130 /*
1131  * Note: the caller should wake up the buffer_wait list if needed.
1132  */
1133 static void __put_unused_buffer_head(struct buffer_head * bh)
1134 {
1135         if (unlikely(buffer_attached(bh)))
1136                 BUG();
1137         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1138                 kmem_cache_free(bh_cachep, bh);
1139         } else {
1140                 bh->b_dev = B_FREE;
1141                 bh->b_blocknr = -1;
1142                 bh->b_this_page = NULL;
1143
1144                 nr_unused_buffer_heads++;
1145                 bh->b_next_free = unused_list;
1146                 unused_list = bh;
1147         }
1148 }
1149
1150 void put_unused_buffer_head(struct buffer_head *bh)
1151 {
1152         spin_lock(&unused_list_lock);
1153         __put_unused_buffer_head(bh);
1154         spin_unlock(&unused_list_lock);
1155 }
1156 EXPORT_SYMBOL(put_unused_buffer_head);
1157
1158 /*
1159  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1160  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1161  * buffer heads is now handled in create_buffers().
1162  */
1163 struct buffer_head * get_unused_buffer_head(int async)
1164 {
1165         struct buffer_head * bh;
1166
1167         spin_lock(&unused_list_lock);
1168         if (nr_unused_buffer_heads > NR_RESERVED) {
1169                 bh = unused_list;
1170                 unused_list = bh->b_next_free;
1171                 nr_unused_buffer_heads--;
1172                 spin_unlock(&unused_list_lock);
1173                 return bh;
1174         }
1175         spin_unlock(&unused_list_lock);
1176
1177         /* This is critical.  We can't call out to the FS
1178          * to get more buffer heads, because the FS may need
1179          * more buffer-heads itself.  Thus SLAB_NOFS.
1180          */
1181         if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1182                 bh->b_blocknr = -1;
1183                 bh->b_this_page = NULL;
1184                 return bh;
1185         }
1186
1187         /*
1188          * If we need an async buffer, use the reserved buffer heads.
1189          */
1190         if (async) {
1191                 spin_lock(&unused_list_lock);
1192                 if (unused_list) {
1193                         bh = unused_list;
1194                         unused_list = bh->b_next_free;
1195                         nr_unused_buffer_heads--;
1196                         spin_unlock(&unused_list_lock);
1197                         return bh;
1198                 }
1199                 spin_unlock(&unused_list_lock);
1200         }
1201
1202         return NULL;
1203 }
1204 EXPORT_SYMBOL(get_unused_buffer_head);
1205
1206 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1207 {
1208         if (offset >= PAGE_SIZE)
1209                 BUG();
1210
1211         /*
1212          * page_address will return NULL anyways for highmem pages
1213          */
1214         bh->b_data = page_address(page) + offset;
1215         bh->b_page = page;
1216 }
1217 EXPORT_SYMBOL(set_bh_page);
1218
1219 /*
1220  * Create the appropriate buffers when given a page for data area and
1221  * the size of each buffer.. Use the bh->b_this_page linked list to
1222  * follow the buffers created.  Return NULL if unable to create more
1223  * buffers.
1224  * The async flag is used to differentiate async IO (paging, swapping)
1225  * from ordinary buffer allocations, and only async requests are allowed
1226  * to sleep waiting for buffer heads.
1227  */
1228 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1229 {
1230         struct buffer_head *bh, *head;
1231         long offset;
1232
1233 try_again:
1234         head = NULL;
1235         offset = PAGE_SIZE;
1236         while ((offset -= size) >= 0) {
1237                 bh = get_unused_buffer_head(async);
1238                 if (!bh)
1239                         goto no_grow;
1240
1241                 bh->b_dev = NODEV;
1242                 bh->b_this_page = head;
1243                 head = bh;
1244
1245                 bh->b_state = 0;
1246                 bh->b_next_free = NULL;
1247                 bh->b_pprev = NULL;
1248                 atomic_set(&bh->b_count, 0);
1249                 bh->b_size = size;
1250
1251                 set_bh_page(bh, page, offset);
1252
1253                 bh->b_list = BUF_CLEAN;
1254                 bh->b_end_io = NULL;
1255         }
1256         return head;
1257 /*
1258  * In case anything failed, we just free everything we got.
1259  */
1260 no_grow:
1261         if (head) {
1262                 spin_lock(&unused_list_lock);
1263                 do {
1264                         bh = head;
1265                         head = head->b_this_page;
1266                         __put_unused_buffer_head(bh);
1267                 } while (head);
1268                 spin_unlock(&unused_list_lock);
1269
1270                 /* Wake up any waiters ... */
1271                 wake_up(&buffer_wait);
1272         }
1273
1274         /*
1275          * Return failure for non-async IO requests.  Async IO requests
1276          * are not allowed to fail, so we have to wait until buffer heads
1277          * become available.  But we don't want tasks sleeping with
1278          * partially complete buffers, so all were released above.
1279          */
1280         if (!async)
1281                 return NULL;
1282
1283         /* We're _really_ low on memory. Now we just
1284          * wait for old buffer heads to become free due to
1285          * finishing IO.  Since this is an async request and
1286          * the reserve list is empty, we're sure there are
1287          * async buffer heads in use.
1288          */
1289         run_task_queue(&tq_disk);
1290
1291         free_more_memory();
1292         goto try_again;
1293 }
1294
1295 /*
1296  * Called when truncating a buffer on a page completely.
1297  */
1298 static void discard_buffer(struct buffer_head * bh)
1299 {
1300         if (buffer_mapped(bh)) {
1301                 mark_buffer_clean(bh);
1302                 lock_buffer(bh);
1303                 clear_bit(BH_Uptodate, &bh->b_state);
1304                 clear_bit(BH_Mapped, &bh->b_state);
1305                 clear_bit(BH_Req, &bh->b_state);
1306                 clear_bit(BH_New, &bh->b_state);
1307                 remove_from_queues(bh);
1308                 unlock_buffer(bh);
1309         }
1310 }
1311
1312 /**
1313  * try_to_release_page - release old fs-specific metadata on a page
1314  *
1315  */
1316
1317 int try_to_release_page(struct page * page, int gfp_mask)
1318 {
1319         if (!PageLocked(page))
1320                 BUG();
1321
1322         if (!page->mapping)
1323                 goto try_to_free;
1324         if (!page->mapping->a_ops->releasepage)
1325                 goto try_to_free;
1326         if (page->mapping->a_ops->releasepage(page, gfp_mask))
1327                 goto try_to_free;
1328         /*
1329          * We couldn't release buffer metadata; don't even bother trying
1330          * to release buffers.
1331          */
1332         return 0;
1333 try_to_free:
1334         return try_to_free_buffers(page, gfp_mask);
1335 }
1336
1337 /*
1338  * We don't have to release all buffers here, but
1339  * we have to be sure that no dirty buffer is left
1340  * and no IO is going on (no buffer is locked), because
1341  * we have truncated the file and are going to free the
1342  * blocks on-disk..
1343  */
1344 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1345 {
1346         struct buffer_head *head, *bh, *next;
1347         unsigned int curr_off = 0;
1348
1349         if (!PageLocked(page))
1350                 BUG();
1351         if (!page->buffers)
1352                 return 1;
1353
1354         head = page->buffers;
1355         bh = head;
1356         do {
1357                 unsigned int next_off = curr_off + bh->b_size;
1358                 next = bh->b_this_page;
1359
1360                 /*
1361                  * is this block fully flushed?
1362                  */
1363                 if (offset <= curr_off)
1364                         discard_buffer(bh);
1365                 curr_off = next_off;
1366                 bh = next;
1367         } while (bh != head);
1368
1369         /*
1370          * subtle. We release buffer-heads only if this is
1371          * the 'final' flushpage. We have invalidated the get_block
1372          * cached value unconditionally, so real IO is not
1373          * possible anymore.
1374          *
1375          * If the free doesn't work out, the buffers can be
1376          * left around - they just turn into anonymous buffers
1377          * instead.
1378          */
1379         if (!offset) {
1380                 if (!try_to_release_page(page, 0))
1381                         return 0;
1382         }
1383
1384         return 1;
1385 }
1386
1387 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1388 {
1389         struct buffer_head *bh, *head, *tail;
1390
1391         /* FIXME: create_buffers should fail if there's no enough memory */
1392         head = create_buffers(page, blocksize, 1);
1393         if (page->buffers)
1394                 BUG();
1395
1396         bh = head;
1397         do {
1398                 bh->b_dev = dev;
1399                 bh->b_blocknr = 0;
1400                 bh->b_end_io = NULL;
1401                 tail = bh;
1402                 bh = bh->b_this_page;
1403         } while (bh);
1404         tail->b_this_page = head;
1405         page->buffers = head;
1406         page_cache_get(page);
1407 }
1408 EXPORT_SYMBOL(create_empty_buffers);
1409
1410 /*
1411  * We are taking a block for data and we don't want any output from any
1412  * buffer-cache aliases starting from return from that function and
1413  * until the moment when something will explicitly mark the buffer
1414  * dirty (hopefully that will not happen until we will free that block ;-)
1415  * We don't even need to mark it not-uptodate - nobody can expect
1416  * anything from a newly allocated buffer anyway. We used to used
1417  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1418  * don't want to mark the alias unmapped, for example - it would confuse
1419  * anyone who might pick it with bread() afterwards...
1420  */
1421
1422 static void unmap_underlying_metadata(struct buffer_head * bh)
1423 {
1424         struct buffer_head *old_bh;
1425
1426         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1427         if (old_bh) {
1428                 mark_buffer_clean(old_bh);
1429                 wait_on_buffer(old_bh);
1430                 clear_bit(BH_Req, &old_bh->b_state);
1431                 __brelse(old_bh);
1432         }
1433 }
1434
1435 /*
1436  * NOTE! All mapped/uptodate combinations are valid:
1437  *
1438  *      Mapped  Uptodate        Meaning
1439  *
1440  *      No      No              "unknown" - must do get_block()
1441  *      No      Yes             "hole" - zero-filled
1442  *      Yes     No              "allocated" - allocated on disk, not read in
1443  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1444  *
1445  * "Dirty" is valid only with the last case (mapped+uptodate).
1446  */
1447
1448 /*
1449  * block_write_full_page() is SMP threaded - the kernel lock is not held.
1450  */
1451 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1452 {
1453         int err, i;
1454         unsigned long block;
1455         struct buffer_head *bh, *head;
1456         int need_unlock;
1457
1458         if (!PageLocked(page))
1459                 BUG();
1460
1461         if (!page->buffers)
1462                 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1463         head = page->buffers;
1464
1465         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1466
1467         bh = head;
1468         i = 0;
1469
1470         /* Stage 1: make sure we have all the buffers mapped! */
1471         do {
1472                 /*
1473                  * If the buffer isn't up-to-date, we can't be sure
1474                  * that the buffer has been initialized with the proper
1475                  * block number information etc..
1476                  *
1477                  * Leave it to the low-level FS to make all those
1478                  * decisions (block #0 may actually be a valid block)
1479                  */
1480                 if (!buffer_mapped(bh)) {
1481                         err = get_block(inode, block, bh, 1);
1482                         if (err)
1483                                 goto out;
1484                         if (buffer_new(bh))
1485                                 unmap_underlying_metadata(bh);
1486                 }
1487                 bh = bh->b_this_page;
1488                 block++;
1489         } while (bh != head);
1490
1491         /* Stage 2: lock the buffers, mark them clean */
1492         do {
1493                 lock_buffer(bh);
1494                 set_buffer_async_io(bh);
1495                 set_bit(BH_Uptodate, &bh->b_state);
1496                 clear_bit(BH_Dirty, &bh->b_state);
1497                 bh = bh->b_this_page;
1498         } while (bh != head);
1499
1500         /* Stage 3: submit the IO */
1501         do {
1502                 struct buffer_head *next = bh->b_this_page;
1503                 submit_bh(WRITE, bh);
1504                 bh = next;
1505         } while (bh != head);
1506
1507         /* Done - end_buffer_io_async will unlock */
1508         SetPageUptodate(page);
1509         return 0;
1510
1511 out:
1512         /*
1513          * ENOSPC, or some other error.  We may already have added some
1514          * blocks to the file, so we need to write these out to avoid
1515          * exposing stale data.
1516          */
1517         ClearPageUptodate(page);
1518         bh = head;
1519         need_unlock = 1;
1520         /* Recovery: lock and submit the mapped buffers */
1521         do {
1522                 if (buffer_mapped(bh)) {
1523                         lock_buffer(bh);
1524                         set_buffer_async_io(bh);
1525                         need_unlock = 0;
1526                 }
1527                 bh = bh->b_this_page;
1528         } while (bh != head);
1529         do {
1530                 struct buffer_head *next = bh->b_this_page;
1531                 if (buffer_mapped(bh)) {
1532                         set_bit(BH_Uptodate, &bh->b_state);
1533                         clear_bit(BH_Dirty, &bh->b_state);
1534                         submit_bh(WRITE, bh);
1535                 }
1536                 bh = next;
1537         } while (bh != head);
1538         if (need_unlock)
1539                 UnlockPage(page);
1540         return err;
1541 }
1542
1543 static int __block_prepare_write(struct inode *inode, struct page *page,
1544                 unsigned from, unsigned to, get_block_t *get_block)
1545 {
1546         unsigned block_start, block_end;
1547         unsigned long block;
1548         int err = 0;
1549         unsigned blocksize, bbits;
1550         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1551         char *kaddr = kmap(page);
1552
1553         blocksize = 1 << inode->i_blkbits;
1554         if (!page->buffers)
1555                 create_empty_buffers(page, inode->i_dev, blocksize);
1556         head = page->buffers;
1557
1558         bbits = inode->i_blkbits;
1559         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1560
1561         for(bh = head, block_start = 0; bh != head || !block_start;
1562             block++, block_start=block_end, bh = bh->b_this_page) {
1563                 if (!bh)
1564                         BUG();
1565                 block_end = block_start+blocksize;
1566                 if (block_end <= from)
1567                         continue;
1568                 if (block_start >= to)
1569                         break;
1570                 clear_bit(BH_New, &bh->b_state);
1571                 if (!buffer_mapped(bh)) {
1572                         err = get_block(inode, block, bh, 1);
1573                         if (err)
1574                                 goto out;
1575                         if (buffer_new(bh)) {
1576                                 unmap_underlying_metadata(bh);
1577                                 if (Page_Uptodate(page)) {
1578                                         set_bit(BH_Uptodate, &bh->b_state);
1579                                         continue;
1580                                 }
1581                                 if (block_end > to)
1582                                         memset(kaddr+to, 0, block_end-to);
1583                                 if (block_start < from)
1584                                         memset(kaddr+block_start, 0, from-block_start);
1585                                 if (block_end > to || block_start < from)
1586                                         flush_dcache_page(page);
1587                                 continue;
1588                         }
1589                 }
1590                 if (Page_Uptodate(page)) {
1591                         set_bit(BH_Uptodate, &bh->b_state);
1592                         continue;
1593                 }
1594                 if (!buffer_uptodate(bh) &&
1595                      (block_start < from || block_end > to)) {
1596                         ll_rw_block(READ, 1, &bh);
1597                         *wait_bh++=bh;
1598                 }
1599         }
1600         /*
1601          * If we issued read requests - let them complete.
1602          */
1603         while(wait_bh > wait) {
1604                 wait_on_buffer(*--wait_bh);
1605                 if (!buffer_uptodate(*wait_bh))
1606                         return -EIO;
1607         }
1608         return 0;
1609 out:
1610         /*
1611          * Zero out any newly allocated blocks to avoid exposing stale
1612          * data.  If BH_New is set, we know that the block was newly
1613          * allocated in the above loop.
1614          *
1615          * Details the buffer can be new and uptodate because:
1616          * 1) hole in uptodate page, get_block(create) allocate the block,
1617          *    so the buffer is new and additionally we also mark it uptodate
1618          * 2) The buffer is not mapped and uptodate due a previous partial read.
1619          *
1620          * We can always ignore uptodate buffers here, if you mark a buffer
1621          * uptodate you must make sure it contains the right data first.
1622          *
1623          * We must stop the "undo/clear" fixup pass not at the caller "to"
1624          * but at the last block that we successfully arrived in the main loop.
1625          */
1626         bh = head;
1627         to = block_start; /* stop at the last successfully handled block */
1628         block_start = 0;
1629         do {
1630                 block_end = block_start+blocksize;
1631                 if (block_end <= from)
1632                         goto next_bh;
1633                 if (block_start >= to)
1634                         break;
1635                 if (buffer_new(bh) && !buffer_uptodate(bh)) {
1636                         memset(kaddr+block_start, 0, bh->b_size);
1637                         flush_dcache_page(page);
1638                         set_bit(BH_Uptodate, &bh->b_state);
1639                         mark_buffer_dirty(bh);
1640                 }
1641 next_bh:
1642                 block_start = block_end;
1643                 bh = bh->b_this_page;
1644         } while (bh != head);
1645         return err;
1646 }
1647
1648 static int __block_commit_write(struct inode *inode, struct page *page,
1649                 unsigned from, unsigned to)
1650 {
1651         unsigned block_start, block_end;
1652         int partial = 0, need_balance_dirty = 0;
1653         unsigned blocksize;
1654         struct buffer_head *bh, *head;
1655
1656         blocksize = 1 << inode->i_blkbits;
1657
1658         for(bh = head = page->buffers, block_start = 0;
1659             bh != head || !block_start;
1660             block_start=block_end, bh = bh->b_this_page) {
1661                 block_end = block_start + blocksize;
1662                 if (block_end <= from || block_start >= to) {
1663                         if (!buffer_uptodate(bh))
1664                                 partial = 1;
1665                 } else {
1666                         set_bit(BH_Uptodate, &bh->b_state);
1667                         if (!atomic_set_buffer_dirty(bh)) {
1668                                 __mark_dirty(bh);
1669                                 buffer_insert_inode_data_queue(bh, inode);
1670                                 need_balance_dirty = 1;
1671                         }
1672                 }
1673         }
1674
1675         if (need_balance_dirty)
1676                 balance_dirty();
1677         /*
1678          * is this a partial write that happened to make all buffers
1679          * uptodate then we can optimize away a bogus readpage() for
1680          * the next read(). Here we 'discover' wether the page went
1681          * uptodate as a result of this (potentially partial) write.
1682          */
1683         if (!partial)
1684                 SetPageUptodate(page);
1685         return 0;
1686 }
1687
1688 /*
1689  * Generic "read page" function for block devices that have the normal
1690  * get_block functionality. This is most of the block device filesystems.
1691  * Reads the page asynchronously --- the unlock_buffer() and
1692  * mark_buffer_uptodate() functions propagate buffer state into the
1693  * page struct once IO has completed.
1694  */
1695 int block_read_full_page(struct page *page, get_block_t *get_block)
1696 {
1697         struct inode *inode = page->mapping->host;
1698         unsigned long iblock, lblock;
1699         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1700         unsigned int blocksize, blocks;
1701         int nr, i;
1702
1703         if (!PageLocked(page))
1704                 PAGE_BUG(page);
1705         blocksize = 1 << inode->i_blkbits;
1706         if (!page->buffers)
1707                 create_empty_buffers(page, inode->i_dev, blocksize);
1708         head = page->buffers;
1709
1710         blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1711         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1712         lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1713         bh = head;
1714         nr = 0;
1715         i = 0;
1716
1717         do {
1718                 if (buffer_uptodate(bh))
1719                         continue;
1720
1721                 if (!buffer_mapped(bh)) {
1722                         if (iblock < lblock) {
1723                                 if (get_block(inode, iblock, bh, 0))
1724                                         continue;
1725                         }
1726                         if (!buffer_mapped(bh)) {
1727                                 memset(kmap(page) + i*blocksize, 0, blocksize);
1728                                 flush_dcache_page(page);
1729                                 kunmap(page);
1730                                 set_bit(BH_Uptodate, &bh->b_state);
1731                                 continue;
1732                         }
1733                         /* get_block() might have updated the buffer synchronously */
1734                         if (buffer_uptodate(bh))
1735                                 continue;
1736                 }
1737
1738                 arr[nr] = bh;
1739                 nr++;
1740         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1741
1742         if (!nr) {
1743                 /*
1744                  * all buffers are uptodate - we can set the page
1745                  * uptodate as well.
1746                  */
1747                 SetPageUptodate(page);
1748                 UnlockPage(page);
1749                 return 0;
1750         }
1751
1752         /* Stage two: lock the buffers */
1753         for (i = 0; i < nr; i++) {
1754                 struct buffer_head * bh = arr[i];
1755                 lock_buffer(bh);
1756                 set_buffer_async_io(bh);
1757         }
1758
1759         /* Stage 3: start the IO */
1760         for (i = 0; i < nr; i++) {
1761                 struct buffer_head * bh = arr[i];
1762                 if (buffer_uptodate(bh))
1763                         end_buffer_io_async(bh, 1);
1764                 else
1765                         submit_bh(READ, bh);
1766         }
1767
1768         return 0;
1769 }
1770
1771 /* utility function for filesystems that need to do work on expanding
1772  * truncates.  Uses prepare/commit_write to allow the filesystem to
1773  * deal with the hole.
1774  */
1775 int generic_cont_expand(struct inode *inode, loff_t size)
1776 {
1777         struct address_space *mapping = inode->i_mapping;
1778         struct page *page;
1779         unsigned long index, offset, limit;
1780         int err;
1781
1782         err = -EFBIG;
1783         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1784         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1785                 send_sig(SIGXFSZ, current, 0);
1786                 goto out;
1787         }
1788         if (size > inode->i_sb->s_maxbytes)
1789                 goto out;
1790
1791         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1792
1793         /* ugh.  in prepare/commit_write, if from==to==start of block, we
1794         ** skip the prepare.  make sure we never send an offset for the start
1795         ** of a block
1796         */
1797         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1798                 offset++;
1799         }
1800         index = size >> PAGE_CACHE_SHIFT;
1801         err = -ENOMEM;
1802         page = grab_cache_page(mapping, index);
1803         if (!page)
1804                 goto out;
1805         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1806         if (!err) {
1807                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1808         }
1809         UnlockPage(page);
1810         page_cache_release(page);
1811         if (err > 0)
1812                 err = 0;
1813 out:
1814         return err;
1815 }
1816
1817 /*
1818  * For moronic filesystems that do not allow holes in file.
1819  * We may have to extend the file.
1820  */
1821
1822 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1823 {
1824         struct address_space *mapping = page->mapping;
1825         struct inode *inode = mapping->host;
1826         struct page *new_page;
1827         unsigned long pgpos;
1828         long status;
1829         unsigned zerofrom;
1830         unsigned blocksize = 1 << inode->i_blkbits;
1831         char *kaddr;
1832
1833         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1834                 status = -ENOMEM;
1835                 new_page = grab_cache_page(mapping, pgpos);
1836                 if (!new_page)
1837                         goto out;
1838                 /* we might sleep */
1839                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1840                         UnlockPage(new_page);
1841                         page_cache_release(new_page);
1842                         continue;
1843                 }
1844                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1845                 if (zerofrom & (blocksize-1)) {
1846                         *bytes |= (blocksize-1);
1847                         (*bytes)++;
1848                 }
1849                 status = __block_prepare_write(inode, new_page, zerofrom,
1850                                                 PAGE_CACHE_SIZE, get_block);
1851                 if (status)
1852                         goto out_unmap;
1853                 kaddr = page_address(new_page);
1854                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1855                 flush_dcache_page(new_page);
1856                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1857                 kunmap(new_page);
1858                 UnlockPage(new_page);
1859                 page_cache_release(new_page);
1860         }
1861
1862         if (page->index < pgpos) {
1863                 /* completely inside the area */
1864                 zerofrom = offset;
1865         } else {
1866                 /* page covers the boundary, find the boundary offset */
1867                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1868
1869                 /* if we will expand the thing last block will be filled */
1870                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1871                         *bytes |= (blocksize-1);
1872                         (*bytes)++;
1873                 }
1874
1875                 /* starting below the boundary? Nothing to zero out */
1876                 if (offset <= zerofrom)
1877                         zerofrom = offset;
1878         }
1879         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1880         if (status)
1881                 goto out1;
1882         kaddr = page_address(page);
1883         if (zerofrom < offset) {
1884                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1885                 flush_dcache_page(page);
1886                 __block_commit_write(inode, page, zerofrom, offset);
1887         }
1888         return 0;
1889 out1:
1890         ClearPageUptodate(page);
1891         kunmap(page);
1892         return status;
1893
1894 out_unmap:
1895         ClearPageUptodate(new_page);
1896         kunmap(new_page);
1897         UnlockPage(new_page);
1898         page_cache_release(new_page);
1899 out:
1900         return status;
1901 }
1902
1903 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1904                         get_block_t *get_block)
1905 {
1906         struct inode *inode = page->mapping->host;
1907         int err = __block_prepare_write(inode, page, from, to, get_block);
1908         if (err) {
1909                 ClearPageUptodate(page);
1910                 kunmap(page);
1911         }
1912         return err;
1913 }
1914
1915 int block_commit_write(struct page *page, unsigned from, unsigned to)
1916 {
1917         struct inode *inode = page->mapping->host;
1918         __block_commit_write(inode,page,from,to);
1919         kunmap(page);
1920         return 0;
1921 }
1922
1923 int generic_commit_write(struct file *file, struct page *page,
1924                 unsigned from, unsigned to)
1925 {
1926         struct inode *inode = page->mapping->host;
1927         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1928         __block_commit_write(inode,page,from,to);
1929         kunmap(page);
1930         if (pos > inode->i_size) {
1931                 inode->i_size = pos;
1932                 mark_inode_dirty(inode);
1933         }
1934         return 0;
1935 }
1936
1937 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1938 {
1939         unsigned long index = from >> PAGE_CACHE_SHIFT;
1940         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1941         unsigned blocksize, iblock, length, pos;
1942         struct inode *inode = mapping->host;
1943         struct page *page;
1944         struct buffer_head *bh;
1945         int err;
1946
1947         blocksize = 1 << inode->i_blkbits;
1948         length = offset & (blocksize - 1);
1949
1950         /* Block boundary? Nothing to do */
1951         if (!length)
1952                 return 0;
1953
1954         length = blocksize - length;
1955         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1956
1957         page = grab_cache_page(mapping, index);
1958         err = -ENOMEM;
1959         if (!page)
1960                 goto out;
1961
1962         if (!page->buffers)
1963                 create_empty_buffers(page, inode->i_dev, blocksize);
1964
1965         /* Find the buffer that contains "offset" */
1966         bh = page->buffers;
1967         pos = blocksize;
1968         while (offset >= pos) {
1969                 bh = bh->b_this_page;
1970                 iblock++;
1971                 pos += blocksize;
1972         }
1973
1974         err = 0;
1975         if (!buffer_mapped(bh)) {
1976                 /* Hole? Nothing to do */
1977                 if (buffer_uptodate(bh))
1978                         goto unlock;
1979                 get_block(inode, iblock, bh, 0);
1980                 /* Still unmapped? Nothing to do */
1981                 if (!buffer_mapped(bh))
1982                         goto unlock;
1983         }
1984
1985         /* Ok, it's mapped. Make sure it's up-to-date */
1986         if (Page_Uptodate(page))
1987                 set_bit(BH_Uptodate, &bh->b_state);
1988
1989         if (!buffer_uptodate(bh)) {
1990                 err = -EIO;
1991                 ll_rw_block(READ, 1, &bh);
1992                 wait_on_buffer(bh);
1993                 /* Uhhuh. Read error. Complain and punt. */
1994                 if (!buffer_uptodate(bh))
1995                         goto unlock;
1996         }
1997
1998         memset(kmap(page) + offset, 0, length);
1999         flush_dcache_page(page);
2000         kunmap(page);
2001
2002         if (!atomic_set_buffer_dirty(bh)) {
2003                 __mark_dirty(bh);
2004                 buffer_insert_inode_data_queue(bh, inode);
2005                 balance_dirty();
2006         }
2007
2008         err = 0;
2009
2010 unlock:
2011         UnlockPage(page);
2012         page_cache_release(page);
2013 out:
2014         return err;
2015 }
2016
2017 int block_write_full_page(struct page *page, get_block_t *get_block)
2018 {
2019         struct inode *inode = page->mapping->host;
2020         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2021         unsigned offset;
2022         int err;
2023
2024         /* easy case */
2025         if (page->index < end_index)
2026                 return __block_write_full_page(inode, page, get_block);
2027
2028         /* things got complicated... */
2029         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2030         /* OK, are we completely out? */
2031         if (page->index >= end_index+1 || !offset) {
2032                 UnlockPage(page);
2033                 return -EIO;
2034         }
2035
2036         /* Sigh... will have to work, then... */
2037         err = __block_prepare_write(inode, page, 0, offset, get_block);
2038         if (!err) {
2039                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2040                 flush_dcache_page(page);
2041                 __block_commit_write(inode,page,0,offset);
2042 done:
2043                 kunmap(page);
2044                 UnlockPage(page);
2045                 return err;
2046         }
2047         ClearPageUptodate(page);
2048         goto done;
2049 }
2050
2051 /*
2052  * Commence writeout of all the buffers against a page.  The
2053  * page must be locked.   Returns zero on success or a negative
2054  * errno.
2055  */
2056 int writeout_one_page(struct page *page)
2057 {
2058         struct buffer_head *bh, *head = page->buffers;
2059
2060         if (!PageLocked(page))
2061                 BUG();
2062         bh = head;
2063         do {
2064                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2065                         continue;
2066
2067                 bh->b_flushtime = jiffies;
2068                 ll_rw_block(WRITE, 1, &bh);
2069         } while ((bh = bh->b_this_page) != head);
2070         return 0;
2071 }
2072 EXPORT_SYMBOL(writeout_one_page);
2073
2074 /*
2075  * Wait for completion of I/O of all buffers against a page.  The page
2076  * must be locked.  Returns zero on success or a negative errno.
2077  */
2078 int waitfor_one_page(struct page *page)
2079 {
2080         int error = 0;
2081         struct buffer_head *bh, *head = page->buffers;
2082
2083         bh = head;
2084         do {
2085                 wait_on_buffer(bh);
2086                 if (buffer_req(bh) && !buffer_uptodate(bh))
2087                         error = -EIO;
2088         } while ((bh = bh->b_this_page) != head);
2089         return error;
2090 }
2091 EXPORT_SYMBOL(waitfor_one_page);
2092
2093 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2094 {
2095         struct buffer_head tmp;
2096         struct inode *inode = mapping->host;
2097         tmp.b_state = 0;
2098         tmp.b_blocknr = 0;
2099         get_block(inode, block, &tmp, 0);
2100         return tmp.b_blocknr;
2101 }
2102
2103 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2104 {
2105         int i, nr_blocks, retval;
2106         unsigned long * blocks = iobuf->blocks;
2107         int length;
2108
2109         length = iobuf->length;
2110         nr_blocks = length / blocksize;
2111         /* build the blocklist */
2112         for (i = 0; i < nr_blocks; i++, blocknr++) {
2113                 struct buffer_head bh;
2114
2115                 bh.b_state = 0;
2116                 bh.b_dev = inode->i_dev;
2117                 bh.b_size = blocksize;
2118
2119                 retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
2120                 if (retval) {
2121                         if (!i)
2122                                 /* report error to userspace */
2123                                 goto out;
2124                         else
2125                                 /* do short I/O utill 'i' */
2126                                 break;
2127                 }
2128
2129                 if (rw == READ) {
2130                         if (buffer_new(&bh))
2131                                 BUG();
2132                         if (!buffer_mapped(&bh)) {
2133                                 /* there was an hole in the filesystem */
2134                                 blocks[i] = -1UL;
2135                                 continue;
2136                         }
2137                 } else {
2138                         if (buffer_new(&bh))
2139                                 unmap_underlying_metadata(&bh);
2140                         if (!buffer_mapped(&bh))
2141                                 BUG();
2142                 }
2143                 blocks[i] = bh.b_blocknr;
2144         }
2145
2146         /* patch length to handle short I/O */
2147         iobuf->length = i * blocksize;
2148         retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2149         /* restore orig length */
2150         iobuf->length = length;
2151  out:
2152
2153         return retval;
2154 }
2155
2156 /*
2157  * IO completion routine for a buffer_head being used for kiobuf IO: we
2158  * can't dispatch the kiobuf callback until io_count reaches 0.
2159  */
2160
2161 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2162 {
2163         struct kiobuf *kiobuf;
2164
2165         mark_buffer_uptodate(bh, uptodate);
2166
2167         kiobuf = bh->b_private;
2168         unlock_buffer(bh);
2169         end_kio_request(kiobuf, uptodate);
2170 }
2171
2172 /*
2173  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2174  * for them to complete.  Clean up the buffer_heads afterwards.
2175  */
2176
2177 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2178 {
2179         int iosize, err;
2180         int i;
2181         struct buffer_head *tmp;
2182
2183         iosize = 0;
2184         err = 0;
2185
2186         for (i = nr; --i >= 0; ) {
2187                 iosize += size;
2188                 tmp = bh[i];
2189                 if (buffer_locked(tmp)) {
2190                         wait_on_buffer(tmp);
2191                 }
2192
2193                 if (!buffer_uptodate(tmp)) {
2194                         /* We are traversing bh'es in reverse order so
2195                            clearing iosize on error calculates the
2196                            amount of IO before the first error. */
2197                         iosize = 0;
2198                         err = -EIO;
2199                 }
2200         }
2201
2202         if (iosize)
2203                 return iosize;
2204         return err;
2205 }
2206
2207 /*
2208  * Start I/O on a physical range of kernel memory, defined by a vector
2209  * of kiobuf structs (much like a user-space iovec list).
2210  *
2211  * The kiobuf must already be locked for IO.  IO is submitted
2212  * asynchronously: you need to check page->locked and page->uptodate.
2213  *
2214  * It is up to the caller to make sure that there are enough blocks
2215  * passed in to completely map the iobufs to disk.
2216  */
2217
2218 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2219                kdev_t dev, unsigned long b[], int size)
2220 {
2221         int             err;
2222         int             length;
2223         int             transferred;
2224         int             i;
2225         int             bufind;
2226         int             pageind;
2227         int             bhind;
2228         int             offset;
2229         unsigned long   blocknr;
2230         struct kiobuf * iobuf = NULL;
2231         struct page *   map;
2232         struct buffer_head *tmp, **bhs = NULL;
2233
2234         if (!nr)
2235                 return 0;
2236
2237         /*
2238          * First, do some alignment and validity checks
2239          */
2240         for (i = 0; i < nr; i++) {
2241                 iobuf = iovec[i];
2242                 if ((iobuf->offset & (size-1)) ||
2243                     (iobuf->length & (size-1)))
2244                         return -EINVAL;
2245                 if (!iobuf->nr_pages)
2246                         panic("brw_kiovec: iobuf not initialised");
2247         }
2248
2249         /*
2250          * OK to walk down the iovec doing page IO on each page we find.
2251          */
2252         bufind = bhind = transferred = err = 0;
2253         for (i = 0; i < nr; i++) {
2254                 iobuf = iovec[i];
2255                 offset = iobuf->offset;
2256                 length = iobuf->length;
2257                 iobuf->errno = 0;
2258                 if (!bhs)
2259                         bhs = iobuf->bh;
2260
2261                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2262                         map  = iobuf->maplist[pageind];
2263                         if (!map) {
2264                                 err = -EFAULT;
2265                                 goto finished;
2266                         }
2267
2268                         while (length > 0) {
2269                                 blocknr = b[bufind++];
2270                                 if (blocknr == -1UL) {
2271                                         if (rw == READ) {
2272                                                 /* there was an hole in the filesystem */
2273                                                 memset(kmap(map) + offset, 0, size);
2274                                                 flush_dcache_page(map);
2275                                                 kunmap(map);
2276
2277                                                 transferred += size;
2278                                                 goto skip_block;
2279                                         } else
2280                                                 BUG();
2281                                 }
2282                                 tmp = bhs[bhind++];
2283
2284                                 tmp->b_size = size;
2285                                 set_bh_page(tmp, map, offset);
2286                                 tmp->b_this_page = tmp;
2287
2288                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2289                                 tmp->b_dev = dev;
2290                                 tmp->b_blocknr = blocknr;
2291                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2292
2293                                 if (rw == WRITE) {
2294                                         set_bit(BH_Uptodate, &tmp->b_state);
2295                                         clear_bit(BH_Dirty, &tmp->b_state);
2296                                 } else
2297                                         set_bit(BH_Uptodate, &tmp->b_state);
2298
2299                                 atomic_inc(&iobuf->io_count);
2300                                 submit_bh(rw, tmp);
2301                                 /*
2302                                  * Wait for IO if we have got too much
2303                                  */
2304                                 if (bhind >= KIO_MAX_SECTORS) {
2305                                         kiobuf_wait_for_io(iobuf); /* wake-one */
2306                                         err = wait_kio(rw, bhind, bhs, size);
2307                                         if (err >= 0)
2308                                                 transferred += err;
2309                                         else
2310                                                 goto finished;
2311                                         bhind = 0;
2312                                 }
2313
2314                         skip_block:
2315                                 length -= size;
2316                                 offset += size;
2317
2318                                 if (offset >= PAGE_SIZE) {
2319                                         offset = 0;
2320                                         break;
2321                                 }
2322                         } /* End of block loop */
2323                 } /* End of page loop */
2324         } /* End of iovec loop */
2325
2326         /* Is there any IO still left to submit? */
2327         if (bhind) {
2328                 kiobuf_wait_for_io(iobuf); /* wake-one */
2329                 err = wait_kio(rw, bhind, bhs, size);
2330                 if (err >= 0)
2331                         transferred += err;
2332                 else
2333                         goto finished;
2334         }
2335
2336  finished:
2337         if (transferred)
2338                 return transferred;
2339         return err;
2340 }
2341
2342 /*
2343  * Start I/O on a page.
2344  * This function expects the page to be locked and may return
2345  * before I/O is complete. You then have to check page->locked
2346  * and page->uptodate.
2347  *
2348  * brw_page() is SMP-safe, although it's being called with the
2349  * kernel lock held - but the code is ready.
2350  *
2351  * FIXME: we need a swapper_inode->get_block function to remove
2352  *        some of the bmap kludges and interface ugliness here.
2353  */
2354 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2355 {
2356         struct buffer_head *head, *bh;
2357
2358         if (!PageLocked(page))
2359                 panic("brw_page: page not locked for I/O");
2360
2361         if (!page->buffers)
2362                 create_empty_buffers(page, dev, size);
2363         head = bh = page->buffers;
2364
2365         /* Stage 1: lock all the buffers */
2366         do {
2367                 lock_buffer(bh);
2368                 bh->b_blocknr = *(b++);
2369                 set_bit(BH_Mapped, &bh->b_state);
2370                 set_buffer_async_io(bh);
2371                 bh = bh->b_this_page;
2372         } while (bh != head);
2373
2374         /* Stage 2: start the IO */
2375         do {
2376                 struct buffer_head *next = bh->b_this_page;
2377                 submit_bh(rw, bh);
2378                 bh = next;
2379         } while (bh != head);
2380         return 0;
2381 }
2382
2383 int block_symlink(struct inode *inode, const char *symname, int len)
2384 {
2385         struct address_space *mapping = inode->i_mapping;
2386         struct page *page = grab_cache_page(mapping, 0);
2387         int err = -ENOMEM;
2388         char *kaddr;
2389
2390         if (!page)
2391                 goto fail;
2392         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2393         if (err)
2394                 goto fail_map;
2395         kaddr = page_address(page);
2396         memcpy(kaddr, symname, len-1);
2397         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2398         /*
2399          * Notice that we are _not_ going to block here - end of page is
2400          * unmapped, so this will only try to map the rest of page, see
2401          * that it is unmapped (typically even will not look into inode -
2402          * ->i_size will be enough for everything) and zero it out.
2403          * OTOH it's obviously correct and should make the page up-to-date.
2404          */
2405         err = mapping->a_ops->readpage(NULL, page);
2406         wait_on_page(page);
2407         page_cache_release(page);
2408         if (err < 0)
2409                 goto fail;
2410         mark_inode_dirty(inode);
2411         return 0;
2412 fail_map:
2413         UnlockPage(page);
2414         page_cache_release(page);
2415 fail:
2416         return err;
2417 }
2418
2419 static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2420 {
2421         struct buffer_head *bh, *tail;
2422
2423         bh = head;
2424         do {
2425                 tail = bh;
2426                 bh = bh->b_this_page;
2427         } while (bh);
2428         tail->b_this_page = head;
2429         page->buffers = head;
2430         page_cache_get(page);
2431 }
2432
2433 /*
2434  * Create the page-cache page that contains the requested block
2435  */
2436 static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2437 {
2438         struct page * page;
2439         struct buffer_head *bh;
2440
2441         page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2442         if (!page)
2443                 return NULL;
2444
2445         if (!PageLocked(page))
2446                 BUG();
2447
2448         bh = page->buffers;
2449         if (bh) {
2450                 if (bh->b_size == size)
2451                         return page;
2452                 if (!try_to_free_buffers(page, GFP_NOFS))
2453                         goto failed;
2454         }
2455
2456         bh = create_buffers(page, size, 0);
2457         if (!bh)
2458                 goto failed;
2459         link_dev_buffers(page, bh);
2460         return page;
2461
2462 failed:
2463         UnlockPage(page);
2464         page_cache_release(page);
2465         return NULL;
2466 }
2467
2468 static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2469 {
2470         struct buffer_head *head = page->buffers;
2471         struct buffer_head *bh = head;
2472         unsigned int uptodate;
2473
2474         uptodate = 1 << BH_Mapped;
2475         if (Page_Uptodate(page))
2476                 uptodate |= 1 << BH_Uptodate;
2477
2478         write_lock(&hash_table_lock);
2479         do {
2480                 if (!(bh->b_state & (1 << BH_Mapped))) {
2481                         init_buffer(bh, NULL, NULL);
2482                         bh->b_dev = dev;
2483                         bh->b_blocknr = block;
2484                         bh->b_state = uptodate;
2485                 }
2486
2487                 /* Insert the buffer into the hash lists if necessary */
2488                 if (!bh->b_pprev)
2489                         __insert_into_hash_list(bh);
2490
2491                 block++;
2492                 bh = bh->b_this_page;
2493         } while (bh != head);
2494         write_unlock(&hash_table_lock);
2495 }
2496
2497 /*
2498  * Try to increase the number of buffers available: the size argument
2499  * is used to determine what kind of buffers we want.
2500  */
2501 static int grow_buffers(kdev_t dev, unsigned long block, int size)
2502 {
2503         struct page * page;
2504         struct block_device *bdev;
2505         unsigned long index;
2506         int sizebits;
2507
2508         /* Size must be multiple of hard sectorsize */
2509         if (size & (get_hardsect_size(dev)-1))
2510                 BUG();
2511         /* Size must be within 512 bytes and PAGE_SIZE */
2512         if (size < 512 || size > PAGE_SIZE)
2513                 BUG();
2514
2515         sizebits = -1;
2516         do {
2517                 sizebits++;
2518         } while ((size << sizebits) < PAGE_SIZE);
2519
2520         index = block >> sizebits;
2521         block = index << sizebits;
2522
2523         bdev = bdget(kdev_t_to_nr(dev));
2524         if (!bdev) {
2525                 printk("No block device for %s\n", kdevname(dev));
2526                 BUG();
2527         }
2528
2529         /* Create a page with the proper size buffers.. */
2530         page = grow_dev_page(bdev, index, size);
2531
2532         /* This is "wrong" - talk to Al Viro */
2533         atomic_dec(&bdev->bd_count);
2534         if (!page)
2535                 return 0;
2536
2537         /* Hash in the buffers on the hash list */
2538         hash_page_buffers(page, dev, block, size);
2539         UnlockPage(page);
2540         page_cache_release(page);
2541
2542         /* We hashed up this page, so increment buffermem */
2543         atomic_inc(&buffermem_pages);
2544         return 1;
2545 }
2546
2547 /*
2548  * The first time the VM inspects a page which has locked buffers, it
2549  * will just mark it as needing waiting upon on the scan of the page LRU.
2550  * BH_Wait_IO is used for this.
2551  *
2552  * The second time the VM visits the page, if it still has locked
2553  * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
2554  *
2555  * The third time the VM visits the page, if the I/O hasn't completed
2556  * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
2557  * used for this.
2558  *
2559  * There is also the case of buffers which were locked by someone else
2560  * - write(2) callers, bdflush, etc.  There can be a huge number of these
2561  * and we don't want to just skip them all and fail the page allocation.
2562  * We want to be able to wait on these buffers as well.
2563  *
2564  * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2565  * underway against the buffer, doesn't matter who started it - we know
2566  * that the buffer will eventually come unlocked, and so it's safe to
2567  * wait on it.
2568  *
2569  * The caller holds the page lock and the caller will free this page
2570  * into current->local_page, so by waiting on the page's buffers the
2571  * caller is guaranteed to obtain this page.
2572  *
2573  * sync_page_buffers() will sort-of return true if all the buffers
2574  * against this page are freeable, so try_to_free_buffers() should
2575  * try to free the page's buffers a second time.  This is a bit
2576  * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2577  */
2578 static int sync_page_buffers(struct buffer_head *head)
2579 {
2580         struct buffer_head * bh = head;
2581         int tryagain = 1;
2582
2583         do {
2584                 if (!buffer_dirty(bh) && !buffer_locked(bh))
2585                         continue;
2586
2587                 /* Don't start IO first time around.. */
2588                 if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2589                         tryagain = 0;
2590                         continue;
2591                 }
2592
2593                 /* Second time through we start actively writing out.. */
2594                 if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2595                         if (unlikely(!buffer_launder(bh))) {
2596                                 tryagain = 0;
2597                                 continue;
2598                         }
2599                         wait_on_buffer(bh);
2600                         tryagain = 1;
2601                         continue;
2602                 }
2603
2604                 if (!atomic_set_buffer_clean(bh)) {
2605                         unlock_buffer(bh);
2606                         continue;
2607                 }
2608
2609                 __mark_buffer_clean(bh);
2610                 get_bh(bh);
2611                 bh->b_end_io = end_buffer_io_sync;
2612                 submit_bh(WRITE, bh);
2613                 tryagain = 0;
2614         } while ((bh = bh->b_this_page) != head);
2615
2616         return tryagain;
2617 }
2618
2619 /*
2620  * Can the buffer be thrown out?
2621  */
2622 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock))
2623 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2624
2625 /*
2626  * try_to_free_buffers() checks if all the buffers on this particular page
2627  * are unused, and free's the page if so.
2628  *
2629  * Wake up bdflush() if this fails - if we're running low on memory due
2630  * to dirty buffers, we need to flush them out as quickly as possible.
2631  *
2632  * NOTE: There are quite a number of ways that threads of control can
2633  *       obtain a reference to a buffer head within a page.  So we must
2634  *       lock out all of these paths to cleanly toss the page.
2635  */
2636 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2637 {
2638         struct buffer_head * tmp, * bh = page->buffers;
2639
2640 cleaned_buffers_try_again:
2641         spin_lock(&lru_list_lock);
2642         write_lock(&hash_table_lock);
2643         tmp = bh;
2644         do {
2645                 if (buffer_busy(tmp))
2646                         goto busy_buffer_page;
2647                 tmp = tmp->b_this_page;
2648         } while (tmp != bh);
2649
2650         spin_lock(&unused_list_lock);
2651         tmp = bh;
2652
2653         /* if this buffer was hashed, this page counts as buffermem */
2654         if (bh->b_pprev)
2655                 atomic_dec(&buffermem_pages);
2656         do {
2657                 struct buffer_head * p = tmp;
2658                 tmp = tmp->b_this_page;
2659
2660                 if (p->b_dev == B_FREE) BUG();
2661
2662                 remove_inode_queue(p);
2663                 __remove_from_queues(p);
2664                 __put_unused_buffer_head(p);
2665         } while (tmp != bh);
2666         spin_unlock(&unused_list_lock);
2667
2668         /* Wake up anyone waiting for buffer heads */
2669         wake_up(&buffer_wait);
2670
2671         /* And free the page */
2672         page->buffers = NULL;
2673         page_cache_release(page);
2674         write_unlock(&hash_table_lock);
2675         spin_unlock(&lru_list_lock);
2676         return 1;
2677
2678 busy_buffer_page:
2679         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2680         write_unlock(&hash_table_lock);
2681         spin_unlock(&lru_list_lock);
2682         gfp_mask = pf_gfp_mask(gfp_mask);
2683         if (gfp_mask & __GFP_IO) {
2684                 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2685                         if (sync_page_buffers(bh)) {
2686                                 /* no IO or waiting next time */
2687                                 gfp_mask = 0;
2688                                 goto cleaned_buffers_try_again;
2689                         }
2690                 }
2691         }
2692         if (balance_dirty_state() >= 0)
2693                 wakeup_bdflush();
2694         return 0;
2695 }
2696 EXPORT_SYMBOL(try_to_free_buffers);
2697
2698 /* ================== Debugging =================== */
2699
2700 void show_buffers(void)
2701 {
2702 #ifdef CONFIG_SMP
2703         struct buffer_head * bh;
2704         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2705         int nlist;
2706         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2707 #endif
2708
2709         printk("Buffer memory:   %6dkB\n",
2710                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2711
2712         printk("Cache memory:   %6dkB\n",
2713                         (atomic_read(&page_cache_size)- atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2714
2715 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2716         if (!spin_trylock(&lru_list_lock))
2717                 return;
2718         for(nlist = 0; nlist < NR_LIST; nlist++) {
2719                 found = locked = dirty = used = lastused = 0;
2720                 bh = lru_list[nlist];
2721                 if(!bh) continue;
2722
2723                 do {
2724                         found++;
2725                         if (buffer_locked(bh))
2726                                 locked++;
2727                         if (buffer_dirty(bh))
2728                                 dirty++;
2729                         if (atomic_read(&bh->b_count))
2730                                 used++, lastused = found;
2731                         bh = bh->b_next_free;
2732                 } while (bh != lru_list[nlist]);
2733                 {
2734                         int tmp = nr_buffers_type[nlist];
2735                         if (found != tmp)
2736                                 printk("%9s: BUG -> found %d, reported %d\n",
2737                                        buf_types[nlist], found, tmp);
2738                 }
2739                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2740                        "%d locked, %d dirty\n",
2741                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2742                        used, lastused, locked, dirty);
2743         }
2744         spin_unlock(&lru_list_lock);
2745 #endif
2746 }
2747
2748 /* ===================== Init ======================= */
2749
2750 /*
2751  * allocate the hash table and init the free list
2752  * Use gfp() for the hash table to decrease TLB misses, use
2753  * SLAB cache for buffer heads.
2754  */
2755 void __init buffer_init(unsigned long mempages)
2756 {
2757         int order, i;
2758         unsigned int nr_hash;
2759
2760         /* The buffer cache hash table is less important these days,
2761          * trim it a bit.
2762          */
2763         mempages >>= 14;
2764
2765         mempages *= sizeof(struct buffer_head *);
2766
2767         for (order = 0; (1 << order) < mempages; order++)
2768                 ;
2769
2770         /* try to allocate something until we get it or we're asking
2771            for something that is really too small */
2772
2773         do {
2774                 unsigned long tmp;
2775
2776                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2777                 bh_hash_mask = (nr_hash - 1);
2778
2779                 tmp = nr_hash;
2780                 bh_hash_shift = 0;
2781                 while((tmp >>= 1UL) != 0UL)
2782                         bh_hash_shift++;
2783
2784                 hash_table = (struct buffer_head **)
2785                     __get_free_pages(GFP_ATOMIC, order);
2786         } while (hash_table == NULL && --order > 0);
2787         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2788                nr_hash, order, (PAGE_SIZE << order));
2789
2790         if (!hash_table)
2791                 panic("Failed to allocate buffer hash table\n");
2792
2793         /* Setup hash chains. */
2794         for(i = 0; i < nr_hash; i++)
2795                 hash_table[i] = NULL;
2796
2797         /* Setup lru lists. */
2798         for(i = 0; i < NR_LIST; i++)
2799                 lru_list[i] = NULL;
2800
2801 }
2802
2803
2804 /* ====================== bdflush support =================== */
2805
2806 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2807  * response to dirty buffers.  Once this process is activated, we write back
2808  * a limited number of buffers to the disks and then go back to sleep again.
2809  */
2810
2811 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2812
2813 void wakeup_bdflush(void)
2814 {
2815         wake_up_interruptible(&bdflush_wait);
2816 }
2817
2818 /*
2819  * Here we attempt to write back old buffers.  We also try to flush inodes
2820  * and supers as well, since this function is essentially "update", and
2821  * otherwise there would be no way of ensuring that these quantities ever
2822  * get written back.  Ideally, we would have a timestamp on the inodes
2823  * and superblocks so that we could write back only the old ones as well
2824  */
2825
2826 static int sync_old_buffers(void)
2827 {
2828         lock_kernel();
2829         sync_unlocked_inodes();
2830         sync_supers(0, 0);
2831         unlock_kernel();
2832
2833         for (;;) {
2834                 struct buffer_head *bh;
2835
2836                 spin_lock(&lru_list_lock);
2837                 bh = lru_list[BUF_DIRTY];
2838                 if (!bh || time_before(jiffies, bh->b_flushtime))
2839                         break;
2840                 if (write_some_buffers(NODEV))
2841                         continue;
2842                 return 0;
2843         }
2844         spin_unlock(&lru_list_lock);
2845         return 0;
2846 }
2847
2848 int block_sync_page(struct page *page)
2849 {
2850         run_task_queue(&tq_disk);
2851         return 0;
2852 }
2853
2854 /* This is the interface to bdflush.  As we get more sophisticated, we can
2855  * pass tuning parameters to this "process", to adjust how it behaves.
2856  * We would want to verify each parameter, however, to make sure that it
2857  * is reasonable. */
2858
2859 asmlinkage long sys_bdflush(int func, long data)
2860 {
2861         if (!capable(CAP_SYS_ADMIN))
2862                 return -EPERM;
2863
2864         if (func == 1) {
2865                 /* do_exit directly and let kupdate to do its work alone. */
2866                 do_exit(0);
2867 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2868          a syscall that doesn't care about the current mm context. */
2869                 int error;
2870                 struct mm_struct *user_mm;
2871
2872                 /*
2873                  * bdflush will spend all of it's time in kernel-space,
2874                  * without touching user-space, so we can switch it into
2875                  * 'lazy TLB mode' to reduce the cost of context-switches
2876                  * to and from bdflush.
2877                  */
2878                 user_mm = start_lazy_tlb();
2879                 error = sync_old_buffers();
2880                 end_lazy_tlb(user_mm);
2881                 return error;
2882 #endif
2883         }
2884
2885         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2886         if (func >= 2) {
2887                 int i = (func-2) >> 1;
2888                 if (i >= 0 && i < N_PARAM) {
2889                         if ((func & 1) == 0)
2890                                 return put_user(bdf_prm.data[i], (int*)data);
2891
2892                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2893                                 bdf_prm.data[i] = data;
2894                                 return 0;
2895                         }
2896                 }
2897                 return -EINVAL;
2898         }
2899
2900         /* Having func 0 used to launch the actual bdflush and then never
2901          * return (unless explicitly killed). We return zero here to
2902          * remain semi-compatible with present update(8) programs.
2903          */
2904         return 0;
2905 }
2906
2907 /*
2908  * This is the actual bdflush daemon itself. It used to be started from
2909  * the syscall above, but now we launch it ourselves internally with
2910  * kernel_thread(...)  directly after the first thread in init/main.c
2911  */
2912 int bdflush(void *startup)
2913 {
2914         struct task_struct *tsk = current;
2915
2916         /*
2917          *      We have a bare-bones task_struct, and really should fill
2918          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2919          *      display semi-sane things. Not real crucial though...
2920          */
2921
2922         tsk->session = 1;
2923         tsk->pgrp = 1;
2924         strcpy(tsk->comm, "bdflush");
2925
2926         /* avoid getting signals */
2927         spin_lock_irq(&tsk->sigmask_lock);
2928         flush_signals(tsk);
2929         sigfillset(&tsk->blocked);
2930         recalc_sigpending(tsk);
2931         spin_unlock_irq(&tsk->sigmask_lock);
2932
2933         complete((struct completion *)startup);
2934
2935         /*
2936          * FIXME: The ndirty logic here is wrong.  It's supposed to
2937          * send bdflush back to sleep after writing ndirty buffers.
2938          * In fact, the test is wrong so bdflush will in fact
2939          * sleep when bdflush_stop() returns true.
2940          *
2941          * FIXME: If it proves useful to implement ndirty properly,
2942          * then perhaps the value of ndirty should be scaled by the
2943          * amount of memory in the machine.
2944          */
2945         for (;;) {
2946                 int ndirty = bdf_prm.b_un.ndirty;
2947
2948                 CHECK_EMERGENCY_SYNC
2949
2950                 while (ndirty > 0) {
2951                         spin_lock(&lru_list_lock);
2952                         if (!write_some_buffers(NODEV))
2953                                 break;
2954                         ndirty -= NRSYNC;
2955                 }
2956                 if (ndirty > 0 || bdflush_stop())
2957                         interruptible_sleep_on(&bdflush_wait);
2958         }
2959 }
2960
2961 /*
2962  * This is the kernel update daemon. It was used to live in userspace
2963  * but since it's need to run safely we want it unkillable by mistake.
2964  * You don't need to change your userspace configuration since
2965  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2966  */
2967 int kupdate(void *startup)
2968 {
2969         struct task_struct * tsk = current;
2970         int interval;
2971
2972         tsk->session = 1;
2973         tsk->pgrp = 1;
2974         strcpy(tsk->comm, "kupdated");
2975
2976         /* sigstop and sigcont will stop and wakeup kupdate */
2977         spin_lock_irq(&tsk->sigmask_lock);
2978         sigfillset(&tsk->blocked);
2979         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2980         recalc_sigpending(tsk);
2981         spin_unlock_irq(&tsk->sigmask_lock);
2982
2983         complete((struct completion *)startup);
2984
2985         for (;;) {
2986                 /* update interval */
2987                 interval = bdf_prm.b_un.interval;
2988                 if (interval) {
2989                         tsk->state = TASK_INTERRUPTIBLE;
2990                         schedule_timeout(interval);
2991                 } else {
2992                 stop_kupdate:
2993                         tsk->state = TASK_STOPPED;
2994                         schedule(); /* wait for SIGCONT */
2995                 }
2996                 /* check for sigstop */
2997                 if (signal_pending(tsk)) {
2998                         int stopped = 0;
2999                         spin_lock_irq(&tsk->sigmask_lock);
3000                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
3001                                 sigdelset(&tsk->pending.signal, SIGSTOP);
3002                                 stopped = 1;
3003                         }
3004                         recalc_sigpending(tsk);
3005                         spin_unlock_irq(&tsk->sigmask_lock);
3006                         if (stopped)
3007                                 goto stop_kupdate;
3008                 }
3009 #ifdef DEBUG
3010                 printk(KERN_DEBUG "kupdate() activated...\n");
3011 #endif
3012                 sync_old_buffers();
3013                 run_task_queue(&tq_disk);
3014         }
3015 }
3016
3017 static int __init bdflush_init(void)
3018 {
3019         static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3020
3021         kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3022         wait_for_completion(&startup);
3023         kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3024         wait_for_completion(&startup);
3025         return 0;
3026 }
3027
3028 module_init(bdflush_init)
3029