more changes on original files
[linux-2.4.git] / fs / xfs / linux-2.4 / xfs_buf.c
1 /*
2  * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms of version 2 of the GNU General Public License as
6  * published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it would be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11  *
12  * Further, this software is distributed without any warranty that it is
13  * free of the rightful claim of any third person regarding infringement
14  * or the like.  Any license provided herein, whether implied or
15  * otherwise, applies only to this software file.  Patent licenses, if
16  * any, provided herein do not apply to combinations of this program with
17  * other software, or any other product whatsoever.
18  *
19  * You should have received a copy of the GNU General Public License along
20  * with this program; if not, write the Free Software Foundation, Inc., 59
21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22  *
23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24  * Mountain View, CA  94043, or:
25  *
26  * http://www.sgi.com
27  *
28  * For further information regarding this notice, see:
29  *
30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31  */
32
33 /*
34  *      The xfs_buf.c code provides an abstract buffer cache model on top
35  *      of the Linux page cache.  Cached metadata blocks for a file system
36  *      are hashed to the inode for the block device.  xfs_buf.c assembles
37  *      buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
38  *
39  *      Written by Steve Lord, Jim Mostek, Russell Cattelan
40  *                  and Rajagopal Ananthanarayanan ("ananth") at SGI.
41  *
42  */
43
44 #include <linux/stddef.h>
45 #include <linux/errno.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <linux/init.h>
49 #include <linux/vmalloc.h>
50 #include <linux/blkdev.h>
51 #include <linux/locks.h>
52 #include <linux/sysctl.h>
53 #include <linux/proc_fs.h>
54
55 #include "xfs_linux.h"
56
57 #define BN_ALIGN_MASK   ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
58
59 #ifndef GFP_READAHEAD
60 #define GFP_READAHEAD   0
61 #endif
62
63 /*
64  * A backport of the 2.5 scheduler is used by many vendors of 2.4-based
65  * distributions.
66  * We can only guess it's presences by the lack of the SCHED_YIELD flag.
67  * If the heuristic doesn't work, change this define by hand.
68  */
69 #ifndef SCHED_YIELD
70 #define __HAVE_NEW_SCHEDULER    1
71 #endif
72
73 /*
74  * cpumask_t is used for supporting NR_CPUS > BITS_PER_LONG.
75  * If support for this is present, migrate_to_cpu exists and provides
76  * a wrapper around the set_cpus_allowed routine.
77  */
78 #ifdef copy_cpumask
79 #define __HAVE_CPUMASK_T        1
80 #endif
81
82 #ifndef __HAVE_CPUMASK_T
83 # ifndef __HAVE_NEW_SCHEDULER
84 #  define migrate_to_cpu(cpu)   \
85         do { current->cpus_allowed = 1UL << (cpu); } while (0)
86 # else
87 #  define migrate_to_cpu(cpu)   \
88         set_cpus_allowed(current, 1UL << (cpu))
89 # endif
90 #endif
91
92 #ifndef VM_MAP
93 #define VM_MAP  VM_ALLOC
94 #endif
95
96 /*
97  * File wide globals
98  */
99
100 STATIC kmem_cache_t *pagebuf_cache;
101 STATIC kmem_shaker_t pagebuf_shake;
102
103 #define MAX_IO_DAEMONS          NR_CPUS
104 #define CPU_TO_DAEMON(cpu)      (cpu)
105 STATIC int pb_logio_daemons[MAX_IO_DAEMONS];
106 STATIC struct list_head pagebuf_logiodone_tq[MAX_IO_DAEMONS];
107 STATIC wait_queue_head_t pagebuf_logiodone_wait[MAX_IO_DAEMONS];
108 STATIC int pb_dataio_daemons[MAX_IO_DAEMONS];
109 STATIC struct list_head pagebuf_dataiodone_tq[MAX_IO_DAEMONS];
110 STATIC wait_queue_head_t pagebuf_dataiodone_wait[MAX_IO_DAEMONS];
111
112 /*
113  * For pre-allocated buffer head pool
114  */
115
116 #define NR_RESERVED_BH  64
117 static wait_queue_head_t        pb_resv_bh_wait;
118 static spinlock_t               pb_resv_bh_lock = SPIN_LOCK_UNLOCKED;
119 struct buffer_head              *pb_resv_bh = NULL;     /* list of bh */
120 int                             pb_resv_bh_cnt = 0;     /* # of bh available */
121
122 STATIC void _pagebuf_ioapply(xfs_buf_t *);
123 STATIC int pagebuf_daemon_wakeup(int, unsigned int);
124 STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
125 STATIC void pagebuf_runall_queues(struct list_head[]);
126
127 /*
128  * Pagebuf debugging
129  */
130
131 #ifdef PAGEBUF_TRACE
132 void
133 pagebuf_trace(
134         xfs_buf_t       *pb,
135         char            *id,
136         void            *data,
137         void            *ra)
138 {
139         ktrace_enter(pagebuf_trace_buf,
140                 pb, id,
141                 (void *)(unsigned long)pb->pb_flags,
142                 (void *)(unsigned long)pb->pb_hold.counter,
143                 (void *)(unsigned long)pb->pb_sema.count.counter,
144                 (void *)current,
145                 data, ra,
146                 (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
147                 (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
148                 (void *)(unsigned long)pb->pb_buffer_length,
149                 NULL, NULL, NULL, NULL, NULL);
150 }
151 ktrace_t *pagebuf_trace_buf;
152 #define PAGEBUF_TRACE_SIZE      4096
153 #define PB_TRACE(pb, id, data)  \
154         pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
155 #else
156 #define PB_TRACE(pb, id, data)  do { } while (0)
157 #endif
158
159 #ifdef PAGEBUF_LOCK_TRACKING
160 # define PB_SET_OWNER(pb)       ((pb)->pb_last_holder = current->pid)
161 # define PB_CLEAR_OWNER(pb)     ((pb)->pb_last_holder = -1)
162 # define PB_GET_OWNER(pb)       ((pb)->pb_last_holder)
163 #else
164 # define PB_SET_OWNER(pb)       do { } while (0)
165 # define PB_CLEAR_OWNER(pb)     do { } while (0)
166 # define PB_GET_OWNER(pb)       do { } while (0)
167 #endif
168
169 /*
170  * Pagebuf allocation / freeing.
171  */
172
173 #define pb_to_gfp(flags) \
174         (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
175          ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
176
177 #define pb_to_km(flags) \
178          (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
179
180
181 #define pagebuf_allocate(flags) \
182         kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))
183 #define pagebuf_deallocate(pb) \
184         kmem_zone_free(pagebuf_cache, (pb));
185
186 /*
187  * Pagebuf hashing
188  */
189
190 #define NBITS   8
191 #define NHASH   (1<<NBITS)
192
193 typedef struct {
194         struct list_head        pb_hash;
195         spinlock_t              pb_hash_lock;
196 } pb_hash_t;
197
198 STATIC pb_hash_t        pbhash[NHASH];
199 #define pb_hash(pb)     &pbhash[pb->pb_hash_index]
200
201 STATIC int
202 _bhash(
203         struct block_device *bdev,
204         loff_t          base)
205 {
206         int             bit, hval;
207
208         base >>= 9;
209         base ^= (unsigned long)bdev / L1_CACHE_BYTES;
210         for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
211                 hval ^= (int)base & (NHASH-1);
212                 base >>= NBITS;
213         }
214         return hval;
215 }
216
217 /*
218  * Mapping of multi-page buffers into contiguous virtual space
219  */
220
221 typedef struct a_list {
222         void            *vm_addr;
223         struct a_list   *next;
224 } a_list_t;
225
226 STATIC a_list_t         *as_free_head;
227 STATIC int              as_list_len;
228 STATIC spinlock_t       as_lock = SPIN_LOCK_UNLOCKED;
229
230 /*
231  * Try to batch vunmaps because they are costly.
232  */
233 STATIC void
234 free_address(
235         void            *addr)
236 {
237         a_list_t        *aentry;
238
239         aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
240         if (aentry) {
241                 spin_lock(&as_lock);
242                 aentry->next = as_free_head;
243                 aentry->vm_addr = addr;
244                 as_free_head = aentry;
245                 as_list_len++;
246                 spin_unlock(&as_lock);
247         } else {
248                 vunmap(addr);
249         }
250 }
251
252 STATIC void
253 purge_addresses(void)
254 {
255         a_list_t        *aentry, *old;
256
257         if (as_free_head == NULL)
258                 return;
259
260         spin_lock(&as_lock);
261         aentry = as_free_head;
262         as_free_head = NULL;
263         as_list_len = 0;
264         spin_unlock(&as_lock);
265
266         while ((old = aentry) != NULL) {
267                 vunmap(aentry->vm_addr);
268                 aentry = aentry->next;
269                 kfree(old);
270         }
271 }
272
273 /*
274  *      Internal pagebuf object manipulation
275  */
276
277 STATIC void
278 _pagebuf_initialize(
279         xfs_buf_t               *pb,
280         xfs_buftarg_t           *target,
281         loff_t                  range_base,
282         size_t                  range_length,
283         page_buf_flags_t        flags)
284 {
285         /*
286          * We don't want certain flags to appear in pb->pb_flags.
287          */
288         flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
289
290         memset(pb, 0, sizeof(xfs_buf_t));
291         atomic_set(&pb->pb_hold, 1);
292         init_MUTEX_LOCKED(&pb->pb_iodonesema);
293         INIT_LIST_HEAD(&pb->pb_list);
294         INIT_LIST_HEAD(&pb->pb_hash_list);
295         init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
296         PB_SET_OWNER(pb);
297         pb->pb_target = target;
298         pb->pb_file_offset = range_base;
299         /*
300          * Set buffer_length and count_desired to the same value initially.
301          * I/O routines should use count_desired, which will be the same in
302          * most cases but may be reset (e.g. XFS recovery).
303          */
304         pb->pb_buffer_length = pb->pb_count_desired = range_length;
305         pb->pb_flags = flags | PBF_NONE;
306         pb->pb_bn = XFS_BUF_DADDR_NULL;
307         atomic_set(&pb->pb_pin_count, 0);
308         init_waitqueue_head(&pb->pb_waiters);
309
310         XFS_STATS_INC(pb_create);
311         PB_TRACE(pb, "initialize", target);
312 }
313
314 /*
315  * Allocate a page array capable of holding a specified number
316  * of pages, and point the page buf at it.
317  */
318 STATIC int
319 _pagebuf_get_pages(
320         xfs_buf_t               *pb,
321         int                     page_count,
322         page_buf_flags_t        flags)
323 {
324         /* Make sure that we have a page list */
325         if (pb->pb_pages == NULL) {
326                 pb->pb_offset = page_buf_poff(pb->pb_file_offset);
327                 pb->pb_page_count = page_count;
328                 if (page_count <= PB_PAGES) {
329                         pb->pb_pages = pb->pb_page_array;
330                 } else {
331                         pb->pb_pages = kmem_alloc(sizeof(struct page *) *
332                                         page_count, pb_to_km(flags));
333                         if (pb->pb_pages == NULL)
334                                 return -ENOMEM;
335                 }
336                 memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
337         }
338         return 0;
339 }
340
341 /*
342  *      Frees pb_pages if it was malloced.
343  */
344 STATIC void
345 _pagebuf_free_pages(
346         xfs_buf_t       *bp)
347 {
348         if (bp->pb_pages != bp->pb_page_array) {
349                 kmem_free(bp->pb_pages,
350                           bp->pb_page_count * sizeof(struct page *));
351         }
352 }
353
354 /*
355  *      Releases the specified buffer.
356  *
357  *      The modification state of any associated pages is left unchanged.
358  *      The buffer most not be on any hash - use pagebuf_rele instead for
359  *      hashed and refcounted buffers
360  */
361 void
362 pagebuf_free(
363         xfs_buf_t               *bp)
364 {
365         PB_TRACE(bp, "free", 0);
366
367         ASSERT(list_empty(&bp->pb_hash_list));
368
369         if (bp->pb_flags & _PBF_PAGE_CACHE) {
370                 uint            i;
371
372                 if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
373                         free_address(bp->pb_addr - bp->pb_offset);
374
375                 for (i = 0; i < bp->pb_page_count; i++)
376                         page_cache_release(bp->pb_pages[i]);
377                 _pagebuf_free_pages(bp);
378         } else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
379                  /*
380                   * XXX(hch): bp->pb_count_desired might be incorrect (see
381                   * pagebuf_associate_memory for details), but fortunately
382                   * the Linux version of kmem_free ignores the len argument..
383                   */
384                 kmem_free(bp->pb_addr, bp->pb_count_desired);
385                 _pagebuf_free_pages(bp);
386         }
387
388         pagebuf_deallocate(bp);
389 }
390
391 /*
392  *      Finds all pages for buffer in question and builds it's page list.
393  */
394 STATIC int
395 _pagebuf_lookup_pages(
396         xfs_buf_t               *bp,
397         uint                    flags)
398 {
399         struct address_space    *mapping = bp->pb_target->pbr_mapping;
400         size_t                  blocksize = bp->pb_target->pbr_bsize;
401         int                     gfp_mask = pb_to_gfp(flags);
402         unsigned short          page_count, i;
403         pgoff_t                 first;
404         loff_t                  end;
405         int                     error;
406
407         end = bp->pb_file_offset + bp->pb_buffer_length;
408         page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
409
410         error = _pagebuf_get_pages(bp, page_count, flags);
411         if (unlikely(error))
412                 return error;
413         bp->pb_flags |= _PBF_PAGE_CACHE;
414
415         first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
416
417         for (i = 0; i < bp->pb_page_count; i++) {
418                 struct page     *page;
419                 uint            retries = 0;
420
421               retry:
422                 page = find_or_create_page(mapping, first + i, gfp_mask);
423                 if (unlikely(page == NULL)) {
424                         if (flags & PBF_READ_AHEAD) {
425                                 bp->pb_page_count = i;
426                                 for (i = 0; i < bp->pb_page_count; i++)
427                                         unlock_page(bp->pb_pages[i]);
428                                 return -ENOMEM;
429                         }
430
431                         /*
432                          * This could deadlock.
433                          *
434                          * But until all the XFS lowlevel code is revamped to
435                          * handle buffer allocation failures we can't do much.
436                          */
437                         if (!(++retries % 100))
438                                 printk(KERN_ERR
439                                         "possible deadlock in %s (mode:0x%x)\n",
440                                         __FUNCTION__, gfp_mask);
441
442                         XFS_STATS_INC(pb_page_retries);
443                         pagebuf_daemon_wakeup(0, gfp_mask);
444                         set_current_state(TASK_UNINTERRUPTIBLE);
445                         schedule_timeout(10);
446                         goto retry;
447                 }
448
449                 XFS_STATS_INC(pb_page_found);
450
451                 /* if we need to do I/O on a page record the fact */
452                 if (!Page_Uptodate(page)) {
453                         page_count--;
454                         if (blocksize == PAGE_CACHE_SIZE && (flags & PBF_READ))
455                                 bp->pb_locked = 1;
456                 }
457
458                 bp->pb_pages[i] = page;
459         }
460
461         if (!bp->pb_locked) {
462                 for (i = 0; i < bp->pb_page_count; i++)
463                         unlock_page(bp->pb_pages[i]);
464         }
465
466         if (page_count) {
467                 /* if we have any uptodate pages, mark that in the buffer */
468                 bp->pb_flags &= ~PBF_NONE;
469
470                 /* if some pages aren't uptodate, mark that in the buffer */
471                 if (page_count != bp->pb_page_count)
472                         bp->pb_flags |= PBF_PARTIAL;
473         }
474
475         PB_TRACE(bp, "lookup_pages", (long)page_count);
476         return error;
477 }
478
479 /*
480  *      Map buffer into kernel address-space if nessecary.
481  */
482 STATIC int
483 _pagebuf_map_pages(
484         xfs_buf_t               *bp,
485         uint                    flags)
486 {
487         /* A single page buffer is always mappable */
488         if (bp->pb_page_count == 1) {
489                 bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
490                 bp->pb_flags |= PBF_MAPPED;
491         } else if (flags & PBF_MAPPED) {
492                 if (as_list_len > 64)
493                         purge_addresses();
494                 bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
495                                 VM_MAP, PAGE_KERNEL);
496                 if (unlikely(bp->pb_addr == NULL))
497                         return -ENOMEM;
498                 bp->pb_addr += bp->pb_offset;
499                 bp->pb_flags |= PBF_MAPPED;
500         }
501
502         return 0;
503 }
504
505 /*
506  *      Pre-allocation of a pool of buffer heads for use in
507  *      low-memory situations.
508  */
509
510 /*
511  *      _pagebuf_prealloc_bh
512  *
513  *      Pre-allocate a pool of "count" buffer heads at startup.
514  *      Puts them on a list at "pb_resv_bh"
515  *      Returns number of bh actually allocated to pool.
516  */
517 STATIC int
518 _pagebuf_prealloc_bh(
519         int                     count)
520 {
521         struct buffer_head      *bh;
522         int                     i;
523
524         for (i = 0; i < count; i++) {
525                 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
526                 if (!bh)
527                         break;
528                 bh->b_pprev = &pb_resv_bh;
529                 bh->b_next = pb_resv_bh;
530                 pb_resv_bh = bh;
531                 pb_resv_bh_cnt++;
532         }
533         return i;
534 }
535
536 /*
537  *      _pagebuf_get_prealloc_bh
538  *
539  *      Get one buffer head from our pre-allocated pool.
540  *      If pool is empty, sleep 'til one comes back in.
541  *      Returns aforementioned buffer head.
542  */
543 STATIC struct buffer_head *
544 _pagebuf_get_prealloc_bh(void)
545 {
546         unsigned long           flags;
547         struct buffer_head      *bh;
548         DECLARE_WAITQUEUE       (wait, current);
549
550         spin_lock_irqsave(&pb_resv_bh_lock, flags);
551
552         if (pb_resv_bh_cnt < 1) {
553                 add_wait_queue(&pb_resv_bh_wait, &wait);
554                 do {
555                         set_current_state(TASK_UNINTERRUPTIBLE);
556                         spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
557                         run_task_queue(&tq_disk);
558                         schedule();
559                         spin_lock_irqsave(&pb_resv_bh_lock, flags);
560                 } while (pb_resv_bh_cnt < 1);
561                 __set_current_state(TASK_RUNNING);
562                 remove_wait_queue(&pb_resv_bh_wait, &wait);
563         }
564
565         BUG_ON(pb_resv_bh_cnt < 1);
566         BUG_ON(!pb_resv_bh);
567
568         bh = pb_resv_bh;
569         pb_resv_bh = bh->b_next;
570         pb_resv_bh_cnt--;
571
572         spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
573         return bh;
574 }
575
576 /*
577  *      _pagebuf_free_bh
578  *
579  *      Take care of buffer heads that we're finished with.
580  *      Call this instead of just kmem_cache_free(bh_cachep, bh)
581  *      when you're done with a bh.
582  *
583  *      If our pre-allocated pool is full, just free the buffer head.
584  *      Otherwise, put it back in the pool, and wake up anybody
585  *      waiting for one.
586  */
587 STATIC inline void
588 _pagebuf_free_bh(
589         struct buffer_head      *bh)
590 {
591         unsigned long           flags;
592         int                     free;
593
594         if (! (free = pb_resv_bh_cnt >= NR_RESERVED_BH)) {
595                 spin_lock_irqsave(&pb_resv_bh_lock, flags);
596
597                 if (! (free = pb_resv_bh_cnt >= NR_RESERVED_BH)) {
598                         bh->b_pprev = &pb_resv_bh;
599                         bh->b_next = pb_resv_bh;
600                         pb_resv_bh = bh;
601                         pb_resv_bh_cnt++;
602
603                         if (waitqueue_active(&pb_resv_bh_wait)) {
604                                 wake_up(&pb_resv_bh_wait);
605                         }
606                 }
607
608                 spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
609         }
610         if (free) {
611                 kmem_cache_free(bh_cachep, bh);
612         }
613 }
614
615 /*
616  *      Finding and Reading Buffers
617  */
618
619 /*
620  *      _pagebuf_find
621  *
622  *      Looks up, and creates if absent, a lockable buffer for
623  *      a given range of an inode.  The buffer is returned
624  *      locked.  If other overlapping buffers exist, they are
625  *      released before the new buffer is created and locked,
626  *      which may imply that this call will block until those buffers
627  *      are unlocked.  No I/O is implied by this call.
628  */
629 xfs_buf_t *
630 _pagebuf_find(                          /* find buffer for block        */
631         xfs_buftarg_t           *target,/* target for block             */
632         loff_t                  ioff,   /* starting offset of range     */
633         size_t                  isize,  /* length of range              */
634         page_buf_flags_t        flags,  /* PBF_TRYLOCK                  */
635         xfs_buf_t               *new_pb)/* newly allocated buffer       */
636 {
637         loff_t                  range_base;
638         size_t                  range_length;
639         int                     hval;
640         pb_hash_t               *h;
641         xfs_buf_t               *pb, *n;
642         int                     not_locked;
643
644         range_base = (ioff << BBSHIFT);
645         range_length = (isize << BBSHIFT);
646
647         /* Ensure we never do IOs smaller than the sector size */
648         BUG_ON(range_length < (1 << target->pbr_sshift));
649
650         /* Ensure we never do IOs that are not sector aligned */
651         BUG_ON(range_base & (loff_t)target->pbr_smask);
652
653         hval = _bhash(target->pbr_bdev, range_base);
654         h = &pbhash[hval];
655
656         spin_lock(&h->pb_hash_lock);
657
658         list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
659                 if (pb->pb_target == target &&
660                     pb->pb_file_offset == range_base &&
661                     pb->pb_buffer_length == range_length) {
662                         /* If we look at something bring it to the
663                          * front of the list for next time
664                          */
665                         atomic_inc(&pb->pb_hold);
666                         list_move(&pb->pb_hash_list, &h->pb_hash);
667                         goto found;
668                 }
669         }
670
671         /* No match found */
672         if (new_pb) {
673                 _pagebuf_initialize(new_pb, target, range_base,
674                                 range_length, flags);
675                 new_pb->pb_hash_index = hval;
676                 list_add(&new_pb->pb_hash_list, &h->pb_hash);
677         } else {
678                 XFS_STATS_INC(pb_miss_locked);
679         }
680
681         spin_unlock(&h->pb_hash_lock);
682         return (new_pb);
683
684 found:
685         spin_unlock(&h->pb_hash_lock);
686
687         /* Attempt to get the semaphore without sleeping,
688          * if this does not work then we need to drop the
689          * spinlock and do a hard attempt on the semaphore.
690          */
691         not_locked = down_trylock(&pb->pb_sema);
692         if (not_locked) {
693                 if (!(flags & PBF_TRYLOCK)) {
694                         /* wait for buffer ownership */
695                         PB_TRACE(pb, "get_lock", 0);
696                         pagebuf_lock(pb);
697                         XFS_STATS_INC(pb_get_locked_waited);
698                 } else {
699                         /* We asked for a trylock and failed, no need
700                          * to look at file offset and length here, we
701                          * know that this pagebuf at least overlaps our
702                          * pagebuf and is locked, therefore our buffer
703                          * either does not exist, or is this buffer
704                          */
705
706                         pagebuf_rele(pb);
707                         XFS_STATS_INC(pb_busy_locked);
708                         return (NULL);
709                 }
710         } else {
711                 /* trylock worked */
712                 PB_SET_OWNER(pb);
713         }
714
715         if (pb->pb_flags & PBF_STALE)
716                 pb->pb_flags &= PBF_MAPPED;
717         PB_TRACE(pb, "got_lock", 0);
718         XFS_STATS_INC(pb_get_locked);
719         return (pb);
720 }
721
722 /*
723  *      xfs_buf_get_flags assembles a buffer covering the specified range.
724  *
725  *      Storage in memory for all portions of the buffer will be allocated,
726  *      although backing storage may not be.
727  */
728 xfs_buf_t *
729 xfs_buf_get_flags(                      /* allocate a buffer            */
730         xfs_buftarg_t           *target,/* target for buffer            */
731         loff_t                  ioff,   /* starting offset of range     */
732         size_t                  isize,  /* length of range              */
733         page_buf_flags_t        flags)  /* PBF_TRYLOCK                  */
734 {
735         xfs_buf_t               *pb, *new_pb;
736         int                     error = 0, i;
737
738         new_pb = pagebuf_allocate(flags);
739         if (unlikely(!new_pb))
740                 return NULL;
741
742         pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
743         if (pb == new_pb) {
744                 error = _pagebuf_lookup_pages(pb, flags);
745                 if (error)
746                         goto no_buffer;
747         } else {
748                 pagebuf_deallocate(new_pb);
749                 if (unlikely(pb == NULL))
750                         return NULL;
751         }
752
753         for (i = 0; i < pb->pb_page_count; i++)
754                 mark_page_accessed(pb->pb_pages[i]);
755
756         if (!(pb->pb_flags & PBF_MAPPED)) {
757                 error = _pagebuf_map_pages(pb, flags);
758                 if (unlikely(error)) {
759                         printk(KERN_WARNING "%s: failed to map pages\n",
760                                         __FUNCTION__);
761                         goto no_buffer;
762                 }
763         }
764
765         XFS_STATS_INC(pb_get);
766
767         /*
768          * Always fill in the block number now, the mapped cases can do
769          * their own overlay of this later.
770          */
771         pb->pb_bn = ioff;
772         pb->pb_count_desired = pb->pb_buffer_length;
773
774         PB_TRACE(pb, "get", (unsigned long)flags);
775         return pb;
776
777  no_buffer:
778         if (flags & (PBF_LOCK | PBF_TRYLOCK))
779                 pagebuf_unlock(pb);
780         pagebuf_rele(pb);
781         return NULL;
782 }
783
784 xfs_buf_t *
785 xfs_buf_read_flags(
786         xfs_buftarg_t           *target,
787         loff_t                  ioff,
788         size_t                  isize,
789         page_buf_flags_t        flags)
790 {
791         xfs_buf_t               *pb;
792
793         flags |= PBF_READ;
794
795         pb = xfs_buf_get_flags(target, ioff, isize, flags);
796         if (pb) {
797                 if (PBF_NOT_DONE(pb)) {
798                         PB_TRACE(pb, "read", (unsigned long)flags);
799                         XFS_STATS_INC(pb_get_read);
800                         pagebuf_iostart(pb, flags);
801                 } else if (flags & PBF_ASYNC) {
802                         PB_TRACE(pb, "read_async", (unsigned long)flags);
803                         /*
804                          * Read ahead call which is already satisfied,
805                          * drop the buffer
806                          */
807                         goto no_buffer;
808                 } else {
809                         PB_TRACE(pb, "read_done", (unsigned long)flags);
810                         /* We do not want read in the flags */
811                         pb->pb_flags &= ~PBF_READ;
812                 }
813         }
814
815         return pb;
816
817  no_buffer:
818         if (flags & (PBF_LOCK | PBF_TRYLOCK))
819                 pagebuf_unlock(pb);
820         pagebuf_rele(pb);
821         return NULL;
822 }
823
824 /*
825  * Create a skeletal pagebuf (no pages associated with it).
826  */
827 xfs_buf_t *
828 pagebuf_lookup(
829         xfs_buftarg_t           *target,
830         loff_t                  ioff,
831         size_t                  isize,
832         page_buf_flags_t        flags)
833 {
834         xfs_buf_t               *pb;
835
836         flags |= _PBF_PRIVATE_BH;
837         pb = pagebuf_allocate(flags);
838         if (pb) {
839                 _pagebuf_initialize(pb, target, ioff, isize, flags);
840         }
841         return pb;
842 }
843
844 /*
845  * If we are not low on memory then do the readahead in a deadlock
846  * safe manner.
847  */
848 void
849 pagebuf_readahead(
850         xfs_buftarg_t           *target,
851         loff_t                  ioff,
852         size_t                  isize,
853         page_buf_flags_t        flags)
854 {
855         flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD);
856         xfs_buf_read_flags(target, ioff, isize, flags);
857 }
858
859 xfs_buf_t *
860 pagebuf_get_empty(
861         size_t                  len,
862         xfs_buftarg_t           *target)
863 {
864         xfs_buf_t               *pb;
865
866         pb = pagebuf_allocate(0);
867         if (pb)
868                 _pagebuf_initialize(pb, target, 0, len, 0);
869         return pb;
870 }
871
872 static inline struct page *
873 mem_to_page(
874         void                    *addr)
875 {
876         if (((unsigned long)addr < VMALLOC_START) ||
877             ((unsigned long)addr >= VMALLOC_END)) {
878                 return virt_to_page(addr);
879         } else {
880                 return vmalloc_to_page(addr);
881         }
882 }
883
884 int
885 pagebuf_associate_memory(
886         xfs_buf_t               *pb,
887         void                    *mem,
888         size_t                  len)
889 {
890         int                     rval;
891         int                     i = 0;
892         size_t                  ptr;
893         size_t                  end, end_cur;
894         off_t                   offset;
895         int                     page_count;
896
897         page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
898         offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
899         if (offset && (len > PAGE_CACHE_SIZE))
900                 page_count++;
901
902         /* Free any previous set of page pointers */
903         if (pb->pb_pages)
904                 _pagebuf_free_pages(pb);
905
906         pb->pb_pages = NULL;
907         pb->pb_addr = mem;
908
909         rval = _pagebuf_get_pages(pb, page_count, 0);
910         if (rval)
911                 return rval;
912
913         pb->pb_offset = offset;
914         ptr = (size_t) mem & PAGE_CACHE_MASK;
915         end = PAGE_CACHE_ALIGN((size_t) mem + len);
916         end_cur = end;
917         /* set up first page */
918         pb->pb_pages[0] = mem_to_page(mem);
919
920         ptr += PAGE_CACHE_SIZE;
921         pb->pb_page_count = ++i;
922         while (ptr < end) {
923                 pb->pb_pages[i] = mem_to_page((void *)ptr);
924                 pb->pb_page_count = ++i;
925                 ptr += PAGE_CACHE_SIZE;
926         }
927         pb->pb_locked = 0;
928
929         pb->pb_count_desired = pb->pb_buffer_length = len;
930         pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH;
931
932         return 0;
933 }
934
935 xfs_buf_t *
936 pagebuf_get_no_daddr(
937         size_t                  len,
938         xfs_buftarg_t           *target)
939 {
940         size_t                  malloc_len = len;
941         xfs_buf_t               *bp;
942         void                    *data;
943         int                     error;
944
945         bp = pagebuf_allocate(0);
946         if (unlikely(bp == NULL))
947                 goto fail;
948         _pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
949
950  try_again:
951         data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
952         if (unlikely(data == NULL))
953                 goto fail_free_buf;
954
955         /* check whether alignment matches.. */
956         if ((__psunsigned_t)data !=
957             ((__psunsigned_t)data & ~target->pbr_smask)) {
958                 /* .. else double the size and try again */
959                 kmem_free(data, malloc_len);
960                 malloc_len <<= 1;
961                 goto try_again;
962         }
963
964         error = pagebuf_associate_memory(bp, data, len);
965         if (error)
966                 goto fail_free_mem;
967         bp->pb_flags |= _PBF_KMEM_ALLOC;
968
969         pagebuf_unlock(bp);
970
971         PB_TRACE(bp, "no_daddr", data);
972         return bp;
973  fail_free_mem:
974         kmem_free(data, malloc_len);
975  fail_free_buf:
976         pagebuf_free(bp);
977  fail:
978         return NULL;
979 }
980
981 /*
982  *      pagebuf_hold
983  *
984  *      Increment reference count on buffer, to hold the buffer concurrently
985  *      with another thread which may release (free) the buffer asynchronously.
986  *
987  *      Must hold the buffer already to call this function.
988  */
989 void
990 pagebuf_hold(
991         xfs_buf_t               *pb)
992 {
993         atomic_inc(&pb->pb_hold);
994         PB_TRACE(pb, "hold", 0);
995 }
996
997 /*
998  *      pagebuf_rele
999  *
1000  *      pagebuf_rele releases a hold on the specified buffer.  If the
1001  *      the hold count is 1, pagebuf_rele calls pagebuf_free.
1002  */
1003 void
1004 pagebuf_rele(
1005         xfs_buf_t               *pb)
1006 {
1007         pb_hash_t               *hash = pb_hash(pb);
1008
1009         PB_TRACE(pb, "rele", pb->pb_relse);
1010
1011         if (atomic_dec_and_lock(&pb->pb_hold, &hash->pb_hash_lock)) {
1012                 int             do_free = 1;
1013
1014                 if (pb->pb_relse) {
1015                         atomic_inc(&pb->pb_hold);
1016                         spin_unlock(&hash->pb_hash_lock);
1017                         (*(pb->pb_relse)) (pb);
1018                         spin_lock(&hash->pb_hash_lock);
1019                         do_free = 0;
1020                 }
1021
1022                 if (pb->pb_flags & PBF_DELWRI) {
1023                         pb->pb_flags |= PBF_ASYNC;
1024                         atomic_inc(&pb->pb_hold);
1025                         pagebuf_delwri_queue(pb, 0);
1026                         do_free = 0;
1027                 } else if (pb->pb_flags & PBF_FS_MANAGED) {
1028                         do_free = 0;
1029                 }
1030
1031                 if (do_free) {
1032                         list_del_init(&pb->pb_hash_list);
1033                         spin_unlock(&hash->pb_hash_lock);
1034                         xfs_buf_free(pb);
1035                 } else {
1036                         spin_unlock(&hash->pb_hash_lock);
1037                 }
1038         }
1039 }
1040
1041
1042 /*
1043  *      Mutual exclusion on buffers.  Locking model:
1044  *
1045  *      Buffers associated with inodes for which buffer locking
1046  *      is not enabled are not protected by semaphores, and are
1047  *      assumed to be exclusively owned by the caller.  There is a
1048  *      spinlock in the buffer, used by the caller when concurrent
1049  *      access is possible.
1050  */
1051
1052 /*
1053  *      pagebuf_cond_lock
1054  *
1055  *      pagebuf_cond_lock locks a buffer object, if it is not already locked.
1056  *      Note that this in no way
1057  *      locks the underlying pages, so it is only useful for synchronizing
1058  *      concurrent use of page buffer objects, not for synchronizing independent
1059  *      access to the underlying pages.
1060  */
1061 int
1062 pagebuf_cond_lock(                      /* lock buffer, if not locked   */
1063                                         /* returns -EBUSY if locked)    */
1064         xfs_buf_t               *pb)
1065 {
1066         int                     locked;
1067
1068         locked = down_trylock(&pb->pb_sema) == 0;
1069         if (locked) {
1070                 PB_SET_OWNER(pb);
1071         }
1072         PB_TRACE(pb, "cond_lock", (long)locked);
1073         return(locked ? 0 : -EBUSY);
1074 }
1075
1076 #if defined(DEBUG) || defined(XFS_BLI_TRACE)
1077 /*
1078  *      pagebuf_lock_value
1079  *
1080  *      Return lock value for a pagebuf
1081  */
1082 int
1083 pagebuf_lock_value(
1084         xfs_buf_t               *pb)
1085 {
1086         return(atomic_read(&pb->pb_sema.count));
1087 }
1088 #endif
1089
1090 /*
1091  *      pagebuf_lock
1092  *
1093  *      pagebuf_lock locks a buffer object.  Note that this in no way
1094  *      locks the underlying pages, so it is only useful for synchronizing
1095  *      concurrent use of page buffer objects, not for synchronizing independent
1096  *      access to the underlying pages.
1097  */
1098 int
1099 pagebuf_lock(
1100         xfs_buf_t               *pb)
1101 {
1102         PB_TRACE(pb, "lock", 0);
1103         if (atomic_read(&pb->pb_io_remaining))
1104                 run_task_queue(&tq_disk);
1105         down(&pb->pb_sema);
1106         PB_SET_OWNER(pb);
1107         PB_TRACE(pb, "locked", 0);
1108         return 0;
1109 }
1110
1111 /*
1112  *      pagebuf_unlock
1113  *
1114  *      pagebuf_unlock releases the lock on the buffer object created by
1115  *      pagebuf_lock or pagebuf_cond_lock (not any
1116  *      pinning of underlying pages created by pagebuf_pin).
1117  */
1118 void
1119 pagebuf_unlock(                         /* unlock buffer                */
1120         xfs_buf_t               *pb)    /* buffer to unlock             */
1121 {
1122         PB_CLEAR_OWNER(pb);
1123         up(&pb->pb_sema);
1124         PB_TRACE(pb, "unlock", 0);
1125 }
1126
1127
1128 /*
1129  *      Pinning Buffer Storage in Memory
1130  */
1131
1132 /*
1133  *      pagebuf_pin
1134  *
1135  *      pagebuf_pin locks all of the memory represented by a buffer in
1136  *      memory.  Multiple calls to pagebuf_pin and pagebuf_unpin, for
1137  *      the same or different buffers affecting a given page, will
1138  *      properly count the number of outstanding "pin" requests.  The
1139  *      buffer may be released after the pagebuf_pin and a different
1140  *      buffer used when calling pagebuf_unpin, if desired.
1141  *      pagebuf_pin should be used by the file system when it wants be
1142  *      assured that no attempt will be made to force the affected
1143  *      memory to disk.  It does not assure that a given logical page
1144  *      will not be moved to a different physical page.
1145  */
1146 void
1147 pagebuf_pin(
1148         xfs_buf_t               *pb)
1149 {
1150         atomic_inc(&pb->pb_pin_count);
1151         PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
1152 }
1153
1154 /*
1155  *      pagebuf_unpin
1156  *
1157  *      pagebuf_unpin reverses the locking of memory performed by
1158  *      pagebuf_pin.  Note that both functions affected the logical
1159  *      pages associated with the buffer, not the buffer itself.
1160  */
1161 void
1162 pagebuf_unpin(
1163         xfs_buf_t               *pb)
1164 {
1165         if (atomic_dec_and_test(&pb->pb_pin_count)) {
1166                 wake_up_all(&pb->pb_waiters);
1167         }
1168         PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
1169 }
1170
1171 int
1172 pagebuf_ispin(
1173         xfs_buf_t               *pb)
1174 {
1175         return atomic_read(&pb->pb_pin_count);
1176 }
1177
1178 /*
1179  *      pagebuf_wait_unpin
1180  *
1181  *      pagebuf_wait_unpin waits until all of the memory associated
1182  *      with the buffer is not longer locked in memory.  It returns
1183  *      immediately if none of the affected pages are locked.
1184  */
1185 static inline void
1186 _pagebuf_wait_unpin(
1187         xfs_buf_t               *pb)
1188 {
1189         DECLARE_WAITQUEUE       (wait, current);
1190
1191         if (atomic_read(&pb->pb_pin_count) == 0)
1192                 return;
1193
1194         add_wait_queue(&pb->pb_waiters, &wait);
1195         for (;;) {
1196                 set_current_state(TASK_UNINTERRUPTIBLE);
1197                 if (atomic_read(&pb->pb_pin_count) == 0)
1198                         break;
1199                 if (atomic_read(&pb->pb_io_remaining))
1200                         run_task_queue(&tq_disk);
1201                 schedule();
1202         }
1203         remove_wait_queue(&pb->pb_waiters, &wait);
1204         set_current_state(TASK_RUNNING);
1205 }
1206
1207
1208 /*
1209  *      Buffer Utility Routines
1210  */
1211
1212 /*
1213  *      pagebuf_iodone
1214  *
1215  *      pagebuf_iodone marks a buffer for which I/O is in progress
1216  *      done with respect to that I/O.  The pb_iodone routine, if
1217  *      present, will be called as a side-effect.
1218  */
1219 void
1220 pagebuf_iodone_sched(
1221         void                    *v)
1222 {
1223         xfs_buf_t               *bp = (xfs_buf_t *)v;
1224
1225         if (bp->pb_iodone)
1226                 (*(bp->pb_iodone))(bp);
1227         else if (bp->pb_flags & PBF_ASYNC)
1228                 xfs_buf_relse(bp);
1229 }
1230
1231 void
1232 pagebuf_iodone(
1233         xfs_buf_t               *pb,
1234         int                     dataio,
1235         int                     schedule)
1236 {
1237         pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
1238         if (pb->pb_error == 0) {
1239                 pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
1240         }
1241
1242         PB_TRACE(pb, "iodone", pb->pb_iodone);
1243
1244         if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
1245                 if (schedule) {
1246                         int     daemon = CPU_TO_DAEMON(smp_processor_id());
1247
1248                         INIT_TQUEUE(&pb->pb_iodone_sched,
1249                                 pagebuf_iodone_sched, (void *)pb);
1250                         queue_task(&pb->pb_iodone_sched, dataio ?
1251                                 &pagebuf_dataiodone_tq[daemon] :
1252                                 &pagebuf_logiodone_tq[daemon]);
1253                         wake_up(dataio ?
1254                                 &pagebuf_dataiodone_wait[daemon] :
1255                                 &pagebuf_logiodone_wait[daemon]);
1256                 } else {
1257                         pagebuf_iodone_sched(pb);
1258                 }
1259         } else {
1260                 up(&pb->pb_iodonesema);
1261         }
1262 }
1263
1264 /*
1265  *      pagebuf_ioerror
1266  *
1267  *      pagebuf_ioerror sets the error code for a buffer.
1268  */
1269 void
1270 pagebuf_ioerror(                        /* mark/clear buffer error flag */
1271         xfs_buf_t               *pb,    /* buffer to mark               */
1272         int                     error)  /* error to store (0 if none)   */
1273 {
1274         ASSERT(error >= 0 && error <= 0xffff);
1275         pb->pb_error = (unsigned short)error;
1276         PB_TRACE(pb, "ioerror", (unsigned long)error);
1277 }
1278
1279 /*
1280  *      pagebuf_iostart
1281  *
1282  *      pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
1283  *      If necessary, it will arrange for any disk space allocation required,
1284  *      and it will break up the request if the block mappings require it.
1285  *      The pb_iodone routine in the buffer supplied will only be called
1286  *      when all of the subsidiary I/O requests, if any, have been completed.
1287  *      pagebuf_iostart calls the pagebuf_ioinitiate routine or
1288  *      pagebuf_iorequest, if the former routine is not defined, to start
1289  *      the I/O on a given low-level request.
1290  */
1291 int
1292 pagebuf_iostart(                        /* start I/O on a buffer          */
1293         xfs_buf_t               *pb,    /* buffer to start                */
1294         page_buf_flags_t        flags)  /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
1295                                         /* PBF_WRITE, PBF_DELWRI,         */
1296                                         /* PBF_DONT_BLOCK                 */
1297 {
1298         int                     status = 0;
1299
1300         PB_TRACE(pb, "iostart", (unsigned long)flags);
1301
1302         if (flags & PBF_DELWRI) {
1303                 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
1304                 pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC);
1305                 pagebuf_delwri_queue(pb, 1);
1306                 return status;
1307         }
1308
1309         pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \
1310                         PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1311         pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
1312                         PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1313
1314         BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
1315
1316         /* For writes allow an alternate strategy routine to precede
1317          * the actual I/O request (which may not be issued at all in
1318          * a shutdown situation, for example).
1319          */
1320         status = (flags & PBF_WRITE) ?
1321                 pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
1322
1323         /* Wait for I/O if we are not an async request.
1324          * Note: async I/O request completion will release the buffer,
1325          * and that can already be done by this point.  So using the
1326          * buffer pointer from here on, after async I/O, is invalid.
1327          */
1328         if (!status && !(flags & PBF_ASYNC))
1329                 status = pagebuf_iowait(pb);
1330
1331         return status;
1332 }
1333
1334
1335 /*
1336  * Helper routines for pagebuf_iorequest (pagebuf I/O completion)
1337  */
1338
1339 STATIC __inline__ int
1340 _pagebuf_iolocked(
1341         xfs_buf_t               *pb)
1342 {
1343         ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
1344         if (pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE)
1345                 return pb->pb_locked;
1346         if (pb->pb_flags & PBF_READ)
1347                 return pb->pb_locked;
1348         return (pb->pb_flags & _PBF_PAGE_CACHE);
1349 }
1350
1351 STATIC void
1352 _pagebuf_iodone(
1353         xfs_buf_t               *pb,
1354         int                     schedule)
1355 {
1356         int                     i;
1357
1358         if (atomic_dec_and_test(&pb->pb_io_remaining) != 1)
1359                 return;
1360
1361         if (_pagebuf_iolocked(pb))
1362                 for (i = 0; i < pb->pb_page_count; i++)
1363                         unlock_page(pb->pb_pages[i]);
1364         pb->pb_locked = 0;
1365         pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
1366 }
1367
1368 STATIC void
1369 _end_io_pagebuf(
1370         struct buffer_head      *bh,
1371         int                     uptodate,
1372         int                     fullpage)
1373 {
1374         struct page             *page = bh->b_page;
1375         xfs_buf_t               *pb = (xfs_buf_t *)bh->b_private;
1376
1377         mark_buffer_uptodate(bh, uptodate);
1378         put_bh(bh);
1379
1380         if (!uptodate) {
1381                 SetPageError(page);
1382                 pb->pb_error = EIO;
1383         }
1384
1385         if (fullpage) {
1386                 unlock_buffer(bh);
1387                 _pagebuf_free_bh(bh);
1388                 if (!PageError(page))
1389                         SetPageUptodate(page);
1390         } else {
1391                 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
1392                 struct buffer_head *bp;
1393                 unsigned long flags;
1394
1395                 ASSERT(PageLocked(page));
1396                 spin_lock_irqsave(&page_uptodate_lock, flags);
1397                 clear_buffer_async(bh);
1398                 unlock_buffer(bh);
1399                 for (bp = bh->b_this_page; bp != bh; bp = bp->b_this_page) {
1400                         if (buffer_locked(bp)) {
1401                                 if (buffer_async(bp))
1402                                         break;
1403                         } else if (!buffer_uptodate(bp))
1404                                 break;
1405                 }
1406                 spin_unlock_irqrestore(&page_uptodate_lock, flags);
1407                 if (bp == bh && !PageError(page))
1408                         SetPageUptodate(page);
1409         }
1410
1411         _pagebuf_iodone(pb, 1);
1412 }
1413
1414 STATIC void
1415 _pagebuf_end_io_complete_pages(
1416         struct buffer_head      *bh,
1417         int                     uptodate)
1418 {
1419         _end_io_pagebuf(bh, uptodate, 1);
1420 }
1421
1422 STATIC void
1423 _pagebuf_end_io_partial_pages(
1424         struct buffer_head      *bh,
1425         int                     uptodate)
1426 {
1427         _end_io_pagebuf(bh, uptodate, 0);
1428 }
1429
1430 /*
1431  *      Handling of buftargs.
1432  */
1433
1434 /*
1435  * Wait for any bufs with callbacks that have been submitted but
1436  * have not yet returned... walk the hash list for the target.
1437  */
1438 void
1439 xfs_wait_buftarg(
1440         xfs_buftarg_t *target)
1441 {
1442         xfs_buf_t       *pb, *n;
1443         pb_hash_t       *h;
1444         int             i;
1445
1446         for (i = 0; i < NHASH; i++) {
1447                 h = &pbhash[i];
1448 again:
1449                 spin_lock(&h->pb_hash_lock);
1450                 list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
1451                         if (pb->pb_target == target &&
1452                                         !(pb->pb_flags & PBF_FS_MANAGED)) {
1453                                 spin_unlock(&h->pb_hash_lock);
1454                                 delay(100);
1455                                 goto again;
1456                         }
1457                 }
1458                 spin_unlock(&h->pb_hash_lock);
1459         }
1460 }
1461
1462 void
1463 xfs_free_buftarg(
1464         xfs_buftarg_t           *btp,
1465         int                     external)
1466 {
1467         xfs_flush_buftarg(btp, 1);
1468         if (external)
1469                 xfs_blkdev_put(btp->pbr_bdev);
1470         iput(btp->pbr_mapping->host);
1471         kmem_free(btp, sizeof(*btp));
1472 }
1473
1474 void
1475 xfs_incore_relse(
1476         xfs_buftarg_t           *btp,
1477         int                     delwri_only,
1478         int                     wait)
1479 {
1480         destroy_buffers(btp->pbr_kdev);
1481         truncate_inode_pages(btp->pbr_mapping, 0LL);
1482 }
1483
1484 int
1485 xfs_setsize_buftarg(
1486         xfs_buftarg_t           *btp,
1487         unsigned int            blocksize,
1488         unsigned int            sectorsize)
1489 {
1490         btp->pbr_bsize = blocksize;
1491         btp->pbr_sshift = ffs(sectorsize) - 1;
1492         btp->pbr_smask = sectorsize - 1;
1493
1494         if (set_blocksize(btp->pbr_kdev, sectorsize)) {
1495                 printk(KERN_WARNING
1496                         "XFS: Cannot set_blocksize to %u on device 0x%x\n",
1497                         sectorsize, kdev_t_to_nr(btp->pbr_kdev));
1498                 return EINVAL;
1499         }
1500         return 0;
1501 }
1502
1503 STATIC int
1504 xfs_mapping_buftarg(
1505         xfs_buftarg_t           *btp,
1506         struct block_device     *bdev)
1507 {
1508         kdev_t                  kdev;
1509         struct inode            *inode;
1510         struct address_space    *mapping;
1511         static struct address_space_operations mapping_aops = {
1512                 .sync_page = block_sync_page,
1513         };
1514
1515         kdev = to_kdev_t(bdev->bd_dev);
1516         inode = new_inode(bdev->bd_inode->i_sb);
1517         if (!inode) {
1518                 printk(KERN_WARNING
1519                         "XFS: Cannot allocate mapping inode for device %s\n",
1520                         XFS_BUFTARG_NAME(btp));
1521                 return ENOMEM;
1522         }
1523         inode->i_mode = S_IFBLK;
1524         inode->i_dev  = kdev;
1525         inode->i_rdev = kdev;
1526         inode->i_bdev = bdev;
1527         mapping = &inode->i_data;
1528         mapping->a_ops = &mapping_aops;
1529         mapping->gfp_mask = GFP_KERNEL;
1530         btp->pbr_mapping = mapping;
1531         return 0;
1532 }
1533
1534 xfs_buftarg_t *
1535 xfs_alloc_buftarg(
1536         struct block_device     *bdev)
1537 {
1538         xfs_buftarg_t           *btp;
1539         kdev_t                  kdev;
1540
1541         btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1542
1543         kdev = to_kdev_t(bdev->bd_dev);
1544         btp->pbr_dev =  bdev->bd_dev;
1545         btp->pbr_kdev = kdev;
1546         btp->pbr_bdev = bdev;
1547         switch (MAJOR(btp->pbr_dev)) {
1548         case MD_MAJOR:
1549         case EVMS_MAJOR:
1550                 btp->pbr_flags = PBR_ALIGNED_ONLY;
1551                 break;
1552         case LOOP_MAJOR:
1553         case LVM_BLK_MAJOR:
1554                 btp->pbr_flags = PBR_SECTOR_ONLY;
1555                 break;
1556         }
1557         if (xfs_setsize_buftarg(btp, PAGE_CACHE_SIZE, get_hardsect_size(kdev)))
1558                 goto error;
1559         if (xfs_mapping_buftarg(btp, bdev))
1560                 goto error;
1561         return btp;
1562
1563 error:
1564         kmem_free(btp, sizeof(*btp));
1565         return NULL;
1566 }
1567
1568 /*
1569  * Initiate I/O on part of a page we are interested in
1570  */
1571 STATIC int
1572 _pagebuf_page_io(
1573         struct page             *page,  /* Page structure we are dealing with */
1574         xfs_buftarg_t           *pbr,   /* device parameters (bsz, ssz, dev) */
1575         xfs_buf_t               *pb,    /* pagebuf holding it, can be NULL */
1576         xfs_daddr_t             bn,     /* starting block number */
1577         size_t                  pg_offset,      /* starting offset in page */
1578         size_t                  pg_length,      /* count of data to process */
1579         int                     rw,     /* read/write operation */
1580         int                     flush)
1581 {
1582         size_t                  sector;
1583         size_t                  blk_length = 0;
1584         struct buffer_head      *bh, *head, *bufferlist[MAX_BUF_PER_PAGE];
1585         int                     sector_shift = pbr->pbr_sshift;
1586         int                     i = 0, cnt = 0;
1587         int                     public_bh = 0;
1588         int                     multi_ok;
1589
1590         if ((pbr->pbr_bsize < PAGE_CACHE_SIZE) &&
1591             !(pb->pb_flags & _PBF_PRIVATE_BH)) {
1592                 int             cache_ok;
1593
1594                 cache_ok = !((pb->pb_flags & PBF_FORCEIO) || (rw == WRITE));
1595                 public_bh = multi_ok = 1;
1596                 sector = 1 << sector_shift;
1597
1598                 ASSERT(PageLocked(page));
1599                 if (!page_has_buffers(page))
1600                         create_empty_buffers(page, pbr->pbr_kdev, sector);
1601
1602                 i = sector >> BBSHIFT;
1603                 bn -= (pg_offset >> BBSHIFT);
1604
1605                 /* Find buffer_heads belonging to just this pagebuf */
1606                 bh = head = page_buffers(page);
1607                 do {
1608                         if (buffer_uptodate(bh) && cache_ok)
1609                                 continue;
1610                         if (blk_length < pg_offset)
1611                                 continue;
1612                         if (blk_length >= pg_offset + pg_length)
1613                                 break;
1614
1615                         lock_buffer(bh);
1616                         get_bh(bh);
1617                         bh->b_size = sector;
1618                         bh->b_blocknr = bn;
1619                         bufferlist[cnt++] = bh;
1620
1621                 } while ((bn += i),
1622                          (blk_length += sector),
1623                           (bh = bh->b_this_page) != head);
1624
1625                 goto request;
1626         }
1627
1628         /* Calculate the block offsets and length we will be using */
1629         if (pg_offset) {
1630                 size_t          block_offset;
1631
1632                 block_offset = pg_offset >> sector_shift;
1633                 block_offset = pg_offset - (block_offset << sector_shift);
1634                 blk_length = (pg_length + block_offset + pbr->pbr_smask) >>
1635                                                                 sector_shift;
1636         } else {
1637                 blk_length = (pg_length + pbr->pbr_smask) >> sector_shift;
1638         }
1639
1640         /* This will attempt to make a request bigger than the sector
1641          * size if we are well aligned.
1642          */
1643         switch (pb->pb_target->pbr_flags) {
1644         case 0:
1645                 sector = blk_length << sector_shift;
1646                 blk_length = 1;
1647                 break;
1648         case PBR_ALIGNED_ONLY:
1649                 if ((pg_offset == 0) && (pg_length == PAGE_CACHE_SIZE) &&
1650                     (((unsigned int) bn) & BN_ALIGN_MASK) == 0) {
1651                         sector = blk_length << sector_shift;
1652                         blk_length = 1;
1653                         break;
1654                 }
1655         case PBR_SECTOR_ONLY:
1656                 /* Fallthrough, same as default */
1657         default:
1658                 sector = 1 << sector_shift;
1659         }
1660
1661         /* If we are doing I/O larger than the bh->b_size field then
1662          * we need to split this request up.
1663          */
1664         while (sector > ((1ULL << NBBY * sizeof(bh->b_size)) - 1)) {
1665                 sector >>= 1;
1666                 blk_length++;
1667         }
1668
1669         multi_ok = (blk_length != 1);
1670         i = sector >> BBSHIFT;
1671
1672         for (; blk_length > 0; bn += i, blk_length--, pg_offset += sector) {
1673                 bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS);
1674                 if (!bh)
1675                         bh = _pagebuf_get_prealloc_bh();
1676                 memset(bh, 0, sizeof(*bh));
1677                 bh->b_blocknr = bn;
1678                 bh->b_size = sector;
1679                 bh->b_dev = pbr->pbr_kdev;
1680                 set_buffer_locked(bh);
1681                 set_bh_page(bh, page, pg_offset);
1682                 init_waitqueue_head(&bh->b_wait);
1683                 atomic_set(&bh->b_count, 1);
1684                 bufferlist[cnt++] = bh;
1685         }
1686
1687 request:
1688         if (cnt) {
1689                 void    (*callback)(struct buffer_head *, int);
1690
1691                 callback = (multi_ok && public_bh) ?
1692                                 _pagebuf_end_io_partial_pages :
1693                                 _pagebuf_end_io_complete_pages;
1694
1695                 /* Account for additional buffers in progress */
1696                 atomic_add(cnt, &pb->pb_io_remaining);
1697
1698 #ifdef RQ_WRITE_ORDERED
1699                 if (flush)
1700                         set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state);
1701 #endif
1702
1703                 for (i = 0; i < cnt; i++) {
1704                         bh = bufferlist[i];
1705                         init_buffer(bh, callback, pb);
1706                         bh->b_rdev = bh->b_dev;
1707                         bh->b_rsector = bh->b_blocknr;
1708                         set_buffer_mapped(bh);
1709                         set_buffer_async(bh);
1710                         set_buffer_req(bh);
1711                         if (rw == WRITE)
1712                                 set_buffer_uptodate(bh);
1713                         generic_make_request(rw, bh);
1714                 }
1715                 return 0;
1716         }
1717
1718         /*
1719          * We have no I/O to submit, let the caller know that
1720          * we have skipped over this page entirely.
1721          */
1722         return 1;
1723 }
1724
1725 STATIC void
1726 _pagebuf_page_apply(
1727         xfs_buf_t               *pb,
1728         loff_t                  offset,
1729         struct page             *page,
1730         size_t                  pg_offset,
1731         size_t                  pg_length,
1732         int                     last)
1733 {
1734         xfs_daddr_t             bn = pb->pb_bn;
1735         xfs_buftarg_t           *pbr = pb->pb_target;
1736         loff_t                  pb_offset;
1737         int                     status, locking;
1738
1739         ASSERT(page);
1740         ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
1741
1742         if ((pbr->pbr_bsize == PAGE_CACHE_SIZE) &&
1743             (pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
1744             (pb->pb_flags & PBF_READ) && pb->pb_locked) {
1745                 bn -= (pb->pb_offset >> BBSHIFT);
1746                 pg_offset = 0;
1747                 pg_length = PAGE_CACHE_SIZE;
1748         } else {
1749                 pb_offset = offset - pb->pb_file_offset;
1750                 if (pb_offset) {
1751                         bn += (pb_offset + BBMASK) >> BBSHIFT;
1752                 }
1753         }
1754
1755         locking = _pagebuf_iolocked(pb);
1756         if (pb->pb_flags & PBF_WRITE) {
1757                 if (locking && !pb->pb_locked)
1758                         lock_page(page);
1759                 status = _pagebuf_page_io(page, pbr, pb, bn,
1760                                 pg_offset, pg_length, WRITE,
1761                                 last && (pb->pb_flags & PBF_FLUSH));
1762         } else {
1763                 status = _pagebuf_page_io(page, pbr, pb, bn,
1764                                 pg_offset, pg_length, READ, 0);
1765         }
1766         if (status && locking && !(pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE))
1767                 unlock_page(page);
1768 }
1769
1770 /*
1771  *      pagebuf_iorequest -- the core I/O request routine.
1772  */
1773 int
1774 pagebuf_iorequest(                      /* start real I/O               */
1775         xfs_buf_t               *pb)    /* buffer to convey to device   */
1776 {
1777         PB_TRACE(pb, "iorequest", 0);
1778
1779         if (pb->pb_flags & PBF_DELWRI) {
1780                 pagebuf_delwri_queue(pb, 1);
1781                 return 0;
1782         }
1783
1784         if (pb->pb_flags & PBF_WRITE) {
1785                 _pagebuf_wait_unpin(pb);
1786         }
1787
1788         pagebuf_hold(pb);
1789
1790         /* Set the count to 1 initially, this will stop an I/O
1791          * completion callout which happens before we have started
1792          * all the I/O from calling pagebuf_iodone too early.
1793          */
1794         atomic_set(&pb->pb_io_remaining, 1);
1795         _pagebuf_ioapply(pb);
1796         _pagebuf_iodone(pb, 0);
1797
1798         pagebuf_rele(pb);
1799         return 0;
1800 }
1801
1802 /*
1803  *      pagebuf_iowait
1804  *
1805  *      pagebuf_iowait waits for I/O to complete on the buffer supplied.
1806  *      It returns immediately if no I/O is pending.  In any case, it returns
1807  *      the error code, if any, or 0 if there is no error.
1808  */
1809 int
1810 pagebuf_iowait(
1811         xfs_buf_t               *pb)
1812 {
1813         PB_TRACE(pb, "iowait", 0);
1814         if (atomic_read(&pb->pb_io_remaining))
1815                 run_task_queue(&tq_disk);
1816         if ((pb->pb_flags & PBF_FS_DATAIOD))
1817                 pagebuf_runall_queues(pagebuf_dataiodone_tq);
1818         down(&pb->pb_iodonesema);
1819         PB_TRACE(pb, "iowaited", (long)pb->pb_error);
1820         return pb->pb_error;
1821 }
1822
1823 caddr_t
1824 pagebuf_offset(
1825         xfs_buf_t               *pb,
1826         size_t                  offset)
1827 {
1828         struct page             *page;
1829
1830         offset += pb->pb_offset;
1831
1832         page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
1833         return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
1834 }
1835
1836 /*
1837  *      pagebuf_iomove
1838  *
1839  *      Move data into or out of a buffer.
1840  */
1841 void
1842 pagebuf_iomove(
1843         xfs_buf_t               *pb,    /* buffer to process            */
1844         size_t                  boff,   /* starting buffer offset       */
1845         size_t                  bsize,  /* length to copy               */
1846         caddr_t                 data,   /* data address                 */
1847         page_buf_rw_t           mode)   /* read/write flag              */
1848 {
1849         size_t                  bend, cpoff, csize;
1850         struct page             *page;
1851
1852         bend = boff + bsize;
1853         while (boff < bend) {
1854                 page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
1855                 cpoff = page_buf_poff(boff + pb->pb_offset);
1856                 csize = min_t(size_t,
1857                               PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
1858
1859                 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1860
1861                 switch (mode) {
1862                 case PBRW_ZERO:
1863                         memset(page_address(page) + cpoff, 0, csize);
1864                         break;
1865                 case PBRW_READ:
1866                         memcpy(data, page_address(page) + cpoff, csize);
1867                         break;
1868                 case PBRW_WRITE:
1869                         memcpy(page_address(page) + cpoff, data, csize);
1870                 }
1871
1872                 boff += csize;
1873                 data += csize;
1874         }
1875 }
1876
1877 /*
1878  *      _pagebuf_ioapply
1879  *
1880  *      Applies _pagebuf_page_apply to each page of the xfs_buf_t.
1881  */
1882 STATIC void
1883 _pagebuf_ioapply(                       /* apply function to pages      */
1884         xfs_buf_t               *pb)    /* buffer to examine            */
1885 {
1886         int                     index;
1887         loff_t                  buffer_offset = pb->pb_file_offset;
1888         size_t                  buffer_len = pb->pb_count_desired;
1889         size_t                  page_offset, len;
1890         size_t                  cur_offset, cur_len;
1891
1892         cur_offset = pb->pb_offset;
1893         cur_len = buffer_len;
1894
1895         if (!pb->pb_locked && !(pb->pb_flags & PBF_DIRECTIO) &&
1896             (pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE)) {
1897                 for (index = 0; index < pb->pb_page_count; index++)
1898                         lock_page(pb->pb_pages[index]);
1899                 pb->pb_locked = 1;
1900         }
1901
1902         for (index = 0; index < pb->pb_page_count; index++) {
1903                 if (cur_len == 0)
1904                         break;
1905                 if (cur_offset >= PAGE_CACHE_SIZE) {
1906                         cur_offset -= PAGE_CACHE_SIZE;
1907                         continue;
1908                 }
1909
1910                 page_offset = cur_offset;
1911                 cur_offset = 0;
1912
1913                 len = PAGE_CACHE_SIZE - page_offset;
1914                 if (len > cur_len)
1915                         len = cur_len;
1916                 cur_len -= len;
1917
1918                 _pagebuf_page_apply(pb, buffer_offset,
1919                                 pb->pb_pages[index], page_offset, len,
1920                                 index + 1 == pb->pb_page_count);
1921                 buffer_offset += len;
1922                 buffer_len -= len;
1923         }
1924
1925         /*
1926          * Run the block device task queue here, while we have
1927          * a hold on the pagebuf (important to have that hold).
1928          */
1929         if (pb->pb_flags & _PBF_RUN_QUEUES) {
1930                 pb->pb_flags &= ~_PBF_RUN_QUEUES;
1931                 if (atomic_read(&pb->pb_io_remaining) > 1)
1932                         run_task_queue(&tq_disk);
1933         }
1934 }
1935
1936
1937 /*
1938  * Delayed write buffer list handling
1939  */
1940
1941 STATIC LIST_HEAD(pbd_delwrite_queue);
1942 STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;
1943
1944 STATIC void
1945 pagebuf_delwri_queue(
1946         xfs_buf_t               *pb,
1947         int                     unlock)
1948 {
1949         PB_TRACE(pb, "delwri_q", (long)unlock);
1950         ASSERT(pb->pb_flags & PBF_DELWRI);
1951
1952         spin_lock(&pbd_delwrite_lock);
1953         /* If already in the queue, dequeue and place at tail */
1954         if (!list_empty(&pb->pb_list)) {
1955                 if (unlock)
1956                         atomic_dec(&pb->pb_hold);
1957                 list_del(&pb->pb_list);
1958         }
1959
1960         list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
1961         pb->pb_queuetime = jiffies;
1962         spin_unlock(&pbd_delwrite_lock);
1963
1964         if (unlock)
1965                 pagebuf_unlock(pb);
1966 }
1967
1968 void
1969 pagebuf_delwri_dequeue(
1970         xfs_buf_t               *pb)
1971 {
1972         int                     dequeued = 0;
1973
1974         spin_lock(&pbd_delwrite_lock);
1975         if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
1976                 list_del_init(&pb->pb_list);
1977                 dequeued = 1;
1978         }
1979         pb->pb_flags &= ~PBF_DELWRI;
1980         spin_unlock(&pbd_delwrite_lock);
1981
1982         if (dequeued)
1983                 pagebuf_rele(pb);
1984
1985         PB_TRACE(pb, "delwri_dq", (long)dequeued);
1986 }
1987
1988
1989 /*
1990  * The pagebuf iodone daemons
1991  */
1992
1993 STATIC int
1994 pagebuf_iodone_daemon(
1995         void                    *__bind_cpu,
1996         const char              *name,
1997         int                     pagebuf_daemons[],
1998         struct list_head        pagebuf_iodone_tq[],
1999         wait_queue_head_t       pagebuf_iodone_wait[])
2000 {
2001         int                     bind_cpu, cpu;
2002         DECLARE_WAITQUEUE       (wait, current);
2003
2004         bind_cpu = (int) (long)__bind_cpu;
2005         cpu = CPU_TO_DAEMON(cpu_logical_map(bind_cpu));
2006
2007         /*  Set up the thread  */
2008         daemonize();
2009
2010         /* Avoid signals */
2011         sigmask_lock();
2012         sigfillset(&current->blocked);
2013         __recalc_sigpending(current);
2014         sigmask_unlock();
2015
2016         /* Migrate to the right CPU */
2017         migrate_to_cpu(cpu);
2018 #ifdef __HAVE_NEW_SCHEDULER
2019         if (smp_processor_id() != cpu)
2020                 BUG();
2021 #else
2022         while (smp_processor_id() != cpu)
2023                 schedule();
2024 #endif
2025
2026         sprintf(current->comm, "%s/%d", name, bind_cpu);
2027         INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]);
2028         init_waitqueue_head(&pagebuf_iodone_wait[cpu]);
2029         __set_current_state(TASK_INTERRUPTIBLE);
2030         mb();
2031
2032         pagebuf_daemons[cpu] = 1;
2033
2034         for (;;) {
2035                 add_wait_queue(&pagebuf_iodone_wait[cpu], &wait);
2036
2037                 if (TQ_ACTIVE(pagebuf_iodone_tq[cpu]))
2038                         __set_task_state(current, TASK_RUNNING);
2039                 schedule();
2040                 remove_wait_queue(&pagebuf_iodone_wait[cpu], &wait);
2041                 run_task_queue(&pagebuf_iodone_tq[cpu]);
2042                 if (pagebuf_daemons[cpu] == 0)
2043                         break;
2044                 __set_current_state(TASK_INTERRUPTIBLE);
2045         }
2046
2047         pagebuf_daemons[cpu] = -1;
2048         wake_up_interruptible(&pagebuf_iodone_wait[cpu]);
2049         return 0;
2050 }
2051
2052 STATIC void
2053 pagebuf_runall_queues(
2054         struct list_head        pagebuf_iodone_tq[])
2055 {
2056         int     pcpu, cpu;
2057
2058         for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2059                 pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2060
2061                 run_task_queue(&pagebuf_iodone_tq[pcpu]);
2062         }
2063 }
2064
2065 STATIC int
2066 pagebuf_logiodone_daemon(
2067         void                    *__bind_cpu)
2068 {
2069         return pagebuf_iodone_daemon(__bind_cpu, "xfslogd", pb_logio_daemons,
2070                         pagebuf_logiodone_tq, pagebuf_logiodone_wait);
2071 }
2072
2073 STATIC int
2074 pagebuf_dataiodone_daemon(
2075         void                    *__bind_cpu)
2076 {
2077         return pagebuf_iodone_daemon(__bind_cpu, "xfsdatad", pb_dataio_daemons,
2078                         pagebuf_dataiodone_tq, pagebuf_dataiodone_wait);
2079 }
2080
2081
2082 /* Defines for pagebuf daemon */
2083 STATIC DECLARE_COMPLETION(pagebuf_daemon_done);
2084 STATIC struct task_struct *pagebuf_daemon_task;
2085 STATIC int pagebuf_daemon_active;
2086 STATIC int force_flush;
2087
2088
2089 STATIC int
2090 pagebuf_daemon_wakeup(
2091         int                     priority,
2092         unsigned int            mask)
2093 {
2094         force_flush = 1;
2095         barrier();
2096         wake_up_process(pagebuf_daemon_task);
2097         return 0;
2098 }
2099
2100 STATIC int
2101 pagebuf_daemon(
2102         void                    *data)
2103 {
2104         struct list_head        tmp;
2105         unsigned long           age;
2106         xfs_buf_t               *pb, *n;
2107         int                     count;
2108
2109         /*  Set up the thread  */
2110         daemonize();
2111
2112         /* Mark it active */
2113         pagebuf_daemon_task = current;
2114         pagebuf_daemon_active = 1;
2115         barrier();
2116
2117         /* Avoid signals */
2118         sigmask_lock();
2119         sigfillset(&current->blocked);
2120         __recalc_sigpending(current);
2121         sigmask_unlock();
2122
2123         strcpy(current->comm, "xfsbufd");
2124         current->flags |= PF_MEMALLOC;
2125
2126         INIT_LIST_HEAD(&tmp);
2127         do {
2128                 set_current_state(TASK_INTERRUPTIBLE);
2129                 schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
2130
2131                 count = 0;
2132                 age = (xfs_buf_age_centisecs * HZ) / 100;
2133                 spin_lock(&pbd_delwrite_lock);
2134                 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
2135                         PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
2136                         ASSERT(pb->pb_flags & PBF_DELWRI);
2137
2138                         if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
2139                                 if (!force_flush &&
2140                                     time_before(jiffies,
2141                                                 pb->pb_queuetime + age)) {
2142                                         pagebuf_unlock(pb);
2143                                         break;
2144                                 }
2145
2146                                 pb->pb_flags &= ~PBF_DELWRI;
2147                                 pb->pb_flags |= PBF_WRITE;
2148                                 list_move(&pb->pb_list, &tmp);
2149                                 count++;
2150                         }
2151                 }
2152                 spin_unlock(&pbd_delwrite_lock);
2153
2154                 while (!list_empty(&tmp)) {
2155                         pb = list_entry(tmp.next, xfs_buf_t, pb_list);
2156                         list_del_init(&pb->pb_list);
2157                         pagebuf_iostrategy(pb);
2158                 }
2159
2160                 if (as_list_len > 0)
2161                         purge_addresses();
2162                 if (count)
2163                         run_task_queue(&tq_disk);
2164
2165                 force_flush = 0;
2166         } while (pagebuf_daemon_active);
2167
2168         complete_and_exit(&pagebuf_daemon_done, 0);
2169 }
2170
2171 /*
2172  * Go through all incore buffers, and release buffers if they belong to
2173  * the given device. This is used in filesystem error handling to
2174  * preserve the consistency of its metadata.
2175  */
2176 int
2177 xfs_flush_buftarg(
2178         xfs_buftarg_t           *target,
2179         int                     wait)
2180 {
2181         struct list_head        tmp;
2182         xfs_buf_t               *pb, *n;
2183         int                     pincount = 0;
2184         int                     flush_cnt = 0;
2185
2186         pagebuf_runall_queues(pagebuf_dataiodone_tq);
2187         pagebuf_runall_queues(pagebuf_logiodone_tq);
2188
2189         INIT_LIST_HEAD(&tmp);
2190         spin_lock(&pbd_delwrite_lock);
2191         list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
2192
2193                 if (pb->pb_target != target)
2194                         continue;
2195
2196                 ASSERT(pb->pb_flags & PBF_DELWRI);
2197                 PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
2198                 if (pagebuf_ispin(pb)) {
2199                         pincount++;
2200                         continue;
2201                 }
2202
2203                 pb->pb_flags &= ~PBF_DELWRI;
2204                 pb->pb_flags |= PBF_WRITE;
2205                 list_move(&pb->pb_list, &tmp);
2206         }
2207         spin_unlock(&pbd_delwrite_lock);
2208
2209         /*
2210          * Dropped the delayed write list lock, now walk the temporary list
2211          */
2212         list_for_each_entry_safe(pb, n, &tmp, pb_list) {
2213
2214                 if (wait)
2215                         pb->pb_flags &= ~PBF_ASYNC;
2216                 else
2217                         list_del_init(&pb->pb_list);
2218
2219                 pagebuf_lock(pb);
2220                 pagebuf_iostrategy(pb);
2221
2222                 if (++flush_cnt > 32) {
2223                         run_task_queue(&tq_disk);
2224                         flush_cnt = 0;
2225                 }
2226         }
2227
2228         run_task_queue(&tq_disk);
2229
2230         /*
2231          * Remaining list items must be flushed before returning
2232          */
2233         while (!list_empty(&tmp)) {
2234                 pb = list_entry(tmp.next, xfs_buf_t, pb_list);
2235
2236                 list_del_init(&pb->pb_list);
2237                 
2238                 xfs_iowait(pb);
2239                 xfs_buf_relse(pb);
2240         }
2241
2242         return pincount;
2243 }
2244
2245 STATIC int
2246 pagebuf_daemon_start(void)
2247 {
2248         int             cpu, pcpu;
2249
2250         kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES|CLONE_VM);
2251
2252         for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2253                 pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2254
2255                 if (kernel_thread(pagebuf_logiodone_daemon,
2256                                 (void *)(long) cpu,
2257                                 CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
2258                         printk("pagebuf_logiodone daemon failed to start\n");
2259                 } else {
2260                         while (!pb_logio_daemons[pcpu])
2261                                 yield();
2262                 }
2263         }
2264         for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2265                 pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2266
2267                 if (kernel_thread(pagebuf_dataiodone_daemon,
2268                                 (void *)(long) cpu,
2269                                 CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
2270                         printk("pagebuf_dataiodone daemon failed to start\n");
2271                 } else {
2272                         while (!pb_dataio_daemons[pcpu])
2273                                 yield();
2274                 }
2275         }
2276         return 0;
2277 }
2278
2279 /*
2280  * pagebuf_daemon_stop
2281  *
2282  * Note: do not mark as __exit, it is called from pagebuf_terminate.
2283  */
2284 STATIC void
2285 pagebuf_daemon_stop(void)
2286 {
2287         int             cpu, pcpu;
2288
2289         pagebuf_daemon_active = 0;
2290         barrier();
2291         wait_for_completion(&pagebuf_daemon_done);
2292
2293         for (pcpu = 0; pcpu < min(smp_num_cpus, MAX_IO_DAEMONS); pcpu++) {
2294                 cpu = CPU_TO_DAEMON(cpu_logical_map(pcpu));
2295
2296                 pb_logio_daemons[cpu] = 0;
2297                 wake_up(&pagebuf_logiodone_wait[cpu]);
2298                 wait_event_interruptible(pagebuf_logiodone_wait[cpu],
2299                                 pb_logio_daemons[cpu] == -1);
2300
2301                 pb_dataio_daemons[cpu] = 0;
2302                 wake_up(&pagebuf_dataiodone_wait[cpu]);
2303                 wait_event_interruptible(pagebuf_dataiodone_wait[cpu],
2304                                 pb_dataio_daemons[cpu] == -1);
2305         }
2306 }
2307
2308 /*
2309  *      Initialization and Termination
2310  */
2311
2312 int __init
2313 pagebuf_init(void)
2314 {
2315         int                     i;
2316
2317         pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0,
2318                         SLAB_HWCACHE_ALIGN, NULL, NULL);
2319         if (pagebuf_cache == NULL) {
2320                 printk("XFS: couldn't init xfs_buf_t cache\n");
2321                 return -ENOMEM;
2322         }
2323
2324         if (_pagebuf_prealloc_bh(NR_RESERVED_BH) < NR_RESERVED_BH) {
2325                 printk("XFS: couldn't allocate %d reserved buffers\n",
2326                         NR_RESERVED_BH);
2327                 kmem_zone_destroy(pagebuf_cache);
2328                 return -ENOMEM;
2329         }
2330         init_waitqueue_head(&pb_resv_bh_wait);
2331
2332 #ifdef PAGEBUF_TRACE
2333         pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
2334 #endif
2335
2336         pagebuf_daemon_start();
2337
2338         pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup);
2339         if (pagebuf_shake == NULL) {
2340                 pagebuf_terminate();
2341                 return -ENOMEM;
2342         }
2343
2344         for (i = 0; i < NHASH; i++) {
2345                 spin_lock_init(&pbhash[i].pb_hash_lock);
2346                 INIT_LIST_HEAD(&pbhash[i].pb_hash);
2347         }
2348
2349         return 0;
2350 }
2351
2352 /*
2353  *      pagebuf_terminate.
2354  *
2355  *      Note: do not mark as __exit, this is also called from the __init code.
2356  */
2357 void
2358 pagebuf_terminate(void)
2359 {
2360         pagebuf_daemon_stop();
2361
2362 #ifdef PAGEBUF_TRACE
2363         ktrace_free(pagebuf_trace_buf);
2364 #endif
2365
2366         kmem_zone_destroy(pagebuf_cache);
2367         kmem_shake_deregister(pagebuf_shake);
2368 }