4 * The pageout daemon, decides which pages to evict (swap out) and
5 * does the actual work of freeing them.
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 * Swap reorganised 29.12.95, Stephen Tweedie.
10 * kswapd added: 7.1.96 sct
11 * Removed kswapd_ctl limits, and swap out as many pages as needed
12 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
13 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14 * Multiqueue VM started 5.8.00, Rik van Riel.
17 #include <linux/slab.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h>
20 #include <linux/swapctl.h>
21 #include <linux/smp_lock.h>
22 #include <linux/pagemap.h>
23 #include <linux/init.h>
24 #include <linux/highmem.h>
25 #include <linux/file.h>
27 #include <asm/pgalloc.h>
30 * "vm_passes" is the number of vm passes before failing the
31 * memory balancing. Take into account 3 passes are needed
32 * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
33 * of the inactive list at each pass.
38 * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
39 * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
40 * scan 1/6 of the inactive lists during a normal aging round.
42 int vm_cache_scan_ratio = 6;
45 * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
46 * we'll start to pageout.
48 int vm_mapped_ratio = 100;
51 * "vm_lru_balance_ratio" controls the balance between active and
52 * inactive cache. The bigger vm_balance is, the easier the
53 * active cache will grow, because we'll rotate the active list
54 * slowly. A value of 2 means we'll go towards a balance of
55 * 1/3 of the cache being inactive.
57 int vm_lru_balance_ratio = 2;
60 * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan
61 * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of
62 * the unused-inode, dentry and dquot caches will be freed during a normal
65 int vm_vfs_scan_ratio = 6;
68 * "vm_anon_lru" select if to immdiatly insert anon pages in the
69 * lru. Immediatly means as soon as they're allocated during the
72 * If this is set to 0, they're inserted only after the first
75 * Having anon pages immediatly inserted in the lru allows the
76 * VM to know better when it's worthwhile to start swapping
77 * anonymous ram, it will start to swap earlier and it should
78 * swap smoother and faster, but it will decrease scalability
79 * on the >16-ways of an order of magnitude. Big SMP/NUMA
80 * definitely can't take an hit on a global spinlock at
81 * every anon page allocation. So this is off by default.
83 * Low ram machines that swaps all the time want to turn
84 * this on (i.e. set to 1).
89 * The swap-out function returns 1 if it successfully
90 * scanned all the pages it was asked to (`count').
91 * It returns zero if it couldn't do anything,
93 * rss may decrease because pages are shared, but this
94 * doesn't count as having freed a page.
97 /* mm->page_table_lock is held. mmap_sem is not held */
98 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
103 /* Don't look at this pte if it's been accessed recently. */
104 if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
105 mark_page_accessed(page);
109 /* Don't bother unmapping pages that are active */
110 if (PageActive(page))
113 /* Don't bother replenishing zones not under pressure.. */
114 if (!memclass(page_zone(page), classzone))
117 if (TryLockPage(page))
120 /* From this point on, the odds are that we're going to
121 * nuke this pte, so read and clear the pte. This hook
122 * is needed on CPUs which update the accessed and dirty
125 flush_cache_page(vma, address);
126 pte = ptep_get_and_clear(page_table);
127 flush_tlb_page(vma, address);
130 set_page_dirty(page);
133 * Is the page already in the swap cache? If so, then
134 * we can just drop our reference to it without doing
135 * any IO - it's already up-to-date on disk.
137 if (PageSwapCache(page)) {
138 entry.val = page->index;
139 swap_duplicate(entry);
141 set_pte(page_table, swp_entry_to_pte(entry));
146 int freeable = page_count(page) - !!page->buffers <= 2;
147 page_cache_release(page);
153 * Is it a clean page? Then it must be recoverable
154 * by just paging it in again, and we can just drop
155 * it.. or if it's dirty but has backing store,
156 * just mark the page dirty and drop it.
158 * However, this won't actually free any real
159 * memory, as the page will just be in the page cache
160 * somewhere, and as such we should just continue
163 * Basically, this just makes it possible for us to do
164 * some real work in the future in "refill_inactive()".
168 if (!PageDirty(page))
172 * Anonymous buffercache pages can be left behind by
173 * concurrent truncate and pagefault.
179 * This is a dirty, swappable page. First of all,
180 * get a suitable swap entry for it, and make sure
181 * we have the swap cache set up to associate the
182 * page with that swap entry.
185 entry = get_swap_page();
188 /* Add it to the swap cache and mark it dirty
189 * (adding to the page cache will clear the dirty
190 * and uptodate bits, so we need to do it again)
192 if (add_to_swap_cache(page, entry) == 0) {
193 SetPageUptodate(page);
194 set_page_dirty(page);
197 /* Raced with "speculative" read_swap_cache_async */
201 /* No swap space left */
203 set_pte(page_table, pte);
208 /* mm->page_table_lock is held. mmap_sem is not held */
209 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
212 unsigned long pmd_end;
222 pte = pte_offset(dir, address);
224 pmd_end = (address + PMD_SIZE) & PMD_MASK;
229 if (pte_present(*pte)) {
230 struct page *page = pte_page(*pte);
232 if (VALID_PAGE(page) && !PageReserved(page)) {
233 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
235 address += PAGE_SIZE;
240 address += PAGE_SIZE;
242 } while (address && (address < end));
243 mm->swap_address = address;
247 /* mm->page_table_lock is held. mmap_sem is not held */
248 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
251 unsigned long pgd_end;
261 pmd = pmd_offset(dir, address);
263 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
264 if (pgd_end && (end > pgd_end))
268 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
271 address = (address + PMD_SIZE) & PMD_MASK;
273 } while (address && (address < end));
277 /* mm->page_table_lock is held. mmap_sem is not held */
278 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
283 /* Don't swap out areas which are reserved */
284 if (vma->vm_flags & VM_RESERVED)
287 pgdir = pgd_offset(mm, address);
290 BUG_ON(address >= end);
292 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
295 address = (address + PGDIR_SIZE) & PGDIR_MASK;
297 } while (address && (address < end));
301 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
302 struct mm_struct *swap_mm = &init_mm;
305 * Returns remaining count of pages to be swapped out by followup call.
307 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
309 unsigned long address;
310 struct vm_area_struct* vma;
313 * Find the proper vm-area after freezing the vma chain
316 spin_lock(&mm->page_table_lock);
317 address = mm->swap_address;
318 if (address == TASK_SIZE || swap_mm != mm) {
319 /* We raced: don't count this mm but try again */
323 vma = find_vma(mm, address);
325 if (address < vma->vm_start)
326 address = vma->vm_start;
329 count = swap_out_vma(mm, vma, address, count, classzone);
335 address = vma->vm_start;
338 /* Indicate that we reached the end of address space */
339 mm->swap_address = TASK_SIZE;
342 spin_unlock(&mm->page_table_lock);
346 static int FASTCALL(swap_out(zone_t * classzone));
347 static int fastcall swap_out(zone_t * classzone)
349 int counter, nr_pages = SWAP_CLUSTER_MAX;
350 struct mm_struct *mm;
352 counter = mmlist_nr << 1;
354 if (unlikely(current->need_resched)) {
355 __set_current_state(TASK_RUNNING);
359 spin_lock(&mmlist_lock);
361 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
362 mm->swap_address = 0;
363 mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
369 /* Make sure the mm doesn't disappear when we drop the lock.. */
370 atomic_inc(&mm->mm_users);
371 spin_unlock(&mmlist_lock);
373 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
379 } while (--counter >= 0);
384 spin_unlock(&mmlist_lock);
388 static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
389 static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
390 static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
392 struct list_head * entry;
393 int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
394 int max_mapped = vm_mapped_ratio * nr_pages;
396 while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
399 if (unlikely(current->need_resched)) {
400 spin_unlock(&pagemap_lru_lock);
401 __set_current_state(TASK_RUNNING);
403 spin_lock(&pagemap_lru_lock);
407 page = list_entry(entry, struct page, lru);
409 BUG_ON(!PageLRU(page));
410 BUG_ON(PageActive(page));
413 list_add(entry, &inactive_list);
416 * Zero page counts can happen because we unlink the pages
417 * _after_ decrementing the usage count..
419 if (unlikely(!page_count(page)))
422 if (!memclass(page_zone(page), classzone))
427 /* Racy check to avoid trylocking when not worthwhile */
428 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
432 * The page is locked. IO in progress?
433 * Move it to the back of the list.
435 if (unlikely(TryLockPage(page))) {
436 if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
437 page_cache_get(page);
438 spin_unlock(&pagemap_lru_lock);
440 page_cache_release(page);
441 spin_lock(&pagemap_lru_lock);
446 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
448 * It is not critical here to write it only if
449 * the page is unmapped beause any direct writer
450 * like O_DIRECT would set the PG_dirty bitflag
451 * on the phisical page after having successfully
452 * pinned it and after the I/O to the page is finished,
453 * so the direct writes to the page cannot get lost.
455 int (*writepage)(struct page *);
457 writepage = page->mapping->a_ops->writepage;
458 if ((gfp_mask & __GFP_FS) && writepage) {
459 ClearPageDirty(page);
460 SetPageLaunder(page);
461 page_cache_get(page);
462 spin_unlock(&pagemap_lru_lock);
465 page_cache_release(page);
467 spin_lock(&pagemap_lru_lock);
473 * If the page has buffers, try to free the buffer mappings
474 * associated with this page. If we succeed we try to free
478 spin_unlock(&pagemap_lru_lock);
480 /* avoid to free a locked page */
481 page_cache_get(page);
483 if (try_to_release_page(page, gfp_mask)) {
484 if (!page->mapping) {
486 * We must not allow an anon page
487 * with no buffers to be visible on
488 * the LRU, so we unlock the page after
489 * taking the lru lock
491 spin_lock(&pagemap_lru_lock);
493 __lru_cache_del(page);
495 /* effectively free the page here */
496 page_cache_release(page);
503 * The page is still in pagecache so undo the stuff
504 * before the try_to_release_page since we've not
505 * finished and we can now try the next step.
507 page_cache_release(page);
509 spin_lock(&pagemap_lru_lock);
512 /* failed to drop the buffers so stop here */
514 page_cache_release(page);
516 spin_lock(&pagemap_lru_lock);
521 spin_lock(&pagecache_lock);
524 * This is the non-racy check for busy page.
525 * It is critical to check PageDirty _after_ we made sure
526 * the page is freeable so not in use by anybody.
527 * At this point we're guaranteed that page->buffers is NULL,
528 * nobody can refill page->buffers under us because we still
529 * hold the page lock.
531 if (!page->mapping || page_count(page) > 1) {
532 spin_unlock(&pagecache_lock);
535 if (--max_mapped < 0) {
536 spin_unlock(&pagemap_lru_lock);
538 nr_pages -= kmem_cache_reap(gfp_mask);
542 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
543 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
545 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
548 if (!*failed_swapout)
549 *failed_swapout = !swap_out(classzone);
551 max_mapped = nr_pages * vm_mapped_ratio;
553 spin_lock(&pagemap_lru_lock);
554 refill_inactive(nr_pages, classzone);
560 if (PageDirty(page)) {
561 spin_unlock(&pagecache_lock);
566 __lru_cache_del(page);
568 /* point of no return */
569 if (likely(!PageSwapCache(page))) {
570 __remove_inode_page(page);
571 spin_unlock(&pagecache_lock);
574 swap.val = page->index;
575 __delete_from_swap_cache(page);
576 spin_unlock(&pagecache_lock);
582 /* effectively free the page here */
583 page_cache_release(page);
589 spin_unlock(&pagemap_lru_lock);
596 * This moves pages from the active list to
599 * We move them the other way when we see the
600 * reference bit on the page.
602 static void fastcall refill_inactive(int nr_pages, zone_t * classzone)
604 struct list_head * entry;
607 ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
609 entry = active_list.prev;
610 while (ratio && entry != &active_list) {
613 page = list_entry(entry, struct page, lru);
615 if (PageTestandClearReferenced(page)) {
616 list_del(&page->lru);
617 list_add(&page->lru, &active_list);
623 del_page_from_active_list(page);
624 add_page_to_inactive_list(page);
625 SetPageReferenced(page);
628 if (entry != &active_list) {
629 list_del(&active_list);
630 list_add(&active_list, entry);
634 static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
635 static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
637 nr_pages -= kmem_cache_reap(gfp_mask);
641 spin_lock(&pagemap_lru_lock);
642 refill_inactive(nr_pages, classzone);
644 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
650 static int check_classzone_need_balance(zone_t * classzone);
652 int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
654 gfp_mask = pf_gfp_mask(gfp_mask);
657 int tries = vm_passes;
658 int failed_swapout = !(gfp_mask & __GFP_IO);
659 int nr_pages = SWAP_CLUSTER_MAX;
662 nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
665 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
666 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
668 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
671 failed_swapout = !swap_out(classzone);
674 #ifdef CONFIG_OOM_KILLER
677 if (likely(current->pid != 1))
679 if (!check_classzone_need_balance(classzone))
682 __set_current_state(TASK_RUNNING);
690 int fastcall try_to_free_pages(unsigned int gfp_mask)
693 zonelist_t *zonelist;
694 unsigned long pf_free_pages;
697 pf_free_pages = current->flags & PF_FREE_PAGES;
698 current->flags &= ~PF_FREE_PAGES;
700 for_each_pgdat(pgdat) {
701 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
702 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
705 current->flags |= pf_free_pages;
709 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
711 static int check_classzone_need_balance(zone_t * classzone)
714 int class_idx = zone_idx(classzone);
716 first_zone = classzone->zone_pgdat->node_zones;
717 while (classzone >= first_zone) {
718 if (classzone->free_pages > classzone->watermarks[class_idx].high)
725 static int kswapd_balance_pgdat(pg_data_t * pgdat)
727 int need_more_balance = 0, i;
730 for (i = pgdat->nr_zones-1; i >= 0; i--) {
731 zone = pgdat->node_zones + i;
732 if (unlikely(current->need_resched))
734 if (!zone->need_balance || !zone->size)
736 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
737 zone->need_balance = 0;
738 __set_current_state(TASK_INTERRUPTIBLE);
739 schedule_timeout(HZ*5);
742 if (check_classzone_need_balance(zone))
743 need_more_balance = 1;
745 zone->need_balance = 0;
748 return need_more_balance;
751 static void kswapd_balance(void)
753 int need_more_balance;
757 need_more_balance = 0;
759 for_each_pgdat(pgdat)
760 need_more_balance |= kswapd_balance_pgdat(pgdat);
761 } while (need_more_balance);
764 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
769 for (i = pgdat->nr_zones-1; i >= 0; i--) {
770 zone = pgdat->node_zones + i;
771 if (!zone->need_balance || !zone->size)
779 static int kswapd_can_sleep(void)
783 for_each_pgdat(pgdat) {
784 if (!kswapd_can_sleep_pgdat(pgdat))
792 * The background pageout daemon, started as a kernel thread
793 * from the init process.
795 * This basically trickles out pages so that we have _some_
796 * free memory available even if there is no other activity
797 * that frees anything up. This is needed for things like routing
798 * etc, where we otherwise might have all activity going on in
799 * asynchronous contexts that cannot page things out.
801 * If there are applications that are active memory-allocators
802 * (most normal use), this basically shouldn't matter.
804 int kswapd(void *unused)
806 struct task_struct *tsk = current;
807 DECLARE_WAITQUEUE(wait, tsk);
810 strcpy(tsk->comm, "kswapd");
811 sigfillset(&tsk->blocked);
814 * Tell the memory management that we're a "memory allocator",
815 * and that if we need more memory we should get access to it
816 * regardless (see "__alloc_pages()"). "kswapd" should
817 * never get caught in the normal page freeing logic.
819 * (Kswapd normally doesn't need memory anyway, but sometimes
820 * you need a small amount of memory in order to be able to
821 * page out something else, and this flag essentially protects
822 * us from recursively trying to free more memory as we're
823 * trying to free the first piece of memory in the first place).
825 tsk->flags |= PF_MEMALLOC;
831 __set_current_state(TASK_INTERRUPTIBLE);
832 add_wait_queue(&kswapd_wait, &wait);
835 if (kswapd_can_sleep())
838 __set_current_state(TASK_RUNNING);
839 remove_wait_queue(&kswapd_wait, &wait);
842 * If we actually get into a low-memory situation,
843 * the processes needing more memory will wake us
844 * up on a more timely basis.
847 run_task_queue(&tq_disk);
851 static int __init kswapd_init(void)
853 printk("Starting kswapd\n");
855 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
859 module_init(kswapd_init)