4 * The pageout daemon, decides which pages to evict (swap out) and
5 * does the actual work of freeing them.
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 * Swap reorganised 29.12.95, Stephen Tweedie.
10 * kswapd added: 7.1.96 sct
11 * Removed kswapd_ctl limits, and swap out as many pages as needed
12 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
13 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14 * Multiqueue VM started 5.8.00, Rik van Riel.
17 #include <linux/slab.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h>
20 #include <linux/swapctl.h>
21 #include <linux/smp_lock.h>
22 #include <linux/pagemap.h>
23 #include <linux/init.h>
24 #include <linux/highmem.h>
25 #include <linux/file.h>
27 #include <asm/pgalloc.h>
30 * The "priority" of VM scanning is how much of the queues we
31 * will scan in one go. A value of 6 for DEF_PRIORITY implies
32 * that we'll scan 1/64th of the queues ("queue_length >> 6")
33 * during a normal aging round.
35 #define DEF_PRIORITY (6)
38 * The swap-out function returns 1 if it successfully
39 * scanned all the pages it was asked to (`count').
40 * It returns zero if it couldn't do anything,
42 * rss may decrease because pages are shared, but this
43 * doesn't count as having freed a page.
46 /* mm->page_table_lock is held. mmap_sem is not held */
47 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
52 /* Don't look at this pte if it's been accessed recently. */
53 if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
54 mark_page_accessed(page);
58 /* Don't bother unmapping pages that are active */
62 /* Don't bother replenishing zones not under pressure.. */
63 if (!memclass(page_zone(page), classzone))
66 if (TryLockPage(page))
69 /* From this point on, the odds are that we're going to
70 * nuke this pte, so read and clear the pte. This hook
71 * is needed on CPUs which update the accessed and dirty
74 flush_cache_page(vma, address);
75 pte = ptep_get_and_clear(page_table);
76 flush_tlb_page(vma, address);
82 * Is the page already in the swap cache? If so, then
83 * we can just drop our reference to it without doing
84 * any IO - it's already up-to-date on disk.
86 if (PageSwapCache(page)) {
87 entry.val = page->index;
88 swap_duplicate(entry);
90 set_pte(page_table, swp_entry_to_pte(entry));
95 int freeable = page_count(page) - !!page->buffers <= 2;
96 page_cache_release(page);
102 * Is it a clean page? Then it must be recoverable
103 * by just paging it in again, and we can just drop
104 * it.. or if it's dirty but has backing store,
105 * just mark the page dirty and drop it.
107 * However, this won't actually free any real
108 * memory, as the page will just be in the page cache
109 * somewhere, and as such we should just continue
112 * Basically, this just makes it possible for us to do
113 * some real work in the future in "refill_inactive()".
117 if (!PageDirty(page))
121 * Anonymous buffercache pages can be left behind by
122 * concurrent truncate and pagefault.
128 * This is a dirty, swappable page. First of all,
129 * get a suitable swap entry for it, and make sure
130 * we have the swap cache set up to associate the
131 * page with that swap entry.
134 entry = get_swap_page();
137 /* Add it to the swap cache and mark it dirty
138 * (adding to the page cache will clear the dirty
139 * and uptodate bits, so we need to do it again)
141 if (add_to_swap_cache(page, entry) == 0) {
142 SetPageUptodate(page);
143 set_page_dirty(page);
146 /* Raced with "speculative" read_swap_cache_async */
150 /* No swap space left */
152 set_pte(page_table, pte);
157 /* mm->page_table_lock is held. mmap_sem is not held */
158 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
161 unsigned long pmd_end;
171 pte = pte_offset(dir, address);
173 pmd_end = (address + PMD_SIZE) & PMD_MASK;
178 if (pte_present(*pte)) {
179 struct page *page = pte_page(*pte);
181 if (VALID_PAGE(page) && !PageReserved(page)) {
182 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
184 address += PAGE_SIZE;
189 address += PAGE_SIZE;
191 } while (address && (address < end));
192 mm->swap_address = address;
196 /* mm->page_table_lock is held. mmap_sem is not held */
197 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
200 unsigned long pgd_end;
210 pmd = pmd_offset(dir, address);
212 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
213 if (pgd_end && (end > pgd_end))
217 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
220 address = (address + PMD_SIZE) & PMD_MASK;
222 } while (address && (address < end));
226 /* mm->page_table_lock is held. mmap_sem is not held */
227 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
232 /* Don't swap out areas which are reserved */
233 if (vma->vm_flags & VM_RESERVED)
236 pgdir = pgd_offset(mm, address);
239 BUG_ON(address >= end);
241 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
244 address = (address + PGDIR_SIZE) & PGDIR_MASK;
246 } while (address && (address < end));
250 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
251 struct mm_struct *swap_mm = &init_mm;
254 * Returns remaining count of pages to be swapped out by followup call.
256 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
258 unsigned long address;
259 struct vm_area_struct* vma;
262 * Find the proper vm-area after freezing the vma chain
265 spin_lock(&mm->page_table_lock);
266 address = mm->swap_address;
267 if (address == TASK_SIZE || swap_mm != mm) {
268 /* We raced: don't count this mm but try again */
272 vma = find_vma(mm, address);
274 if (address < vma->vm_start)
275 address = vma->vm_start;
278 count = swap_out_vma(mm, vma, address, count, classzone);
284 address = vma->vm_start;
287 /* Indicate that we reached the end of address space */
288 mm->swap_address = TASK_SIZE;
291 spin_unlock(&mm->page_table_lock);
295 static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
296 static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
298 int counter, nr_pages = SWAP_CLUSTER_MAX;
299 struct mm_struct *mm;
303 if (unlikely(current->need_resched)) {
304 __set_current_state(TASK_RUNNING);
308 spin_lock(&mmlist_lock);
310 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
311 mm->swap_address = 0;
312 mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
318 /* Make sure the mm doesn't disappear when we drop the lock.. */
319 atomic_inc(&mm->mm_users);
320 spin_unlock(&mmlist_lock);
322 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
328 } while (--counter >= 0);
333 spin_unlock(&mmlist_lock);
337 static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
338 static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
340 struct list_head * entry;
341 int max_scan = nr_inactive_pages / priority;
342 int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
344 spin_lock(&pagemap_lru_lock);
345 while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
348 if (unlikely(current->need_resched)) {
349 spin_unlock(&pagemap_lru_lock);
350 __set_current_state(TASK_RUNNING);
352 spin_lock(&pagemap_lru_lock);
356 page = list_entry(entry, struct page, lru);
358 BUG_ON(!PageLRU(page));
359 BUG_ON(PageActive(page));
362 list_add(entry, &inactive_list);
365 * Zero page counts can happen because we unlink the pages
366 * _after_ decrementing the usage count..
368 if (unlikely(!page_count(page)))
371 if (!memclass(page_zone(page), classzone))
374 /* Racy check to avoid trylocking when not worthwhile */
375 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
379 * The page is locked. IO in progress?
380 * Move it to the back of the list.
382 if (unlikely(TryLockPage(page))) {
383 if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
384 page_cache_get(page);
385 spin_unlock(&pagemap_lru_lock);
387 page_cache_release(page);
388 spin_lock(&pagemap_lru_lock);
393 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
395 * It is not critical here to write it only if
396 * the page is unmapped beause any direct writer
397 * like O_DIRECT would set the PG_dirty bitflag
398 * on the phisical page after having successfully
399 * pinned it and after the I/O to the page is finished,
400 * so the direct writes to the page cannot get lost.
402 int (*writepage)(struct page *);
404 writepage = page->mapping->a_ops->writepage;
405 if ((gfp_mask & __GFP_FS) && writepage) {
406 ClearPageDirty(page);
407 SetPageLaunder(page);
408 page_cache_get(page);
409 spin_unlock(&pagemap_lru_lock);
412 page_cache_release(page);
414 spin_lock(&pagemap_lru_lock);
420 * If the page has buffers, try to free the buffer mappings
421 * associated with this page. If we succeed we try to free
425 spin_unlock(&pagemap_lru_lock);
427 /* avoid to free a locked page */
428 page_cache_get(page);
430 if (try_to_release_page(page, gfp_mask)) {
431 if (!page->mapping) {
433 * We must not allow an anon page
434 * with no buffers to be visible on
435 * the LRU, so we unlock the page after
436 * taking the lru lock
438 spin_lock(&pagemap_lru_lock);
440 __lru_cache_del(page);
442 /* effectively free the page here */
443 page_cache_release(page);
450 * The page is still in pagecache so undo the stuff
451 * before the try_to_release_page since we've not
452 * finished and we can now try the next step.
454 page_cache_release(page);
456 spin_lock(&pagemap_lru_lock);
459 /* failed to drop the buffers so stop here */
461 page_cache_release(page);
463 spin_lock(&pagemap_lru_lock);
468 spin_lock(&pagecache_lock);
471 * this is the non-racy check for busy page.
473 if (!page->mapping || !is_page_cache_freeable(page)) {
474 spin_unlock(&pagecache_lock);
477 if (--max_mapped >= 0)
481 * Alert! We've found too many mapped pages on the
482 * inactive list, so we start swapping out now!
484 spin_unlock(&pagemap_lru_lock);
485 swap_out(priority, gfp_mask, classzone);
490 * It is critical to check PageDirty _after_ we made sure
491 * the page is freeable* so not in use by anybody.
493 if (PageDirty(page)) {
494 spin_unlock(&pagecache_lock);
499 /* point of no return */
500 if (likely(!PageSwapCache(page))) {
501 __remove_inode_page(page);
502 spin_unlock(&pagecache_lock);
505 swap.val = page->index;
506 __delete_from_swap_cache(page);
507 spin_unlock(&pagecache_lock);
511 __lru_cache_del(page);
514 /* effectively free the page here */
515 page_cache_release(page);
521 spin_unlock(&pagemap_lru_lock);
527 * This moves pages from the active list to
530 * We move them the other way when we see the
531 * reference bit on the page.
533 static void refill_inactive(int nr_pages)
535 struct list_head * entry;
537 spin_lock(&pagemap_lru_lock);
538 entry = active_list.prev;
539 while (nr_pages && entry != &active_list) {
542 page = list_entry(entry, struct page, lru);
544 if (PageTestandClearReferenced(page)) {
545 list_del(&page->lru);
546 list_add(&page->lru, &active_list);
552 del_page_from_active_list(page);
553 add_page_to_inactive_list(page);
554 SetPageReferenced(page);
556 spin_unlock(&pagemap_lru_lock);
559 static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
560 static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
562 int chunk_size = nr_pages;
565 nr_pages -= kmem_cache_reap(gfp_mask);
569 nr_pages = chunk_size;
570 /* try to keep the active list 2/3 of the size of the cache */
571 ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
572 refill_inactive(ratio);
574 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
578 shrink_dcache_memory(priority, gfp_mask);
579 shrink_icache_memory(priority, gfp_mask);
581 shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
587 int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
589 int priority = DEF_PRIORITY;
590 int nr_pages = SWAP_CLUSTER_MAX;
592 gfp_mask = pf_gfp_mask(gfp_mask);
594 nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
597 } while (--priority);
600 * Hmm.. Cache shrink failed - time to kill something?
601 * Mhwahahhaha! This is the part I really like. Giggle.
607 int try_to_free_pages(unsigned int gfp_mask)
610 zonelist_t *zonelist;
611 unsigned long pf_free_pages;
614 pf_free_pages = current->flags & PF_FREE_PAGES;
615 current->flags &= ~PF_FREE_PAGES;
617 for_each_pgdat(pgdat) {
618 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
619 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
622 current->flags |= pf_free_pages;
626 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
628 static int check_classzone_need_balance(zone_t * classzone)
630 zone_t * first_classzone;
632 first_classzone = classzone->zone_pgdat->node_zones;
633 while (classzone >= first_classzone) {
634 if (classzone->free_pages > classzone->pages_high)
641 static int kswapd_balance_pgdat(pg_data_t * pgdat)
643 int need_more_balance = 0, i;
646 for (i = pgdat->nr_zones-1; i >= 0; i--) {
647 zone = pgdat->node_zones + i;
648 if (unlikely(current->need_resched))
650 if (!zone->need_balance)
652 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
653 zone->need_balance = 0;
654 __set_current_state(TASK_INTERRUPTIBLE);
655 schedule_timeout(HZ);
658 if (check_classzone_need_balance(zone))
659 need_more_balance = 1;
661 zone->need_balance = 0;
664 return need_more_balance;
667 static void kswapd_balance(void)
669 int need_more_balance;
673 need_more_balance = 0;
675 for_each_pgdat(pgdat)
676 need_more_balance |= kswapd_balance_pgdat(pgdat);
677 } while (need_more_balance);
680 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
685 for (i = pgdat->nr_zones-1; i >= 0; i--) {
686 zone = pgdat->node_zones + i;
687 if (!zone->need_balance)
695 static int kswapd_can_sleep(void)
699 for_each_pgdat(pgdat) {
700 if (!kswapd_can_sleep_pgdat(pgdat))
708 * The background pageout daemon, started as a kernel thread
709 * from the init process.
711 * This basically trickles out pages so that we have _some_
712 * free memory available even if there is no other activity
713 * that frees anything up. This is needed for things like routing
714 * etc, where we otherwise might have all activity going on in
715 * asynchronous contexts that cannot page things out.
717 * If there are applications that are active memory-allocators
718 * (most normal use), this basically shouldn't matter.
720 int kswapd(void *unused)
722 struct task_struct *tsk = current;
723 DECLARE_WAITQUEUE(wait, tsk);
726 strcpy(tsk->comm, "kswapd");
727 sigfillset(&tsk->blocked);
730 * Tell the memory management that we're a "memory allocator",
731 * and that if we need more memory we should get access to it
732 * regardless (see "__alloc_pages()"). "kswapd" should
733 * never get caught in the normal page freeing logic.
735 * (Kswapd normally doesn't need memory anyway, but sometimes
736 * you need a small amount of memory in order to be able to
737 * page out something else, and this flag essentially protects
738 * us from recursively trying to free more memory as we're
739 * trying to free the first piece of memory in the first place).
741 tsk->flags |= PF_MEMALLOC;
747 __set_current_state(TASK_INTERRUPTIBLE);
748 add_wait_queue(&kswapd_wait, &wait);
751 if (kswapd_can_sleep())
754 __set_current_state(TASK_RUNNING);
755 remove_wait_queue(&kswapd_wait, &wait);
758 * If we actually get into a low-memory situation,
759 * the processes needing more memory will wake us
760 * up on a more timely basis.
763 run_task_queue(&tq_disk);
767 static int __init kswapd_init(void)
769 printk("Starting kswapd\n");
771 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
775 module_init(kswapd_init)