mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  The pageout daemon, decides which pages to evict (swap out) and
   5  *  does the actual work of freeing them.
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *
   9  *  Swap reorganised 29.12.95, Stephen Tweedie.
  10  *  kswapd added: 7.1.96  sct
  11  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  12  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  13  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  14  *  Multiqueue VM started 5.8.00, Rik van Riel.
  15  */
  16
  17 #include <linux/slab.h>
  18 #include <linux/kernel_stat.h>
  19 #include <linux/swap.h>
  20 #include <linux/swapctl.h>
  21 #include <linux/smp_lock.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/init.h>
  24 #include <linux/highmem.h>
  25 #include <linux/file.h>
  26
  27 #include <asm/pgalloc.h>
  28
  29 /*
  30  * "vm_passes" is the number of vm passes before failing the
  31  * memory balancing. Take into account 3 passes are needed
  32  * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
  33  * of the inactive list at each pass.
  34  */
  35 int vm_passes = 60;
  36
  37 /*
  38  * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
  39  * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
  40  * scan 1/6 of the inactive lists during a normal aging round.
  41  */
  42 int vm_cache_scan_ratio = 6;
  43
  44 /*
  45  * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
  46  * we'll start to pageout.
  47  */
  48 int vm_mapped_ratio = 100;
  49
  50 /*
  51  * "vm_lru_balance_ratio" controls the balance between active and
  52  * inactive cache. The bigger vm_balance is, the easier the
  53  * active cache will grow, because we'll rotate the active list
  54  * slowly. A value of 2 means we'll go towards a balance of
  55  * 1/3 of the cache being inactive.
  56  */
  57 int vm_lru_balance_ratio = 2;
  58
  59 /*
  60  * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan
  61  * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of
  62  * the unused-inode, dentry and dquot caches will be freed during a normal
  63  * aging round.
  64  */
  65 int vm_vfs_scan_ratio = 6;
  66
  67 /*
  68  * "vm_anon_lru" select if to immdiatly insert anon pages in the
  69  * lru. Immediatly means as soon as they're allocated during the
  70  * page faults.
  71  *
  72  * If this is set to 0, they're inserted only after the first
  73  * swapout.
  74  *
  75  * Having anon pages immediatly inserted in the lru allows the
  76  * VM to know better when it's worthwhile to start swapping
  77  * anonymous ram, it will start to swap earlier and it should
  78  * swap smoother and faster, but it will decrease scalability
  79  * on the >16-ways of an order of magnitude. Big SMP/NUMA
  80  * definitely can't take an hit on a global spinlock at
  81  * every anon page allocation. So this is off by default.
  82  *
  83  * Low ram machines that swaps all the time want to turn
  84  * this on (i.e. set to 1).
  85  */
  86 int vm_anon_lru = 0;
  87
  88 /*
  89  * The swap-out function returns 1 if it successfully
  90  * scanned all the pages it was asked to (`count').
  91  * It returns zero if it couldn't do anything,
  92  *
  93  * rss may decrease because pages are shared, but this
  94  * doesn't count as having freed a page.
  95  */
  96
  97 /* mm->page_table_lock is held. mmap_sem is not held */
  98 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
  99 {
 100         pte_t pte;
 101         swp_entry_t entry;
 102
 103         /* Don't look at this pte if it's been accessed recently. */
 104         if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
 105                 mark_page_accessed(page);
 106                 return 0;
 107         }
 108
 109         /* Don't bother unmapping pages that are active */
 110         if (PageActive(page))
 111                 return 0;
 112
 113         /* Don't bother replenishing zones not under pressure.. */
 114         if (!memclass(page_zone(page), classzone))
 115                 return 0;
 116
 117         if (TryLockPage(page))
 118                 return 0;
 119
 120         /* From this point on, the odds are that we're going to
 121          * nuke this pte, so read and clear the pte.  This hook
 122          * is needed on CPUs which update the accessed and dirty
 123          * bits in hardware.
 124          */
 125         flush_cache_page(vma, address);
 126         pte = ptep_get_and_clear(page_table);
 127         flush_tlb_page(vma, address);
 128
 129         if (pte_dirty(pte))
 130                 set_page_dirty(page);
 131
 132         /*
 133          * Is the page already in the swap cache? If so, then
 134          * we can just drop our reference to it without doing
 135          * any IO - it's already up-to-date on disk.
 136          */
 137         if (PageSwapCache(page)) {
 138                 entry.val = page->index;
 139                 swap_duplicate(entry);
 140 set_swap_pte:
 141                 set_pte(page_table, swp_entry_to_pte(entry));
 142 drop_pte:
 143                 mm->rss--;
 144                 UnlockPage(page);
 145                 {
 146                         int freeable = page_count(page) - !!page->buffers <= 2;
 147                         page_cache_release(page);
 148                         return freeable;
 149                 }
 150         }
 151
 152         /*
 153          * Is it a clean page? Then it must be recoverable
 154          * by just paging it in again, and we can just drop
 155          * it..  or if it's dirty but has backing store,
 156          * just mark the page dirty and drop it.
 157          *
 158          * However, this won't actually free any real
 159          * memory, as the page will just be in the page cache
 160          * somewhere, and as such we should just continue
 161          * our scan.
 162          *
 163          * Basically, this just makes it possible for us to do
 164          * some real work in the future in "refill_inactive()".
 165          */
 166         if (page->mapping)
 167                 goto drop_pte;
 168         if (!PageDirty(page))
 169                 goto drop_pte;
 170
 171         /*
 172          * Anonymous buffercache pages can be left behind by
 173          * concurrent truncate and pagefault.
 174          */
 175         if (page->buffers)
 176                 goto preserve;
 177
 178         /*
 179          * This is a dirty, swappable page.  First of all,
 180          * get a suitable swap entry for it, and make sure
 181          * we have the swap cache set up to associate the
 182          * page with that swap entry.
 183          */
 184         for (;;) {
 185                 entry = get_swap_page();
 186                 if (!entry.val)
 187                         break;
 188                 /* Add it to the swap cache and mark it dirty
 189                  * (adding to the page cache will clear the dirty
 190                  * and uptodate bits, so we need to do it again)
 191                  */
 192                 if (add_to_swap_cache(page, entry) == 0) {
 193                         SetPageUptodate(page);
 194                         set_page_dirty(page);
 195                         goto set_swap_pte;
 196                 }
 197                 /* Raced with "speculative" read_swap_cache_async */
 198                 swap_free(entry);
 199         }
 200
 201         /* No swap space left */
 202 preserve:
 203         set_pte(page_table, pte);
 204         UnlockPage(page);
 205         return 0;
 206 }
 207
 208 /* mm->page_table_lock is held. mmap_sem is not held */
 209 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 210 {
 211         pte_t * pte;
 212         unsigned long pmd_end;
 213
 214         if (pmd_none(*dir))
 215                 return count;
 216         if (pmd_bad(*dir)) {
 217                 pmd_ERROR(*dir);
 218                 pmd_clear(dir);
 219                 return count;
 220         }
 221
 222         pte = pte_offset(dir, address);
 223
 224         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 225         if (end > pmd_end)
 226                 end = pmd_end;
 227
 228         do {
 229                 if (pte_present(*pte)) {
 230                         struct page *page = pte_page(*pte);
 231
 232                         if (VALID_PAGE(page) && !PageReserved(page)) {
 233                                 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
 234                                 if (!count) {
 235                                         address += PAGE_SIZE;
 236                                         break;
 237                                 }
 238                         }
 239                 }
 240                 address += PAGE_SIZE;
 241                 pte++;
 242         } while (address && (address < end));
 243         mm->swap_address = address;
 244         return count;
 245 }
 246
 247 /* mm->page_table_lock is held. mmap_sem is not held */
 248 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 249 {
 250         pmd_t * pmd;
 251         unsigned long pgd_end;
 252
 253         if (pgd_none(*dir))
 254                 return count;
 255         if (pgd_bad(*dir)) {
 256                 pgd_ERROR(*dir);
 257                 pgd_clear(dir);
 258                 return count;
 259         }
 260
 261         pmd = pmd_offset(dir, address);
 262
 263         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 264         if (pgd_end && (end > pgd_end))
 265                 end = pgd_end;
 266
 267         do {
 268                 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
 269                 if (!count)
 270                         break;
 271                 address = (address + PMD_SIZE) & PMD_MASK;
 272                 pmd++;
 273         } while (address && (address < end));
 274         return count;
 275 }
 276
 277 /* mm->page_table_lock is held. mmap_sem is not held */
 278 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
 279 {
 280         pgd_t *pgdir;
 281         unsigned long end;
 282
 283         /* Don't swap out areas which are reserved */
 284         if (vma->vm_flags & VM_RESERVED)
 285                 return count;
 286
 287         pgdir = pgd_offset(mm, address);
 288
 289         end = vma->vm_end;
 290         BUG_ON(address >= end);
 291         do {
 292                 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
 293                 if (!count)
 294                         break;
 295                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 296                 pgdir++;
 297         } while (address && (address < end));
 298         return count;
 299 }
 300
 301 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
 302 struct mm_struct *swap_mm = &init_mm;
 303
 304 /*
 305  * Returns remaining count of pages to be swapped out by followup call.
 306  */
 307 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
 308 {
 309         unsigned long address;
 310         struct vm_area_struct* vma;
 311
 312         /*
 313          * Find the proper vm-area after freezing the vma chain
 314          * and ptes.
 315          */
 316         spin_lock(&mm->page_table_lock);
 317         address = mm->swap_address;
 318         if (address == TASK_SIZE || swap_mm != mm) {
 319                 /* We raced: don't count this mm but try again */
 320                 ++*mmcounter;
 321                 goto out_unlock;
 322         }
 323         vma = find_vma(mm, address);
 324         if (vma) {
 325                 if (address < vma->vm_start)
 326                         address = vma->vm_start;
 327
 328                 for (;;) {
 329                         count = swap_out_vma(mm, vma, address, count, classzone);
 330                         vma = vma->vm_next;
 331                         if (!vma)
 332                                 break;
 333                         if (!count)
 334                                 goto out_unlock;
 335                         address = vma->vm_start;
 336                 }
 337         }
 338         /* Indicate that we reached the end of address space */
 339         mm->swap_address = TASK_SIZE;
 340
 341 out_unlock:
 342         spin_unlock(&mm->page_table_lock);
 343         return count;
 344 }
 345
 346 static int FASTCALL(swap_out(zone_t * classzone));
 347 static int fastcall swap_out(zone_t * classzone)
 348 {
 349         int counter, nr_pages = SWAP_CLUSTER_MAX;
 350         struct mm_struct *mm;
 351
 352         counter = mmlist_nr << 1;
 353         do {
 354                 if (unlikely(current->need_resched)) {
 355                         __set_current_state(TASK_RUNNING);
 356                         schedule();
 357                 }
 358
 359                 spin_lock(&mmlist_lock);
 360                 mm = swap_mm;
 361                 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
 362                         mm->swap_address = 0;
 363                         mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 364                         if (mm == swap_mm)
 365                                 goto empty;
 366                         swap_mm = mm;
 367                 }
 368
 369                 /* Make sure the mm doesn't disappear when we drop the lock.. */
 370                 atomic_inc(&mm->mm_users);
 371                 spin_unlock(&mmlist_lock);
 372
 373                 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
 374
 375                 mmput(mm);
 376
 377                 if (!nr_pages)
 378                         return 1;
 379         } while (--counter >= 0);
 380
 381         return 0;
 382
 383 empty:
 384         spin_unlock(&mmlist_lock);
 385         return 0;
 386 }
 387
 388 static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
 389 static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
 390 static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
 391 {
 392         struct list_head * entry;
 393         int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
 394         int max_mapped = vm_mapped_ratio * nr_pages;
 395
 396         while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
 397                 struct page * page;
 398
 399                 if (unlikely(current->need_resched)) {
 400                         spin_unlock(&pagemap_lru_lock);
 401                         __set_current_state(TASK_RUNNING);
 402                         schedule();
 403                         spin_lock(&pagemap_lru_lock);
 404                         continue;
 405                 }
 406
 407                 page = list_entry(entry, struct page, lru);
 408
 409                 BUG_ON(!PageLRU(page));
 410                 BUG_ON(PageActive(page));
 411
 412                 list_del(entry);
 413                 list_add(entry, &inactive_list);
 414
 415                 /*
 416                  * Zero page counts can happen because we unlink the pages
 417                  * _after_ decrementing the usage count..
 418                  */
 419                 if (unlikely(!page_count(page)))
 420                         continue;
 421
 422                 if (!memclass(page_zone(page), classzone))
 423                         continue;
 424
 425                 max_scan--;
 426
 427                 /* Racy check to avoid trylocking when not worthwhile */
 428                 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
 429                         goto page_mapped;
 430
 431                 /*
 432                  * The page is locked. IO in progress?
 433                  * Move it to the back of the list.
 434                  */
 435                 if (unlikely(TryLockPage(page))) {
 436                         if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
 437                                 page_cache_get(page);
 438                                 spin_unlock(&pagemap_lru_lock);
 439                                 wait_on_page(page);
 440                                 page_cache_release(page);
 441                                 spin_lock(&pagemap_lru_lock);
 442                         }
 443                         continue;
 444                 }
 445
 446                 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
 447                         /*
 448                          * It is not critical here to write it only if
 449                          * the page is unmapped beause any direct writer
 450                          * like O_DIRECT would set the PG_dirty bitflag
 451                          * on the phisical page after having successfully
 452                          * pinned it and after the I/O to the page is finished,
 453                          * so the direct writes to the page cannot get lost.
 454                          */
 455                         int (*writepage)(struct page *);
 456
 457                         writepage = page->mapping->a_ops->writepage;
 458                         if ((gfp_mask & __GFP_FS) && writepage) {
 459                                 ClearPageDirty(page);
 460                                 SetPageLaunder(page);
 461                                 page_cache_get(page);
 462                                 spin_unlock(&pagemap_lru_lock);
 463
 464                                 writepage(page);
 465                                 page_cache_release(page);
 466
 467                                 spin_lock(&pagemap_lru_lock);
 468                                 continue;
 469                         }
 470                 }
 471
 472                 /*
 473                  * If the page has buffers, try to free the buffer mappings
 474                  * associated with this page. If we succeed we try to free
 475                  * the page as well.
 476                  */
 477                 if (page->buffers) {
 478                         spin_unlock(&pagemap_lru_lock);
 479
 480                         /* avoid to free a locked page */
 481                         page_cache_get(page);
 482
 483                         if (try_to_release_page(page, gfp_mask)) {
 484                                 if (!page->mapping) {
 485                                         /*
 486                                          * We must not allow an anon page
 487                                          * with no buffers to be visible on
 488                                          * the LRU, so we unlock the page after
 489                                          * taking the lru lock
 490                                          */
 491                                         spin_lock(&pagemap_lru_lock);
 492                                         UnlockPage(page);
 493                                         __lru_cache_del(page);
 494
 495                                         /* effectively free the page here */
 496                                         page_cache_release(page);
 497
 498                                         if (--nr_pages)
 499                                                 continue;
 500                                         break;
 501                                 } else {
 502                                         /*
 503                                          * The page is still in pagecache so undo the stuff
 504                                          * before the try_to_release_page since we've not
 505                                          * finished and we can now try the next step.
 506                                          */
 507                                         page_cache_release(page);
 508
 509                                         spin_lock(&pagemap_lru_lock);
 510                                 }
 511                         } else {
 512                                 /* failed to drop the buffers so stop here */
 513                                 UnlockPage(page);
 514                                 page_cache_release(page);
 515
 516                                 spin_lock(&pagemap_lru_lock);
 517                                 continue;
 518                         }
 519                 }
 520
 521                 spin_lock(&pagecache_lock);
 522
 523                 /*
 524                  * This is the non-racy check for busy page.
 525                  * It is critical to check PageDirty _after_ we made sure
 526                  * the page is freeable so not in use by anybody.
 527                  * At this point we're guaranteed that page->buffers is NULL,
 528                  * nobody can refill page->buffers under us because we still
 529                  * hold the page lock.
 530                  */
 531                 if (!page->mapping || page_count(page) > 1) {
 532                         spin_unlock(&pagecache_lock);
 533                         UnlockPage(page);
 534 page_mapped:
 535                         if (--max_mapped < 0) {
 536                                 spin_unlock(&pagemap_lru_lock);
 537
 538                                 nr_pages -= kmem_cache_reap(gfp_mask);
 539                                 if (nr_pages <= 0)
 540                                         goto out;
 541
 542                                 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
 543                                 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
 544 #ifdef CONFIG_QUOTA
 545                                 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
 546 #endif
 547
 548                                 if (!*failed_swapout)
 549                                         *failed_swapout = !swap_out(classzone);
 550
 551                                 max_mapped = nr_pages * vm_mapped_ratio;
 552
 553                                 spin_lock(&pagemap_lru_lock);
 554                                 refill_inactive(nr_pages, classzone);
 555                         }
 556                         continue;
 557
 558                 }
 559                 smp_rmb();
 560                 if (PageDirty(page)) {
 561                         spin_unlock(&pagecache_lock);
 562                         UnlockPage(page);
 563                         continue;
 564                 }
 565
 566                 __lru_cache_del(page);
 567
 568                 /* point of no return */
 569                 if (likely(!PageSwapCache(page))) {
 570                         __remove_inode_page(page);
 571                         spin_unlock(&pagecache_lock);
 572                 } else {
 573                         swp_entry_t swap;
 574                         swap.val = page->index;
 575                         __delete_from_swap_cache(page);
 576                         spin_unlock(&pagecache_lock);
 577                         swap_free(swap);
 578                 }
 579
 580                 UnlockPage(page);
 581
 582                 /* effectively free the page here */
 583                 page_cache_release(page);
 584
 585                 if (--nr_pages)
 586                         continue;
 587                 break;
 588         }
 589         spin_unlock(&pagemap_lru_lock);
 590
 591  out:
 592         return nr_pages;
 593 }
 594
 595 /*
 596  * This moves pages from the active list to
 597  * the inactive list.
 598  *
 599  * We move them the other way when we see the
 600  * reference bit on the page.
 601  */
 602 static void fastcall refill_inactive(int nr_pages, zone_t * classzone)
 603 {
 604         struct list_head * entry;
 605         unsigned long ratio;
 606
 607         ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
 608
 609         entry = active_list.prev;
 610         while (ratio && entry != &active_list) {
 611                 struct page * page;
 612
 613                 page = list_entry(entry, struct page, lru);
 614                 entry = entry->prev;
 615                 if (PageTestandClearReferenced(page)) {
 616                         list_del(&page->lru);
 617                         list_add(&page->lru, &active_list);
 618                         continue;
 619                 }
 620
 621                 ratio--;
 622
 623                 del_page_from_active_list(page);
 624                 add_page_to_inactive_list(page);
 625                 SetPageReferenced(page);
 626         }
 627
 628         if (entry != &active_list) {
 629                 list_del(&active_list);
 630                 list_add(&active_list, entry);
 631         }
 632 }
 633
 634 static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
 635 static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
 636 {
 637         nr_pages -= kmem_cache_reap(gfp_mask);
 638         if (nr_pages <= 0)
 639                 goto out;
 640
 641         spin_lock(&pagemap_lru_lock);
 642         refill_inactive(nr_pages, classzone);
 643
 644         nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
 645
 646 out:
 647         return nr_pages;
 648 }
 649
 650 static int check_classzone_need_balance(zone_t * classzone);
 651
 652 int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
 653 {
 654         gfp_mask = pf_gfp_mask(gfp_mask);
 655
 656         for (;;) {
 657                 int tries = vm_passes;
 658                 int failed_swapout = !(gfp_mask & __GFP_IO);
 659                 int nr_pages = SWAP_CLUSTER_MAX;
 660
 661                 do {
 662                         nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
 663                         if (nr_pages <= 0)
 664                                 return 1;
 665                         shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
 666                         shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
 667 #ifdef CONFIG_QUOTA
 668                         shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
 669 #endif
 670                         if (!failed_swapout)
 671                                 failed_swapout = !swap_out(classzone);
 672                 } while (--tries);
 673
 674 #ifdef  CONFIG_OOM_KILLER
 675         out_of_memory();
 676 #else
 677         if (likely(current->pid != 1))
 678                 break;
 679         if (!check_classzone_need_balance(classzone))
 680                 break;
 681
 682         __set_current_state(TASK_RUNNING);
 683         yield();
 684 #endif
 685         }
 686
 687         return 0;
 688 }
 689
 690 int fastcall try_to_free_pages(unsigned int gfp_mask)
 691 {
 692         pg_data_t *pgdat;
 693         zonelist_t *zonelist;
 694         unsigned long pf_free_pages;
 695         int error = 0;
 696
 697         pf_free_pages = current->flags & PF_FREE_PAGES;
 698         current->flags &= ~PF_FREE_PAGES;
 699
 700         for_each_pgdat(pgdat) {
 701                 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
 702                 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
 703         }
 704
 705         current->flags |= pf_free_pages;
 706         return error;
 707 }
 708
 709 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 710
 711 static int check_classzone_need_balance(zone_t * classzone)
 712 {
 713         zone_t * first_zone;
 714         int class_idx = zone_idx(classzone);
 715
 716         first_zone = classzone->zone_pgdat->node_zones;
 717         while (classzone >= first_zone) {
 718                 if (classzone->free_pages > classzone->watermarks[class_idx].high)
 719                         return 0;
 720                 classzone--;
 721         }
 722         return 1;
 723 }
 724
 725 static int kswapd_balance_pgdat(pg_data_t * pgdat)
 726 {
 727         int need_more_balance = 0, i;
 728         zone_t * zone;
 729
 730         for (i = pgdat->nr_zones-1; i >= 0; i--) {
 731                 zone = pgdat->node_zones + i;
 732                 if (unlikely(current->need_resched))
 733                         schedule();
 734                 if (!zone->need_balance || !zone->size)
 735                         continue;
 736                 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
 737                         zone->need_balance = 0;
 738                         __set_current_state(TASK_INTERRUPTIBLE);
 739                         schedule_timeout(HZ*5);
 740                         continue;
 741                 }
 742                 if (check_classzone_need_balance(zone))
 743                         need_more_balance = 1;
 744                 else
 745                         zone->need_balance = 0;
 746         }
 747
 748         return need_more_balance;
 749 }
 750
 751 static void kswapd_balance(void)
 752 {
 753         int need_more_balance;
 754         pg_data_t * pgdat;
 755
 756         do {
 757                 need_more_balance = 0;
 758
 759                 for_each_pgdat(pgdat)
 760                         need_more_balance |= kswapd_balance_pgdat(pgdat);
 761         } while (need_more_balance);
 762 }
 763
 764 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
 765 {
 766         zone_t * zone;
 767         int i;
 768
 769         for (i = pgdat->nr_zones-1; i >= 0; i--) {
 770                 zone = pgdat->node_zones + i;
 771                 if (!zone->need_balance || !zone->size)
 772                         continue;
 773                 return 0;
 774         }
 775
 776         return 1;
 777 }
 778
 779 static int kswapd_can_sleep(void)
 780 {
 781         pg_data_t * pgdat;
 782
 783         for_each_pgdat(pgdat) {
 784                 if (!kswapd_can_sleep_pgdat(pgdat))
 785                         return 0;
 786         }
 787
 788         return 1;
 789 }
 790
 791 /*
 792  * The background pageout daemon, started as a kernel thread
 793  * from the init process.
 794  *
 795  * This basically trickles out pages so that we have _some_
 796  * free memory available even if there is no other activity
 797  * that frees anything up. This is needed for things like routing
 798  * etc, where we otherwise might have all activity going on in
 799  * asynchronous contexts that cannot page things out.
 800  *
 801  * If there are applications that are active memory-allocators
 802  * (most normal use), this basically shouldn't matter.
 803  */
 804 int kswapd(void *unused)
 805 {
 806         struct task_struct *tsk = current;
 807         DECLARE_WAITQUEUE(wait, tsk);
 808
 809         daemonize();
 810         strcpy(tsk->comm, "kswapd");
 811         sigfillset(&tsk->blocked);
 812
 813         /*
 814          * Tell the memory management that we're a "memory allocator",
 815          * and that if we need more memory we should get access to it
 816          * regardless (see "__alloc_pages()"). "kswapd" should
 817          * never get caught in the normal page freeing logic.
 818          *
 819          * (Kswapd normally doesn't need memory anyway, but sometimes
 820          * you need a small amount of memory in order to be able to
 821          * page out something else, and this flag essentially protects
 822          * us from recursively trying to free more memory as we're
 823          * trying to free the first piece of memory in the first place).
 824          */
 825         tsk->flags |= PF_MEMALLOC;
 826
 827         /*
 828          * Kswapd main loop.
 829          */
 830         for (;;) {
 831                 __set_current_state(TASK_INTERRUPTIBLE);
 832                 add_wait_queue(&kswapd_wait, &wait);
 833
 834                 mb();
 835                 if (kswapd_can_sleep())
 836                         schedule();
 837
 838                 __set_current_state(TASK_RUNNING);
 839                 remove_wait_queue(&kswapd_wait, &wait);
 840
 841                 /*
 842                  * If we actually get into a low-memory situation,
 843                  * the processes needing more memory will wake us
 844                  * up on a more timely basis.
 845                  */
 846                 kswapd_balance();
 847                 run_task_queue(&tq_disk);
 848         }
 849 }
 850
 851 static int __init kswapd_init(void)
 852 {
 853         printk("Starting kswapd\n");
 854         swap_setup();
 855         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 856         return 0;
 857 }
 858
 859 module_init(kswapd_init)