mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  The pageout daemon, decides which pages to evict (swap out) and
   5  *  does the actual work of freeing them.
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *
   9  *  Swap reorganised 29.12.95, Stephen Tweedie.
  10  *  kswapd added: 7.1.96  sct
  11  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  12  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  13  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  14  *  Multiqueue VM started 5.8.00, Rik van Riel.
  15  */
  16
  17 #include <linux/slab.h>
  18 #include <linux/kernel_stat.h>
  19 #include <linux/swap.h>
  20 #include <linux/swapctl.h>
  21 #include <linux/smp_lock.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/init.h>
  24 #include <linux/highmem.h>
  25 #include <linux/file.h>
  26
  27 #include <asm/pgalloc.h>
  28
  29 /*
  30  * The "priority" of VM scanning is how much of the queues we
  31  * will scan in one go. A value of 6 for DEF_PRIORITY implies
  32  * that we'll scan 1/64th of the queues ("queue_length >> 6")
  33  * during a normal aging round.
  34  */
  35 #define DEF_PRIORITY (6)
  36
  37 /*
  38  * The swap-out function returns 1 if it successfully
  39  * scanned all the pages it was asked to (`count').
  40  * It returns zero if it couldn't do anything,
  41  *
  42  * rss may decrease because pages are shared, but this
  43  * doesn't count as having freed a page.
  44  */
  45
  46 /* mm->page_table_lock is held. mmap_sem is not held */
  47 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
  48 {
  49         pte_t pte;
  50         swp_entry_t entry;
  51
  52         /* Don't look at this pte if it's been accessed recently. */
  53         if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
  54                 mark_page_accessed(page);
  55                 return 0;
  56         }
  57
  58         /* Don't bother unmapping pages that are active */
  59         if (PageActive(page))
  60                 return 0;
  61
  62         /* Don't bother replenishing zones not under pressure.. */
  63         if (!memclass(page_zone(page), classzone))
  64                 return 0;
  65
  66         if (TryLockPage(page))
  67                 return 0;
  68
  69         /* From this point on, the odds are that we're going to
  70          * nuke this pte, so read and clear the pte.  This hook
  71          * is needed on CPUs which update the accessed and dirty
  72          * bits in hardware.
  73          */
  74         flush_cache_page(vma, address);
  75         pte = ptep_get_and_clear(page_table);
  76         flush_tlb_page(vma, address);
  77
  78         if (pte_dirty(pte))
  79                 set_page_dirty(page);
  80
  81         /*
  82          * Is the page already in the swap cache? If so, then
  83          * we can just drop our reference to it without doing
  84          * any IO - it's already up-to-date on disk.
  85          */
  86         if (PageSwapCache(page)) {
  87                 entry.val = page->index;
  88                 swap_duplicate(entry);
  89 set_swap_pte:
  90                 set_pte(page_table, swp_entry_to_pte(entry));
  91 drop_pte:
  92                 mm->rss--;
  93                 UnlockPage(page);
  94                 {
  95                         int freeable = page_count(page) - !!page->buffers <= 2;
  96                         page_cache_release(page);
  97                         return freeable;
  98                 }
  99         }
 100
 101         /*
 102          * Is it a clean page? Then it must be recoverable
 103          * by just paging it in again, and we can just drop
 104          * it..  or if it's dirty but has backing store,
 105          * just mark the page dirty and drop it.
 106          *
 107          * However, this won't actually free any real
 108          * memory, as the page will just be in the page cache
 109          * somewhere, and as such we should just continue
 110          * our scan.
 111          *
 112          * Basically, this just makes it possible for us to do
 113          * some real work in the future in "refill_inactive()".
 114          */
 115         if (page->mapping)
 116                 goto drop_pte;
 117         if (!PageDirty(page))
 118                 goto drop_pte;
 119
 120         /*
 121          * Anonymous buffercache pages can be left behind by
 122          * concurrent truncate and pagefault.
 123          */
 124         if (page->buffers)
 125                 goto preserve;
 126
 127         /*
 128          * This is a dirty, swappable page.  First of all,
 129          * get a suitable swap entry for it, and make sure
 130          * we have the swap cache set up to associate the
 131          * page with that swap entry.
 132          */
 133         for (;;) {
 134                 entry = get_swap_page();
 135                 if (!entry.val)
 136                         break;
 137                 /* Add it to the swap cache and mark it dirty
 138                  * (adding to the page cache will clear the dirty
 139                  * and uptodate bits, so we need to do it again)
 140                  */
 141                 if (add_to_swap_cache(page, entry) == 0) {
 142                         SetPageUptodate(page);
 143                         set_page_dirty(page);
 144                         goto set_swap_pte;
 145                 }
 146                 /* Raced with "speculative" read_swap_cache_async */
 147                 swap_free(entry);
 148         }
 149
 150         /* No swap space left */
 151 preserve:
 152         set_pte(page_table, pte);
 153         UnlockPage(page);
 154         return 0;
 155 }
 156
 157 /* mm->page_table_lock is held. mmap_sem is not held */
 158 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 159 {
 160         pte_t * pte;
 161         unsigned long pmd_end;
 162
 163         if (pmd_none(*dir))
 164                 return count;
 165         if (pmd_bad(*dir)) {
 166                 pmd_ERROR(*dir);
 167                 pmd_clear(dir);
 168                 return count;
 169         }
 170
 171         pte = pte_offset(dir, address);
 172
 173         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 174         if (end > pmd_end)
 175                 end = pmd_end;
 176
 177         do {
 178                 if (pte_present(*pte)) {
 179                         struct page *page = pte_page(*pte);
 180
 181                         if (VALID_PAGE(page) && !PageReserved(page)) {
 182                                 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
 183                                 if (!count) {
 184                                         address += PAGE_SIZE;
 185                                         break;
 186                                 }
 187                         }
 188                 }
 189                 address += PAGE_SIZE;
 190                 pte++;
 191         } while (address && (address < end));
 192         mm->swap_address = address;
 193         return count;
 194 }
 195
 196 /* mm->page_table_lock is held. mmap_sem is not held */
 197 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 198 {
 199         pmd_t * pmd;
 200         unsigned long pgd_end;
 201
 202         if (pgd_none(*dir))
 203                 return count;
 204         if (pgd_bad(*dir)) {
 205                 pgd_ERROR(*dir);
 206                 pgd_clear(dir);
 207                 return count;
 208         }
 209
 210         pmd = pmd_offset(dir, address);
 211
 212         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 213         if (pgd_end && (end > pgd_end))
 214                 end = pgd_end;
 215
 216         do {
 217                 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
 218                 if (!count)
 219                         break;
 220                 address = (address + PMD_SIZE) & PMD_MASK;
 221                 pmd++;
 222         } while (address && (address < end));
 223         return count;
 224 }
 225
 226 /* mm->page_table_lock is held. mmap_sem is not held */
 227 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
 228 {
 229         pgd_t *pgdir;
 230         unsigned long end;
 231
 232         /* Don't swap out areas which are reserved */
 233         if (vma->vm_flags & VM_RESERVED)
 234                 return count;
 235
 236         pgdir = pgd_offset(mm, address);
 237
 238         end = vma->vm_end;
 239         BUG_ON(address >= end);
 240         do {
 241                 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
 242                 if (!count)
 243                         break;
 244                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 245                 pgdir++;
 246         } while (address && (address < end));
 247         return count;
 248 }
 249
 250 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
 251 struct mm_struct *swap_mm = &init_mm;
 252
 253 /*
 254  * Returns remaining count of pages to be swapped out by followup call.
 255  */
 256 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
 257 {
 258         unsigned long address;
 259         struct vm_area_struct* vma;
 260
 261         /*
 262          * Find the proper vm-area after freezing the vma chain
 263          * and ptes.
 264          */
 265         spin_lock(&mm->page_table_lock);
 266         address = mm->swap_address;
 267         if (address == TASK_SIZE || swap_mm != mm) {
 268                 /* We raced: don't count this mm but try again */
 269                 ++*mmcounter;
 270                 goto out_unlock;
 271         }
 272         vma = find_vma(mm, address);
 273         if (vma) {
 274                 if (address < vma->vm_start)
 275                         address = vma->vm_start;
 276
 277                 for (;;) {
 278                         count = swap_out_vma(mm, vma, address, count, classzone);
 279                         vma = vma->vm_next;
 280                         if (!vma)
 281                                 break;
 282                         if (!count)
 283                                 goto out_unlock;
 284                         address = vma->vm_start;
 285                 }
 286         }
 287         /* Indicate that we reached the end of address space */
 288         mm->swap_address = TASK_SIZE;
 289
 290 out_unlock:
 291         spin_unlock(&mm->page_table_lock);
 292         return count;
 293 }
 294
 295 static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
 296 static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
 297 {
 298         int counter, nr_pages = SWAP_CLUSTER_MAX;
 299         struct mm_struct *mm;
 300
 301         counter = mmlist_nr;
 302         do {
 303                 if (unlikely(current->need_resched)) {
 304                         __set_current_state(TASK_RUNNING);
 305                         schedule();
 306                 }
 307
 308                 spin_lock(&mmlist_lock);
 309                 mm = swap_mm;
 310                 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
 311                         mm->swap_address = 0;
 312                         mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 313                         if (mm == swap_mm)
 314                                 goto empty;
 315                         swap_mm = mm;
 316                 }
 317
 318                 /* Make sure the mm doesn't disappear when we drop the lock.. */
 319                 atomic_inc(&mm->mm_users);
 320                 spin_unlock(&mmlist_lock);
 321
 322                 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
 323
 324                 mmput(mm);
 325
 326                 if (!nr_pages)
 327                         return 1;
 328         } while (--counter >= 0);
 329
 330         return 0;
 331
 332 empty:
 333         spin_unlock(&mmlist_lock);
 334         return 0;
 335 }
 336
 337 static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
 338 static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
 339 {
 340         struct list_head * entry;
 341         int max_scan = nr_inactive_pages / priority;
 342         int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
 343
 344         spin_lock(&pagemap_lru_lock);
 345         while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
 346                 struct page * page;
 347
 348                 if (unlikely(current->need_resched)) {
 349                         spin_unlock(&pagemap_lru_lock);
 350                         __set_current_state(TASK_RUNNING);
 351                         schedule();
 352                         spin_lock(&pagemap_lru_lock);
 353                         continue;
 354                 }
 355
 356                 page = list_entry(entry, struct page, lru);
 357
 358                 BUG_ON(!PageLRU(page));
 359                 BUG_ON(PageActive(page));
 360
 361                 list_del(entry);
 362                 list_add(entry, &inactive_list);
 363
 364                 /*
 365                  * Zero page counts can happen because we unlink the pages
 366                  * _after_ decrementing the usage count..
 367                  */
 368                 if (unlikely(!page_count(page)))
 369                         continue;
 370
 371                 if (!memclass(page_zone(page), classzone))
 372                         continue;
 373
 374                 /* Racy check to avoid trylocking when not worthwhile */
 375                 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
 376                         goto page_mapped;
 377
 378                 /*
 379                  * The page is locked. IO in progress?
 380                  * Move it to the back of the list.
 381                  */
 382                 if (unlikely(TryLockPage(page))) {
 383                         if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
 384                                 page_cache_get(page);
 385                                 spin_unlock(&pagemap_lru_lock);
 386                                 wait_on_page(page);
 387                                 page_cache_release(page);
 388                                 spin_lock(&pagemap_lru_lock);
 389                         }
 390                         continue;
 391                 }
 392
 393                 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
 394                         /*
 395                          * It is not critical here to write it only if
 396                          * the page is unmapped beause any direct writer
 397                          * like O_DIRECT would set the PG_dirty bitflag
 398                          * on the phisical page after having successfully
 399                          * pinned it and after the I/O to the page is finished,
 400                          * so the direct writes to the page cannot get lost.
 401                          */
 402                         int (*writepage)(struct page *);
 403
 404                         writepage = page->mapping->a_ops->writepage;
 405                         if ((gfp_mask & __GFP_FS) && writepage) {
 406                                 ClearPageDirty(page);
 407                                 SetPageLaunder(page);
 408                                 page_cache_get(page);
 409                                 spin_unlock(&pagemap_lru_lock);
 410
 411                                 writepage(page);
 412                                 page_cache_release(page);
 413
 414                                 spin_lock(&pagemap_lru_lock);
 415                                 continue;
 416                         }
 417                 }
 418
 419                 /*
 420                  * If the page has buffers, try to free the buffer mappings
 421                  * associated with this page. If we succeed we try to free
 422                  * the page as well.
 423                  */
 424                 if (page->buffers) {
 425                         spin_unlock(&pagemap_lru_lock);
 426
 427                         /* avoid to free a locked page */
 428                         page_cache_get(page);
 429
 430                         if (try_to_release_page(page, gfp_mask)) {
 431                                 if (!page->mapping) {
 432                                         /*
 433                                          * We must not allow an anon page
 434                                          * with no buffers to be visible on
 435                                          * the LRU, so we unlock the page after
 436                                          * taking the lru lock
 437                                          */
 438                                         spin_lock(&pagemap_lru_lock);
 439                                         UnlockPage(page);
 440                                         __lru_cache_del(page);
 441
 442                                         /* effectively free the page here */
 443                                         page_cache_release(page);
 444
 445                                         if (--nr_pages)
 446                                                 continue;
 447                                         break;
 448                                 } else {
 449                                         /*
 450                                          * The page is still in pagecache so undo the stuff
 451                                          * before the try_to_release_page since we've not
 452                                          * finished and we can now try the next step.
 453                                          */
 454                                         page_cache_release(page);
 455
 456                                         spin_lock(&pagemap_lru_lock);
 457                                 }
 458                         } else {
 459                                 /* failed to drop the buffers so stop here */
 460                                 UnlockPage(page);
 461                                 page_cache_release(page);
 462
 463                                 spin_lock(&pagemap_lru_lock);
 464                                 continue;
 465                         }
 466                 }
 467
 468                 spin_lock(&pagecache_lock);
 469
 470                 /*
 471                  * this is the non-racy check for busy page.
 472                  */
 473                 if (!page->mapping || !is_page_cache_freeable(page)) {
 474                         spin_unlock(&pagecache_lock);
 475                         UnlockPage(page);
 476 page_mapped:
 477                         if (--max_mapped >= 0)
 478                                 continue;
 479
 480                         /*
 481                          * Alert! We've found too many mapped pages on the
 482                          * inactive list, so we start swapping out now!
 483                          */
 484                         spin_unlock(&pagemap_lru_lock);
 485                         swap_out(priority, gfp_mask, classzone);
 486                         return nr_pages;
 487                 }
 488
 489                 /*
 490                  * It is critical to check PageDirty _after_ we made sure
 491                  * the page is freeable* so not in use by anybody.
 492                  */
 493                 if (PageDirty(page)) {
 494                         spin_unlock(&pagecache_lock);
 495                         UnlockPage(page);
 496                         continue;
 497                 }
 498
 499                 /* point of no return */
 500                 if (likely(!PageSwapCache(page))) {
 501                         __remove_inode_page(page);
 502                         spin_unlock(&pagecache_lock);
 503                 } else {
 504                         swp_entry_t swap;
 505                         swap.val = page->index;
 506                         __delete_from_swap_cache(page);
 507                         spin_unlock(&pagecache_lock);
 508                         swap_free(swap);
 509                 }
 510
 511                 __lru_cache_del(page);
 512                 UnlockPage(page);
 513
 514                 /* effectively free the page here */
 515                 page_cache_release(page);
 516
 517                 if (--nr_pages)
 518                         continue;
 519                 break;
 520         }
 521         spin_unlock(&pagemap_lru_lock);
 522
 523         return nr_pages;
 524 }
 525
 526 /*
 527  * This moves pages from the active list to
 528  * the inactive list.
 529  *
 530  * We move them the other way when we see the
 531  * reference bit on the page.
 532  */
 533 static void refill_inactive(int nr_pages)
 534 {
 535         struct list_head * entry;
 536
 537         spin_lock(&pagemap_lru_lock);
 538         entry = active_list.prev;
 539         while (nr_pages && entry != &active_list) {
 540                 struct page * page;
 541
 542                 page = list_entry(entry, struct page, lru);
 543                 entry = entry->prev;
 544                 if (PageTestandClearReferenced(page)) {
 545                         list_del(&page->lru);
 546                         list_add(&page->lru, &active_list);
 547                         continue;
 548                 }
 549
 550                 nr_pages--;
 551
 552                 del_page_from_active_list(page);
 553                 add_page_to_inactive_list(page);
 554                 SetPageReferenced(page);
 555         }
 556         spin_unlock(&pagemap_lru_lock);
 557 }
 558
 559 static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
 560 static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
 561 {
 562         int chunk_size = nr_pages;
 563         unsigned long ratio;
 564
 565         nr_pages -= kmem_cache_reap(gfp_mask);
 566         if (nr_pages <= 0)
 567                 return 0;
 568
 569         nr_pages = chunk_size;
 570         /* try to keep the active list 2/3 of the size of the cache */
 571         ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
 572         refill_inactive(ratio);
 573
 574         nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
 575         if (nr_pages <= 0)
 576                 return 0;
 577
 578         shrink_dcache_memory(priority, gfp_mask);
 579         shrink_icache_memory(priority, gfp_mask);
 580 #ifdef CONFIG_QUOTA
 581         shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 582 #endif
 583
 584         return nr_pages;
 585 }
 586
 587 int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
 588 {
 589         int priority = DEF_PRIORITY;
 590         int nr_pages = SWAP_CLUSTER_MAX;
 591
 592         gfp_mask = pf_gfp_mask(gfp_mask);
 593         do {
 594                 nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
 595                 if (nr_pages <= 0)
 596                         return 1;
 597         } while (--priority);
 598
 599         /*
 600          * Hmm.. Cache shrink failed - time to kill something?
 601          * Mhwahahhaha! This is the part I really like. Giggle.
 602          */
 603         out_of_memory();
 604         return 0;
 605 }
 606
 607 int try_to_free_pages(unsigned int gfp_mask)
 608 {
 609         pg_data_t *pgdat;
 610         zonelist_t *zonelist;
 611         unsigned long pf_free_pages;
 612         int error = 0;
 613
 614         pf_free_pages = current->flags & PF_FREE_PAGES;
 615         current->flags &= ~PF_FREE_PAGES;
 616
 617         for_each_pgdat(pgdat) {
 618                 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
 619                 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
 620         }
 621
 622         current->flags |= pf_free_pages;
 623         return error;
 624 }
 625
 626 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 627
 628 static int check_classzone_need_balance(zone_t * classzone)
 629 {
 630         zone_t * first_classzone;
 631
 632         first_classzone = classzone->zone_pgdat->node_zones;
 633         while (classzone >= first_classzone) {
 634                 if (classzone->free_pages > classzone->pages_high)
 635                         return 0;
 636                 classzone--;
 637         }
 638         return 1;
 639 }
 640
 641 static int kswapd_balance_pgdat(pg_data_t * pgdat)
 642 {
 643         int need_more_balance = 0, i;
 644         zone_t * zone;
 645
 646         for (i = pgdat->nr_zones-1; i >= 0; i--) {
 647                 zone = pgdat->node_zones + i;
 648                 if (unlikely(current->need_resched))
 649                         schedule();
 650                 if (!zone->need_balance)
 651                         continue;
 652                 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
 653                         zone->need_balance = 0;
 654                         __set_current_state(TASK_INTERRUPTIBLE);
 655                         schedule_timeout(HZ);
 656                         continue;
 657                 }
 658                 if (check_classzone_need_balance(zone))
 659                         need_more_balance = 1;
 660                 else
 661                         zone->need_balance = 0;
 662         }
 663
 664         return need_more_balance;
 665 }
 666
 667 static void kswapd_balance(void)
 668 {
 669         int need_more_balance;
 670         pg_data_t * pgdat;
 671
 672         do {
 673                 need_more_balance = 0;
 674
 675                 for_each_pgdat(pgdat)
 676                         need_more_balance |= kswapd_balance_pgdat(pgdat);
 677         } while (need_more_balance);
 678 }
 679
 680 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
 681 {
 682         zone_t * zone;
 683         int i;
 684
 685         for (i = pgdat->nr_zones-1; i >= 0; i--) {
 686                 zone = pgdat->node_zones + i;
 687                 if (!zone->need_balance)
 688                         continue;
 689                 return 0;
 690         }
 691
 692         return 1;
 693 }
 694
 695 static int kswapd_can_sleep(void)
 696 {
 697         pg_data_t * pgdat;
 698
 699         for_each_pgdat(pgdat) {
 700                 if (!kswapd_can_sleep_pgdat(pgdat))
 701                         return 0;
 702         }
 703
 704         return 1;
 705 }
 706
 707 /*
 708  * The background pageout daemon, started as a kernel thread
 709  * from the init process.
 710  *
 711  * This basically trickles out pages so that we have _some_
 712  * free memory available even if there is no other activity
 713  * that frees anything up. This is needed for things like routing
 714  * etc, where we otherwise might have all activity going on in
 715  * asynchronous contexts that cannot page things out.
 716  *
 717  * If there are applications that are active memory-allocators
 718  * (most normal use), this basically shouldn't matter.
 719  */
 720 int kswapd(void *unused)
 721 {
 722         struct task_struct *tsk = current;
 723         DECLARE_WAITQUEUE(wait, tsk);
 724
 725         daemonize();
 726         strcpy(tsk->comm, "kswapd");
 727         sigfillset(&tsk->blocked);
 728
 729         /*
 730          * Tell the memory management that we're a "memory allocator",
 731          * and that if we need more memory we should get access to it
 732          * regardless (see "__alloc_pages()"). "kswapd" should
 733          * never get caught in the normal page freeing logic.
 734          *
 735          * (Kswapd normally doesn't need memory anyway, but sometimes
 736          * you need a small amount of memory in order to be able to
 737          * page out something else, and this flag essentially protects
 738          * us from recursively trying to free more memory as we're
 739          * trying to free the first piece of memory in the first place).
 740          */
 741         tsk->flags |= PF_MEMALLOC;
 742
 743         /*
 744          * Kswapd main loop.
 745          */
 746         for (;;) {
 747                 __set_current_state(TASK_INTERRUPTIBLE);
 748                 add_wait_queue(&kswapd_wait, &wait);
 749
 750                 mb();
 751                 if (kswapd_can_sleep())
 752                         schedule();
 753
 754                 __set_current_state(TASK_RUNNING);
 755                 remove_wait_queue(&kswapd_wait, &wait);
 756
 757                 /*
 758                  * If we actually get into a low-memory situation,
 759                  * the processes needing more memory will wake us
 760                  * up on a more timely basis.
 761                  */
 762                 kswapd_balance();
 763                 run_task_queue(&tq_disk);
 764         }
 765 }
 766
 767 static int __init kswapd_init(void)
 768 {
 769         printk("Starting kswapd\n");
 770         swap_setup();
 771         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 772         return 0;
 773 }
 774
 775 module_init(kswapd_init)