more changes on original files
[linux-2.4.git] / mm / vmscan.c
1 /*
2  *  linux/mm/vmscan.c
3  *
4  *  The pageout daemon, decides which pages to evict (swap out) and
5  *  does the actual work of freeing them.
6  *
7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8  *
9  *  Swap reorganised 29.12.95, Stephen Tweedie.
10  *  kswapd added: 7.1.96  sct
11  *  Removed kswapd_ctl limits, and swap out as many pages as needed
12  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
13  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14  *  Multiqueue VM started 5.8.00, Rik van Riel.
15  */
16
17 #include <linux/slab.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h>
20 #include <linux/swapctl.h>
21 #include <linux/smp_lock.h>
22 #include <linux/pagemap.h>
23 #include <linux/init.h>
24 #include <linux/highmem.h>
25 #include <linux/file.h>
26
27 #include <asm/pgalloc.h>
28
29 /*
30  * "vm_passes" is the number of vm passes before failing the
31  * memory balancing. Take into account 3 passes are needed
32  * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
33  * of the inactive list at each pass.
34  */
35 int vm_passes = 60;
36
37 /*
38  * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
39  * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
40  * scan 1/6 of the inactive lists during a normal aging round.
41  */
42 int vm_cache_scan_ratio = 6;
43
44 /*
45  * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
46  * we'll start to pageout.
47  */
48 int vm_mapped_ratio = 100;
49
50 /*
51  * "vm_lru_balance_ratio" controls the balance between active and
52  * inactive cache. The bigger vm_balance is, the easier the
53  * active cache will grow, because we'll rotate the active list
54  * slowly. A value of 2 means we'll go towards a balance of
55  * 1/3 of the cache being inactive.
56  */
57 int vm_lru_balance_ratio = 2;
58
59 /*
60  * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan
61  * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of
62  * the unused-inode, dentry and dquot caches will be freed during a normal
63  * aging round.
64  */
65 int vm_vfs_scan_ratio = 6;
66
67 /*
68  * "vm_anon_lru" select if to immdiatly insert anon pages in the
69  * lru. Immediatly means as soon as they're allocated during the
70  * page faults.
71  *
72  * If this is set to 0, they're inserted only after the first
73  * swapout.
74  *
75  * Having anon pages immediatly inserted in the lru allows the
76  * VM to know better when it's worthwhile to start swapping
77  * anonymous ram, it will start to swap earlier and it should
78  * swap smoother and faster, but it will decrease scalability
79  * on the >16-ways of an order of magnitude. Big SMP/NUMA
80  * definitely can't take an hit on a global spinlock at
81  * every anon page allocation. So this is off by default.
82  *
83  * Low ram machines that swaps all the time want to turn
84  * this on (i.e. set to 1).
85  */
86 int vm_anon_lru = 0;
87
88 /*
89  * The swap-out function returns 1 if it successfully
90  * scanned all the pages it was asked to (`count').
91  * It returns zero if it couldn't do anything,
92  *
93  * rss may decrease because pages are shared, but this
94  * doesn't count as having freed a page.
95  */
96
97 /* mm->page_table_lock is held. mmap_sem is not held */
98 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
99 {
100         pte_t pte;
101         swp_entry_t entry;
102
103         /* Don't look at this pte if it's been accessed recently. */
104         if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
105                 mark_page_accessed(page);
106                 return 0;
107         }
108
109         /* Don't bother unmapping pages that are active */
110         if (PageActive(page))
111                 return 0;
112
113         /* Don't bother replenishing zones not under pressure.. */
114         if (!memclass(page_zone(page), classzone))
115                 return 0;
116
117         if (TryLockPage(page))
118                 return 0;
119
120         /* From this point on, the odds are that we're going to
121          * nuke this pte, so read and clear the pte.  This hook
122          * is needed on CPUs which update the accessed and dirty
123          * bits in hardware.
124          */
125         flush_cache_page(vma, address);
126         pte = ptep_get_and_clear(page_table);
127         flush_tlb_page(vma, address);
128
129         if (pte_dirty(pte))
130                 set_page_dirty(page);
131
132         /*
133          * Is the page already in the swap cache? If so, then
134          * we can just drop our reference to it without doing
135          * any IO - it's already up-to-date on disk.
136          */
137         if (PageSwapCache(page)) {
138                 entry.val = page->index;
139                 swap_duplicate(entry);
140 set_swap_pte:
141                 set_pte(page_table, swp_entry_to_pte(entry));
142 drop_pte:
143                 mm->rss--;
144                 UnlockPage(page);
145                 {
146                         int freeable = page_count(page) - !!page->buffers <= 2;
147                         page_cache_release(page);
148                         return freeable;
149                 }
150         }
151
152         /*
153          * Is it a clean page? Then it must be recoverable
154          * by just paging it in again, and we can just drop
155          * it..  or if it's dirty but has backing store,
156          * just mark the page dirty and drop it.
157          *
158          * However, this won't actually free any real
159          * memory, as the page will just be in the page cache
160          * somewhere, and as such we should just continue
161          * our scan.
162          *
163          * Basically, this just makes it possible for us to do
164          * some real work in the future in "refill_inactive()".
165          */
166         if (page->mapping)
167                 goto drop_pte;
168         if (!PageDirty(page))
169                 goto drop_pte;
170
171         /*
172          * Anonymous buffercache pages can be left behind by
173          * concurrent truncate and pagefault.
174          */
175         if (page->buffers)
176                 goto preserve;
177
178         /*
179          * This is a dirty, swappable page.  First of all,
180          * get a suitable swap entry for it, and make sure
181          * we have the swap cache set up to associate the
182          * page with that swap entry.
183          */
184         for (;;) {
185                 entry = get_swap_page();
186                 if (!entry.val)
187                         break;
188                 /* Add it to the swap cache and mark it dirty
189                  * (adding to the page cache will clear the dirty
190                  * and uptodate bits, so we need to do it again)
191                  */
192                 if (add_to_swap_cache(page, entry) == 0) {
193                         SetPageUptodate(page);
194                         set_page_dirty(page);
195                         goto set_swap_pte;
196                 }
197                 /* Raced with "speculative" read_swap_cache_async */
198                 swap_free(entry);
199         }
200
201         /* No swap space left */
202 preserve:
203         set_pte(page_table, pte);
204         UnlockPage(page);
205         return 0;
206 }
207
208 /* mm->page_table_lock is held. mmap_sem is not held */
209 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
210 {
211         pte_t * pte;
212         unsigned long pmd_end;
213
214         if (pmd_none(*dir))
215                 return count;
216         if (pmd_bad(*dir)) {
217                 pmd_ERROR(*dir);
218                 pmd_clear(dir);
219                 return count;
220         }
221         
222         pte = pte_offset(dir, address);
223         
224         pmd_end = (address + PMD_SIZE) & PMD_MASK;
225         if (end > pmd_end)
226                 end = pmd_end;
227
228         do {
229                 if (pte_present(*pte)) {
230                         struct page *page = pte_page(*pte);
231
232                         if (VALID_PAGE(page) && !PageReserved(page)) {
233                                 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
234                                 if (!count) {
235                                         address += PAGE_SIZE;
236                                         break;
237                                 }
238                         }
239                 }
240                 address += PAGE_SIZE;
241                 pte++;
242         } while (address && (address < end));
243         mm->swap_address = address;
244         return count;
245 }
246
247 /* mm->page_table_lock is held. mmap_sem is not held */
248 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
249 {
250         pmd_t * pmd;
251         unsigned long pgd_end;
252
253         if (pgd_none(*dir))
254                 return count;
255         if (pgd_bad(*dir)) {
256                 pgd_ERROR(*dir);
257                 pgd_clear(dir);
258                 return count;
259         }
260
261         pmd = pmd_offset(dir, address);
262
263         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
264         if (pgd_end && (end > pgd_end))
265                 end = pgd_end;
266         
267         do {
268                 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
269                 if (!count)
270                         break;
271                 address = (address + PMD_SIZE) & PMD_MASK;
272                 pmd++;
273         } while (address && (address < end));
274         return count;
275 }
276
277 /* mm->page_table_lock is held. mmap_sem is not held */
278 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
279 {
280         pgd_t *pgdir;
281         unsigned long end;
282
283         /* Don't swap out areas which are reserved */
284         if (vma->vm_flags & VM_RESERVED)
285                 return count;
286
287         pgdir = pgd_offset(mm, address);
288
289         end = vma->vm_end;
290         BUG_ON(address >= end);
291         do {
292                 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
293                 if (!count)
294                         break;
295                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
296                 pgdir++;
297         } while (address && (address < end));
298         return count;
299 }
300
301 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
302 struct mm_struct *swap_mm = &init_mm;
303
304 /*
305  * Returns remaining count of pages to be swapped out by followup call.
306  */
307 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
308 {
309         unsigned long address;
310         struct vm_area_struct* vma;
311
312         /*
313          * Find the proper vm-area after freezing the vma chain 
314          * and ptes.
315          */
316         spin_lock(&mm->page_table_lock);
317         address = mm->swap_address;
318         if (address == TASK_SIZE || swap_mm != mm) {
319                 /* We raced: don't count this mm but try again */
320                 ++*mmcounter;
321                 goto out_unlock;
322         }
323         vma = find_vma(mm, address);
324         if (vma) {
325                 if (address < vma->vm_start)
326                         address = vma->vm_start;
327
328                 for (;;) {
329                         count = swap_out_vma(mm, vma, address, count, classzone);
330                         vma = vma->vm_next;
331                         if (!vma)
332                                 break;
333                         if (!count)
334                                 goto out_unlock;
335                         address = vma->vm_start;
336                 }
337         }
338         /* Indicate that we reached the end of address space */
339         mm->swap_address = TASK_SIZE;
340
341 out_unlock:
342         spin_unlock(&mm->page_table_lock);
343         return count;
344 }
345
346 static int FASTCALL(swap_out(zone_t * classzone));
347 static int fastcall swap_out(zone_t * classzone)
348 {
349         int counter, nr_pages = SWAP_CLUSTER_MAX;
350         struct mm_struct *mm;
351
352         counter = mmlist_nr << 1;
353         do {
354                 if (unlikely(current->need_resched)) {
355                         __set_current_state(TASK_RUNNING);
356                         schedule();
357                 }
358
359                 spin_lock(&mmlist_lock);
360                 mm = swap_mm;
361                 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
362                         mm->swap_address = 0;
363                         mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
364                         if (mm == swap_mm)
365                                 goto empty;
366                         swap_mm = mm;
367                 }
368
369                 /* Make sure the mm doesn't disappear when we drop the lock.. */
370                 atomic_inc(&mm->mm_users);
371                 spin_unlock(&mmlist_lock);
372
373                 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
374
375                 mmput(mm);
376
377                 if (!nr_pages)
378                         return 1;
379         } while (--counter >= 0);
380
381         return 0;
382
383 empty:
384         spin_unlock(&mmlist_lock);
385         return 0;
386 }
387
388 static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
389 static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
390 static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
391 {
392         struct list_head * entry;
393         int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
394         int max_mapped = vm_mapped_ratio * nr_pages;
395
396         while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
397                 struct page * page;
398
399                 if (unlikely(current->need_resched)) {
400                         spin_unlock(&pagemap_lru_lock);
401                         __set_current_state(TASK_RUNNING);
402                         schedule();
403                         spin_lock(&pagemap_lru_lock);
404                         continue;
405                 }
406
407                 page = list_entry(entry, struct page, lru);
408
409                 BUG_ON(!PageLRU(page));
410                 BUG_ON(PageActive(page));
411
412                 list_del(entry);
413                 list_add(entry, &inactive_list);
414
415                 /*
416                  * Zero page counts can happen because we unlink the pages
417                  * _after_ decrementing the usage count..
418                  */
419                 if (unlikely(!page_count(page)))
420                         continue;
421
422                 if (!memclass(page_zone(page), classzone))
423                         continue;
424
425                 max_scan--;
426
427                 /* Racy check to avoid trylocking when not worthwhile */
428                 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
429                         goto page_mapped;
430
431                 /*
432                  * The page is locked. IO in progress?
433                  * Move it to the back of the list.
434                  */
435                 if (unlikely(TryLockPage(page))) {
436                         if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
437                                 page_cache_get(page);
438                                 spin_unlock(&pagemap_lru_lock);
439                                 wait_on_page(page);
440                                 page_cache_release(page);
441                                 spin_lock(&pagemap_lru_lock);
442                         }
443                         continue;
444                 }
445
446                 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
447                         /*
448                          * It is not critical here to write it only if
449                          * the page is unmapped beause any direct writer
450                          * like O_DIRECT would set the PG_dirty bitflag
451                          * on the phisical page after having successfully
452                          * pinned it and after the I/O to the page is finished,
453                          * so the direct writes to the page cannot get lost.
454                          */
455                         int (*writepage)(struct page *);
456
457                         writepage = page->mapping->a_ops->writepage;
458                         if ((gfp_mask & __GFP_FS) && writepage) {
459                                 ClearPageDirty(page);
460                                 SetPageLaunder(page);
461                                 page_cache_get(page);
462                                 spin_unlock(&pagemap_lru_lock);
463
464                                 writepage(page);
465                                 page_cache_release(page);
466
467                                 spin_lock(&pagemap_lru_lock);
468                                 continue;
469                         }
470                 }
471
472                 /*
473                  * If the page has buffers, try to free the buffer mappings
474                  * associated with this page. If we succeed we try to free
475                  * the page as well.
476                  */
477                 if (page->buffers) {
478                         spin_unlock(&pagemap_lru_lock);
479
480                         /* avoid to free a locked page */
481                         page_cache_get(page);
482
483                         if (try_to_release_page(page, gfp_mask)) {
484                                 if (!page->mapping) {
485                                         /*
486                                          * We must not allow an anon page
487                                          * with no buffers to be visible on
488                                          * the LRU, so we unlock the page after
489                                          * taking the lru lock
490                                          */
491                                         spin_lock(&pagemap_lru_lock);
492                                         UnlockPage(page);
493                                         __lru_cache_del(page);
494
495                                         /* effectively free the page here */
496                                         page_cache_release(page);
497
498                                         if (--nr_pages)
499                                                 continue;
500                                         break;
501                                 } else {
502                                         /*
503                                          * The page is still in pagecache so undo the stuff
504                                          * before the try_to_release_page since we've not
505                                          * finished and we can now try the next step.
506                                          */
507                                         page_cache_release(page);
508
509                                         spin_lock(&pagemap_lru_lock);
510                                 }
511                         } else {
512                                 /* failed to drop the buffers so stop here */
513                                 UnlockPage(page);
514                                 page_cache_release(page);
515
516                                 spin_lock(&pagemap_lru_lock);
517                                 continue;
518                         }
519                 }
520
521                 spin_lock(&pagecache_lock);
522
523                 /*
524                  * This is the non-racy check for busy page.
525                  * It is critical to check PageDirty _after_ we made sure
526                  * the page is freeable so not in use by anybody.
527                  * At this point we're guaranteed that page->buffers is NULL,
528                  * nobody can refill page->buffers under us because we still
529                  * hold the page lock.
530                  */
531                 if (!page->mapping || page_count(page) > 1) {
532                         spin_unlock(&pagecache_lock);
533                         UnlockPage(page);
534 page_mapped:
535                         if (--max_mapped < 0) {
536                                 spin_unlock(&pagemap_lru_lock);
537
538                                 nr_pages -= kmem_cache_reap(gfp_mask);
539                                 if (nr_pages <= 0)
540                                         goto out;
541
542                                 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
543                                 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
544 #ifdef CONFIG_QUOTA
545                                 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
546 #endif
547
548                                 if (!*failed_swapout)
549                                         *failed_swapout = !swap_out(classzone);
550
551                                 max_mapped = nr_pages * vm_mapped_ratio;
552
553                                 spin_lock(&pagemap_lru_lock);
554                                 refill_inactive(nr_pages, classzone);
555                         }
556                         continue;
557                         
558                 }
559                 smp_rmb();
560                 if (PageDirty(page)) {
561                         spin_unlock(&pagecache_lock);
562                         UnlockPage(page);
563                         continue;
564                 }
565
566                 __lru_cache_del(page);
567
568                 /* point of no return */
569                 if (likely(!PageSwapCache(page))) {
570                         __remove_inode_page(page);
571                         spin_unlock(&pagecache_lock);
572                 } else {
573                         swp_entry_t swap;
574                         swap.val = page->index;
575                         __delete_from_swap_cache(page);
576                         spin_unlock(&pagecache_lock);
577                         swap_free(swap);
578                 }
579
580                 UnlockPage(page);
581
582                 /* effectively free the page here */
583                 page_cache_release(page);
584
585                 if (--nr_pages)
586                         continue;
587                 break;
588         }
589         spin_unlock(&pagemap_lru_lock);
590
591  out:
592         return nr_pages;
593 }
594
595 /*
596  * This moves pages from the active list to
597  * the inactive list.
598  *
599  * We move them the other way when we see the
600  * reference bit on the page.
601  */
602 static void fastcall refill_inactive(int nr_pages, zone_t * classzone)
603 {
604         struct list_head * entry;
605         unsigned long ratio;
606
607         ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
608
609         entry = active_list.prev;
610         while (ratio && entry != &active_list) {
611                 struct page * page;
612
613                 page = list_entry(entry, struct page, lru);
614                 entry = entry->prev;
615                 if (PageTestandClearReferenced(page)) {
616                         list_del(&page->lru);
617                         list_add(&page->lru, &active_list);
618                         continue;
619                 }
620
621                 ratio--;
622
623                 del_page_from_active_list(page);
624                 add_page_to_inactive_list(page);
625                 SetPageReferenced(page);
626         }
627
628         if (entry != &active_list) {
629                 list_del(&active_list);
630                 list_add(&active_list, entry);
631         }
632 }
633
634 static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
635 static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
636 {
637         nr_pages -= kmem_cache_reap(gfp_mask);
638         if (nr_pages <= 0)
639                 goto out;
640
641         spin_lock(&pagemap_lru_lock);
642         refill_inactive(nr_pages, classzone);
643
644         nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
645
646 out:
647         return nr_pages;
648 }
649
650 static int check_classzone_need_balance(zone_t * classzone);
651
652 int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
653 {
654         gfp_mask = pf_gfp_mask(gfp_mask);
655
656         for (;;) {
657                 int tries = vm_passes;
658                 int failed_swapout = !(gfp_mask & __GFP_IO);
659                 int nr_pages = SWAP_CLUSTER_MAX;
660
661                 do {
662                         nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
663                         if (nr_pages <= 0)
664                                 return 1;
665                         shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
666                         shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
667 #ifdef CONFIG_QUOTA
668                         shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
669 #endif
670                         if (!failed_swapout)
671                                 failed_swapout = !swap_out(classzone);
672                 } while (--tries);
673
674 #ifdef  CONFIG_OOM_KILLER
675         out_of_memory();
676 #else
677         if (likely(current->pid != 1))
678                 break;
679         if (!check_classzone_need_balance(classzone))
680                 break;
681
682         __set_current_state(TASK_RUNNING);
683         yield();
684 #endif
685         }
686
687         return 0;
688 }
689
690 int fastcall try_to_free_pages(unsigned int gfp_mask)
691 {
692         pg_data_t *pgdat;
693         zonelist_t *zonelist;
694         unsigned long pf_free_pages;
695         int error = 0;
696
697         pf_free_pages = current->flags & PF_FREE_PAGES;
698         current->flags &= ~PF_FREE_PAGES;
699
700         for_each_pgdat(pgdat) {
701                 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
702                 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
703         }
704
705         current->flags |= pf_free_pages;
706         return error;
707 }
708
709 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
710
711 static int check_classzone_need_balance(zone_t * classzone)
712 {
713         zone_t * first_zone;
714         int class_idx = zone_idx(classzone);
715
716         first_zone = classzone->zone_pgdat->node_zones;
717         while (classzone >= first_zone) {
718                 if (classzone->free_pages > classzone->watermarks[class_idx].high)
719                         return 0;
720                 classzone--;
721         }
722         return 1;
723 }
724
725 static int kswapd_balance_pgdat(pg_data_t * pgdat)
726 {
727         int need_more_balance = 0, i;
728         zone_t * zone;
729
730         for (i = pgdat->nr_zones-1; i >= 0; i--) {
731                 zone = pgdat->node_zones + i;
732                 if (unlikely(current->need_resched))
733                         schedule();
734                 if (!zone->need_balance || !zone->size)
735                         continue;
736                 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
737                         zone->need_balance = 0;
738                         __set_current_state(TASK_INTERRUPTIBLE);
739                         schedule_timeout(HZ*5);
740                         continue;
741                 }
742                 if (check_classzone_need_balance(zone))
743                         need_more_balance = 1;
744                 else
745                         zone->need_balance = 0;
746         }
747
748         return need_more_balance;
749 }
750
751 static void kswapd_balance(void)
752 {
753         int need_more_balance;
754         pg_data_t * pgdat;
755
756         do {
757                 need_more_balance = 0;
758
759                 for_each_pgdat(pgdat)
760                         need_more_balance |= kswapd_balance_pgdat(pgdat);
761         } while (need_more_balance);
762 }
763
764 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
765 {
766         zone_t * zone;
767         int i;
768
769         for (i = pgdat->nr_zones-1; i >= 0; i--) {
770                 zone = pgdat->node_zones + i;
771                 if (!zone->need_balance || !zone->size)
772                         continue;
773                 return 0;
774         }
775
776         return 1;
777 }
778
779 static int kswapd_can_sleep(void)
780 {
781         pg_data_t * pgdat;
782
783         for_each_pgdat(pgdat) {
784                 if (!kswapd_can_sleep_pgdat(pgdat))
785                         return 0;
786         }
787
788         return 1;
789 }
790
791 /*
792  * The background pageout daemon, started as a kernel thread
793  * from the init process. 
794  *
795  * This basically trickles out pages so that we have _some_
796  * free memory available even if there is no other activity
797  * that frees anything up. This is needed for things like routing
798  * etc, where we otherwise might have all activity going on in
799  * asynchronous contexts that cannot page things out.
800  *
801  * If there are applications that are active memory-allocators
802  * (most normal use), this basically shouldn't matter.
803  */
804 int kswapd(void *unused)
805 {
806         struct task_struct *tsk = current;
807         DECLARE_WAITQUEUE(wait, tsk);
808
809         daemonize();
810         strcpy(tsk->comm, "kswapd");
811         sigfillset(&tsk->blocked);
812         
813         /*
814          * Tell the memory management that we're a "memory allocator",
815          * and that if we need more memory we should get access to it
816          * regardless (see "__alloc_pages()"). "kswapd" should
817          * never get caught in the normal page freeing logic.
818          *
819          * (Kswapd normally doesn't need memory anyway, but sometimes
820          * you need a small amount of memory in order to be able to
821          * page out something else, and this flag essentially protects
822          * us from recursively trying to free more memory as we're
823          * trying to free the first piece of memory in the first place).
824          */
825         tsk->flags |= PF_MEMALLOC;
826
827         /*
828          * Kswapd main loop.
829          */
830         for (;;) {
831                 __set_current_state(TASK_INTERRUPTIBLE);
832                 add_wait_queue(&kswapd_wait, &wait);
833
834                 mb();
835                 if (kswapd_can_sleep())
836                         schedule();
837
838                 __set_current_state(TASK_RUNNING);
839                 remove_wait_queue(&kswapd_wait, &wait);
840
841                 /*
842                  * If we actually get into a low-memory situation,
843                  * the processes needing more memory will wake us
844                  * up on a more timely basis.
845                  */
846                 kswapd_balance();
847                 run_task_queue(&tq_disk);
848         }
849 }
850
851 static int __init kswapd_init(void)
852 {
853         printk("Starting kswapd\n");
854         swap_setup();
855         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
856         return 0;
857 }
858
859 module_init(kswapd_init)