X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=e12154d9c4ede318d1776e7636fee678485f0472;hb=9328b8faae922e52073785ed6c1eaa8565648a0e;hp=e1d3d77f4aeef89259d9e3b09e500042a42d7a87;hpb=e0f998930eb67c49f2862c58a45262ad0bc03eca;p=powerpc.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1d3d77f4a..e12154d9c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,8 @@ unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +static void fastcall free_hot_cold_page(struct page *page, int cold); + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -59,11 +62,13 @@ long nr_swap_pages; * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; EXPORT_SYMBOL(totalram_pages); -EXPORT_SYMBOL(nr_swap_pages); /* * Used by page_zone() to look up the address of the struct zone whose @@ -72,36 +77,67 @@ EXPORT_SYMBOL(nr_swap_pages); struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; EXPORT_SYMBOL(zone_table); -static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +#ifdef CONFIG_DEBUG_VM +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (zone != page_zone(page)) + return 0; + + return 1; +} /* * Temporary debugging check for pages not lying within a given zone. */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + if (page_outside_zone_boundaries(zone, page)) return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) - return 1; -#endif - if (zone != page_zone(page)) + if (!page_is_consistent(zone, page)) return 1; + + return 0; +} + +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ return 0; } +#endif static void bad_page(const char *function, struct page *page) { printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page); printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, + (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); printk(KERN_EMERG "Backtrace:\n"); dump_stack(); @@ -114,17 +150,13 @@ static void bad_page(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback); + 1 << PG_writeback ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; add_taint(TAINT_BAD_PAGE); } -#ifndef CONFIG_HUGETLB_PAGE -#define prep_compound_page(page, order) do { } while (0) -#define destroy_compound_page(page, order) do { } while (0) -#else /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -153,7 +185,7 @@ static void prep_compound_page(struct page *page, unsigned long order) struct page *p = page + i; SetPageCompound(p); - p->private = (unsigned long)page; + set_page_private(p, (unsigned long)page); } } @@ -173,12 +205,11 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (!PageCompound(p)) bad_page(__FUNCTION__, page); - if (p->private != (unsigned long)page) + if (page_private(p) != (unsigned long)page) bad_page(__FUNCTION__, page); ClearPageCompound(p); } } -#endif /* CONFIG_HUGETLB_PAGE */ /* * function for dealing with page's order in buddy system. @@ -186,18 +217,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) * So, we don't need atomic page->flags operations here. */ static inline unsigned long page_order(struct page *page) { - return page->private; + return page_private(page); } static inline void set_page_order(struct page *page, int order) { - page->private = order; + set_page_private(page, order); __SetPagePrivate(page); } static inline void rmv_page_order(struct page *page) { __ClearPagePrivate(page); - page->private = 0; + set_page_private(page, 0); } /* @@ -234,17 +265,22 @@ __find_combined_index(unsigned long page_idx, unsigned int order) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is free && - * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. - * for recording page's order, we use page->private and PG_private. + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. + * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (PagePrivate(page) && (page_order(page) == order) && - !PageReserved(page) && page_count(page) == 0) return 1; return 0; @@ -264,7 +300,7 @@ static inline int page_is_buddy(struct page *page, int order) * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PG_Private.Page's - * order is recorded in page->private field. + * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -294,17 +330,15 @@ static inline void __free_pages_bulk (struct page *page, struct free_area *area; struct page *buddy; - combined_idx = __find_combined_index(page_idx, order); buddy = __page_find_buddy(page, page_idx, order); - - if (bad_range(zone, buddy)) - break; if (!page_is_buddy(buddy, order)) break; /* Move the buddy up one level. */ + list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -314,11 +348,11 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline void free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(const char *function, struct page *page) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -327,10 +361,17 @@ static inline void free_pages_check(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved )))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); } /* @@ -348,11 +389,10 @@ static int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { - unsigned long flags; struct page *page = NULL; int ret = 0; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (!list_empty(list) && count--) { @@ -362,19 +402,19 @@ free_pages_bulk(struct zone *zone, int count, __free_pages_bulk(page, zone, order); ret++; } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return ret; } void __free_pages_ok(struct page *page, unsigned int order) { + unsigned long flags; LIST_HEAD(list); int i; + int reserved = 0; arch_free_page(page, order); - mod_page_state(pgfree, 1 << order); - #ifndef CONFIG_MMU if (order > 0) for (i = 1 ; i < (1 << order) ; ++i) @@ -382,10 +422,49 @@ void __free_pages_ok(struct page *page, unsigned int order) #endif for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(__FUNCTION__, page + i); + if (reserved) + return; + list_add(&page->lru, &list); + mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<lru, &list); + kernel_map_pages(page, 1 << order, 0); + free_pages_bulk(page_zone(page), 1, &list, order); + } } @@ -403,8 +482,7 @@ void __free_pages_ok(struct page *page, unsigned int order) * * -- wli */ -static inline struct page * -expand(struct zone *zone, struct page *page, +static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area) { unsigned long size = 1 << high; @@ -418,34 +496,16 @@ expand(struct zone *zone, struct page *page, area->nr_free++; set_page_order(&page[size], high); } - return page; -} - -void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ } /* * This page is about to be returned from the page allocator */ -static void prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -455,15 +515,24 @@ static void prep_new_page(struct page *page, int order) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved )))) bad_page(__FUNCTION__, page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); - page->private = 0; + set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); + return 0; } /* @@ -486,7 +555,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) rmv_page_order(page); area->nr_free--; zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area); + return page; } return NULL; @@ -500,21 +570,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list) { - unsigned long flags; int i; - int allocated = 0; - struct page *page; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) break; - allocated++; list_add_tail(&page->lru, list); } - spin_unlock_irqrestore(&zone->lock, flags); - return allocated; + spin_unlock(&zone->lock); + return i; } #ifdef CONFIG_NUMA @@ -550,6 +616,7 @@ void drain_remote_pages(void) #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { + unsigned long flags; struct zone *zone; int i; @@ -561,8 +628,10 @@ static void __drain_pages(unsigned int cpu) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; + local_irq_save(flags); pcp->count -= free_pages_bulk(zone, pcp->count, &pcp->list, 0); + local_irq_restore(flags); } } } @@ -637,7 +706,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) /* * Free a 0-order page */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); @@ -646,11 +714,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - inc_page_state(pgfree); if (PageAnon(page)) page->mapping = NULL; - free_pages_check(__FUNCTION__, page); + if (free_pages_check(__FUNCTION__, page)) + return; + + inc_page_state(pgfree); + kernel_map_pages(page, 1, 0); + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); list_add(&page->lru, &pcp->list); @@ -689,27 +760,27 @@ static struct page * buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; - struct page *page = NULL; + struct page *page; int cold = !!(gfp_flags & __GFP_COLD); +again: if (order == 0) { struct per_cpu_pages *pcp; + page = NULL; pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) + if (!pcp->count) pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (pcp->count) { + if (likely(pcp->count)) { page = list_entry(pcp->list.next, struct page, lru); list_del(&page->lru); pcp->count--; } local_irq_restore(flags); put_cpu(); - } - - if (page == NULL) { + } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); spin_unlock_irqrestore(&zone->lock, flags); @@ -718,7 +789,8 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) if (page != NULL) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); - prep_new_page(page, order); + if (prep_new_page(page, order)) + goto again; if (gfp_flags & __GFP_ZERO) prep_zero_page(page, order, gfp_flags); @@ -729,20 +801,28 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) return page; } +#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + /* * Return 1 if free pages are above 'mark'. This takes into account the order * of the allocation. */ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int can_try_harder, int gfp_high) + int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ long min = mark, free_pages = z->free_pages - (1 << order) + 1; int o; - if (gfp_high) + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (can_try_harder) + if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) @@ -760,14 +840,47 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, return 1; } -static inline int -should_reclaim_zone(struct zone *z, gfp_t gfp_mask) +/* + * get_page_from_freeliest goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, int alloc_flags) { - if (!z->reclaim_pages) - return 0; - if (gfp_mask & __GFP_NORECLAIM) - return 0; - return 1; + struct zone **z = zonelist->zones; + struct page *page = NULL; + int classzone_idx = zone_idx(*z); + + /* + * Go through the zonelist once, looking for a zone with enough free. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + do { + if ((alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed(*z, gfp_mask)) + continue; + + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = (*z)->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = (*z)->pages_low; + else + mark = (*z)->pages_high; + if (!zone_watermark_ok(*z, order, mark, + classzone_idx, alloc_flags)) + continue; + } + + page = buffered_rmqueue(*z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, *z); + break; + } + } while (*(++z) != NULL); + return page; } /* @@ -777,106 +890,76 @@ struct page * fastcall __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { - const int wait = gfp_mask & __GFP_WAIT; - struct zone **zones, *z; + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct zone **z; struct page *page; struct reclaim_state reclaim_state; struct task_struct *p = current; - int i; - int classzone_idx; int do_retry; - int can_try_harder; + int alloc_flags; int did_some_progress; might_sleep_if(wait); - /* - * The caller may dip into page reserves a bit more if the caller - * cannot run direct reclaim, or is the caller has realtime scheduling - * policy - */ - can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ +restart: + z = zonelist->zones; /* the list of zones suitable for gfp_mask */ - if (unlikely(zones[0] == NULL)) { + if (unlikely(*z == NULL)) { /* Should this ever happen?? */ return NULL; } - classzone_idx = zone_idx(zones[0]); + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) + goto got_pg; + + do { + wakeup_kswapd(*z, order); + } while (*(++z)); -restart: /* - * Go through the zonelist once, looking for a zone with enough free. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * OK, we're below the kswapd watermark and have kicked background + * reclaim. Now things get more complex, so set up alloc_flags according + * to how we want to proceed. + * + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - int do_reclaim = should_reclaim_zone(z, gfp_mask); - - if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) - continue; - - /* - * If the zone is to attempt early page reclaim then this loop - * will try to reclaim pages and check the watermark a second - * time before giving up and falling back to the next zone. - */ -zone_reclaim_retry: - if (!zone_watermark_ok(z, order, z->pages_low, - classzone_idx, 0, 0)) { - if (!do_reclaim) - continue; - else { - zone_reclaim(z, gfp_mask, order); - /* Only try reclaim once */ - do_reclaim = 0; - goto zone_reclaim_retry; - } - } - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - - for (i = 0; (z = zones[i]) != NULL; i++) - wakeup_kswapd(z, order); + alloc_flags = ALLOC_WMARK_MIN; + if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) + alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations - * coming from realtime tasks to go deeper into reserves + * coming from realtime tasks go deeper into reserves. * * This is the last chance, in general, before the goto nopage. * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - if (wait && !cpuset_zone_allowed(z, gfp_mask)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); + if (page) + goto got_pg; /* This allocation should allow future memory freeing. */ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { +nofail_alloc: /* go through the zonelist yet again, ignoring mins */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!cpuset_zone_allowed(z, gfp_mask)) - continue; - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; + page = get_page_from_freelist(gfp_mask, order, + zonelist, ALLOC_NO_WATERMARKS); + if (page) + goto got_pg; + if (gfp_mask & __GFP_NOFAIL) { + blk_congestion_wait(WRITE, HZ/50); + goto nofail_alloc; } } goto nopage; @@ -894,7 +977,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask); + did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -902,19 +985,10 @@ rebalance: cond_resched(); if (likely(did_some_progress)) { - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - if (!cpuset_zone_allowed(z, gfp_mask)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask, order, + zonelist, alloc_flags); + if (page) + goto got_pg; } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { /* * Go through the zonelist yet one more time, keep @@ -922,18 +996,10 @@ rebalance: * a parallel oom killing, we must fail if we're still * under heavy pressure. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_high, - classzone_idx, 0, 0)) - continue; - - if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); + if (page) + goto got_pg; out_of_memory(gfp_mask, order); goto restart; @@ -966,9 +1032,7 @@ nopage: dump_stack(); show_mem(); } - return NULL; got_pg: - zone_statistics(zonelist, z); return page; } @@ -996,7 +1060,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) * get_zeroed_page() returns a 32-bit address, which cannot represent * a highmem page */ - BUG_ON(gfp_mask & __GFP_HIGHMEM); + BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); page = alloc_pages(gfp_mask | __GFP_ZERO, 0); if (page) @@ -1016,7 +1080,7 @@ void __pagevec_free(struct pagevec *pvec) fastcall void __free_pages(struct page *page, unsigned int order) { - if (!PageReserved(page) && put_page_testzero(page)) { + if (put_page_testzero(page)) { if (order == 0) free_hot_page(page); else @@ -1089,7 +1153,7 @@ static unsigned int nr_free_zone_pages(int offset) */ unsigned int nr_free_buffer_pages(void) { - return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); + return nr_free_zone_pages(gfp_zone(GFP_USER)); } /* @@ -1097,7 +1161,7 @@ unsigned int nr_free_buffer_pages(void) */ unsigned int nr_free_pagecache_pages(void) { - return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); } #ifdef CONFIG_HIGHMEM @@ -1135,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache); DEFINE_PER_CPU(long, nr_pagecache_local) = 0; #endif -void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { int cpu = 0; memset(ret, 0, sizeof(*ret)); - cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { @@ -1193,7 +1256,7 @@ unsigned long __read_page_state(unsigned long offset) unsigned long ret = 0; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu) { unsigned long in; in = (unsigned long)&per_cpu(page_states, cpu) + offset; @@ -1305,19 +1368,15 @@ void show_free_areas(void) } else printk("\n"); - for (cpu = 0; cpu < NR_CPUS; ++cpu) { + for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - if (!cpu_possible(cpu)) - continue; - pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", + printk("cpu %d %s: high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", - pageset->pcp[temperature].low, pageset->pcp[temperature].high, pageset->pcp[temperature].batch, pageset->pcp[temperature].count); @@ -1419,6 +1478,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli zone = pgdat->node_zones + ZONE_NORMAL; if (zone->present_pages) zonelist->zones[j++] = zone; + case ZONE_DMA32: + zone = pgdat->node_zones + ZONE_DMA32; + if (zone->present_pages) + zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; if (zone->present_pages) @@ -1428,6 +1491,18 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli return j; } +static inline int highest_zone(int zone_bits) +{ + int res = ZONE_NORMAL; + if (zone_bits & (__force int)__GFP_HIGHMEM) + res = ZONE_HIGHMEM; + if (zone_bits & (__force int)__GFP_DMA32) + res = ZONE_DMA32; + if (zone_bits & (__force int)__GFP_DMA) + res = ZONE_DMA; + return res; +} + #ifdef CONFIG_NUMA #define MAX_NODE_LOAD (num_online_nodes()) static int __initdata node_load[MAX_NUMNODES]; @@ -1524,11 +1599,7 @@ static void __init build_zonelists(pg_data_t *pgdat) zonelist = pgdat->node_zonelists + i; for (j = 0; zonelist->zones[j] != NULL; j++); - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; + k = highest_zone(i); j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); zonelist->zones[j] = NULL; @@ -1549,12 +1620,7 @@ static void __init build_zonelists(pg_data_t *pgdat) zonelist = pgdat->node_zonelists + i; j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - + k = highest_zone(i); j = build_zonelists_node(pgdat, zonelist, j, k); /* * Now we build the zonelist so that it contains the zones @@ -1659,7 +1725,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1669,11 +1735,9 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { if (!early_pfn_valid(pfn)) continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 0); + set_page_count(page, 1); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); @@ -1720,14 +1784,13 @@ static int __devinit zone_batchsize(struct zone *zone) /* * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. + * size of the zone. But no more than 1/2 of a meg. * * OK, so we don't know how big the cache is. So guess. */ batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; + if (batch * PAGE_SIZE > 512 * 1024) + batch = (512 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -1742,7 +1805,8 @@ static int __devinit zone_batchsize(struct zone *zone) * of pages of one half of the possible page colors * and the other with pages of the other colors. */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = (1 << (fls(batch + batch/2)-1)) - 1; + return batch; } @@ -1754,16 +1818,14 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 2 * batch; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); pcp = &p->pcp[1]; /* cold*/ pcp->count = 0; - pcp->low = 0; pcp->high = 2 * batch; - pcp->batch = max(1UL, 1 * batch); + pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); } @@ -1843,11 +1905,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, if (process_zones(cpu)) ret = NOTIFY_BAD; break; -#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: case CPU_DEAD: free_zone_pagesets(cpu); break; -#endif default: break; } @@ -1857,7 +1918,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, static struct notifier_block pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; -void __init setup_per_cpu_pageset() +void __init setup_per_cpu_pageset(void) { int err; @@ -1872,6 +1933,60 @@ void __init setup_per_cpu_pageset() #endif +static __devinit +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +static __devinit void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __devinit void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1881,10 +1996,11 @@ void __init setup_per_cpu_pageset() static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; @@ -1892,13 +2008,12 @@ static void __init free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; - if (j == ZONE_DMA || j == ZONE_NORMAL) + if (j < ZONE_HIGHMEM) nr_kernel_pages += realsize; nr_all_pages += realsize; @@ -1907,24 +2022,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1935,32 +2039,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - memmap_init(size, nid, j, zone_start_pfn); - zonetable_add(zone, nid, j, zone_start_pfn, size); - + init_currently_empty_zone(zone, zone_start_pfn, size); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } @@ -2138,12 +2219,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n cpu: %i pcp: %i" "\n count: %i" - "\n low: %i" "\n high: %i" "\n batch: %i", i, j, pageset->pcp[j].count, - pageset->pcp[j].low, pageset->pcp[j].high, pageset->pcp[j].batch); } @@ -2198,32 +2277,40 @@ static char *vmstat_text[] = { "pgpgout", "pswpin", "pswpout", - "pgalloc_high", + "pgalloc_high", "pgalloc_normal", + "pgalloc_dma32", "pgalloc_dma", + "pgfree", "pgactivate", "pgdeactivate", "pgfault", "pgmajfault", + "pgrefill_high", "pgrefill_normal", + "pgrefill_dma32", "pgrefill_dma", "pgsteal_high", "pgsteal_normal", + "pgsteal_dma32", "pgsteal_dma", + "pgscan_kswapd_high", "pgscan_kswapd_normal", - + "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_direct_high", "pgscan_direct_normal", + "pgscan_direct_dma32", "pgscan_direct_dma", - "pginodesteal", + "pginodesteal", "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", @@ -2360,7 +2447,7 @@ static void setup_per_zone_lowmem_reserve(void) * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -static void setup_per_zone_pages_min(void) +void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -2374,13 +2461,18 @@ static void setup_per_zone_pages_min(void) } for_each_zone(zone) { + unsigned long tmp; spin_lock_irqsave(&zone->lru_lock, flags); + tmp = (pages_min * zone->present_pages) / lowmem_pages; if (is_highmem(zone)) { /* - * Often, highmem doesn't need to reserve any pages. - * But the pages_min/low/high values are also used for - * batching up page reclaim activity so we need a - * decent value here. + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The (pages_high-pages_low) and (pages_low-pages_min) + * deltas controls asynch page reclaim, and so should + * not be capped for highmem. */ int min_pages; @@ -2391,19 +2483,15 @@ static void setup_per_zone_pages_min(void) min_pages = 128; zone->pages_min = min_pages; } else { - /* if it's a lowmem zone, reserve a number of pages + /* + * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = (pages_min * zone->present_pages) / - lowmem_pages; + zone->pages_min = tmp; } - /* - * When interpreting these watermarks, just keep in mind that: - * zone->pages_min == (zone->pages_min * 4) / 4; - */ - zone->pages_low = (zone->pages_min * 5) / 4; - zone->pages_high = (zone->pages_min * 6) / 4; + zone->pages_low = zone->pages_min + tmp / 4; + zone->pages_high = zone->pages_min + tmp / 2; spin_unlock_irqrestore(&zone->lru_lock, flags); } }