X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=d461b23a27a1176decc9229edf45c8ae1c388472;hb=3689a0ec60bc8f56cc372c1dfa0d89dab48f7c9c;hp=ebd425c2e2a7fe8402733d1a5166ca2a69fa6678;hpb=c45aa055c32b488fc3fd73c760df372b09acf69a;p=powerpc.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ebd425c2e2..d461b23a27 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -72,7 +73,9 @@ static void __free_pages_ok(struct page *page, unsigned int order); * don't need any ZONE_NORMAL reservation */ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +#ifdef CONFIG_ZONE_DMA 256, +#endif #ifdef CONFIG_ZONE_DMA32 256, #endif @@ -83,15 +86,10 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { EXPORT_SYMBOL(totalram_pages); -/* - * Used by page_zone() to look up the address of the struct zone whose - * id is encoded in the upper bits of page->flags - */ -struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; -EXPORT_SYMBOL(zone_table); - -static char *zone_names[MAX_NR_ZONES] = { +static char * const zone_names[MAX_NR_ZONES] = { +#ifdef CONFIG_ZONE_DMA "DMA", +#endif #ifdef CONFIG_ZONE_DMA32 "DMA32", #endif @@ -237,7 +235,7 @@ static void prep_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - page[1].lru.next = (void *)free_compound_page; /* set dtor */ + set_compound_page_dtor(page, free_compound_page); page[1].lru.prev = (void *)order; for (i = 0; i < nr_pages; i++) { struct page *p = page + i; @@ -401,7 +399,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(page_idx & (order_size - 1)); VM_BUG_ON(bad_range(zone, page)); - zone->free_pages += order_size; + __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); while (order < MAX_ORDER-1) { unsigned long combined_idx; struct free_area *area; @@ -486,7 +484,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order) spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __free_one_page(page, zone ,order); + __free_one_page(page, zone, order); spin_unlock(&zone->lock); } @@ -605,6 +603,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); set_page_refcounted(page); + + arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); if (gfp_flags & __GFP_ZERO) @@ -635,7 +635,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) list_del(&page->lru); rmv_page_order(page); area->nr_free--; - zone->free_pages -= 1UL << order; + __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); expand(zone, page, order, current_order, area); return page; } @@ -690,9 +690,15 @@ void drain_node_pages(int nodeid) pcp = &pset->pcp[i]; if (pcp->count) { + int to_drain; + local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); - pcp->count = 0; + if (pcp->count >= pcp->batch) + to_drain = pcp->batch; + else + to_drain = pcp->count; + free_pages_bulk(zone, to_drain, &pcp->list, 0); + pcp->count -= to_drain; local_irq_restore(flags); } } @@ -700,7 +706,6 @@ void drain_node_pages(int nodeid) } #endif -#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { unsigned long flags; @@ -710,6 +715,9 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; + if (!populated_zone(zone)) + continue; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -722,7 +730,6 @@ static void __drain_pages(unsigned int cpu) } } } -#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_PM @@ -853,7 +860,7 @@ again: pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); if (!pcp->count) { - pcp->count += rmqueue_bulk(zone, 0, + pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (unlikely(!pcp->count)) goto failed; @@ -893,6 +900,91 @@ failed: #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#ifdef CONFIG_FAIL_PAGE_ALLOC + +static struct fail_page_alloc_attr { + struct fault_attr attr; + + u32 ignore_gfp_highmem; + u32 ignore_gfp_wait; + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + + struct dentry *ignore_gfp_highmem_file; + struct dentry *ignore_gfp_wait_file; + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, + .ignore_gfp_highmem = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + if (gfp_mask & __GFP_NOFAIL) + return 0; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return 0; + if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + return 0; + + return should_fail(&fail_page_alloc.attr, 1 << order); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&fail_page_alloc.attr, + "fail_page_alloc"); + if (err) + return err; + dir = fail_page_alloc.attr.dentries.dir; + + fail_page_alloc.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait); + + fail_page_alloc.ignore_gfp_highmem_file = + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + + if (!fail_page_alloc.ignore_gfp_wait_file || + !fail_page_alloc.ignore_gfp_highmem_file) { + err = -ENOMEM; + debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); + debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); + cleanup_fault_attr_dentries(&fail_page_alloc.attr); + } + + return err; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAIL_PAGE_ALLOC */ + +static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return 0; +} + +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + /* * Return 1 if free pages are above 'mark'. This takes into account the order * of the allocation. @@ -901,8 +993,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ - unsigned long min = mark; - long free_pages = z->free_pages - (1 << order) + 1; + long min = mark; + long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -925,31 +1017,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, return 1; } +#ifdef CONFIG_NUMA +/* + * zlc_setup - Setup for "zonelist cache". Uses cached zone data to + * skip over zones that are not allowed by the cpuset, or that have + * been recently (in last second) found to be nearly full. See further + * comments in mmzone.h. Reduces cache footprint of zonelist scans + * that have to skip over alot of full or unallowed zones. + * + * If the zonelist cache is present in the passed in zonelist, then + * returns a pointer to the allowed node mask (either the current + * tasks mems_allowed, or node_online_map.) + * + * If the zonelist cache is not available for this zonelist, does + * nothing and returns NULL. + * + * If the fullzones BITMAP in the zonelist cache is stale (more than + * a second since last zap'd) then we zap it out (clear its bits.) + * + * We hold off even calling zlc_setup, until after we've checked the + * first zone in the zonelist, on the theory that most allocations will + * be satisfied from that first zone, so best to examine that zone as + * quickly as we can. + */ +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + nodemask_t *allowednodes; /* zonelist_cache approximation */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return NULL; + + if (jiffies - zlc->last_full_zap > 1 * HZ) { + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + zlc->last_full_zap = jiffies; + } + + allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? + &cpuset_current_mems_allowed : + &node_online_map; + return allowednodes; +} + +/* + * Given 'z' scanning a zonelist, run a couple of quick checks to see + * if it is worth looking at further for free memory: + * 1) Check that the zone isn't thought to be full (doesn't have its + * bit set in the zonelist_cache fullzones BITMAP). + * 2) Check that the zones node (obtained from the zonelist_cache + * z_to_n[] mapping) is allowed in the passed in allowednodes mask. + * Return true (non-zero) if zone is worth looking at further, or + * else return false (zero) if it is not. + * + * This check -ignores- the distinction between various watermarks, + * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is + * found to be full for any variation of these watermarks, it will + * be considered full for up to one second by all requests, unless + * we are so low on memory on all allowed nodes that we are forced + * into the second scan of the zonelist. + * + * In the second scan we ignore this zonelist cache and exactly + * apply the watermarks to all zones, even it is slower to do so. + * We are low on memory in the second scan, and should leave no stone + * unturned looking for a free page. + */ +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, + nodemask_t *allowednodes) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + int n; /* node that zone *z is on */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return 1; + + i = z - zonelist->zones; + n = zlc->z_to_n[i]; + + /* This zone is worth trying if it is allowed but not full */ + return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); +} + /* - * get_page_from_freeliest goes through the zonelist trying to allocate + * Given 'z' scanning a zonelist, set the corresponding bit in + * zlc->fullzones, so that subsequent attempts to allocate a page + * from that zone don't waste time re-examining it. + */ +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + i = z - zonelist->zones; + + set_bit(i, zlc->fullzones); +} + +#else /* CONFIG_NUMA */ + +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + return NULL; +} + +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, + nodemask_t *allowednodes) +{ + return 1; +} + +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) +{ +} +#endif /* CONFIG_NUMA */ + +/* + * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, int alloc_flags) { - struct zone **z = zonelist->zones; + struct zone **z; struct page *page = NULL; - int classzone_idx = zone_idx(*z); + int classzone_idx = zone_idx(zonelist->zones[0]); struct zone *zone; + nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ + int zlc_active = 0; /* set if using zonelist_cache */ + int did_zlc_setup = 0; /* just call zlc_setup() one time */ +zonelist_scan: /* - * Go through the zonelist once, looking for a zone with enough free. + * Scan zonelist, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ + z = zonelist->zones; + do { + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; zone = *z; if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) break; if ((alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) - continue; + !cpuset_zone_allowed_softwall(zone, gfp_mask)) + goto try_next_zone; if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; @@ -959,18 +1180,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, mark = zone->pages_low; else mark = zone->pages_high; - if (!zone_watermark_ok(zone , order, mark, - classzone_idx, alloc_flags)) + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { if (!zone_reclaim_mode || !zone_reclaim(zone, gfp_mask, order)) - continue; + goto this_zone_full; + } } page = buffered_rmqueue(zonelist, zone, order, gfp_mask); - if (page) { + if (page) break; +this_zone_full: + if (NUMA_BUILD) + zlc_mark_zone_full(zonelist, z); +try_next_zone: + if (NUMA_BUILD && !did_zlc_setup) { + /* we do zlc_setup after the first zone is tried */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; } } while (*(++z) != NULL); + + if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + goto zonelist_scan; + } return page; } @@ -992,6 +1229,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, might_sleep_if(wait); + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + restart: z = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -1005,9 +1245,19 @@ restart: if (page) goto got_pg; - do { + /* + * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and + * __GFP_NOWARN set) should not cause reclaim since the subsystem + * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim + * using a larger set of nodes after it has established that the + * allowed per node queues are empty and that nodes are + * over allocated. + */ + if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + goto nopage; + + for (z = zonelist->zones; *z; z++) wakeup_kswapd(*z, order); - } while (*(++z)); /* * OK, we're below the kswapd watermark and have kicked background @@ -1041,6 +1291,7 @@ restart: /* This allocation should allow future memory freeing. */ +rebalance: if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { @@ -1062,7 +1313,6 @@ nofail_alloc: if (!wait) goto nopage; -rebalance: cond_resched(); /* We now go into synchronous reclaim */ @@ -1194,35 +1444,6 @@ fastcall void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); -/* - * Total amount of free (allocatable) RAM: - */ -unsigned int nr_free_pages(void) -{ - unsigned int sum = 0; - struct zone *zone; - - for_each_zone(zone) - sum += zone->free_pages; - - return sum; -} - -EXPORT_SYMBOL(nr_free_pages); - -#ifdef CONFIG_NUMA -unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) -{ - unsigned int sum = 0; - enum zone_type i; - - for (i = 0; i < MAX_NR_ZONES; i++) - sum += pgdat->node_zones[i].free_pages; - - return sum; -} -#endif - static unsigned int nr_free_zone_pages(int offset) { /* Just pick one node, since fallback list is circular */ @@ -1262,14 +1483,14 @@ unsigned int nr_free_pagecache_pages(void) static inline void show_node(struct zone *zone) { if (NUMA_BUILD) - printk("Node %ld ", zone_to_nid(zone)); + printk("Node %d ", zone_to_nid(zone)); } void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; val->sharedram = 0; - val->freeram = nr_free_pages(); + val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); @@ -1284,10 +1505,11 @@ void si_meminfo_node(struct sysinfo *val, int nid) pg_data_t *pgdat = NODE_DATA(nid); val->totalram = pgdat->node_present_pages; - val->freeram = nr_free_pages_pgdat(pgdat); + val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; - val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; + val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], + NR_FREE_PAGES); #else val->totalhigh = 0; val->freehigh = 0; @@ -1306,9 +1528,6 @@ void si_meminfo_node(struct sysinfo *val, int nid) void show_free_areas(void) { int cpu; - unsigned long active; - unsigned long inactive; - unsigned long free; struct zone *zone; for_each_zone(zone) { @@ -1332,20 +1551,19 @@ void show_free_areas(void) } } - get_zone_counts(&active, &inactive, &free); - - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " - "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", - active, - inactive, + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + global_page_state(NR_ACTIVE), + global_page_state(NR_INACTIVE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), - nr_free_pages(), + global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), - global_page_state(NR_PAGETABLE)); + global_page_state(NR_PAGETABLE), + global_page_state(NR_BOUNCE)); for_each_zone(zone) { int i; @@ -1366,12 +1584,12 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone->free_pages), + K(zone_page_state(zone, NR_FREE_PAGES)), K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), - K(zone->nr_active), - K(zone->nr_inactive), + K(zone_page_state(zone, NR_ACTIVE)), + K(zone_page_state(zone, NR_INACTIVE)), K(zone->present_pages), zone->pages_scanned, (zone->all_unreclaimable ? "yes" : "no") @@ -1542,6 +1760,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) } } +/* Construct the zonelist performance cache - see further mmzone.h */ +static void __meminit build_zonelist_cache(pg_data_t *pgdat) +{ + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zonelist *zonelist; + struct zonelist_cache *zlc; + struct zone **z; + + zonelist = pgdat->node_zonelists + i; + zonelist->zlcache_ptr = zlc = &zonelist->zlcache; + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + for (z = zonelist->zones; *z; z++) + zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); + } +} + #else /* CONFIG_NUMA */ static void __meminit build_zonelists(pg_data_t *pgdat) @@ -1579,14 +1815,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat) } } +/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ +static void __meminit build_zonelist_cache(pg_data_t *pgdat) +{ + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + pgdat->node_zonelists[i].zlcache_ptr = NULL; +} + #endif /* CONFIG_NUMA */ /* return values int ....just for stop_machine_run() */ static int __meminit __build_all_zonelists(void *dummy) { int nid; - for_each_online_node(nid) + + for_each_online_node(nid) { build_zonelists(NODE_DATA(nid)); + build_zonelist_cache(NODE_DATA(nid)); + } return 0; } @@ -1680,15 +1928,24 @@ static inline unsigned long wait_table_bits(unsigned long size) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn) + unsigned long start_pfn, enum memmap_context context) { struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!early_pfn_valid(pfn)) - continue; + /* + * There can be holes in boot-time mem_map[]s + * handed to this function. They do not + * exist on hotplugged memory. + */ + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + } page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); init_page_count(page); @@ -1713,23 +1970,9 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, } } -#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) -void zonetable_add(struct zone *zone, int nid, enum zone_type zid, - unsigned long pfn, unsigned long size) -{ - unsigned long snum = pfn_to_section_nr(pfn); - unsigned long end = pfn_to_section_nr(pfn + size); - - if (FLAGS_HAS_NODE) - zone_table[ZONETABLE_INDEX(nid, zid)] = zone; - else - for (; snum <= end; snum++) - zone_table[ZONETABLE_INDEX(snum, zid)] = zone; -} - #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn)) + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif static int __cpuinit zone_batchsize(struct zone *zone) @@ -1879,16 +2122,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, int ret = NOTIFY_OK; switch (action) { - case CPU_UP_PREPARE: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_DEAD: - free_zone_pagesets(cpu); - break; - default: - break; + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: + free_zone_pagesets(cpu); + break; + default: + break; } return ret; } @@ -1975,7 +2218,8 @@ static __meminit void zone_pcp_init(struct zone *zone) __meminit int init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size) + unsigned long size, + enum memmap_context context) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -2259,7 +2503,7 @@ unsigned long __init __absent_pages_in_range(int nid, /* Account for ranges past physical memory on this node */ if (range_end_pfn > prev_end_pfn) - hole_pages = range_end_pfn - + hole_pages += range_end_pfn - max(range_start_pfn, prev_end_pfn); return hole_pages; @@ -2379,11 +2623,11 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, " %s zone: %lu pages exceeds realsize %lu\n", zone_names[j], memmap_pages, realsize); - /* Account for reserved DMA pages */ - if (j == ZONE_DMA && realsize > dma_reserve) { + /* Account for reserved pages */ + if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve; - printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", - dma_reserve); + printk(KERN_DEBUG " %s zone: %lu pages reserved\n", + zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) @@ -2403,24 +2647,21 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone->free_pages = 0; - zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; zone->nr_scan_inactive = 0; - zone->nr_active = 0; - zone->nr_inactive = 0; zap_zone_vm_stats(zone); atomic_set(&zone->reclaim_in_progress, 0); if (!size) continue; - zonetable_add(zone, nid, j, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, + size, MEMMAP_EARLY); BUG_ON(ret); zone_start_pfn += size; } @@ -2605,17 +2846,23 @@ static void __init sort_node_map(void) cmp_node_active_region, NULL); } -/* Find the lowest pfn for a node. This depends on a sorted early_node_map */ +/* Find the lowest pfn for a node */ unsigned long __init find_min_pfn_for_node(unsigned long nid) { int i; + unsigned long min_pfn = ULONG_MAX; /* Assuming a sorted map, the first range found has the starting pfn */ for_each_active_range_index_in_nid(i, nid) - return early_node_map[i].start_pfn; + min_pfn = min(min_pfn, early_node_map[i].start_pfn); - printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); - return 0; + if (min_pfn == ULONG_MAX) { + printk(KERN_WARNING + "Could not find start_pfn for node %lu\n", nid); + return 0; + } + + return min_pfn; } /** @@ -2664,6 +2911,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) unsigned long nid; enum zone_type i; + /* Sort early_node_map as initialisation assumes it is sorted */ + sort_node_map(); + /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn)); @@ -2678,9 +2928,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); } - /* Regions in the early_node_map can be in any order */ - sort_node_map(); - /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -2734,7 +2981,6 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#ifdef CONFIG_HOTPLUG_CPU static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -2749,7 +2995,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ void __init page_alloc_init(void) { @@ -3053,7 +3298,7 @@ void *__init alloc_large_system_hash(const char *tablename, /* allow the kernel cmdline to have a say */ if (!numentries) { /* round applicable memory size up to nearest megabyte */ - numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; + numentries = nr_kernel_pages; numentries += (1UL << (20 - PAGE_SHIFT)) - 1; numentries >>= 20 - PAGE_SHIFT; numentries <<= 20 - PAGE_SHIFT; @@ -3063,6 +3308,10 @@ void *__init alloc_large_system_hash(const char *tablename, numentries >>= (scale - PAGE_SHIFT); else numentries <<= (PAGE_SHIFT - scale); + + /* Make sure we've got at least a 0-order allocation.. */ + if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -3075,7 +3324,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (numentries > max) numentries = max; - log2qty = long_log2(numentries); + log2qty = ilog2(numentries); do { size = bucketsize << log2qty; @@ -3097,7 +3346,7 @@ void *__init alloc_large_system_hash(const char *tablename, printk("%s hash table entries: %d (order: %d, %lu bytes)\n", tablename, (1U << log2qty), - long_log2(size) - PAGE_SHIFT, + ilog2(size) - PAGE_SHIFT, size); if (_hash_shift)