Merge git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6
[powerpc.git] / mm / page_alloc.c
index cd47e8f..5916431 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -72,7 +73,9 @@ static void __free_pages_ok(struct page *page, unsigned int order);
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+#ifdef CONFIG_ZONE_DMA
         256,
+#endif
 #ifdef CONFIG_ZONE_DMA32
         256,
 #endif
@@ -83,8 +86,10 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 
 EXPORT_SYMBOL(totalram_pages);
 
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
+#ifdef CONFIG_ZONE_DMA
         "DMA",
+#endif
 #ifdef CONFIG_ZONE_DMA32
         "DMA32",
 #endif
@@ -151,10 +156,8 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
-#ifdef CONFIG_HOLES_IN_ZONE
-       if (!pfn_valid(page_to_pfn(page)))
+       if (!pfn_valid_within(page_to_pfn(page)))
                return 0;
-#endif
        if (zone != page_zone(page))
                return 0;
 
@@ -222,7 +225,7 @@ static void bad_page(struct page *page)
 
 static void free_compound_page(struct page *page)
 {
-       __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+       __free_pages_ok(page, compound_order(page));
 }
 
 static void prep_compound_page(struct page *page, unsigned long order)
@@ -230,13 +233,14 @@ static void prep_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
 
-       page[1].lru.next = (void *)free_compound_page;  /* set dtor */
-       page[1].lru.prev = (void *)order;
-       for (i = 0; i < nr_pages; i++) {
+       set_compound_page_dtor(page, free_compound_page);
+       set_compound_order(page, order);
+       __SetPageHead(page);
+       for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
 
-               __SetPageCompound(p);
-               set_page_private(p, (unsigned long)page);
+               __SetPageTail(p);
+               p->first_page = page;
        }
 }
 
@@ -245,16 +249,19 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
 
-       if (unlikely((unsigned long)page[1].lru.prev != order))
+       if (unlikely(compound_order(page) != order))
                bad_page(page);
 
-       for (i = 0; i < nr_pages; i++) {
+       if (unlikely(!PageHead(page)))
+                       bad_page(page);
+       __ClearPageHead(page);
+       for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
 
-               if (unlikely(!PageCompound(p) |
-                               (page_private(p) != (unsigned long)page)))
+               if (unlikely(!PageTail(p) |
+                               (p->first_page != page)))
                        bad_page(page);
-               __ClearPageCompound(p);
+               __ClearPageTail(p);
        }
 }
 
@@ -341,10 +348,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 static inline int page_is_buddy(struct page *page, struct page *buddy,
                                                                int order)
 {
-#ifdef CONFIG_HOLES_IN_ZONE
-       if (!pfn_valid(page_to_pfn(buddy)))
+       if (!pfn_valid_within(page_to_pfn(buddy)))
                return 0;
-#endif
 
        if (page_zone_id(page) != page_zone_id(buddy))
                return 0;
@@ -394,7 +399,7 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(page_idx & (order_size - 1));
        VM_BUG_ON(bad_range(zone, page));
 
-       zone->free_pages += order_size;
+       __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
        while (order < MAX_ORDER-1) {
                unsigned long combined_idx;
                struct free_area *area;
@@ -428,13 +433,18 @@ static inline int free_pages_check(struct page *page)
                        1 << PG_private |
                        1 << PG_locked  |
                        1 << PG_active  |
-                       1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback |
                        1 << PG_reserved |
                        1 << PG_buddy ))))
                bad_page(page);
+       /*
+        * PageReclaim == PageTail. It is only an error
+        * for PageReclaim to be set if PageCompound is clear.
+        */
+       if (unlikely(!PageCompound(page) && PageReclaim(page)))
+               bad_page(page);
        if (PageDirty(page))
                __ClearPageDirty(page);
        /*
@@ -595,7 +605,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                        1 << PG_referenced | 1 << PG_arch_1 |
-                       1 << PG_checked | 1 << PG_mappedtodisk);
+                       1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
        set_page_private(page, 0);
        set_page_refcounted(page);
 
@@ -630,7 +640,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
                list_del(&page->lru);
                rmv_page_order(page);
                area->nr_free--;
-               zone->free_pages -= 1UL << order;
+               __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
                expand(zone, page, order, current_order, area);
                return page;
        }
@@ -659,6 +669,26 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
        return i;
 }
 
+#if MAX_NUMNODES > 1
+int nr_node_ids __read_mostly = MAX_NUMNODES;
+EXPORT_SYMBOL(nr_node_ids);
+
+/*
+ * Figure out the number of possible node ids.
+ */
+static void __init setup_nr_node_ids(void)
+{
+       unsigned int node;
+       unsigned int highest = 0;
+
+       for_each_node_mask(node, node_possible_map)
+               highest = node;
+       nr_node_ids = highest + 1;
+}
+#else
+static void __init setup_nr_node_ids(void) {}
+#endif
+
 #ifdef CONFIG_NUMA
 /*
  * Called from the slab reaper to drain pagesets on a particular node that
@@ -685,9 +715,15 @@ void drain_node_pages(int nodeid)
 
                        pcp = &pset->pcp[i];
                        if (pcp->count) {
+                               int to_drain;
+
                                local_irq_save(flags);
-                               free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                               pcp->count = 0;
+                               if (pcp->count >= pcp->batch)
+                                       to_drain = pcp->batch;
+                               else
+                                       to_drain = pcp->count;
+                               free_pages_bulk(zone, to_drain, &pcp->list, 0);
+                               pcp->count -= to_drain;
                                local_irq_restore(flags);
                        }
                }
@@ -695,7 +731,6 @@ void drain_node_pages(int nodeid)
 }
 #endif
 
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
        unsigned long flags;
@@ -705,6 +740,9 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
 
+               if (!populated_zone(zone))
+                       continue;
+
                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -717,7 +755,6 @@ static void __drain_pages(unsigned int cpu)
                }
        }
 }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_PM
 
@@ -738,8 +775,8 @@ void mark_free_pages(struct zone *zone)
                if (pfn_valid(pfn)) {
                        struct page *page = pfn_to_page(pfn);
 
-                       if (!PageNosave(page))
-                               ClearPageNosaveFree(page);
+                       if (!swsusp_page_is_forbidden(page))
+                               swsusp_unset_page_free(page);
                }
 
        for (order = MAX_ORDER - 1; order >= 0; --order)
@@ -748,7 +785,7 @@ void mark_free_pages(struct zone *zone)
 
                        pfn = page_to_pfn(list_entry(curr, struct page, lru));
                        for (i = 0; i < (1UL << order); i++)
-                               SetPageNosaveFree(pfn_to_page(pfn + i));
+                               swsusp_set_page_free(pfn_to_page(pfn + i));
                }
 
        spin_unlock_irqrestore(&zone->lock, flags);
@@ -888,6 +925,91 @@ failed:
 #define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET           0x40 /* check for correct cpuset */
 
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct fail_page_alloc_attr {
+       struct fault_attr attr;
+
+       u32 ignore_gfp_highmem;
+       u32 ignore_gfp_wait;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+       struct dentry *ignore_gfp_highmem_file;
+       struct dentry *ignore_gfp_wait_file;
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+} fail_page_alloc = {
+       .attr = FAULT_ATTR_INITIALIZER,
+       .ignore_gfp_wait = 1,
+       .ignore_gfp_highmem = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+       return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+       if (gfp_mask & __GFP_NOFAIL)
+               return 0;
+       if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+               return 0;
+       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+               return 0;
+
+       return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       struct dentry *dir;
+       int err;
+
+       err = init_fault_attr_dentries(&fail_page_alloc.attr,
+                                      "fail_page_alloc");
+       if (err)
+               return err;
+       dir = fail_page_alloc.attr.dentries.dir;
+
+       fail_page_alloc.ignore_gfp_wait_file =
+               debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                                     &fail_page_alloc.ignore_gfp_wait);
+
+       fail_page_alloc.ignore_gfp_highmem_file =
+               debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                                     &fail_page_alloc.ignore_gfp_highmem);
+
+       if (!fail_page_alloc.ignore_gfp_wait_file ||
+                       !fail_page_alloc.ignore_gfp_highmem_file) {
+               err = -ENOMEM;
+               debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+               debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+               cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+       }
+
+       return err;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+       return 0;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
 /*
  * Return 1 if free pages are above 'mark'. This takes into account the order
  * of the allocation.
@@ -896,8 +1018,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
        /* free_pages my go negative - that's OK */
-       unsigned long min = mark;
-       long free_pages = z->free_pages - (1 << order) + 1;
+       long min = mark;
+       long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
        int o;
 
        if (alloc_flags & ALLOC_HIGH)
@@ -1072,7 +1194,7 @@ zonelist_scan:
                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
-                       !cpuset_zone_allowed(zone, gfp_mask))
+                       !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                goto try_next_zone;
 
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1132,6 +1254,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 
        might_sleep_if(wait);
 
+       if (should_fail_alloc_page(gfp_mask, order))
+               return NULL;
+
 restart:
        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
@@ -1145,6 +1270,17 @@ restart:
        if (page)
                goto got_pg;
 
+       /*
+        * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+        * __GFP_NOWARN set) should not cause reclaim since the subsystem
+        * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+        * using a larger set of nodes after it has established that the
+        * allowed per node queues are empty and that nodes are
+        * over allocated.
+        */
+       if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+               goto nopage;
+
        for (z = zonelist->zones; *z; z++)
                wakeup_kswapd(*z, order);
 
@@ -1180,6 +1316,7 @@ restart:
 
        /* This allocation should allow future memory freeing. */
 
+rebalance:
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1201,7 +1338,6 @@ nofail_alloc:
        if (!wait)
                goto nopage;
 
-rebalance:
        cond_resched();
 
        /* We now go into synchronous reclaim */
@@ -1333,35 +1469,6 @@ fastcall void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
-/*
- * Total amount of free (allocatable) RAM:
- */
-unsigned int nr_free_pages(void)
-{
-       unsigned int sum = 0;
-       struct zone *zone;
-
-       for_each_zone(zone)
-               sum += zone->free_pages;
-
-       return sum;
-}
-
-EXPORT_SYMBOL(nr_free_pages);
-
-#ifdef CONFIG_NUMA
-unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
-{
-       unsigned int sum = 0;
-       enum zone_type i;
-
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               sum += pgdat->node_zones[i].free_pages;
-
-       return sum;
-}
-#endif
-
 static unsigned int nr_free_zone_pages(int offset)
 {
        /* Just pick one node, since fallback list is circular */
@@ -1401,14 +1508,14 @@ unsigned int nr_free_pagecache_pages(void)
 static inline void show_node(struct zone *zone)
 {
        if (NUMA_BUILD)
-               printk("Node %ld ", zone_to_nid(zone));
+               printk("Node %d ", zone_to_nid(zone));
 }
 
 void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
        val->sharedram = 0;
-       val->freeram = nr_free_pages();
+       val->freeram = global_page_state(NR_FREE_PAGES);
        val->bufferram = nr_blockdev_pages();
        val->totalhigh = totalhigh_pages;
        val->freehigh = nr_free_highpages();
@@ -1423,10 +1530,11 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        pg_data_t *pgdat = NODE_DATA(nid);
 
        val->totalram = pgdat->node_present_pages;
-       val->freeram = nr_free_pages_pgdat(pgdat);
+       val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
-       val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+       val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
+                       NR_FREE_PAGES);
 #else
        val->totalhigh = 0;
        val->freehigh = 0;
@@ -1445,9 +1553,6 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 void show_free_areas(void)
 {
        int cpu;
-       unsigned long active;
-       unsigned long inactive;
-       unsigned long free;
        struct zone *zone;
 
        for_each_zone(zone) {
@@ -1471,20 +1576,19 @@ void show_free_areas(void)
                }
        }
 
-       get_zone_counts(&active, &inactive, &free);
-
-       printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
-               "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
-               active,
-               inactive,
+       printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+               " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
+               global_page_state(NR_ACTIVE),
+               global_page_state(NR_INACTIVE),
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
-               nr_free_pages(),
+               global_page_state(NR_FREE_PAGES),
                global_page_state(NR_SLAB_RECLAIMABLE) +
                        global_page_state(NR_SLAB_UNRECLAIMABLE),
                global_page_state(NR_FILE_MAPPED),
-               global_page_state(NR_PAGETABLE));
+               global_page_state(NR_PAGETABLE),
+               global_page_state(NR_BOUNCE));
 
        for_each_zone(zone) {
                int i;
@@ -1505,12 +1609,12 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                       K(zone->free_pages),
+                       K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(zone->pages_min),
                        K(zone->pages_low),
                        K(zone->pages_high),
-                       K(zone->nr_active),
-                       K(zone->nr_inactive),
+                       K(zone_page_state(zone, NR_ACTIVE)),
+                       K(zone_page_state(zone, NR_INACTIVE)),
                        K(zone->present_pages),
                        zone->pages_scanned,
                        (zone->all_unreclaimable ? "yes" : "no")
@@ -1849,17 +1953,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-               unsigned long start_pfn)
+               unsigned long start_pfn, enum memmap_context context)
 {
        struct page *page;
        unsigned long end_pfn = start_pfn + size;
        unsigned long pfn;
 
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!early_pfn_valid(pfn))
-                       continue;
-               if (!early_pfn_in_nid(pfn, nid))
-                       continue;
+               /*
+                * There can be holes in boot-time mem_map[]s
+                * handed to this function.  They do not
+                * exist on hotplugged memory.
+                */
+               if (context == MEMMAP_EARLY) {
+                       if (!early_pfn_valid(pfn))
+                               continue;
+                       if (!early_pfn_in_nid(pfn, nid))
+                               continue;
+               }
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
                init_page_count(page);
@@ -1886,7 +1997,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
-       memmap_init_zone((size), (nid), (zone), (start_pfn))
+       memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 
 static int __cpuinit zone_batchsize(struct zone *zone)
@@ -2036,16 +2147,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
        int ret = NOTIFY_OK;
 
        switch (action) {
-               case CPU_UP_PREPARE:
-                       if (process_zones(cpu))
-                               ret = NOTIFY_BAD;
-                       break;
-               case CPU_UP_CANCELED:
-               case CPU_DEAD:
-                       free_zone_pagesets(cpu);
-                       break;
-               default:
-                       break;
+       case CPU_UP_PREPARE:
+               if (process_zones(cpu))
+                       ret = NOTIFY_BAD;
+               break;
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               free_zone_pagesets(cpu);
+               break;
+       default:
+               break;
        }
        return ret;
 }
@@ -2132,7 +2243,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
 
 __meminit int init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
-                                       unsigned long size)
+                                       unsigned long size,
+                                       enum memmap_context context)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int ret;
@@ -2536,11 +2648,11 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                                "  %s zone: %lu pages exceeds realsize %lu\n",
                                zone_names[j], memmap_pages, realsize);
 
-               /* Account for reserved DMA pages */
-               if (j == ZONE_DMA && realsize > dma_reserve) {
+               /* Account for reserved pages */
+               if (j == 0 && realsize > dma_reserve) {
                        realsize -= dma_reserve;
-                       printk(KERN_DEBUG "  DMA zone: %lu pages reserved\n",
-                                                               dma_reserve);
+                       printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
+                                       zone_names[0], dma_reserve);
                }
 
                if (!is_highmem_idx(j))
@@ -2560,7 +2672,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                spin_lock_init(&zone->lru_lock);
                zone_seqlock_init(zone);
                zone->zone_pgdat = pgdat;
-               zone->free_pages = 0;
 
                zone->prev_priority = DEF_PRIORITY;
 
@@ -2569,14 +2680,13 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                INIT_LIST_HEAD(&zone->inactive_list);
                zone->nr_scan_active = 0;
                zone->nr_scan_inactive = 0;
-               zone->nr_active = 0;
-               zone->nr_inactive = 0;
                zap_zone_vm_stats(zone);
                atomic_set(&zone->reclaim_in_progress, 0);
                if (!size)
                        continue;
 
-               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+               ret = init_currently_empty_zone(zone, zone_start_pfn,
+                                               size, MEMMAP_EARLY);
                BUG_ON(ret);
                zone_start_pfn += size;
        }
@@ -2761,20 +2871,23 @@ static void __init sort_node_map(void)
                        cmp_node_active_region, NULL);
 }
 
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+/* Find the lowest pfn for a node */
 unsigned long __init find_min_pfn_for_node(unsigned long nid)
 {
        int i;
-
-       /* Regions in the early_node_map can be in any order */
-       sort_node_map();
+       unsigned long min_pfn = ULONG_MAX;
 
        /* Assuming a sorted map, the first range found has the starting pfn */
        for_each_active_range_index_in_nid(i, nid)
-               return early_node_map[i].start_pfn;
+               min_pfn = min(min_pfn, early_node_map[i].start_pfn);
 
-       printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
-       return 0;
+       if (min_pfn == ULONG_MAX) {
+               printk(KERN_WARNING
+                       "Could not find start_pfn for node %lu\n", nid);
+               return 0;
+       }
+
+       return min_pfn;
 }
 
 /**
@@ -2823,6 +2936,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        unsigned long nid;
        enum zone_type i;
 
+       /* Sort early_node_map as initialisation assumes it is sorted */
+       sort_node_map();
+
        /* Record where the zone boundaries are */
        memset(arch_zone_lowest_possible_pfn, 0,
                                sizeof(arch_zone_lowest_possible_pfn));
@@ -2853,6 +2969,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                                early_node_map[i].end_pfn);
 
        /* Initialise every node */
+       setup_nr_node_ids();
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
                free_area_init_node(nid, pgdat, NULL,
@@ -2890,7 +3007,6 @@ void __init free_area_init(unsigned long *zones_size)
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int page_alloc_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -2905,7 +3021,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init page_alloc_init(void)
 {
@@ -3093,7 +3208,8 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, file, buffer, length, ppos);
-       setup_per_zone_pages_min();
+       if (write)
+               setup_per_zone_pages_min();
        return 0;
 }
 
@@ -3209,7 +3325,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        /* allow the kernel cmdline to have a say */
        if (!numentries) {
                /* round applicable memory size up to nearest megabyte */
-               numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+               numentries = nr_kernel_pages;
                numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
                numentries >>= 20 - PAGE_SHIFT;
                numentries <<= 20 - PAGE_SHIFT;
@@ -3219,6 +3335,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                        numentries >>= (scale - PAGE_SHIFT);
                else
                        numentries <<= (PAGE_SHIFT - scale);
+
+               /* Make sure we've got at least a 0-order allocation.. */
+               if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+                       numentries = PAGE_SIZE / bucketsize;
        }
        numentries = roundup_pow_of_two(numentries);
 
@@ -3231,7 +3351,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (numentries > max)
                numentries = max;
 
-       log2qty = long_log2(numentries);
+       log2qty = ilog2(numentries);
 
        do {
                size = bucketsize << log2qty;
@@ -3253,7 +3373,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
               tablename,
               (1U << log2qty),
-              long_log2(size) - PAGE_SHIFT,
+              ilog2(size) - PAGE_SHIFT,
               size);
 
        if (_hash_shift)
@@ -3277,18 +3397,4 @@ EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
 
-#if MAX_NUMNODES > 1
-/*
- * Find the highest possible node id.
- */
-int highest_possible_node_id(void)
-{
-       unsigned int node;
-       unsigned int highest = 0;
 
-       for_each_node_mask(node, node_possible_map)
-               highest = node;
-       return highest;
-}
-EXPORT_SYMBOL(highest_possible_node_id);
-#endif