X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=40db96a655d0c23ad7df245f6dd990b6555c8f4f;hb=0ca9caae2d05ee0c4878aa3e2619cd23f4ad4cb4;hp=cc64830475676088ee056bc94c0f07fd54c14b01;hpb=006d22d9bbb7e66279ba5cc4556b54eeaf8fd556;p=powerpc.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cc64830475..40db96a655 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -102,6 +104,38 @@ int min_free_kbytes = 1024; unsigned long __meminitdata nr_kernel_pages; unsigned long __meminitdata nr_all_pages; +static unsigned long __initdata dma_reserve; + +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct + * ranges of memory (RAM) that may be registered with add_active_range(). + * Ranges passed to add_active_range() will be merged if possible + * so the number of times add_active_range() can be called is + * related to the number of nodes and the number of holes + */ + #ifdef CONFIG_MAX_ACTIVE_REGIONS + /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ + #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS + #else + #if MAX_NUMNODES >= 32 + /* If there can be many nodes, allow up to 50 holes per node */ + #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) + #else + /* By default, allow up to 256 distinct regions */ + #define MAX_ACTIVE_REGIONS 256 + #endif + #endif + + struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; + int __initdata nr_nodemap_entries; + unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; + unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) @@ -461,17 +495,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) int i; int reserved = 0; - arch_free_page(page, order); - if (!PageHighMem(page)) - debug_check_no_locks_freed(page_address(page), - PAGE_SIZE<node_zones + z; struct per_cpu_pageset *pset; + if (!populated_zone(zone)) + continue; + pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -691,7 +727,8 @@ static void __drain_pages(unsigned int cpu) void mark_free_pages(struct zone *zone) { - unsigned long zone_pfn, flags; + unsigned long pfn, max_zone_pfn; + unsigned long flags; int order; struct list_head *curr; @@ -699,18 +736,25 @@ void mark_free_pages(struct zone *zone) return; spin_lock_irqsave(&zone->lock, flags); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); + + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (!PageNosave(page)) + ClearPageNosaveFree(page); + } for (order = MAX_ORDER - 1; order >= 0; --order) list_for_each(curr, &zone->free_area[order].free_list) { - unsigned long start_pfn, i; + unsigned long i; - start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); + pfn = page_to_pfn(list_entry(curr, struct page, lru)); + for (i = 0; i < (1UL << order); i++) + SetPageNosaveFree(pfn_to_page(pfn + i)); + } - for (i=0; i < (1<lock, flags); } @@ -736,13 +780,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; - arch_free_page(page, 0); - if (PageAnon(page)) page->mapping = NULL; if (free_pages_check(page)) return; + if (!PageHighMem(page)) + debug_check_no_locks_freed(page_address(page), PAGE_SIZE); + arch_free_page(page, 0); kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; @@ -855,7 +900,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ - long min = mark, free_pages = z->free_pages - (1 << order) + 1; + unsigned long min = mark; + long free_pages = z->free_pages - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -897,7 +943,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, */ do { zone = *z; - if (unlikely((gfp_mask & __GFP_THISNODE) && + if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) break; if ((alloc_flags & ALLOC_CPUSET) && @@ -1211,14 +1257,12 @@ unsigned int nr_free_pagecache_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); } -#ifdef CONFIG_NUMA -static void show_node(struct zone *zone) + +static inline void show_node(struct zone *zone) { - printk("Node %d ", zone->zone_pgdat->node_id); + if (NUMA_BUILD) + printk("Node %ld ", zone_to_nid(zone)); } -#else -#define show_node(zone) do { } while (0) -#endif void si_meminfo(struct sysinfo *val) { @@ -1260,34 +1304,30 @@ void si_meminfo_node(struct sysinfo *val, int nid) */ void show_free_areas(void) { - int cpu, temperature; + int cpu; unsigned long active; unsigned long inactive; unsigned long free; struct zone *zone; for_each_zone(zone) { - show_node(zone); - printk("%s per-cpu:", zone->name); - - if (!populated_zone(zone)) { - printk(" empty\n"); + if (!populated_zone(zone)) continue; - } else - printk("\n"); + + show_node(zone); + printk("%s per-cpu:\n", zone->name); for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; pageset = zone_pcp(zone, cpu); - for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: high %d, batch %d used:%d\n", - cpu, - temperature ? "cold" : "hot", - pageset->pcp[temperature].high, - pageset->pcp[temperature].batch, - pageset->pcp[temperature].count); + printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " + "Cold: hi:%5d, btch:%4d usd:%4d\n", + cpu, pageset->pcp[0].high, + pageset->pcp[0].batch, pageset->pcp[0].count, + pageset->pcp[1].high, pageset->pcp[1].batch, + pageset->pcp[1].count); } } @@ -1301,13 +1341,17 @@ void show_free_areas(void) global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), nr_free_pages(), - global_page_state(NR_SLAB), + global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_PAGETABLE)); for_each_zone(zone) { int i; + if (!populated_zone(zone)) + continue; + show_node(zone); printk("%s" " free:%lukB" @@ -1340,12 +1384,11 @@ void show_free_areas(void) for_each_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (!populated_zone(zone)) + continue; + show_node(zone); printk("%s: ", zone->name); - if (!populated_zone(zone)) { - printk("empty\n"); - continue; - } spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -1549,7 +1592,7 @@ static int __meminit __build_all_zonelists(void *dummy) void __meminit build_all_zonelists(void) { if (system_state == SYSTEM_BOOTING) { - __build_all_zonelists(0); + __build_all_zonelists(NULL); cpuset_init_current_mems_allowed(); } else { /* we have to stop all cpus to guaranntee there is no user @@ -1630,25 +1673,6 @@ static inline unsigned long wait_table_bits(unsigned long size) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) -static void __init calculate_zone_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long realtotalpages, totalpages = 0; - enum zone_type i; - - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); -} - - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -1806,6 +1830,9 @@ static int __cpuinit process_zones(int cpu) for_each_zone(zone) { + if (!populated_zone(zone)) + continue; + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); if (!zone_pcp(zone, cpu)) @@ -1965,6 +1992,349 @@ __meminit int init_currently_empty_zone(struct zone *zone, return 0; } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * Basic iterator support. Return the first range of PFNs for a node + * Note: nid == MAX_NUMNODES returns first region regardless of node + */ +static int __init first_active_region_index_in_nid(int nid) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the next active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardles of node + */ +static int __init next_active_region_index_in_nid(int index, int nid) +{ + for (index = index + 1; index < nr_nodemap_entries; index++) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + * Architectures may implement their own version but if add_active_range() + * was used and there are no special requirements, this is a convenient + * alternative + */ +int __init early_pfn_to_nid(unsigned long pfn) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long start_pfn = early_node_map[i].start_pfn; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (start_pfn <= pfn && pfn < end_pfn) + return early_node_map[i].nid; + } + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +/* Basic iterator support to walk early_node_map[] */ +#define for_each_active_range_index_in_nid(i, nid) \ + for (i = first_active_region_index_in_nid(nid); i != -1; \ + i = next_active_region_index_in_nid(i, nid)) + +/** + * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. + * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling free_bootmem() manually. + */ +void __init free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) { + unsigned long size_pages = 0; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (early_node_map[i].start_pfn >= max_low_pfn) + continue; + + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + + size_pages = end_pfn - early_node_map[i].start_pfn; + free_bootmem_node(NODE_DATA(early_node_map[i].nid), + PFN_PHYS(early_node_map[i].start_pfn), + size_pages << PAGE_SHIFT); + } +} + +/** + * sparse_memory_present_with_active_regions - Call memory_present for each active range + * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * function may be used instead of calling memory_present() manually. + */ +void __init sparse_memory_present_with_active_regions(int nid) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + memory_present(early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); +} + +/** + * push_node_boundaries - Push node boundaries to at least the requested boundary + * @nid: The nid of the node to push the boundary for + * @start_pfn: The start pfn of the node + * @end_pfn: The end pfn of the node + * + * In reserve-based hot-add, mem_map is allocated that is unused until hotadd + * time. Specifically, on x86_64, SRAT will report ranges that can potentially + * be hotplugged even though no physical memory exists. This function allows + * an arch to push out the node boundaries so mem_map is allocated that can + * be used later. + */ +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) +{ + printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", + nid, start_pfn, end_pfn); + + /* Initialise the boundary for this node if necessary */ + if (node_boundary_end_pfn[nid] == 0) + node_boundary_start_pfn[nid] = -1UL; + + /* Update the boundaries */ + if (node_boundary_start_pfn[nid] > start_pfn) + node_boundary_start_pfn[nid] = start_pfn; + if (node_boundary_end_pfn[nid] < end_pfn) + node_boundary_end_pfn[nid] = end_pfn; +} + +/* If necessary, push the node boundary out for reserve hotadd */ +static void __init account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", + nid, *start_pfn, *end_pfn); + + /* Return if boundary information has not been provided */ + if (node_boundary_end_pfn[nid] == 0) + return; + + /* Check the boundaries and update if necessary */ + if (node_boundary_start_pfn[nid] < *start_pfn) + *start_pfn = node_boundary_start_pfn[nid]; + if (node_boundary_end_pfn[nid] > *end_pfn) + *end_pfn = node_boundary_end_pfn[nid]; +} +#else +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) {} + +static void __init account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) {} +#endif + + +/** + * get_pfn_range_for_nid - Return the start and end page frames for a node + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. + * @start_pfn: Passed by reference. On return, it will have the node start_pfn. + * @end_pfn: Passed by reference. On return, it will have the node end_pfn. + * + * It returns the start and end page frame of a node based on information + * provided by an arch calling add_active_range(). If called for a node + * with no available memory, a warning is printed and the start and end + * PFNs will be 0. + */ +void __init get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + int i; + *start_pfn = -1UL; + *end_pfn = 0; + + for_each_active_range_index_in_nid(i, nid) { + *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); + *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + } + + if (*start_pfn == -1UL) { + printk(KERN_WARNING "Node %u active with no memory\n", nid); + *start_pfn = 0; + } + + /* Push the node boundaries out if requested */ + account_node_boundary(nid, start_pfn, end_pfn); +} + +/* + * Return the number of pages a zone spans in a node, including holes + * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() + */ +unsigned long __init zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + /* Get the start and end of the node and zone */ + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + + /* Check that this node has pages within the zone's required range */ + if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + return 0; + + /* Move the zone boundaries inside the node if necessary */ + zone_end_pfn = min(zone_end_pfn, node_end_pfn); + zone_start_pfn = max(zone_start_pfn, node_start_pfn); + + /* Return the spanned pages */ + return zone_end_pfn - zone_start_pfn; +} + +/* + * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, + * then all holes in the requested range will be accounted for. + */ +unsigned long __init __absent_pages_in_range(int nid, + unsigned long range_start_pfn, + unsigned long range_end_pfn) +{ + int i = 0; + unsigned long prev_end_pfn = 0, hole_pages = 0; + unsigned long start_pfn; + + /* Find the end_pfn of the first active range of pfns in the node */ + i = first_active_region_index_in_nid(nid); + if (i == -1) + return 0; + + /* Account for ranges before physical memory on this node */ + if (early_node_map[i].start_pfn > range_start_pfn) + hole_pages = early_node_map[i].start_pfn - range_start_pfn; + + prev_end_pfn = early_node_map[i].start_pfn; + + /* Find all holes for the zone within the node */ + for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { + + /* No need to continue if prev_end_pfn is outside the zone */ + if (prev_end_pfn >= range_end_pfn) + break; + + /* Make sure the end of the zone is not within the hole */ + start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + prev_end_pfn = max(prev_end_pfn, range_start_pfn); + + /* Update the hole size cound and move on */ + if (start_pfn > range_start_pfn) { + BUG_ON(prev_end_pfn > start_pfn); + hole_pages += start_pfn - prev_end_pfn; + } + prev_end_pfn = early_node_map[i].end_pfn; + } + + /* Account for ranges past physical memory on this node */ + if (range_end_pfn > prev_end_pfn) + hole_pages = range_end_pfn - + max(range_start_pfn, prev_end_pfn); + + return hole_pages; +} + +/** + * absent_pages_in_range - Return number of page frames in holes within a range + * @start_pfn: The start PFN to start searching for holes + * @end_pfn: The end PFN to stop searching for holes + * + * It returns the number of pages frames in memory holes within a range. + */ +unsigned long __init absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn) +{ + return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); +} + +/* Return the number of page frames in holes in a zone on a node */ +unsigned long __init zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], + node_start_pfn); + zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], + node_end_pfn); + + return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); +} + +#else +static inline unsigned long zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zones_size) +{ + return zones_size[zone_type]; +} + +static inline unsigned long zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zholes_size) +{ + if (!zholes_size) + return 0; + + return zholes_size[zone_type]; +} + +#endif + +static void __init calculate_node_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + enum zone_type i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, + zones_size); + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= + zone_absent_pages_in_node(pgdat->node_id, i, + zholes_size); + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, + realtotalpages); +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1986,11 +2356,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize; + unsigned long size, realsize, memmap_pages; + + size = zone_spanned_pages_in_node(nid, j, zones_size); + realsize = size - zone_absent_pages_in_node(nid, j, + zholes_size); - realsize = size = zones_size[j]; - if (zholes_size) - realsize -= zholes_size[j]; + /* + * Adjust realsize so that it accounts for how much memory + * is used by this zone for memmap. This affects the watermark + * and per-cpu initialisations + */ + memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; + if (realsize >= memmap_pages) { + realsize -= memmap_pages; + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds realsize %lu\n", + zone_names[j], memmap_pages, realsize); + + /* Account for reserved DMA pages */ + if (j == ZONE_DMA && realsize > dma_reserve) { + realsize -= dma_reserve; + printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", + dma_reserve); + } if (!is_highmem_idx(j)) nr_kernel_pages += realsize; @@ -1999,8 +2392,10 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; #ifdef CONFIG_NUMA - zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) + zone->node = nid; + zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; + zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); @@ -2060,8 +2455,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) /* * With no DISCONTIG, the global mem_map is just set as node 0's */ - if (pgdat == NODE_DATA(0)) + if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= pgdat->node_start_pfn; +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } @@ -2072,13 +2472,254 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, { pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - calculate_zone_totalpages(pgdat, zones_size, zholes_size); + calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/** + * add_active_range - Register a range of PFNs backed by physical memory + * @nid: The node ID the range resides on + * @start_pfn: The start PFN of the available physical memory + * @end_pfn: The end PFN of the available physical memory + * + * These ranges are stored in an early_node_map[] and later used by + * free_area_init_nodes() to calculate zone sizes and holes. If the + * range spans a memory hole, it is up to the architecture to ensure + * the memory is not freed by the bootmem allocator. If possible + * the range being registered will be merged with existing ranges. + */ +void __init add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + + printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); + + /* Merge with existing active regions if possible */ + for (i = 0; i < nr_nodemap_entries; i++) { + if (early_node_map[i].nid != nid) + continue; + + /* Skip if an existing region covers this new one */ + if (start_pfn >= early_node_map[i].start_pfn && + end_pfn <= early_node_map[i].end_pfn) + return; + + /* Merge forward if suitable */ + if (start_pfn <= early_node_map[i].end_pfn && + end_pfn > early_node_map[i].end_pfn) { + early_node_map[i].end_pfn = end_pfn; + return; + } + + /* Merge backward if suitable */ + if (start_pfn < early_node_map[i].end_pfn && + end_pfn >= early_node_map[i].start_pfn) { + early_node_map[i].start_pfn = start_pfn; + return; + } + } + + /* Check that early_node_map is large enough */ + if (i >= MAX_ACTIVE_REGIONS) { + printk(KERN_CRIT "More than %d memory regions, truncating\n", + MAX_ACTIVE_REGIONS); + return; + } + + early_node_map[i].nid = nid; + early_node_map[i].start_pfn = start_pfn; + early_node_map[i].end_pfn = end_pfn; + nr_nodemap_entries = i + 1; +} + +/** + * shrink_active_range - Shrink an existing registered range of PFNs + * @nid: The node id the range is on that should be shrunk + * @old_end_pfn: The old end PFN of the range + * @new_end_pfn: The new PFN of the range + * + * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. + * The map is kept at the end physical page range that has already been + * registered with add_active_range(). This function allows an arch to shrink + * an existing registered range. + */ +void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, + unsigned long new_end_pfn) +{ + int i; + + /* Find the old active region end and shrink */ + for_each_active_range_index_in_nid(i, nid) + if (early_node_map[i].end_pfn == old_end_pfn) { + early_node_map[i].end_pfn = new_end_pfn; + break; + } +} + +/** + * remove_all_active_ranges - Remove all currently registered regions + * + * During discovery, it may be found that a table like SRAT is invalid + * and an alternative discovery method must be used. This function removes + * all currently registered regions. + */ +void __init remove_all_active_ranges(void) +{ + memset(early_node_map, 0, sizeof(early_node_map)); + nr_nodemap_entries = 0; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); + memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ +} + +/* Compare two active node_active_regions */ +static int __init cmp_node_active_region(const void *a, const void *b) +{ + struct node_active_region *arange = (struct node_active_region *)a; + struct node_active_region *brange = (struct node_active_region *)b; + + /* Done this way to avoid overflows */ + if (arange->start_pfn > brange->start_pfn) + return 1; + if (arange->start_pfn < brange->start_pfn) + return -1; + + return 0; +} + +/* sort the node_map by start_pfn */ +static void __init sort_node_map(void) +{ + sort(early_node_map, (size_t)nr_nodemap_entries, + sizeof(struct node_active_region), + cmp_node_active_region, NULL); +} + +/* Find the lowest pfn for a node. This depends on a sorted early_node_map */ +unsigned long __init find_min_pfn_for_node(unsigned long nid) +{ + int i; + + /* Assuming a sorted map, the first range found has the starting pfn */ + for_each_active_range_index_in_nid(i, nid) + return early_node_map[i].start_pfn; + + printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); + return 0; +} + +/** + * find_min_pfn_with_active_regions - Find the minimum PFN registered + * + * It returns the minimum PFN based on information provided via + * add_active_range(). + */ +unsigned long __init find_min_pfn_with_active_regions(void) +{ + return find_min_pfn_for_node(MAX_NUMNODES); +} + +/** + * find_max_pfn_with_active_regions - Find the maximum PFN registered + * + * It returns the maximum PFN based on information provided via + * add_active_range(). + */ +unsigned long __init find_max_pfn_with_active_regions(void) +{ + int i; + unsigned long max_pfn = 0; + + for (i = 0; i < nr_nodemap_entries; i++) + max_pfn = max(max_pfn, early_node_map[i].end_pfn); + + return max_pfn; +} + +/** + * free_area_init_nodes - Initialise all pg_data_t and zone data + * @max_zone_pfn: an array of max PFNs for each zone + * + * This will call free_area_init_node() for each active node in the system. + * Using the page ranges provided by add_active_range(), the size of each + * zone in each node and their holes is calculated. If the maximum PFN + * between two adjacent zones match, it is assumed that the zone is empty. + * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed + * that arch_max_dma32_pfn has no pages. It is also assumed that a zone + * starts where the previous one ended. For example, ZONE_DMA32 starts + * at arch_max_dma_pfn. + */ +void __init free_area_init_nodes(unsigned long *max_zone_pfn) +{ + unsigned long nid; + enum zone_type i; + + /* Record where the zone boundaries are */ + memset(arch_zone_lowest_possible_pfn, 0, + sizeof(arch_zone_lowest_possible_pfn)); + memset(arch_zone_highest_possible_pfn, 0, + sizeof(arch_zone_highest_possible_pfn)); + arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); + arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; + for (i = 1; i < MAX_NR_ZONES; i++) { + arch_zone_lowest_possible_pfn[i] = + arch_zone_highest_possible_pfn[i-1]; + arch_zone_highest_possible_pfn[i] = + max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + } + + /* Regions in the early_node_map can be in any order */ + sort_node_map(); + + /* Print out the zone ranges */ + printk("Zone PFN ranges:\n"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %-8s %8lu -> %8lu\n", + zone_names[i], + arch_zone_lowest_possible_pfn[i], + arch_zone_highest_possible_pfn[i]); + + /* Print out the early_node_map[] */ + printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); + for (i = 0; i < nr_nodemap_entries; i++) + printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); + + /* Initialise every node */ + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + free_area_init_node(nid, pgdat, NULL, + find_min_pfn_for_node(nid), NULL); + } +} +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +/** + * set_dma_reserve - set the specified number of pages reserved in the first zone + * @new_dma_reserve: The number of pages to mark reserved + * + * The per-cpu batchsize and zone watermarks are determined by present_pages. + * In the DMA zone, a significant percentage may be consumed by kernel image + * and other unfreeable allocations which can skew the watermarks badly. This + * function may optionally be used to account for unfreeable pages in the + * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and + * smaller per-cpu batchsize. + */ +void __init set_dma_reserve(unsigned long new_dma_reserve) +{ + dma_reserve = new_dma_reserve; +} + #ifndef CONFIG_NEED_MULTIPLE_NODES static bootmem_data_t contig_bootmem_data; struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; @@ -2185,10 +2826,11 @@ static void setup_per_zone_lowmem_reserve(void) calculate_totalreserve_pages(); } -/* - * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures - * that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. +/** + * setup_per_zone_pages_min - called when min_free_kbytes changes. + * + * Ensures that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. */ void setup_per_zone_pages_min(void) { @@ -2310,10 +2952,26 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_unmapped_ratio = (zone->present_pages * + zone->min_unmapped_pages = (zone->present_pages * sysctl_min_unmapped_ratio) / 100; return 0; } + +int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_slab_pages = (zone->present_pages * + sysctl_min_slab_ratio) / 100; + return 0; +} #endif /*