X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=arch%2Fx86%2Fmm%2Fnuma_64.c;h=a920d09b91948dcf8ea42b6c95f09fbbb07a2145;hb=3e01dfce1387f8bec41018f0d7b42fd88ad4163f;hp=6da235522269e856fc000807888db8e345891a90;hpb=df3d80f5a5c74168be42788364d13cf6c83c7b9c;p=powerpc.git diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 6da2355222..a920d09b91 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -1,7 +1,7 @@ -/* +/* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ + */ #include #include #include @@ -11,35 +11,45 @@ #include #include #include +#include #include #include #include #include #include +#include #ifndef Dprintk #define Dprintk(x...) #endif struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -unsigned char cpu_to_node[NR_CPUS] __read_mostly = { +int x86_cpu_to_node_map_init[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; -unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +void *x86_cpu_to_node_map_early_ptr; +DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); +EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); + +s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_to_cpumask_map); int numa_off __initdata; unsigned long __initdata nodemap_addr; unsigned long __initdata nodemap_size; - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size; * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +static int __init populate_memnodemap(const struct bootnode *nodes, + int numnodes, int shift) { - int i; - int res = -1; unsigned long addr, end; + int i, res = -1; - memset(memnodemap, 0xff, memnodemapsize); + memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; @@ -63,37 +72,36 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) if ((end >> shift) >= memnodemapsize) return 0; do { - if (memnodemap[addr >> shift] != 0xff) + if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; memnodemap[addr >> shift] = i; addr += (1UL << shift); } while (addr < end); res = 1; - } + } return res; } static int __init allocate_cachealigned_memnodemap(void) { - unsigned long pad, pad_addr; + unsigned long addr; memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) + if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) return 0; - pad = L1_CACHE_BYTES - 1; - pad_addr = 0x8000; - nodemap_size = pad + memnodemapsize; - nodemap_addr = find_e820_area(pad_addr, end_pfn<> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); + node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, + SMP_CACHE_BYTES); if (node_data[nodeid] == NULL) return; nodedata_phys = __pa(node_data[nodeid]); @@ -200,75 +209,59 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; /* Find a place for the bootmem map */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + /* + * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like + * to use that to align to PAGE_SIZE + */ bootmap = early_node_mem(nodeid, bootmap_start, end, - bootmap_pages<= end) - free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + free_bootmem((unsigned long)node_data[nodeid], + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); - + Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), - bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); + bootmap_start >> PAGE_SHIFT, + start_pfn, end_pfn); free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<node_mem_map = - __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, - memmapsize, SMP_CACHE_BYTES, - round_down(limit - memmapsize, PAGE_SIZE), - limit); -#endif -} +} +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ void __init numa_init_array(void) { int rr, i; - /* There are unfortunately some poorly designed mainboards around - that only connect memory to a single CPU. This breaks the 1:1 cpu->node - mapping. To avoid this fill in the mapping for all possible - CPUs, as the number of CPUs is not known yet. - We round robin the existing nodes. */ + rr = first_node(node_online_map); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node[i] != NUMA_NO_NODE) + if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; - numa_set_node(i, rr); + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); } - } #ifdef CONFIG_NUMA_EMU @@ -276,15 +269,17 @@ void __init numa_init_array(void) char *cmdline __initdata; /* - * Setups up nid to range from addr to addr + size. If the end boundary is - * greater than max_addr, then max_addr is used instead. The return value is 0 - * if there is additional memory left for allocation past addr and -1 otherwise. - * addr is adjusted to be at the end of the node. + * Setups up nid to range from addr to addr + size. If the end + * boundary is greater than max_addr, then max_addr is used instead. + * The return value is 0 if there is additional memory left for + * allocation past addr and -1 otherwise. addr is adjusted to be at + * the end of the node. */ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, u64 size, u64 max_addr) { int ret = 0; + nodes[nid].start = *addr; *addr += size; if (*addr >= max_addr) { @@ -335,6 +330,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, for (i = node_start; i < num_nodes + node_start; i++) { u64 end = *addr + size; + if (i < big) end += FAKE_NODE_MIN_SIZE; /* @@ -380,14 +376,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { struct bootnode nodes[MAX_NUMNODES]; - u64 addr = start_pfn << PAGE_SHIFT; + u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; - int num_nodes = 0; - int coeff_flag; - int coeff = -1; - int num = 0; - u64 size; - int i; + int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; memset(&nodes, 0, sizeof(nodes)); /* @@ -395,8 +386,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) * system RAM into N fake nodes. */ if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, - simple_strtol(cmdline, NULL, 0)); + long n = simple_strtol(cmdline, NULL, 0); + + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); if (num_nodes < 0) return num_nodes; goto out; @@ -483,46 +475,47 @@ out: for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + setup_node_bootmem(i, nodes[i].start, nodes[i].end); } acpi_fake_nodes(nodes, num_nodes); - numa_init_array(); - return 0; + numa_init_array(); + return 0; } #endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ +{ int i; nodes_clear(node_possible_map); #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, end_pfn)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<nodenumber = node; - cpu_to_node[cpu] = node; + + if(cpu_to_node_map) + cpu_to_node_map[cpu] = node; + else if(per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); } -unsigned long __init numa_free_all_bootmem(void) -{ - int i; +unsigned long __init numa_free_all_bootmem(void) +{ unsigned long pages = 0; - for_each_online_node(i) { + int i; + + for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); - } + return pages; -} +} void __init paging_init(void) -{ - int i; +{ unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -568,32 +573,27 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); - for_each_online_node(i) { - setup_node_zones(i); - } - free_area_init_nodes(max_zone_pfns); -} +} static __init int numa_setup(char *opt) -{ +{ if (!opt) return -EINVAL; - if (!strncmp(opt,"off",3)) + if (!strncmp(opt, "off", 3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU if (!strncmp(opt, "fake=", 5)) cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA - if (!strncmp(opt,"noacpi",6)) - acpi_numa = -1; - if (!strncmp(opt,"hotadd=", 7)) + if (!strncmp(opt, "noacpi", 6)) + acpi_numa = -1; + if (!strncmp(opt, "hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 0; -} - +} early_param("numa", numa_setup); /* @@ -611,38 +611,16 @@ early_param("numa", numa_setup); void __init init_cpu_to_node(void) { int i; - for (i = 0; i < NR_CPUS; i++) { - u8 apicid = x86_cpu_to_apicid[i]; + + for (i = 0; i < NR_CPUS; i++) { + u16 apicid = x86_cpu_to_apicid_init[i]; + if (apicid == BAD_APICID) continue; if (apicid_to_node[apicid] == NUMA_NO_NODE) continue; - numa_set_node(i,apicid_to_node[apicid]); + numa_set_node(i, apicid_to_node[apicid]); } } -EXPORT_SYMBOL(cpu_to_node); -EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode); -EXPORT_SYMBOL(node_data); -#ifdef CONFIG_DISCONTIGMEM -/* - * Functions to convert PFNs from/to per node page addresses. - * These are out of line because they are quite big. - * They could be all tuned by pre caching more state. - * Should do that. - */ - -int pfn_valid(unsigned long pfn) -{ - unsigned nid; - if (pfn >= num_physpages) - return 0; - nid = pfn_to_nid(pfn); - if (nid == 0xff) - return 0; - return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); -} -EXPORT_SYMBOL(pfn_valid); -#endif