include upstream ip1000a driver version 2.09f
[linux-2.4.git] / arch / x86_64 / mm / init.c
1 /*
2  *  linux/arch/x86_64/mm/init.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
6  *  Copyright (C) 2002  Andi Kleen <ak@suse.de>
7  */
8
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
18 #include <linux/mm.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/blk.h>
23 #include <linux/pagemap.h>
24 #include <linux/bootmem.h>
25
26 #include <asm/processor.h>
27 #include <asm/system.h>
28 #include <asm/uaccess.h>
29 #include <asm/pgtable.h>
30 #include <asm/pgalloc.h>
31 #include <asm/dma.h>
32 #include <asm/fixmap.h>
33 #include <asm/e820.h>
34 #include <asm/apic.h>
35 #include <asm/tlb.h>
36 #include <asm/pda.h>
37 #include <asm/mmu_context.h>
38 #include <asm/proto.h>
39
40 mmu_gather_t mmu_gathers[NR_CPUS];
41
42 static unsigned long totalram_pages;
43
44 int do_check_pgt_cache(int low, int high)
45 {
46         int freed = 0;
47         if(read_pda(pgtable_cache_sz) > high) {
48                 do {
49                         if (read_pda(pgd_quick)) {
50                                 pgd_free_slow(pgd_alloc_one_fast());
51                                 freed++;
52                         }
53                         if (read_pda(pmd_quick)) {
54                                 pmd_free_slow(pmd_alloc_one_fast(NULL, 0));
55                                 freed++;
56                         }
57                         if (read_pda(pte_quick)) {
58                                 pte_free_slow(pte_alloc_one_fast(NULL, 0));
59                                 freed++;
60                         }
61                 } while(read_pda(pgtable_cache_sz) > low);
62         }
63         return freed;
64 }
65
66 #ifndef CONFIG_DISCONTIGMEM
67 /*
68  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
69  * physical space so we can cache the place of the first one and move
70  * around without checking the pgd every time.
71  */
72
73 void show_mem(void)
74 {
75         int i, total = 0, reserved = 0;
76         int shared = 0, cached = 0;
77
78         printk("Mem-info:\n");
79         show_free_areas();
80         printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
81         i = max_mapnr;
82         while (i-- > 0) {
83                 total++;
84                 if (PageReserved(mem_map+i))
85                         reserved++;
86                 else if (PageSwapCache(mem_map+i))
87                         cached++;
88                 else if (page_count(mem_map+i))
89                         shared += page_count(mem_map+i) - 1;
90         }
91         printk("%d pages of RAM\n", total);
92         printk("%d reserved pages\n",reserved);
93         printk("%d pages shared\n",shared);
94         printk("%d pages swap cached\n",cached);
95         printk("%ld pages in page table cache\n",read_pda(pgtable_cache_sz));
96         show_buffers();
97 }
98 #endif
99
100 /* References to section boundaries */
101
102 extern char _text, _etext, _edata, __bss_start, _end;
103 extern char __init_begin, __init_end;
104
105 int after_bootmem;
106
107 static void *spp_getpage(void)
108
109         void *ptr;
110         if (after_bootmem)
111                 ptr = (void *) get_free_page(GFP_ATOMIC); 
112         else
113                 ptr = alloc_bootmem_low_pages(PAGE_SIZE); 
114         if (!ptr)
115                 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
116         return ptr;
117
118
119 static void set_pte_phys(unsigned long vaddr,
120                          unsigned long phys, pgprot_t prot)
121 {
122         pml4_t *level4;
123         pgd_t *pgd;
124         pmd_t *pmd;
125         pte_t *pte;
126
127         level4 = pml4_offset_k(vaddr);
128         if (pml4_none(*level4)) {
129                 printk("PML4 FIXMAP MISSING, it should be setup in head.S!\n");
130                 return;
131         }
132         pgd = level3_offset_k(level4, vaddr);
133         if (pgd_none(*pgd)) {
134                 pmd = (pmd_t *) spp_getpage(); 
135                 set_pgd(pgd, __pgd(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
136                 if (pmd != pmd_offset(pgd, 0)) {
137                         printk("PAGETABLE BUG #01!\n");
138                         return;
139                 }
140         }
141         pmd = pmd_offset(pgd, vaddr);
142         if (pmd_none(*pmd)) {
143                 pte = (pte_t *) spp_getpage();
144                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
145                 if (pte != pte_offset(pmd, 0)) {
146                         printk("PAGETABLE BUG #02!\n");
147                         return;
148                 }
149         }
150         pte = pte_offset(pmd, vaddr);
151         set_pte(pte, mk_pte_phys(phys, prot));
152
153         /*
154          * It's enough to flush this one mapping.
155          * (PGE mappings get flushed as well)
156          */
157         __flush_tlb_one(vaddr);
158 }
159
160 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
161 {
162         unsigned long address = __fix_to_virt(idx);
163
164         if (idx >= __end_of_fixed_addresses) {
165                 printk("Invalid __set_fixmap\n");
166                 return;
167         }
168         set_pte_phys(address, phys, prot);
169 }
170
171 extern pmd_t temp_boot_pmds[]; 
172
173 unsigned long __initdata table_start, table_end; 
174
175 static  struct temp_map { 
176         pmd_t *pmd; 
177         void  *address; 
178         int    allocated; 
179 } temp_mappings[] __initdata = { 
180         { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
181         { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
182         {}
183 }; 
184
185 static __init void *alloc_low_page(int *index, unsigned long *phys) 
186
187         struct temp_map *ti;
188         int i; 
189         unsigned long pfn = table_end++, paddr; 
190         void *adr;
191
192         if (table_end >= end_pfn_map) 
193                 panic("alloc_low_page: ran out of page mappings"); 
194         for (i = 0; temp_mappings[i].allocated; i++) {
195                 if (!temp_mappings[i].pmd) 
196                         panic("alloc_low_page: ran out of temp mappings"); 
197         } 
198         ti = &temp_mappings[i];
199         paddr = (pfn << PAGE_SHIFT) & PMD_MASK; 
200         set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); 
201         ti->allocated = 1; 
202         __flush_tlb();         
203         adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
204         *index = i; 
205         *phys  = pfn * PAGE_SIZE;  
206         return adr; 
207
208
209 static __init void unmap_low_page(int i)
210
211         struct temp_map *ti = &temp_mappings[i];
212         set_pmd(ti->pmd, __pmd(0));
213         ti->allocated = 0; 
214
215
216 static void __init phys_pgd_init(pgd_t *pgd, unsigned long address, unsigned long end)
217
218         long i, j; 
219
220         i = pgd_index(address);
221         pgd = pgd + i;
222         for (; i < PTRS_PER_PGD; pgd++, i++) {
223                 int map; 
224                 unsigned long paddr, pmd_phys;
225                 pmd_t *pmd;
226
227                 paddr = (address & PML4_MASK) + i*PGDIR_SIZE; 
228                 if (paddr >= end) { 
229                         for (; i < PTRS_PER_PGD; i++, pgd++) 
230                                 set_pgd(pgd, __pgd(0)); 
231                         break;
232                 } 
233
234                 if (!e820_mapped(paddr, paddr+PGDIR_SIZE, 0)) { 
235                         set_pgd(pgd, __pgd(0)); 
236                         continue;
237                 } 
238
239                 pmd = alloc_low_page(&map, &pmd_phys);
240                 set_pgd(pgd, __pgd(pmd_phys | _KERNPG_TABLE));
241                 for (j = 0; j < PTRS_PER_PMD; pmd++, j++ , paddr += PMD_SIZE) {
242                         unsigned long pe;
243
244                         if (paddr >= end) { 
245                                 for (; j < PTRS_PER_PMD; j++, pmd++)
246                                         set_pmd(pmd,  __pmd(0)); 
247                                 break;
248                         }
249                         pe = _PAGE_PSE | _KERNPG_TABLE | _PAGE_NX | _PAGE_GLOBAL | paddr;
250                         pe &= __supported_pte_mask; 
251                         set_pmd(pmd, __pmd(pe));
252                 }
253                 unmap_low_page(map);
254         }
255         __flush_tlb();
256
257
258 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
259    This runs before bootmem is initialized and gets pages directly from the 
260    physical memory. To access them they are temporarily mapped. */
261 void __init init_memory_mapping(void) 
262
263         unsigned long adr;             
264         unsigned long end;
265         unsigned long next; 
266         unsigned long pgds, pmds, tables; 
267
268         end = end_pfn_map << PAGE_SHIFT; 
269
270         /* 
271          * Find space for the kernel direct mapping tables.
272          * Later we should allocate these tables in the local node of the memory
273          * mapped.  Unfortunately this is done currently before the nodes are 
274          * discovered.
275          */
276
277         pgds = (end + PGDIR_SIZE - 1) >> PGDIR_SHIFT;
278         pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 
279         tables = round_up(pgds*8, PAGE_SIZE) + round_up(pmds * 8, PAGE_SIZE); 
280
281         /* Direct mapping must currently fit below the kernel in the first MB.
282            This is because we have no way to tell the later passes to not reuse
283            the memory, until bootmem is initialised */
284         /* Should limit MAXMEM for this */
285         table_start = find_e820_area(/*0*/ 0x8000, __pa_symbol(&_text), tables); 
286         if (table_start == -1UL) 
287                 panic("Cannot find space for the kernel page tables"); 
288
289         table_start >>= PAGE_SHIFT; 
290         table_end = table_start;
291        
292         end += __PAGE_OFFSET; /* turn virtual */  
293
294         for (adr = PAGE_OFFSET; adr < end; adr = next) { 
295                 int map;
296                 unsigned long pgd_phys; 
297                 pgd_t *pgd = alloc_low_page(&map, &pgd_phys);
298                 next = adr + PML4_SIZE;
299                 if (next > end) 
300                         next = end; 
301
302                 phys_pgd_init(pgd, adr-PAGE_OFFSET, next-PAGE_OFFSET); 
303                 set_pml4(init_level4_pgt + pml4_index(adr), 
304                          mk_kernel_pml4(pgd_phys, KERNPG_TABLE));
305                 unmap_low_page(map);   
306         } 
307         asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
308         __flush_tlb_all();
309         printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
310                table_start<<PAGE_SHIFT, 
311                table_end<<PAGE_SHIFT);
312
313
314 void __init zap_low_mappings (void)
315 {
316         int i;
317         for (i = 0; i < NR_CPUS; i++) {
318                 if (cpu_pda[i].level4_pgt) 
319                         cpu_pda[i].level4_pgt[0] = 0; 
320         }
321
322         flush_tlb_all();
323 }
324
325 #ifndef CONFIG_DISCONTIGMEM
326 void __init paging_init(void)
327 {
328         unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
329         unsigned int max_dma;
330         
331         max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
332         if (end_pfn < max_dma)
333                 zones_size[ZONE_DMA] = end_pfn;
334         else {
335                 zones_size[ZONE_DMA] = max_dma;
336                 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
337         }
338         free_area_init(zones_size);
339 }
340
341 static inline int page_is_ram (unsigned long pagenr)
342 {
343         int i;
344
345         for (i = 0; i < e820.nr_map; i++) {
346                 unsigned long addr, end;
347
348                 if (e820.map[i].type != E820_RAM)       /* not usable memory */
349                         continue;
350                 /*
351                  *      !!!FIXME!!! Some BIOSen report areas as RAM that
352                  *      are not. Notably the 640->1Mb area. We need a sanity
353                  *      check here.
354                  */
355                 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
356                 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
357                 if  ((pagenr >= addr) && (pagenr < end))
358                         return 1;
359         }
360         return 0;
361 }
362 #endif
363
364 void __init mem_init(void)
365 {
366         unsigned long codesize, reservedpages, datasize, initsize;
367         unsigned long tmp;
368
369         max_mapnr = end_pfn; 
370         num_physpages = end_pfn; /* XXX not true because of holes */
371         high_memory = (void *) __va(end_pfn << PAGE_SHIFT);
372
373         /* clear the zero-page */
374         memset(empty_zero_page, 0, PAGE_SIZE);
375
376         reservedpages = 0;
377
378         /* this will put all low memory onto the freelists */
379 #ifdef CONFIG_DISCONTIGMEM
380         totalram_pages += numa_free_all_bootmem();
381         tmp = 0;
382         /* should count reserved pages here for all nodes */ 
383 #else
384         if (!mem_map) BUG();
385
386         totalram_pages += free_all_bootmem();
387
388         for (tmp = 0; tmp < end_pfn; tmp++)
389                 /*
390                  * Only count reserved RAM pages
391                  */
392                 if (page_is_ram(tmp) && PageReserved(mem_map+tmp))
393                         reservedpages++;
394 #endif
395
396         after_bootmem = 1;
397
398         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
399         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
400         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
401
402         printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
403                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
404                 max_mapnr << (PAGE_SHIFT-10),
405                 codesize >> 10,
406                 reservedpages << (PAGE_SHIFT-10),
407                 datasize >> 10,
408                 initsize >> 10);
409
410         /*
411          * Subtle. SMP is doing its boot stuff late (because it has to
412          * fork idle threads) - but it also needs low mappings for the
413          * protected-mode entry to work. We zap these entries only after
414          * the WP-bit has been tested.
415          */
416 #ifndef CONFIG_SMP
417         zap_low_mappings();
418 #endif
419 }
420
421 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
422    from the CPU leading to inconsistent cache lines. address and size
423    must be aligned to 2MB boundaries. 
424    Does nothing when the mapping doesn't exist. */
425 void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
426
427         unsigned long end = address + size;
428
429         BUG_ON(address & ~LARGE_PAGE_MASK);
430         BUG_ON(size & ~LARGE_PAGE_MASK); 
431         
432         for (; address < end; address += LARGE_PAGE_SIZE) { 
433                 pgd_t *pgd = pgd_offset_k(address);
434                 if (!pgd || pgd_none(*pgd))
435                         continue; 
436                 pmd_t *pmd = pmd_offset(pgd, address);
437                 if (!pmd || pmd_none(*pmd))
438                         continue; 
439                 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
440                         /* Could handle this, but it should not happen currently. */
441                         printk(KERN_ERR 
442                 "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
443                         pmd_ERROR(*pmd); 
444                 } 
445                 set_pmd(pmd, __pmd(0));                 
446         } 
447         __flush_tlb_all(); 
448
449
450 void free_initmem(void)
451 {
452         void *addr;
453
454         addr = (&__init_begin);
455         for (; addr < (void *)(&__init_end); addr += PAGE_SIZE) {
456                 ClearPageReserved(virt_to_page(addr));
457                 set_page_count(virt_to_page(addr), 1);
458 #ifdef CONFIG_INIT_DEBUG
459                 memset((unsigned long)addr & ~(PAGE_SIZE-1), 0xcc, PAGE_SIZE); 
460 #endif
461                 free_page((unsigned long)addr);
462                 totalram_pages++;
463         }
464         printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
465 }
466
467 #ifdef CONFIG_BLK_DEV_INITRD
468 void free_initrd_mem(unsigned long start, unsigned long end)
469 {
470         if (start < (unsigned long)&_end)
471                 return;
472         printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
473         for (; start < end; start += PAGE_SIZE) {
474                 ClearPageReserved(virt_to_page(start));
475                 set_page_count(virt_to_page(start), 1);
476                 free_page(start);
477                 totalram_pages++;
478         }
479 }
480 #endif
481
482 void si_meminfo(struct sysinfo *val)
483 {
484         val->totalram = totalram_pages;
485         val->sharedram = 0;
486         val->freeram = nr_free_pages();
487         val->bufferram = atomic_read(&buffermem_pages);
488         val->totalhigh = 0;
489         val->freehigh = nr_free_highpages();
490         val->mem_unit = PAGE_SIZE;
491         return;
492 }
493
494 void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
495
496         /* Should check here against the e820 map to avoid double free */ 
497 #ifdef CONFIG_DISCONTIGMEM
498         reserve_bootmem_node(NODE_DATA(phys_to_nid(phys)), phys, len);
499 #else                   
500         reserve_bootmem(phys, len);    
501 #endif
502 }
503
504
505 void free_bootmem_generic(unsigned long phys, unsigned len) 
506
507 #ifdef CONFIG_DISCONTIGMEM
508         free_bootmem_node(NODE_DATA(phys_to_nid(phys)), phys, len);
509 #else                   
510         free_bootmem(phys, len);    
511 #endif
512 }