2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/signal.h>
9 #include <linux/sched.h>
10 #include <linux/kernel.h>
11 #include <linux/errno.h>
12 #include <linux/string.h>
13 #include <linux/types.h>
14 #include <linux/ptrace.h>
15 #include <linux/mman.h>
17 #include <linux/smp.h>
18 #include <linux/smp_lock.h>
19 #include <linux/interrupt.h>
20 #include <linux/init.h>
21 #include <linux/tty.h>
22 #include <linux/vt_kern.h> /* For unblank_screen() */
23 #include <linux/compiler.h>
25 #include <asm/system.h>
26 #include <asm/uaccess.h>
27 #include <asm/pgalloc.h>
28 #include <asm/hardirq.h>
30 #include <asm/proto.h>
31 #include <asm/kdebug.h>
33 extern spinlock_t console_lock, timerlist_lock;
35 void bust_spinlocks(int yes)
37 spin_lock_init(&timerlist_lock);
41 global_irq_lock = 0; /* Many serial drivers do __global_cli() */
44 int loglevel_save = console_loglevel;
50 * OK, the message is on the console. Now we call printk()
51 * without oops_in_progress set so that printk will give klogd
52 * a poke. Hold onto your hats...
54 console_loglevel = 15; /* NMI oopser may have shut the console up */
56 console_loglevel = loglevel_save;
60 static int bad_address(void *p)
63 return __get_user(dummy, (unsigned long *)p);
66 void dump_pagetable(unsigned long address)
69 asm("movq %%cr3,%0" : "=r" (pml4));
71 pml4 = __va((unsigned long)pml4 & PHYSICAL_PAGE_MASK);
72 pml4 += pml4_index(address);
73 printk("PML4 %lx ", pml4_val(*pml4));
74 if (bad_address(pml4)) goto bad;
75 if (!pml4_present(*pml4)) goto ret;
77 pgd_t *pgd = __pgd_offset_k((pgd_t *)pml4_page(*pml4), address);
78 if (bad_address(pgd)) goto bad;
79 printk("PGD %lx ", pgd_val(*pgd));
80 if (!pgd_present(*pgd)) goto ret;
82 pmd_t *pmd = pmd_offset(pgd, address);
83 if (bad_address(pmd)) goto bad;
84 printk("PMD %lx ", pmd_val(*pmd));
85 if (!pmd_present(*pmd)) goto ret;
87 pte_t *pte = pte_offset(pmd, address);
88 if (bad_address(pte)) goto bad;
89 printk("PTE %lx", pte_val(*pte));
97 /* Sometimes the CPU reports invalid exceptions on prefetch.
98 Check that here and ignore.
99 Opcode checker based on code by Richard Brunner */
100 static int is_prefetch(struct pt_regs *regs, unsigned long addr)
102 unsigned char *instr = (unsigned char *)(regs->rip);
105 unsigned char *max_instr = instr + 15;
107 /* Avoid recursive faults for this common case */
108 if (regs->rip == addr)
111 if (regs->cs & (1<<2))
114 while (scan_more && instr < max_instr) {
115 unsigned char opcode;
116 unsigned char instr_hi;
117 unsigned char instr_lo;
119 if (__get_user(opcode, instr))
122 instr_hi = opcode & 0xf0;
123 instr_lo = opcode & 0x0f;
129 /* Values 0x26,0x2E,0x36,0x3E are valid x86
130 prefixes. In long mode, the CPU will signal
131 invalid opcode if some of these prefixes are
132 present so we will never get here anyway */
133 scan_more = ((instr_lo & 7) == 0x6);
137 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
138 Need to figure out under what instruction mode the
139 instruction was issued ... */
140 /* Could check the LDT for lm, but for now it's good
141 enough to assume that long mode only uses well known
142 segments or kernel. */
143 scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
147 /* 0x64 thru 0x67 are valid prefixes in all modes. */
148 scan_more = (instr_lo & 0xC) == 0x4;
151 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
152 scan_more = !instr_lo || (instr_lo>>1) == 1;
155 /* Prefetch instruction is 0x0F0D or 0x0F18 */
157 if (__get_user(opcode, instr))
159 prefetch = (instr_lo == 0xF) &&
160 (opcode == 0x0D || opcode == 0x18);
170 printk("%s: prefetch caused page fault at %lx/%lx\n", current->comm,
176 int page_fault_trace;
177 int exception_trace = 1;
180 * This routine handles page faults. It determines the address,
181 * and the problem, and then passes it off to one of the appropriate
185 * bit 0 == 0 means no page found, 1 means protection fault
186 * bit 1 == 0 means read, 1 means write
187 * bit 2 == 0 means kernel, 1 means user-mode
188 * bit 3 == 1 means fault was an instruction fetch
190 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
192 struct task_struct *tsk;
193 struct mm_struct *mm;
194 struct vm_area_struct * vma;
195 unsigned long address;
200 /* get the address */
201 __asm__("movq %%cr2,%0":"=r" (address));
203 if (regs->eflags & X86_EFLAGS_IF)
206 #ifdef CONFIG_CHECKING
207 if (page_fault_trace)
208 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
209 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
214 struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
215 rdmsrl(MSR_GS_BASE, gs);
216 if (gs != (unsigned long)pda) {
217 wrmsrl(MSR_GS_BASE, pda);
218 printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
225 info.si_code = SEGV_MAPERR;
227 /* 5 => page not present and from supervisor mode */
228 if (unlikely(!(error_code & 5) &&
229 ((address >= VMALLOC_START && address <= VMALLOC_END) ||
230 (address >= MODULES_VADDR && address <= MODULES_END))))
234 * If we're in an interrupt or have no user
235 * context, we must not take the fault..
237 if (in_interrupt() || !mm)
238 goto bad_area_nosemaphore;
241 * Work around K8 errata #100. See the K8 specification update for
242 * details. Any code segment in LDT is compatibility mode.
244 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
249 down_read(&mm->mmap_sem);
251 vma = find_vma(mm, address);
254 if (vma->vm_start <= address)
256 if (!(vma->vm_flags & VM_GROWSDOWN))
258 if (error_code & 4) {
259 // XXX: align red zone size with ABI
260 if (address + 128 < regs->rsp)
263 if (expand_stack(vma, address))
266 * Ok, we have a good vm_area for this memory access, so
270 info.si_code = SEGV_ACCERR;
272 switch (error_code & 3) {
273 default: /* 3: write, present */
275 case 2: /* write, not present */
276 if (!(vma->vm_flags & VM_WRITE))
280 case 1: /* read, present */
282 case 0: /* read, not present */
283 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
288 * If for any reason at all we couldn't handle the fault,
289 * make sure we exit gracefully rather than endlessly redo
292 switch (handle_mm_fault(mm, vma, address, write)) {
305 up_read(&mm->mmap_sem);
309 * Something tried to access memory that isn't in our memory map..
310 * Fix it, but check if it's kernel or user first..
313 up_read(&mm->mmap_sem);
315 bad_area_nosemaphore:
316 /* User mode accesses just cause a SIGSEGV */
317 if (error_code & 4) {
318 if (is_prefetch(regs, address))
321 if (exception_trace && !(tsk->ptrace & PT_PTRACED) &&
322 (tsk->sig->action[SIGSEGV-1].sa.sa_handler == SIG_IGN ||
323 (tsk->sig->action[SIGSEGV-1].sa.sa_handler == SIG_DFL)))
325 "%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
326 tsk->comm, tsk->pid, address, regs->rip,
327 regs->rsp, error_code);
329 tsk->thread.cr2 = address;
330 /* Kernel addresses are always protection faults */
331 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
332 tsk->thread.trap_no = 14;
333 info.si_signo = SIGSEGV;
335 /* info.si_code has been set above */
336 info.si_addr = (void *)address;
337 force_sig_info(SIGSEGV, &info, tsk);
343 /* Are we prepared to handle this kernel fault? */
344 if ((fixup = search_exception_table(regs->rip)) != 0) {
346 if (0 && exception_trace)
348 "%s: fixed kernel exception at %lx address %lx err:%ld\n",
349 tsk->comm, regs->rip, address, error_code);
353 if (is_prefetch(regs, address))
357 * Oops. The kernel tried to access some bad page. We'll have to
358 * terminate things with extreme prejudice.
363 if (address < PAGE_SIZE)
364 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
366 printk(KERN_ALERT "Unable to handle kernel paging request");
367 printk(KERN_ALERT " at %016lx RIP: ", address);
368 printk_address(regs->rip);
369 dump_pagetable(address);
370 __die("Oops", regs, error_code);
371 /* Executive summary in case the oops scrolled away */
372 printk(KERN_EMERG "CR2: %016lx\n", address);
377 * We ran out of memory, or some other thing happened to us that made
378 * us unable to handle the page fault gracefully.
381 up_read(&mm->mmap_sem);
382 if (current->pid == 1) {
383 tsk->policy |= SCHED_YIELD;
387 printk("VM: killing process %s\n", tsk->comm);
393 up_read(&mm->mmap_sem);
395 /* Kernel mode? Handle exceptions or die */
396 if (!(error_code & 4))
399 if (is_prefetch(regs, address))
402 tsk->thread.cr2 = address;
403 tsk->thread.error_code = error_code;
404 tsk->thread.trap_no = 14;
405 info.si_signo = SIGBUS;
407 info.si_code = BUS_ADRERR;
408 info.si_addr = (void *)address;
409 force_sig_info(SIGBUS, &info, tsk);
420 * x86-64 has the same kernel 3rd level pages for all CPUs.
421 * But for vmalloc/modules the TLB synchronization works lazily,
422 * so it can happen that we get a page fault for something
423 * that is really already in the page table. Just check if it
424 * is really there and when yes flush the local TLB.
427 printk("vmalloc fault %lx index %lu\n",address,pml4_index(address));
428 dump_pagetable(address);
431 pgd = pgd_offset_k(address);
432 if (pgd != current_pgd_offset_k(address))
433 goto bad_area_nosemaphore;
434 if (!pgd_present(*pgd))
435 goto bad_area_nosemaphore;
436 pmd = pmd_offset(pgd, address);
437 if (!pmd_present(*pmd))
438 goto bad_area_nosemaphore;
439 pte = pte_offset(pmd, address);
440 if (!pte_present(*pte))
441 goto bad_area_nosemaphore;