2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * $Id: process.c,v 1.77 2004/03/22 00:37:29 ak Exp $
16 * This file handles the architecture-dependent parts of process handling..
19 #define __KERNEL_SYSCALLS__
22 #include <linux/compiler.h>
23 #include <linux/errno.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
27 #include <linux/smp.h>
28 #include <linux/smp_lock.h>
29 #include <linux/stddef.h>
30 #include <linux/unistd.h>
31 #include <linux/ptrace.h>
32 #include <linux/slab.h>
33 #include <linux/vmalloc.h>
34 #include <linux/user.h>
35 #include <linux/a.out.h>
36 #include <linux/interrupt.h>
37 #include <linux/config.h>
38 #include <linux/delay.h>
39 #include <linux/reboot.h>
40 #include <linux/init.h>
41 #include <linux/ctype.h>
42 #include <linux/slab.h>
44 #include <asm/uaccess.h>
45 #include <asm/pgtable.h>
46 #include <asm/system.h>
49 #include <asm/processor.h>
52 #include <asm/mmu_context.h>
54 #include <asm/prctl.h>
55 #include <asm/kdebug.h>
56 #include <asm/proto.h>
59 #include <linux/irq.h>
61 asmlinkage extern void ret_from_fork(void);
66 * Powermanagement idle function, if any..
68 void (*pm_idle)(void);
71 * Power off function, if any
73 void (*pm_power_off)(void);
75 void disable_hlt(void)
86 * We use this if we don't have any better
89 static void default_idle(void)
93 if (!current->need_resched)
101 * On SMP it's slightly faster (but much more power-consuming!)
102 * to poll the ->need_resched flag instead of waiting for the
103 * cross-CPU IPI to arrive. Use this option with caution.
105 static void poll_idle (void)
112 * Deal with another CPU just having chosen a thread to
115 oldval = xchg(¤t->need_resched, -1);
123 : :"m" (current->need_resched));
127 * The idle thread. There's no useful work to be
128 * done, so just try to conserve power and have a
129 * low exit latency (ie sit in a loop waiting for
130 * somebody to say that they'd like to reschedule)
134 /* endless idle loop with no priority at all */
137 current->counter = -100;
140 void (*idle)(void) = pm_idle;
143 while (!current->need_resched)
151 * This is a kind of hybrid between poll and halt idle routines. This uses new
152 * Monitor/Mwait instructions on P4 processors with PNI. We Monitor
153 * need_resched and go to optimized wait state through Mwait.
154 * Whenever someone changes need_resched, we would be woken up from Mwait
157 static void mwait_idle (void)
162 /* Setting need_resched to -1 skips sending IPI during idle resched */
163 oldval = xchg(¤t->need_resched, -1);
166 __monitor((void *)¤t->need_resched, 0, 0);
167 if (current->need_resched != -1)
170 } while (current->need_resched == -1);
174 int __init select_idle_routine(struct cpuinfo_x86 *c)
176 if (cpu_has(c, X86_FEATURE_MWAIT)) {
177 printk("Monitor/Mwait feature present.\n");
179 * Take care of system with asymmetric CPUs.
180 * Use, mwait_idle only if all cpus support it.
181 * If not, we fallback to default_idle()
184 pm_idle = mwait_idle;
192 static int __init idle_setup (char *str)
194 if (!strncmp(str, "poll", 4)) {
195 printk("using polling idle threads.\n");
197 } else if (!strncmp(str, "halt", 4)) {
198 printk("using halt in idle threads.\n");
199 pm_idle = default_idle;
205 __setup("idle=", idle_setup);
207 static struct { long x; } no_idt[3];
212 } reboot_type = BOOT_KBD;
213 static int reboot_mode = 0;
215 /* reboot=b[ios] | t[riple] | k[bd] [, [w]arm | [c]old]
216 bios Use the CPU reboot vector for warm reset
217 warm Don't set the cold reboot flag
218 cold Set the cold reboto flag
219 triple Force a triple fault (init)
220 kbd Use the keyboard controller. cold reset (default)
222 static int __init reboot_setup(char *str)
227 reboot_mode = 0x1234;
240 if((str = strchr(str,',')) != NULL)
247 __setup("reboot=", reboot_setup);
249 /* overwrites random kernel memory. Should not be kernel .text */
250 #define WARMBOOT_TRAMP 0x1000UL
252 static void reboot_warm(void)
254 extern unsigned char warm_reboot[], warm_reboot_end[];
255 printk("warm reboot\n");
259 /* restore identity mapping */
260 init_level4_pgt[0] = __pml4(__pa(level3_ident_pgt) | 7);
263 memcpy(__va(WARMBOOT_TRAMP), warm_reboot, warm_reboot_end - warm_reboot);
265 asm volatile( " pushq $0\n" /* ss */
266 " pushq $0x2000\n" /* rsp */
267 " pushfq\n" /* eflags */
271 [cs] "i" (__KERNEL_COMPAT32_CS),
272 [target] "b" (WARMBOOT_TRAMP));
275 static void kb_wait(void)
279 for (i=0; i<0x10000; i++)
280 if ((inb_p(0x64) & 0x02) == 0)
286 static void smp_halt(void)
288 int cpuid = safe_smp_processor_id();
289 static int first_entry = 1;
293 smp_call_function((void *)machine_restart, NULL, 1, 0);
298 /* AP calling this. Just halt */
299 if (cpuid != boot_cpu_id) {
300 printk("CPU %d SMP halt\n", cpuid);
305 /* Wait for all other CPUs to have run smp_stop_cpu */
306 while (cpu_online_map)
311 void machine_restart(char * __unused)
321 disable_local_APIC();
327 /* Tell the BIOS if we want cold or warm reboot */
328 *((unsigned short *)__va(0x472)) = reboot_mode;
331 /* Could also try the reset bit in the Hammer NB */
332 switch (reboot_type) {
337 /* force cold reboot to reinit all hardware*/
338 for (i=0; i<100; i++) {
341 outb(0xfe,0x64); /* pulse reset low */
346 /* force cold reboot to reinit all hardware*/
347 *((unsigned short *)__va(0x472)) = 0;
349 __asm__ __volatile__("lidt (%0)": :"r" (no_idt));
350 __asm__ __volatile__("int3");
352 reboot_type = BOOT_KBD;
358 void machine_halt(void)
362 void machine_power_off(void)
368 extern int printk_address(unsigned long);
370 /* Prints also some state that isn't saved in the pt_regs */
371 void __show_regs(struct pt_regs * regs)
373 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
374 unsigned int fsindex,gsindex;
375 unsigned int ds,cs,es;
378 printk("Pid: %d, comm: %.20s %s\n", current->pid, current->comm, print_tainted());
379 printk("RIP: %04lx:", regs->cs & 0xffff);
380 printk_address(regs->rip);
381 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
382 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
383 regs->rax, regs->rbx, regs->rcx);
384 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
385 regs->rdx, regs->rsi, regs->rdi);
386 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
387 regs->rbp, regs->r8, regs->r9);
388 printk("R10: %016lx R11: %016lx R12: %016lx\n",
389 regs->r10, regs->r11, regs->r12);
390 printk("R13: %016lx R14: %016lx R15: %016lx\n",
391 regs->r13, regs->r14, regs->r15);
393 asm("movl %%ds,%0" : "=r" (ds));
394 asm("movl %%cs,%0" : "=r" (cs));
395 asm("movl %%es,%0" : "=r" (es));
396 asm("movl %%fs,%0" : "=r" (fsindex));
397 asm("movl %%gs,%0" : "=r" (gsindex));
399 rdmsrl(MSR_FS_BASE, fs);
400 rdmsrl(MSR_GS_BASE, gs);
401 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
403 asm("movq %%cr0, %0": "=r" (cr0));
404 asm("movq %%cr2, %0": "=r" (cr2));
405 asm("movq %%cr3, %0": "=r" (cr3));
406 asm("movq %%cr4, %0": "=r" (cr4));
408 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
409 fs,fsindex,gs,gsindex,shadowgs);
410 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
411 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
414 void show_regs(struct pt_regs * regs)
417 show_trace(®s->rsp);
421 * No need to lock the MM as we are the last user
423 void release_segments(struct mm_struct *mm)
425 void * ldt = mm->context.segments;
431 mm->context.segments = NULL;
438 * Free current thread data structures etc..
440 void exit_thread(void)
442 struct task_struct *me = current;
443 if (me->thread.io_bitmap_ptr) {
444 (init_tss + smp_processor_id())->io_map_base =
445 INVALID_IO_BITMAP_OFFSET;
446 kfree(me->thread.io_bitmap_ptr);
447 me->thread.io_bitmap_ptr = NULL;
451 void flush_thread(void)
453 struct task_struct *tsk = current;
455 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
457 * Forget coprocessor state..
463 void release_thread(struct task_struct *dead_task)
466 void * ldt = dead_task->mm->context.segments;
468 // temporary debugging check
470 printk("WARNING: dead process %8s still has LDT? <%p>\n",
471 dead_task->comm, ldt);
478 * we do not have to muck with descriptors here, that is
479 * done in switch_mm() as needed.
481 void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
483 struct mm_struct * old_mm;
487 old_mm = current->mm;
488 if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
490 * Completely new LDT, we initialize it from the parent:
492 ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
494 printk(KERN_WARNING "ldt allocation failed\n");
496 memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
498 new_mm->context.segments = ldt;
499 new_mm->context.cpuvalid = 0UL;
503 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
504 unsigned long unused,
505 struct task_struct * p, struct pt_regs * regs)
507 struct pt_regs * childregs;
508 struct task_struct *me = current;
510 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
515 childregs->rsp = rsp;
517 childregs->rsp = (unsigned long)childregs;
520 p->thread.rsp = (unsigned long) childregs;
521 p->thread.rsp0 = (unsigned long) (childregs+1);
522 p->thread.userrsp = current->thread.userrsp;
524 p->thread.rip = (unsigned long) ret_from_fork;
526 p->thread.fs = me->thread.fs;
527 p->thread.gs = me->thread.gs;
529 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
530 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
531 asm("mov %%es,%0" : "=m" (p->thread.es));
532 asm("mov %%ds,%0" : "=m" (p->thread.ds));
535 p->thread.i387 = current->thread.i387;
537 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
538 p->thread.io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL);
539 if (!p->thread.io_bitmap_ptr)
541 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
542 (IO_BITMAP_SIZE+1)*4);
549 * This special macro can be used to load a debugging register
551 #define loaddebug(thread,register) \
552 set_debug(thread->debugreg[register], register)
555 * switch_to(x,y) should switch tasks from x to y.
557 * This could still be optimized:
558 * - fold all the options into a flag word and test it with a single test.
559 * - could test fs/gs bitsliced
561 struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
563 struct thread_struct *prev = &prev_p->thread,
564 *next = &next_p->thread;
565 struct tss_struct *tss = init_tss + smp_processor_id();
568 * Reload rsp0, LDT and the page table pointer:
570 tss->rsp0 = next->rsp0;
575 asm volatile("mov %%es,%0" : "=m" (prev->es));
576 if (unlikely(next->es | prev->es))
577 loadsegment(es, next->es);
579 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
580 if (unlikely(next->ds | prev->ds))
581 loadsegment(ds, next->ds);
584 * Must be after DS reload for AMD workaround.
593 asm volatile("movl %%fs,%0" : "=r" (fsindex));
594 /* segment register != 0 always requires a reload.
595 also reload when it has changed.
596 when prev process used 64bit base always reload
597 to avoid an information leak. */
598 if (unlikely((fsindex | next->fsindex) || prev->fs)) {
599 loadsegment(fs, next->fsindex);
600 /* check if the user use a selector != 0
601 * if yes clear 64bit base, since overloaded base
602 * is allways mapped to the Null selector
607 /* when next process has a 64bit base use it */
609 wrmsrl(MSR_FS_BASE, next->fs);
610 prev->fsindex = fsindex;
614 asm volatile("movl %%gs,%0" : "=r" (gsindex));
615 if (unlikely((gsindex | next->gsindex) || prev->gs)) {
616 load_gs_index(next->gsindex);
621 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
622 prev->gsindex = gsindex;
626 * Switch the PDA context.
628 prev->userrsp = read_pda(oldrsp);
629 write_pda(oldrsp, next->userrsp);
630 write_pda(pcurrent, next_p);
631 write_pda(kernelstack, (unsigned long)next_p + THREAD_SIZE - PDA_STACKOFFSET);
634 * Now maybe reload the debug registers
636 if (unlikely(next->debugreg[7])) {
648 * Handle the IO bitmap
650 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
651 if (next->io_bitmap_ptr) {
653 * 4 cachelines copy ... not good, but not that
654 * bad either. Anyone got something better?
655 * This only affects processes which use ioperm().
656 * [Putting the TSSs into 4k-tlb mapped regions
657 * and playing VM tricks to switch the IO bitmap
658 * is not really acceptable.]
660 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
661 IO_BITMAP_SIZE*sizeof(u32));
662 tss->io_map_base = IO_BITMAP_OFFSET;
665 * a bitmap offset pointing outside of the TSS limit
666 * causes a nicely controllable SIGSEGV if a process
667 * tries to use a port IO instruction. The first
668 * sys_ioperm() call sets up the bitmap properly.
670 tss->io_map_base = INVALID_IO_BITMAP_OFFSET;
679 * sys_execve() executes a new program.
682 long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
687 filename = getname(name);
688 error = PTR_ERR(filename);
689 if (IS_ERR(filename))
691 error = do_execve(filename, argv, envp, ®s);
693 current->ptrace &= ~PT_DTRACE;
698 void set_personality_64bit(void)
700 /* inherit personality from parent */
702 /* Make sure to be in 64bit mode */
703 current->thread.flags = 0;
706 asmlinkage long sys_fork(struct pt_regs regs)
708 return do_fork(SIGCHLD, regs.rsp, ®s, 0);
711 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, struct pt_regs regs)
715 return do_fork(clone_flags, newsp, ®s, 0);
719 * This is trivial, and on the face of it looks like it
720 * could equally well be done in user mode.
722 * Not so, for quite unobvious reasons - register pressure.
723 * In user mode vfork() cannot have a stack frame, and if
724 * done by calling the "clone()" system call directly, you
725 * do not have enough call-clobbered registers to hold all
726 * the information you need.
728 asmlinkage long sys_vfork(struct pt_regs regs)
730 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, ®s, 0);
734 * These bracket the sleeping functions..
736 extern void scheduling_functions_start_here(void);
737 extern void scheduling_functions_end_here(void);
738 #define first_sched ((unsigned long) scheduling_functions_start_here)
739 #define last_sched ((unsigned long) scheduling_functions_end_here)
741 unsigned long get_wchan(struct task_struct *p)
746 if (!p || p == current || p->state==TASK_RUNNING)
748 if (p->thread.rsp < (u64)p || p->thread.rsp > (u64)p + THREAD_SIZE)
750 fp = *(u64 *)(p->thread.rsp);
752 if (fp < (unsigned long)p || fp > (unsigned long)p+THREAD_SIZE)
754 rip = *(u64 *)(fp+8);
755 if (rip < first_sched || rip >= last_sched)
758 } while (count++ < 16);
764 asmlinkage long sys_arch_prctl(int code, unsigned long addr)
771 if (addr >= TASK_SIZE)
773 asm volatile("movl %0,%%gs" :: "r" (0));
774 current->thread.gsindex = 0;
775 current->thread.gs = addr;
776 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
779 /* Not strictly needed for fs, but do it for symmetry
781 if (addr >= TASK_SIZE)
783 asm volatile("movl %0,%%fs" :: "r" (0));
784 current->thread.fsindex = 0;
785 current->thread.fs = addr;
786 ret = checking_wrmsrl(MSR_FS_BASE, addr);
789 /* Returned value may not be correct when the user changed fs/gs */
791 rdmsrl(MSR_FS_BASE, tmp);
792 ret = put_user(tmp, (unsigned long *)addr);
796 rdmsrl(MSR_KERNEL_GS_BASE, tmp);
797 ret = put_user(tmp, (unsigned long *)addr);