2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
11 * This file handles the architecture-dependent parts of process handling..
14 #define __KERNEL_SYSCALLS__
17 #include <linux/errno.h>
18 #include <linux/sched.h>
19 #include <linux/kernel.h>
21 #include <linux/smp.h>
22 #include <linux/smp_lock.h>
23 #include <linux/stddef.h>
24 #include <linux/unistd.h>
25 #include <linux/ptrace.h>
26 #include <linux/slab.h>
27 #include <linux/vmalloc.h>
28 #include <linux/user.h>
29 #include <linux/a.out.h>
30 #include <linux/interrupt.h>
31 #include <linux/config.h>
32 #include <linux/delay.h>
33 #include <linux/reboot.h>
34 #include <linux/init.h>
35 #include <linux/mc146818rtc.h>
37 #include <asm/uaccess.h>
38 #include <asm/pgtable.h>
39 #include <asm/system.h>
42 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
47 #include <asm/smpboot.h>
48 #ifdef CONFIG_MATH_EMULATION
49 #include <asm/math_emu.h>
53 #include <linux/irq.h>
55 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 * Powermanagement idle function, if any..
62 void (*pm_idle)(void);
65 * Power off function, if any
67 void (*pm_power_off)(void);
69 void disable_hlt(void)
80 * We use this if we don't have any better
83 void default_idle(void)
85 if (current_cpu_data.hlt_works_ok && !hlt_counter) {
87 if (!current->need_resched)
95 * On SMP it's slightly faster (but much more power-consuming!)
96 * to poll the ->need_resched flag instead of waiting for the
97 * cross-CPU IPI to arrive. Use this option with caution.
99 static void poll_idle (void)
106 * Deal with another CPU just having chosen a thread to
109 oldval = xchg(¤t->need_resched, -1);
117 : :"m" (current->need_resched));
121 * The idle thread. There's no useful work to be
122 * done, so just try to conserve power and have a
123 * low exit latency (ie sit in a loop waiting for
124 * somebody to say that they'd like to reschedule)
128 /* endless idle loop with no priority at all */
131 current->counter = -100;
134 void (*idle)(void) = pm_idle;
137 while (!current->need_resched)
144 static int __init idle_setup (char *str)
146 if (!strncmp(str, "poll", 4)) {
147 printk("using polling idle threads.\n");
154 __setup("idle=", idle_setup);
156 static int reboot_mode;
157 int reboot_thru_bios;
161 static int reboot_cpu = -1;
162 /* shamelessly grabbed from lib/vsprintf.c for readability */
163 #define is_digit(c) ((c) >= '0' && (c) <= '9')
165 static int __init reboot_setup(char *str)
169 case 'w': /* "warm" reboot (no memory testing etc) */
170 reboot_mode = 0x1234;
172 case 'c': /* "cold" reboot (with memory testing etc) */
175 case 'b': /* "bios" reboot by jumping through the BIOS */
176 reboot_thru_bios = 1;
178 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
179 reboot_thru_bios = 0;
182 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
184 if (is_digit(*(str+1))) {
185 reboot_cpu = (int) (*(str+1) - '0');
186 if (is_digit(*(str+2)))
187 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
189 /* we will leave sorting out the final value
190 when we are ready to reboot, since we might not
191 have set up boot_cpu_id or smp_num_cpu */
195 if((str = strchr(str,',')) != NULL)
203 __setup("reboot=", reboot_setup);
205 /* The following code and data reboots the machine by switching to real
206 mode and jumping to the BIOS reset entry point, as if the CPU has
207 really been reset. The previous version asked the keyboard
208 controller to pulse the CPU reset line, which is more thorough, but
209 doesn't work with at least one type of 486 motherboard. It is easy
210 to stop this code working; hence the copious comments. */
212 static unsigned long long
213 real_mode_gdt_entries [3] =
215 0x0000000000000000ULL, /* Null descriptor */
216 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
217 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
222 unsigned short size __attribute__ ((packed));
223 unsigned long long * base __attribute__ ((packed));
225 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
226 real_mode_idt = { 0x3ff, 0 },
229 /* This is 16-bit protected mode code to disable paging and the cache,
230 switch to real mode and jump to the BIOS reset code.
232 The instruction that switches to real mode by writing to CR0 must be
233 followed immediately by a far jump instruction, which set CS to a
234 valid value for real mode, and flushes the prefetch queue to avoid
235 running instructions that have already been decoded in protected
238 Clears all the flags except ET, especially PG (paging), PE
239 (protected-mode enable) and TS (task switch for coprocessor state
240 save). Flushes the TLB after paging has been disabled. Sets CD and
241 NW, to disable the cache on a 486, and invalidates the cache. This
242 is more like the state of a 486 after reset. I don't know if
243 something else should be done for other chips.
245 More could be done here to set up the registers as if a CPU reset had
246 occurred; hopefully real BIOSs don't assume much. */
248 static unsigned char real_mode_switch [] =
250 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
251 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
252 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
253 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
254 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
255 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
256 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
257 0x74, 0x02, /* jz f */
258 0x0f, 0x09, /* wbinvd */
259 0x24, 0x10, /* f: andb $0x10,al */
260 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
262 static unsigned char jump_to_bios [] =
264 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
267 static inline void kb_wait(void)
271 for (i=0; i<0x10000; i++)
272 if ((inb_p(0x64) & 0x02) == 0)
277 * Switch to real mode and then execute the code
278 * specified by the code and length parameters.
279 * We assume that length will aways be less that 100!
281 void machine_real_restart(unsigned char *code, int length)
287 /* Write zero to CMOS register number 0x0f, which the BIOS POST
288 routine will recognize as telling it to do a proper reboot. (Well
289 that's what this book in front of me says -- it may only apply to
290 the Phoenix BIOS though, it's not clear). At the same time,
291 disable NMIs by setting the top bit in the CMOS address register,
292 as we're about to do peculiar things to the CPU. I'm not sure if
293 `outb_p' is needed instead of just `outb'. Use it to be on the
294 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
297 spin_lock_irqsave(&rtc_lock, flags);
298 CMOS_WRITE(0x00, 0x8f);
299 spin_unlock_irqrestore(&rtc_lock, flags);
301 /* Remap the kernel at virtual address zero, as well as offset zero
302 from the kernel segment. This assumes the kernel segment starts at
303 virtual address PAGE_OFFSET. */
305 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
306 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
308 /* Make sure the first page is mapped to the start of physical memory.
309 It is normally not mapped, to trap kernel NULL pointer dereferences. */
311 pg0[0] = _PAGE_RW | _PAGE_PRESENT;
314 * Use `swapper_pg_dir' as our page directory.
316 load_cr3(swapper_pg_dir);
318 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
319 this on booting to tell it to "Bypass memory test (also warm
320 boot)". This seems like a fairly standard thing that gets set by
321 REBOOT.COM programs, and the previous reset routine did this
324 *((unsigned short *)0x472) = reboot_mode;
326 /* For the switch to real mode, copy some code to low memory. It has
327 to be in the first 64k because it is running in 16-bit mode, and it
328 has to have the same physical and virtual address, because it turns
329 off paging. Copy it near the end of the first page, out of the way
330 of BIOS variables. */
332 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
333 real_mode_switch, sizeof (real_mode_switch));
334 memcpy ((void *) (0x1000 - 100), code, length);
336 /* Set up the IDT for real mode. */
338 __asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
340 /* Set up a GDT from which we can load segment descriptors for real
341 mode. The GDT is not used in real mode; it is just needed here to
342 prepare the descriptors. */
344 __asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
346 /* Load the data segment registers, and thus the descriptors ready for
347 real mode. The base address of each segment is 0x100, 16 times the
348 selector value being loaded here. This is so that the segment
349 registers don't have to be reloaded after switching to real mode:
350 the values are consistent for real mode operation already. */
352 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
353 "\tmovl %%eax,%%ds\n"
354 "\tmovl %%eax,%%es\n"
355 "\tmovl %%eax,%%fs\n"
356 "\tmovl %%eax,%%gs\n"
357 "\tmovl %%eax,%%ss" : : : "eax");
359 /* Jump to the 16-bit code that we copied earlier. It disables paging
360 and the cache, switches to real mode, and jumps to the BIOS reset
363 __asm__ __volatile__ ("ljmp $0x0008,%0"
365 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
368 void machine_restart(char * __unused)
373 cpuid = GET_APIC_ID(apic_read(APIC_ID));
377 /* check to see if reboot_cpu is valid
378 if its not, default to the BSP */
379 if ((reboot_cpu == -1) ||
380 (reboot_cpu > (NR_CPUS -1)) ||
381 !(phys_cpu_present_map & apicid_to_phys_cpu_present(cpuid)))
382 reboot_cpu = boot_cpu_physical_apicid;
384 reboot_smp = 0; /* use this as a flag to only go through this once*/
385 /* re-run this function on the other CPUs
386 it will fall though this section since we have
387 cleared reboot_smp, and do the reboot if it is the
388 correct CPU, otherwise it halts. */
389 if (reboot_cpu != cpuid)
390 smp_call_function((void *)machine_restart , NULL, 1, 0);
393 /* if reboot_cpu is still -1, then we want a tradional reboot,
394 and if we are not running on the reboot_cpu,, halt */
395 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
397 __asm__ __volatile__ ("hlt");
400 * Stop all CPUs and turn off local APICs and the IO-APIC, so
401 * other OSs see a clean IRQ state.
404 #elif CONFIG_X86_LOCAL_APIC
407 disable_local_APIC();
411 #ifdef CONFIG_X86_IO_APIC
415 if(!reboot_thru_bios) {
416 /* rebooting needs to touch the page at absolute addr 0 */
417 *((unsigned short *)__va(0x472)) = reboot_mode;
420 for (i=0; i<100; i++) {
423 outb(0xfe,0x64); /* pulse reset low */
426 /* That didn't work - force a triple fault.. */
427 __asm__ __volatile__("lidt %0": :"m" (no_idt));
428 __asm__ __volatile__("int3");
432 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
435 void machine_halt(void)
439 void machine_power_off(void)
445 extern void show_trace(unsigned long* esp);
447 void show_regs(struct pt_regs * regs)
449 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
452 printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
453 printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id());
455 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
456 printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted());
457 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
458 regs->eax,regs->ebx,regs->ecx,regs->edx);
459 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
460 regs->esi, regs->edi, regs->ebp);
461 printk(" DS: %04x ES: %04x\n",
462 0xffff & regs->xds,0xffff & regs->xes);
464 __asm__("movl %%cr0, %0": "=r" (cr0));
465 __asm__("movl %%cr2, %0": "=r" (cr2));
466 __asm__("movl %%cr3, %0": "=r" (cr3));
467 /* This could fault if %cr4 does not exist */
468 __asm__("1: movl %%cr4, %0 \n"
470 ".section __ex_table,\"a\" \n"
473 : "=r" (cr4): "0" (0));
474 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
475 show_trace(®s->esp);
479 * Create a kernel thread
481 int arch_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
485 __asm__ __volatile__(
486 "movl %%esp,%%esi\n\t"
487 "int $0x80\n\t" /* Linux/i386 system call */
488 "cmpl %%esp,%%esi\n\t" /* child or parent? */
489 "je 1f\n\t" /* parent - jump */
490 /* Load the argument into eax, and push it. That way, it does
491 * not matter whether the called function is compiled with
492 * -mregparm or not. */
495 "call *%5\n\t" /* call fn */
496 "movl %3,%0\n\t" /* exit */
499 :"=&a" (retval), "=&S" (d0)
500 :"0" (__NR_clone), "i" (__NR_exit),
502 "b" (flags | CLONE_VM)
509 * Free current thread data structures etc..
511 void exit_thread(void)
513 /* nothing to do ... */
516 void flush_thread(void)
518 struct task_struct *tsk = current;
520 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
522 * Forget coprocessor state..
528 void release_thread(struct task_struct *dead_task)
531 // temporary debugging check
532 if (dead_task->mm->context.size) {
533 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
535 dead_task->mm->context.ldt,
536 dead_task->mm->context.size);
540 release_x86_irqs(dead_task);
546 #define savesegment(seg,value) \
547 asm volatile("mov %%" #seg ",%0":"=m" (value))
549 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
550 unsigned long unused,
551 struct task_struct * p, struct pt_regs * regs)
553 struct pt_regs * childregs;
555 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
556 struct_cpy(childregs, regs);
558 childregs->esp = esp;
560 p->thread.esp = (unsigned long) childregs;
561 p->thread.esp0 = (unsigned long) (childregs+1);
563 p->thread.eip = (unsigned long) ret_from_fork;
565 savesegment(fs,p->thread.fs);
566 savesegment(gs,p->thread.gs);
569 struct_cpy(&p->thread.i387, ¤t->thread.i387);
575 * fill in the user structure for a core dump..
577 void dump_thread(struct pt_regs * regs, struct user * dump)
581 /* changed the size calculations - should hopefully work better. lbt */
582 dump->magic = CMAGIC;
583 dump->start_code = 0;
584 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
585 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
586 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
587 dump->u_dsize -= dump->u_tsize;
589 for (i = 0; i < 8; i++)
590 dump->u_debugreg[i] = current->thread.debugreg[i];
592 if (dump->start_stack < TASK_SIZE)
593 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
595 dump->regs.ebx = regs->ebx;
596 dump->regs.ecx = regs->ecx;
597 dump->regs.edx = regs->edx;
598 dump->regs.esi = regs->esi;
599 dump->regs.edi = regs->edi;
600 dump->regs.ebp = regs->ebp;
601 dump->regs.eax = regs->eax;
602 dump->regs.ds = regs->xds;
603 dump->regs.es = regs->xes;
604 savesegment(fs,dump->regs.fs);
605 savesegment(gs,dump->regs.gs);
606 dump->regs.orig_eax = regs->orig_eax;
607 dump->regs.eip = regs->eip;
608 dump->regs.cs = regs->xcs;
609 dump->regs.eflags = regs->eflags;
610 dump->regs.esp = regs->esp;
611 dump->regs.ss = regs->xss;
613 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
617 * This special macro can be used to load a debugging register
619 #define loaddebug(thread,register) \
620 __asm__("movl %0,%%db" #register \
622 :"r" (thread->debugreg[register]))
625 * switch_to(x,yn) should switch tasks from x to y.
627 * We fsave/fwait so that an exception goes off at the right time
628 * (as a call from the fsave or fwait in effect) rather than to
629 * the wrong process. Lazy FP saving no longer makes any sense
630 * with modern CPU's, and this simplifies a lot of things (SMP
631 * and UP become the same).
633 * NOTE! We used to use the x86 hardware context switching. The
634 * reason for not using it any more becomes apparent when you
635 * try to recover gracefully from saved state that is no longer
636 * valid (stale segment register values in particular). With the
637 * hardware task-switch, there is no way to fix up bad state in
638 * a reasonable manner.
640 * The fact that Intel documents the hardware task-switching to
641 * be slow is a fairly red herring - this code is not noticeably
642 * faster. However, there _is_ some room for improvement here,
643 * so the performance issues may eventually be a valid point.
644 * More important, however, is the fact that this allows us much
647 void fastcall __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
649 struct thread_struct *prev = &prev_p->thread,
650 *next = &next_p->thread;
651 struct tss_struct *tss = init_tss + smp_processor_id();
656 * Reload esp0, LDT and the page table pointer:
658 tss->esp0 = next->esp0;
661 * Save away %fs and %gs. No need to save %es and %ds, as
662 * those are always kernel segments while inside the kernel.
664 asm volatile("mov %%fs,%0":"=m" (prev->fs));
665 asm volatile("mov %%gs,%0":"=m" (prev->gs));
668 * Restore %fs and %gs.
670 loadsegment(fs, next->fs);
671 loadsegment(gs, next->gs);
674 * Now maybe reload the debug registers
676 if (next->debugreg[7]){
686 if (prev->ioperm || next->ioperm) {
689 * 4 cachelines copy ... not good, but not that
690 * bad either. Anyone got something better?
691 * This only affects processes which use ioperm().
692 * [Putting the TSSs into 4k-tlb mapped regions
693 * and playing VM tricks to switch the IO bitmap
694 * is not really acceptable.]
696 memcpy(tss->io_bitmap, next->io_bitmap,
698 tss->bitmap = IO_BITMAP_OFFSET;
701 * a bitmap offset pointing outside of the TSS limit
702 * causes a nicely controllable SIGSEGV if a process
703 * tries to use a port IO instruction. The first
704 * sys_ioperm() call sets up the bitmap properly.
706 tss->bitmap = INVALID_IO_BITMAP_OFFSET;
710 asmlinkage int sys_fork(struct pt_regs regs)
712 return do_fork(SIGCHLD, regs.esp, ®s, 0);
715 asmlinkage int sys_clone(struct pt_regs regs)
717 unsigned long clone_flags;
720 clone_flags = regs.ebx;
724 return do_fork(clone_flags, newsp, ®s, 0);
728 * This is trivial, and on the face of it looks like it
729 * could equally well be done in user mode.
731 * Not so, for quite unobvious reasons - register pressure.
732 * In user mode vfork() cannot have a stack frame, and if
733 * done by calling the "clone()" system call directly, you
734 * do not have enough call-clobbered registers to hold all
735 * the information you need.
737 asmlinkage int sys_vfork(struct pt_regs regs)
739 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0);
743 * sys_execve() executes a new program.
745 asmlinkage int sys_execve(struct pt_regs regs)
750 filename = getname((char *) regs.ebx);
751 error = PTR_ERR(filename);
752 if (IS_ERR(filename))
754 error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, ®s);
756 current->ptrace &= ~PT_DTRACE;
763 * These bracket the sleeping functions..
765 extern void scheduling_functions_start_here(void);
766 extern void scheduling_functions_end_here(void);
767 #define first_sched ((unsigned long) scheduling_functions_start_here)
768 #define last_sched ((unsigned long) scheduling_functions_end_here)
770 unsigned long get_wchan(struct task_struct *p)
772 unsigned long ebp, esp, eip;
773 unsigned long stack_page;
775 if (!p || p == current || p->state == TASK_RUNNING)
777 stack_page = (unsigned long)p;
779 if (!stack_page || esp < stack_page || esp > 8188+stack_page)
781 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
782 ebp = *(unsigned long *) esp;
784 if (ebp < stack_page || ebp > 8184+stack_page)
786 eip = *(unsigned long *) (ebp+4);
787 if (eip < first_sched || eip >= last_sched)
789 ebp = *(unsigned long *) ebp;
790 } while (count++ < 16);