2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
11 * This file handles the architecture-dependent parts of process handling..
14 #define __KERNEL_SYSCALLS__
17 #include <linux/errno.h>
18 #include <linux/sched.h>
19 #include <linux/kernel.h>
21 #include <linux/smp.h>
22 #include <linux/smp_lock.h>
23 #include <linux/stddef.h>
24 #include <linux/unistd.h>
25 #include <linux/ptrace.h>
26 #include <linux/slab.h>
27 #include <linux/vmalloc.h>
28 #include <linux/user.h>
29 #include <linux/a.out.h>
30 #include <linux/interrupt.h>
31 #include <linux/config.h>
32 #include <linux/delay.h>
33 #include <linux/reboot.h>
34 #include <linux/init.h>
35 #include <linux/mc146818rtc.h>
37 #include <asm/uaccess.h>
38 #include <asm/pgtable.h>
39 #include <asm/system.h>
42 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
47 #ifdef CONFIG_MATH_EMULATION
48 #include <asm/math_emu.h>
51 #include <linux/irq.h>
53 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
58 * Powermanagement idle function, if any..
60 void (*pm_idle)(void);
63 * Power off function, if any
65 void (*pm_power_off)(void);
67 void disable_hlt(void)
78 * We use this if we don't have any better
81 void default_idle(void)
83 if (current_cpu_data.hlt_works_ok && !hlt_counter) {
85 if (!current->need_resched)
93 * On SMP it's slightly faster (but much more power-consuming!)
94 * to poll the ->need_resched flag instead of waiting for the
95 * cross-CPU IPI to arrive. Use this option with caution.
97 static void poll_idle (void)
104 * Deal with another CPU just having chosen a thread to
107 oldval = xchg(¤t->need_resched, -1);
115 : :"m" (current->need_resched));
119 * The idle thread. There's no useful work to be
120 * done, so just try to conserve power and have a
121 * low exit latency (ie sit in a loop waiting for
122 * somebody to say that they'd like to reschedule)
126 /* endless idle loop with no priority at all */
129 current->counter = -100;
132 void (*idle)(void) = pm_idle;
135 while (!current->need_resched)
142 static int __init idle_setup (char *str)
144 if (!strncmp(str, "poll", 4)) {
145 printk("using polling idle threads.\n");
152 __setup("idle=", idle_setup);
154 static long no_idt[2];
155 static int reboot_mode;
156 int reboot_thru_bios;
160 static int reboot_cpu = -1;
161 /* shamelessly grabbed from lib/vsprintf.c for readability */
162 #define is_digit(c) ((c) >= '0' && (c) <= '9')
164 static int __init reboot_setup(char *str)
168 case 'w': /* "warm" reboot (no memory testing etc) */
169 reboot_mode = 0x1234;
171 case 'c': /* "cold" reboot (with memory testing etc) */
174 case 'b': /* "bios" reboot by jumping through the BIOS */
175 reboot_thru_bios = 1;
177 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
178 reboot_thru_bios = 0;
181 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
183 if (is_digit(*(str+1))) {
184 reboot_cpu = (int) (*(str+1) - '0');
185 if (is_digit(*(str+2)))
186 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
188 /* we will leave sorting out the final value
189 when we are ready to reboot, since we might not
190 have set up boot_cpu_id or smp_num_cpu */
194 if((str = strchr(str,',')) != NULL)
202 __setup("reboot=", reboot_setup);
204 /* The following code and data reboots the machine by switching to real
205 mode and jumping to the BIOS reset entry point, as if the CPU has
206 really been reset. The previous version asked the keyboard
207 controller to pulse the CPU reset line, which is more thorough, but
208 doesn't work with at least one type of 486 motherboard. It is easy
209 to stop this code working; hence the copious comments. */
211 static unsigned long long
212 real_mode_gdt_entries [3] =
214 0x0000000000000000ULL, /* Null descriptor */
215 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
216 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
221 unsigned short size __attribute__ ((packed));
222 unsigned long long * base __attribute__ ((packed));
224 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
225 real_mode_idt = { 0x3ff, 0 };
227 /* This is 16-bit protected mode code to disable paging and the cache,
228 switch to real mode and jump to the BIOS reset code.
230 The instruction that switches to real mode by writing to CR0 must be
231 followed immediately by a far jump instruction, which set CS to a
232 valid value for real mode, and flushes the prefetch queue to avoid
233 running instructions that have already been decoded in protected
236 Clears all the flags except ET, especially PG (paging), PE
237 (protected-mode enable) and TS (task switch for coprocessor state
238 save). Flushes the TLB after paging has been disabled. Sets CD and
239 NW, to disable the cache on a 486, and invalidates the cache. This
240 is more like the state of a 486 after reset. I don't know if
241 something else should be done for other chips.
243 More could be done here to set up the registers as if a CPU reset had
244 occurred; hopefully real BIOSs don't assume much. */
246 static unsigned char real_mode_switch [] =
248 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
249 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
250 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
251 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
252 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
253 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
254 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
255 0x74, 0x02, /* jz f */
256 0x0f, 0x08, /* invd */
257 0x24, 0x10, /* f: andb $0x10,al */
258 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
260 static unsigned char jump_to_bios [] =
262 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
265 static inline void kb_wait(void)
269 for (i=0; i<0x10000; i++)
270 if ((inb_p(0x64) & 0x02) == 0)
275 * Switch to real mode and then execute the code
276 * specified by the code and length parameters.
277 * We assume that length will aways be less that 100!
279 void machine_real_restart(unsigned char *code, int length)
285 /* Write zero to CMOS register number 0x0f, which the BIOS POST
286 routine will recognize as telling it to do a proper reboot. (Well
287 that's what this book in front of me says -- it may only apply to
288 the Phoenix BIOS though, it's not clear). At the same time,
289 disable NMIs by setting the top bit in the CMOS address register,
290 as we're about to do peculiar things to the CPU. I'm not sure if
291 `outb_p' is needed instead of just `outb'. Use it to be on the
292 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
295 spin_lock_irqsave(&rtc_lock, flags);
296 CMOS_WRITE(0x00, 0x8f);
297 spin_unlock_irqrestore(&rtc_lock, flags);
299 /* Remap the kernel at virtual address zero, as well as offset zero
300 from the kernel segment. This assumes the kernel segment starts at
301 virtual address PAGE_OFFSET. */
303 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
304 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
306 /* Make sure the first page is mapped to the start of physical memory.
307 It is normally not mapped, to trap kernel NULL pointer dereferences. */
309 pg0[0] = _PAGE_RW | _PAGE_PRESENT;
312 * Use `swapper_pg_dir' as our page directory.
314 load_cr3(swapper_pg_dir);
316 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
317 this on booting to tell it to "Bypass memory test (also warm
318 boot)". This seems like a fairly standard thing that gets set by
319 REBOOT.COM programs, and the previous reset routine did this
322 *((unsigned short *)0x472) = reboot_mode;
324 /* For the switch to real mode, copy some code to low memory. It has
325 to be in the first 64k because it is running in 16-bit mode, and it
326 has to have the same physical and virtual address, because it turns
327 off paging. Copy it near the end of the first page, out of the way
328 of BIOS variables. */
330 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
331 real_mode_switch, sizeof (real_mode_switch));
332 memcpy ((void *) (0x1000 - 100), code, length);
334 /* Set up the IDT for real mode. */
336 __asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
338 /* Set up a GDT from which we can load segment descriptors for real
339 mode. The GDT is not used in real mode; it is just needed here to
340 prepare the descriptors. */
342 __asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
344 /* Load the data segment registers, and thus the descriptors ready for
345 real mode. The base address of each segment is 0x100, 16 times the
346 selector value being loaded here. This is so that the segment
347 registers don't have to be reloaded after switching to real mode:
348 the values are consistent for real mode operation already. */
350 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
351 "\tmovl %%eax,%%ds\n"
352 "\tmovl %%eax,%%es\n"
353 "\tmovl %%eax,%%fs\n"
354 "\tmovl %%eax,%%gs\n"
355 "\tmovl %%eax,%%ss" : : : "eax");
357 /* Jump to the 16-bit code that we copied earlier. It disables paging
358 and the cache, switches to real mode, and jumps to the BIOS reset
361 __asm__ __volatile__ ("ljmp $0x0008,%0"
363 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
366 void machine_restart(char * __unused)
371 cpuid = GET_APIC_ID(apic_read(APIC_ID));
375 /* check to see if reboot_cpu is valid
376 if its not, default to the BSP */
377 if ((reboot_cpu == -1) ||
378 (reboot_cpu > (NR_CPUS -1)) ||
379 !(phys_cpu_present_map & (1<<cpuid)))
380 reboot_cpu = boot_cpu_physical_apicid;
382 reboot_smp = 0; /* use this as a flag to only go through this once*/
383 /* re-run this function on the other CPUs
384 it will fall though this section since we have
385 cleared reboot_smp, and do the reboot if it is the
386 correct CPU, otherwise it halts. */
387 if (reboot_cpu != cpuid)
388 smp_call_function((void *)machine_restart , NULL, 1, 0);
391 /* if reboot_cpu is still -1, then we want a tradional reboot,
392 and if we are not running on the reboot_cpu,, halt */
393 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
395 __asm__ __volatile__ ("hlt");
398 * Stop all CPUs and turn off local APICs and the IO-APIC, so
399 * other OSs see a clean IRQ state.
405 if(!reboot_thru_bios) {
406 /* rebooting needs to touch the page at absolute addr 0 */
407 *((unsigned short *)__va(0x472)) = reboot_mode;
410 for (i=0; i<100; i++) {
413 outb(0xfe,0x64); /* pulse reset low */
416 /* That didn't work - force a triple fault.. */
417 __asm__ __volatile__("lidt %0": :"m" (no_idt));
418 __asm__ __volatile__("int3");
422 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
425 void machine_halt(void)
429 void machine_power_off(void)
435 extern void show_trace(unsigned long* esp);
437 void show_regs(struct pt_regs * regs)
439 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
442 printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
443 printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id());
445 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
446 printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted());
447 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
448 regs->eax,regs->ebx,regs->ecx,regs->edx);
449 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
450 regs->esi, regs->edi, regs->ebp);
451 printk(" DS: %04x ES: %04x\n",
452 0xffff & regs->xds,0xffff & regs->xes);
454 __asm__("movl %%cr0, %0": "=r" (cr0));
455 __asm__("movl %%cr2, %0": "=r" (cr2));
456 __asm__("movl %%cr3, %0": "=r" (cr3));
457 /* This could fault if %cr4 does not exist */
458 __asm__("1: movl %%cr4, %0 \n"
460 ".section __ex_table,\"a\" \n"
463 : "=r" (cr4): "0" (0));
464 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
465 show_trace(®s->esp);
469 * No need to lock the MM as we are the last user
471 void release_segments(struct mm_struct *mm)
473 void * ldt = mm->context.segments;
479 mm->context.segments = NULL;
486 * Create a kernel thread
488 int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
492 __asm__ __volatile__(
493 "movl %%esp,%%esi\n\t"
494 "int $0x80\n\t" /* Linux/i386 system call */
495 "cmpl %%esp,%%esi\n\t" /* child or parent? */
496 "je 1f\n\t" /* parent - jump */
497 /* Load the argument into eax, and push it. That way, it does
498 * not matter whether the called function is compiled with
499 * -mregparm or not. */
502 "call *%5\n\t" /* call fn */
503 "movl %3,%0\n\t" /* exit */
506 :"=&a" (retval), "=&S" (d0)
507 :"0" (__NR_clone), "i" (__NR_exit),
509 "b" (flags | CLONE_VM)
515 * Free current thread data structures etc..
517 void exit_thread(void)
519 /* nothing to do ... */
522 void flush_thread(void)
524 struct task_struct *tsk = current;
526 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
528 * Forget coprocessor state..
534 void release_thread(struct task_struct *dead_task)
537 void * ldt = dead_task->mm->context.segments;
539 // temporary debugging check
541 printk("WARNING: dead process %8s still has LDT? <%p>\n",
542 dead_task->comm, ldt);
547 release_x86_irqs(dead_task);
551 * we do not have to muck with descriptors here, that is
552 * done in switch_mm() as needed.
554 void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
556 struct mm_struct * old_mm;
560 old_mm = current->mm;
561 if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
563 * Completely new LDT, we initialize it from the parent:
565 ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
567 printk(KERN_WARNING "ldt allocation failed\n");
569 memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
571 new_mm->context.segments = ldt;
572 new_mm->context.cpuvalid = ~0UL; /* valid on all CPU's - they can't have stale data */
578 #define savesegment(seg,value) \
579 asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
581 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
582 unsigned long unused,
583 struct task_struct * p, struct pt_regs * regs)
585 struct pt_regs * childregs;
587 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
588 struct_cpy(childregs, regs);
590 childregs->esp = esp;
592 p->thread.esp = (unsigned long) childregs;
593 p->thread.esp0 = (unsigned long) (childregs+1);
595 p->thread.eip = (unsigned long) ret_from_fork;
597 savesegment(fs,p->thread.fs);
598 savesegment(gs,p->thread.gs);
601 struct_cpy(&p->thread.i387, ¤t->thread.i387);
607 * fill in the user structure for a core dump..
609 void dump_thread(struct pt_regs * regs, struct user * dump)
613 /* changed the size calculations - should hopefully work better. lbt */
614 dump->magic = CMAGIC;
615 dump->start_code = 0;
616 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
617 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
618 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
619 dump->u_dsize -= dump->u_tsize;
621 for (i = 0; i < 8; i++)
622 dump->u_debugreg[i] = current->thread.debugreg[i];
624 if (dump->start_stack < TASK_SIZE)
625 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
627 dump->regs.ebx = regs->ebx;
628 dump->regs.ecx = regs->ecx;
629 dump->regs.edx = regs->edx;
630 dump->regs.esi = regs->esi;
631 dump->regs.edi = regs->edi;
632 dump->regs.ebp = regs->ebp;
633 dump->regs.eax = regs->eax;
634 dump->regs.ds = regs->xds;
635 dump->regs.es = regs->xes;
636 savesegment(fs,dump->regs.fs);
637 savesegment(gs,dump->regs.gs);
638 dump->regs.orig_eax = regs->orig_eax;
639 dump->regs.eip = regs->eip;
640 dump->regs.cs = regs->xcs;
641 dump->regs.eflags = regs->eflags;
642 dump->regs.esp = regs->esp;
643 dump->regs.ss = regs->xss;
645 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
649 * This special macro can be used to load a debugging register
651 #define loaddebug(thread,register) \
652 __asm__("movl %0,%%db" #register \
654 :"r" (thread->debugreg[register]))
657 * switch_to(x,yn) should switch tasks from x to y.
659 * We fsave/fwait so that an exception goes off at the right time
660 * (as a call from the fsave or fwait in effect) rather than to
661 * the wrong process. Lazy FP saving no longer makes any sense
662 * with modern CPU's, and this simplifies a lot of things (SMP
663 * and UP become the same).
665 * NOTE! We used to use the x86 hardware context switching. The
666 * reason for not using it any more becomes apparent when you
667 * try to recover gracefully from saved state that is no longer
668 * valid (stale segment register values in particular). With the
669 * hardware task-switch, there is no way to fix up bad state in
670 * a reasonable manner.
672 * The fact that Intel documents the hardware task-switching to
673 * be slow is a fairly red herring - this code is not noticeably
674 * faster. However, there _is_ some room for improvement here,
675 * so the performance issues may eventually be a valid point.
676 * More important, however, is the fact that this allows us much
679 void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
681 struct thread_struct *prev = &prev_p->thread,
682 *next = &next_p->thread;
683 struct tss_struct *tss = init_tss + smp_processor_id();
688 * Reload esp0, LDT and the page table pointer:
690 tss->esp0 = next->esp0;
693 * Save away %fs and %gs. No need to save %es and %ds, as
694 * those are always kernel segments while inside the kernel.
696 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
697 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
700 * Restore %fs and %gs.
702 loadsegment(fs, next->fs);
703 loadsegment(gs, next->gs);
706 * Now maybe reload the debug registers
708 if (next->debugreg[7]){
718 if (prev->ioperm || next->ioperm) {
721 * 4 cachelines copy ... not good, but not that
722 * bad either. Anyone got something better?
723 * This only affects processes which use ioperm().
724 * [Putting the TSSs into 4k-tlb mapped regions
725 * and playing VM tricks to switch the IO bitmap
726 * is not really acceptable.]
728 memcpy(tss->io_bitmap, next->io_bitmap,
729 IO_BITMAP_SIZE*sizeof(unsigned long));
730 tss->bitmap = IO_BITMAP_OFFSET;
733 * a bitmap offset pointing outside of the TSS limit
734 * causes a nicely controllable SIGSEGV if a process
735 * tries to use a port IO instruction. The first
736 * sys_ioperm() call sets up the bitmap properly.
738 tss->bitmap = INVALID_IO_BITMAP_OFFSET;
742 asmlinkage int sys_fork(struct pt_regs regs)
744 return do_fork(SIGCHLD, regs.esp, ®s, 0);
747 asmlinkage int sys_clone(struct pt_regs regs)
749 unsigned long clone_flags;
752 clone_flags = regs.ebx;
756 return do_fork(clone_flags, newsp, ®s, 0);
760 * This is trivial, and on the face of it looks like it
761 * could equally well be done in user mode.
763 * Not so, for quite unobvious reasons - register pressure.
764 * In user mode vfork() cannot have a stack frame, and if
765 * done by calling the "clone()" system call directly, you
766 * do not have enough call-clobbered registers to hold all
767 * the information you need.
769 asmlinkage int sys_vfork(struct pt_regs regs)
771 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0);
775 * sys_execve() executes a new program.
777 asmlinkage int sys_execve(struct pt_regs regs)
782 filename = getname((char *) regs.ebx);
783 error = PTR_ERR(filename);
784 if (IS_ERR(filename))
786 error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, ®s);
788 current->ptrace &= ~PT_DTRACE;
795 * These bracket the sleeping functions..
797 extern void scheduling_functions_start_here(void);
798 extern void scheduling_functions_end_here(void);
799 #define first_sched ((unsigned long) scheduling_functions_start_here)
800 #define last_sched ((unsigned long) scheduling_functions_end_here)
802 unsigned long get_wchan(struct task_struct *p)
804 unsigned long ebp, esp, eip;
805 unsigned long stack_page;
807 if (!p || p == current || p->state == TASK_RUNNING)
809 stack_page = (unsigned long)p;
811 if (!stack_page || esp < stack_page || esp > 8188+stack_page)
813 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
814 ebp = *(unsigned long *) esp;
816 if (ebp < stack_page || ebp > 8184+stack_page)
818 eip = *(unsigned long *) (ebp+4);
819 if (eip < first_sched || eip >= last_sched)
821 ebp = *(unsigned long *) ebp;
822 } while (count++ < 16);