4 * Kernel internal timers, kernel timekeeping, basic process system calls
6 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
18 #include <linux/config.h>
20 #include <linux/timex.h>
21 #include <linux/delay.h>
22 #include <linux/smp_lock.h>
23 #include <linux/interrupt.h>
24 #include <linux/kernel_stat.h>
26 #include <asm/uaccess.h>
29 * Timekeeping variables
32 long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */
34 /* The current time */
35 struct timeval xtime __attribute__ ((aligned (16)));
37 /* Don't completely fail for HZ > 500. */
38 int tickadj = 500/HZ ? : 1; /* microsecs */
40 DECLARE_TASK_QUEUE(tq_timer);
41 DECLARE_TASK_QUEUE(tq_immediate);
44 * phase-lock loop variables
46 /* TIME_ERROR prevents overwriting the CMOS clock */
47 int time_state = TIME_OK; /* clock synchronization status */
48 int time_status = STA_UNSYNC; /* clock status bits */
49 long time_offset; /* time adjustment (us) */
50 long time_constant = 2; /* pll time constant */
51 long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
52 long time_precision = 1; /* clock precision (us) */
53 long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
54 long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
55 long time_phase; /* phase offset (scaled us) */
56 long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
57 /* frequency offset (scaled ppm)*/
58 long time_adj; /* tick adjust (scaled 1 / HZ) */
59 long time_reftime; /* time at last adjustment (s) */
62 long time_adjust_step;
66 extern int do_setitimer(int, struct itimerval *, struct itimerval *);
68 unsigned long volatile jiffies;
70 unsigned int * prof_buffer;
71 unsigned long prof_len;
72 unsigned long prof_shift;
79 #define TVN_SIZE (1 << TVN_BITS)
80 #define TVR_SIZE (1 << TVR_BITS)
81 #define TVN_MASK (TVN_SIZE - 1)
82 #define TVR_MASK (TVR_SIZE - 1)
86 struct list_head vec[TVN_SIZE];
89 struct timer_vec_root {
91 struct list_head vec[TVR_SIZE];
94 static struct timer_vec tv5;
95 static struct timer_vec tv4;
96 static struct timer_vec tv3;
97 static struct timer_vec tv2;
98 static struct timer_vec_root tv1;
100 static struct timer_vec * const tvecs[] = {
101 (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
104 static struct list_head * run_timer_list_running;
106 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
108 void init_timervecs (void)
112 for (i = 0; i < TVN_SIZE; i++) {
113 INIT_LIST_HEAD(tv5.vec + i);
114 INIT_LIST_HEAD(tv4.vec + i);
115 INIT_LIST_HEAD(tv3.vec + i);
116 INIT_LIST_HEAD(tv2.vec + i);
118 for (i = 0; i < TVR_SIZE; i++)
119 INIT_LIST_HEAD(tv1.vec + i);
122 static unsigned long timer_jiffies;
124 static inline void internal_add_timer(struct timer_list *timer)
127 * must be cli-ed when calling this
129 unsigned long expires = timer->expires;
130 unsigned long idx = expires - timer_jiffies;
131 struct list_head * vec;
133 if (run_timer_list_running)
134 vec = run_timer_list_running;
135 else if (idx < TVR_SIZE) {
136 int i = expires & TVR_MASK;
138 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
139 int i = (expires >> TVR_BITS) & TVN_MASK;
141 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
142 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
144 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
145 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
147 } else if ((signed long) idx < 0) {
148 /* can happen if you add a timer with expires == jiffies,
149 * or you set a timer to go off in the past
151 vec = tv1.vec + tv1.index;
152 } else if (idx <= 0xffffffffUL) {
153 int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
156 /* Can only get here on architectures with 64-bit jiffies */
157 INIT_LIST_HEAD(&timer->list);
163 list_add(&timer->list, vec->prev);
166 /* Initialize both explicitly - let's try to have them in the same cache line */
167 spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
170 volatile struct timer_list * volatile running_timer;
171 #define timer_enter(t) do { running_timer = t; mb(); } while (0)
172 #define timer_exit() do { running_timer = NULL; } while (0)
173 #define timer_is_running(t) (running_timer == t)
174 #define timer_synchronize(t) while (timer_is_running(t)) barrier()
176 #define timer_enter(t) do { } while (0)
177 #define timer_exit() do { } while (0)
180 void add_timer(struct timer_list *timer)
184 spin_lock_irqsave(&timerlist_lock, flags);
185 if (timer_pending(timer))
187 internal_add_timer(timer);
188 spin_unlock_irqrestore(&timerlist_lock, flags);
191 spin_unlock_irqrestore(&timerlist_lock, flags);
192 printk("bug: kernel timer added twice at %p.\n",
193 __builtin_return_address(0));
196 static inline int detach_timer (struct timer_list *timer)
198 if (!timer_pending(timer))
200 list_del(&timer->list);
204 int mod_timer(struct timer_list *timer, unsigned long expires)
209 spin_lock_irqsave(&timerlist_lock, flags);
210 timer->expires = expires;
211 ret = detach_timer(timer);
212 internal_add_timer(timer);
213 spin_unlock_irqrestore(&timerlist_lock, flags);
217 int del_timer(struct timer_list * timer)
222 spin_lock_irqsave(&timerlist_lock, flags);
223 ret = detach_timer(timer);
224 timer->list.next = timer->list.prev = NULL;
225 spin_unlock_irqrestore(&timerlist_lock, flags);
230 void sync_timers(void)
232 spin_unlock_wait(&global_bh_lock);
236 * SMP specific function to delete periodic timer.
237 * Caller must disable by some means restarting the timer
238 * for new. Upon exit the timer is not queued and handler is not running
239 * on any CPU. It returns number of times, which timer was deleted
240 * (for reference counting).
243 int del_timer_sync(struct timer_list * timer)
251 spin_lock_irqsave(&timerlist_lock, flags);
252 ret += detach_timer(timer);
253 timer->list.next = timer->list.prev = 0;
254 running = timer_is_running(timer);
255 spin_unlock_irqrestore(&timerlist_lock, flags);
260 timer_synchronize(timer);
268 static inline void cascade_timers(struct timer_vec *tv)
270 /* cascade all the timers from tv up one level */
271 struct list_head *head, *curr, *next;
273 head = tv->vec + tv->index;
276 * We are removing _all_ timers from the list, so we don't have to
277 * detach them individually, just clear the list afterwards.
279 while (curr != head) {
280 struct timer_list *tmp;
282 tmp = list_entry(curr, struct timer_list, list);
284 list_del(curr); // not needed
285 internal_add_timer(tmp);
288 INIT_LIST_HEAD(head);
289 tv->index = (tv->index + 1) & TVN_MASK;
292 static inline void run_timer_list(void)
294 spin_lock_irq(&timerlist_lock);
295 while ((long)(jiffies - timer_jiffies) >= 0) {
297 struct list_head *head, *curr;
301 cascade_timers(tvecs[n]);
302 } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
304 run_timer_list_running = &queued;
306 head = tv1.vec + tv1.index;
309 struct timer_list *timer;
310 void (*fn)(unsigned long);
313 timer = list_entry(curr, struct timer_list, list);
314 fn = timer->function;
318 timer->list.next = timer->list.prev = NULL;
320 spin_unlock_irq(&timerlist_lock);
322 spin_lock_irq(&timerlist_lock);
326 run_timer_list_running = NULL;
328 tv1.index = (tv1.index + 1) & TVR_MASK;
331 while (curr != &queued) {
332 struct timer_list *timer;
334 timer = list_entry(curr, struct timer_list, list);
336 internal_add_timer(timer);
339 spin_unlock_irq(&timerlist_lock);
342 spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED;
346 run_task_queue(&tq_timer);
349 void immediate_bh(void)
351 run_task_queue(&tq_immediate);
355 * this routine handles the overflow of the microsecond field
357 * The tricky bits of code to handle the accurate clock support
358 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
359 * They were originally developed for SUN and DEC kernels.
360 * All the kudos should go to Dave for this stuff.
363 static void second_overflow(void)
367 /* Bump the maxerror field */
368 time_maxerror += time_tolerance >> SHIFT_USEC;
369 if ( time_maxerror > NTP_PHASE_LIMIT ) {
370 time_maxerror = NTP_PHASE_LIMIT;
371 time_status |= STA_UNSYNC;
375 * Leap second processing. If in leap-insert state at
376 * the end of the day, the system clock is set back one
377 * second; if in leap-delete state, the system clock is
378 * set ahead one second. The microtime() routine or
379 * external clock driver will insure that reported time
380 * is always monotonic. The ugly divides should be
383 switch (time_state) {
386 if (time_status & STA_INS)
387 time_state = TIME_INS;
388 else if (time_status & STA_DEL)
389 time_state = TIME_DEL;
393 if (xtime.tv_sec % 86400 == 0) {
395 time_state = TIME_OOP;
396 printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
401 if ((xtime.tv_sec + 1) % 86400 == 0) {
403 time_state = TIME_WAIT;
404 printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
409 time_state = TIME_WAIT;
413 if (!(time_status & (STA_INS | STA_DEL)))
414 time_state = TIME_OK;
418 * Compute the phase adjustment for the next second. In
419 * PLL mode, the offset is reduced by a fixed factor
420 * times the time constant. In FLL mode the offset is
421 * used directly. In either mode, the maximum phase
422 * adjustment for each second is clamped so as to spread
423 * the adjustment over not more than the number of
424 * seconds between updates.
426 if (time_offset < 0) {
427 ltemp = -time_offset;
428 if (!(time_status & STA_FLL))
429 ltemp >>= SHIFT_KG + time_constant;
430 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
431 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
432 time_offset += ltemp;
433 time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
436 if (!(time_status & STA_FLL))
437 ltemp >>= SHIFT_KG + time_constant;
438 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
439 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
440 time_offset -= ltemp;
441 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
445 * Compute the frequency estimate and additional phase
446 * adjustment due to frequency error for the next
447 * second. When the PPS signal is engaged, gnaw on the
448 * watchdog counter and update the frequency computed by
449 * the pll and the PPS signal.
452 if (pps_valid == PPS_VALID) { /* PPS signal lost */
453 pps_jitter = MAXTIME;
454 pps_stabil = MAXFREQ;
455 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
456 STA_PPSWANDER | STA_PPSERROR);
458 ltemp = time_freq + pps_freq;
460 time_adj -= -ltemp >>
461 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
464 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
467 /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
468 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
471 time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
473 time_adj += (time_adj >> 2) + (time_adj >> 5);
477 /* in the NTP reference this is called "hardclock()" */
478 static void update_wall_time_one_tick(void)
480 if ( (time_adjust_step = time_adjust) != 0 ) {
481 /* We are doing an adjtime thing.
483 * Prepare time_adjust_step to be within bounds.
484 * Note that a positive time_adjust means we want the clock
487 * Limit the amount of the step to be in the range
488 * -tickadj .. +tickadj
490 if (time_adjust > tickadj)
491 time_adjust_step = tickadj;
492 else if (time_adjust < -tickadj)
493 time_adjust_step = -tickadj;
495 /* Reduce by this step the amount of time left */
496 time_adjust -= time_adjust_step;
498 xtime.tv_usec += tick + time_adjust_step;
500 * Advance the phase, once it gets to one microsecond, then
501 * advance the tick more.
503 time_phase += time_adj;
504 if (time_phase <= -FINEUSEC) {
505 long ltemp = -time_phase >> SHIFT_SCALE;
506 time_phase += ltemp << SHIFT_SCALE;
507 xtime.tv_usec -= ltemp;
509 else if (time_phase >= FINEUSEC) {
510 long ltemp = time_phase >> SHIFT_SCALE;
511 time_phase -= ltemp << SHIFT_SCALE;
512 xtime.tv_usec += ltemp;
517 * Using a loop looks inefficient, but "ticks" is
518 * usually just one (we shouldn't be losing ticks,
519 * we're doing this this way mainly for interrupt
520 * latency reasons, not because we think we'll
521 * have lots of lost timer ticks
523 static void update_wall_time(unsigned long ticks)
527 update_wall_time_one_tick();
530 if (xtime.tv_usec >= 1000000) {
531 xtime.tv_usec -= 1000000;
537 static inline void do_process_times(struct task_struct *p,
538 unsigned long user, unsigned long system)
542 psecs = (p->times.tms_utime += user);
543 psecs += (p->times.tms_stime += system);
544 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
545 /* Send SIGXCPU every second.. */
547 send_sig(SIGXCPU, p, 1);
548 /* and SIGKILL when we go over max.. */
549 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
550 send_sig(SIGKILL, p, 1);
554 static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
556 unsigned long it_virt = p->it_virt_value;
561 it_virt = p->it_virt_incr;
562 send_sig(SIGVTALRM, p, 1);
564 p->it_virt_value = it_virt;
568 static inline void do_it_prof(struct task_struct *p)
570 unsigned long it_prof = p->it_prof_value;
573 if (--it_prof == 0) {
574 it_prof = p->it_prof_incr;
575 send_sig(SIGPROF, p, 1);
577 p->it_prof_value = it_prof;
581 void update_one_process(struct task_struct *p, unsigned long user,
582 unsigned long system, int cpu)
584 p->per_cpu_utime[cpu] += user;
585 p->per_cpu_stime[cpu] += system;
586 do_process_times(p, user, system);
592 * Called from the timer interrupt handler to charge one tick to the current
593 * process. user_tick is 1 if the tick is user time, 0 for system.
595 void update_process_times(int user_tick)
597 struct task_struct *p = current;
598 int cpu = smp_processor_id(), system = user_tick ^ 1;
600 update_one_process(p, user_tick, system, cpu);
602 if (--p->counter <= 0) {
605 * SCHED_FIFO is priority preemption, so this is
606 * not the place to decide whether to reschedule a
607 * SCHED_FIFO task or not - Bhavesh Davda
609 if (p->policy != SCHED_FIFO) {
614 kstat.per_cpu_nice[cpu] += user_tick;
616 kstat.per_cpu_user[cpu] += user_tick;
617 kstat.per_cpu_system[cpu] += system;
618 } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
619 kstat.per_cpu_system[cpu] += system;
623 * Nr of active tasks - counted in fixed-point numbers
625 static unsigned long count_active_tasks(void)
627 struct task_struct *p;
628 unsigned long nr = 0;
630 read_lock(&tasklist_lock);
632 if ((p->state == TASK_RUNNING ||
633 (p->state & TASK_UNINTERRUPTIBLE)))
636 read_unlock(&tasklist_lock);
641 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
642 * imply that avenrun[] is the standard name for this kind of thing.
643 * Nothing else seems to be standardized: the fractional size etc
644 * all seem to differ on different machines.
646 unsigned long avenrun[3];
648 static inline void calc_load(unsigned long ticks)
650 unsigned long active_tasks; /* fixed-point */
651 static int count = LOAD_FREQ;
656 active_tasks = count_active_tasks();
657 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
658 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
659 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
663 /* jiffies at the most recent update of wall time */
664 unsigned long wall_jiffies;
667 * This spinlock protect us from races in SMP while playing with xtime. -arca
669 rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
671 static inline void update_times(void)
676 * update_times() is run from the raw timer_bh handler so we
677 * just know that the irqs are locally enabled and so we don't
678 * need to save/restore the flags of the local CPU here. -arca
680 write_lock_irq(&xtime_lock);
683 ticks = jiffies - wall_jiffies;
685 wall_jiffies += ticks;
686 update_wall_time(ticks);
689 write_unlock_irq(&xtime_lock);
699 void do_timer(struct pt_regs *regs)
701 (*(unsigned long *)&jiffies)++;
703 /* SMP process accounting uses the local APIC timer */
705 update_process_times(user_mode(regs));
708 if (TQ_ACTIVE(tq_timer))
712 #if !defined(__alpha__) && !defined(__ia64__)
715 * For backwards compatibility? This can be done in libc so Alpha
716 * and all newer ports shouldn't need it.
718 asmlinkage unsigned long sys_alarm(unsigned int seconds)
720 struct itimerval it_new, it_old;
721 unsigned int oldalarm;
723 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
724 it_new.it_value.tv_sec = seconds;
725 it_new.it_value.tv_usec = 0;
726 do_setitimer(ITIMER_REAL, &it_new, &it_old);
727 oldalarm = it_old.it_value.tv_sec;
728 /* ehhh.. We can't return 0 if we have an alarm pending.. */
729 /* And we'd better return too much than too little anyway */
730 if (it_old.it_value.tv_usec)
740 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
741 * should be moved into arch/i386 instead?
745 * sys_getpid - return the thread group id of the current process
747 * Note, despite the name, this returns the tgid not the pid. The tgid and
748 * the pid are identical unless CLONE_THREAD was specified on clone() in
749 * which case the tgid is the same in all threads of the same group.
751 * This is SMP safe as current->tgid does not change.
753 asmlinkage long sys_getpid(void)
755 return current->tgid;
759 * This is not strictly SMP safe: p_opptr could change
760 * from under us. However, rather than getting any lock
761 * we can use an optimistic algorithm: get the parent
762 * pid, and go back and check that the parent is still
763 * the same. If it has changed (which is extremely unlikely
764 * indeed), we just try again..
766 * NOTE! This depends on the fact that even if we _do_
767 * get an old value of "parent", we can happily dereference
768 * the pointer: we just can't necessarily trust the result
769 * until we know that the parent pointer is valid.
771 * The "mb()" macro is a memory barrier - a synchronizing
772 * event. It also makes sure that gcc doesn't optimize
773 * away the necessary memory references.. The barrier doesn't
774 * have to have all that strong semantics: on x86 we don't
775 * really require a synchronizing instruction, for example.
776 * The barrier is more important for code generation than
777 * for any real memory ordering semantics (even if there is
778 * a small window for a race, using the old pointer is
779 * harmless for a while).
781 asmlinkage long sys_getppid(void)
784 struct task_struct * me = current;
785 struct task_struct * parent;
787 parent = me->p_opptr;
792 struct task_struct *old = parent;
794 parent = me->p_opptr;
804 asmlinkage long sys_getuid(void)
806 /* Only we change this so SMP safe */
810 asmlinkage long sys_geteuid(void)
812 /* Only we change this so SMP safe */
813 return current->euid;
816 asmlinkage long sys_getgid(void)
818 /* Only we change this so SMP safe */
822 asmlinkage long sys_getegid(void)
824 /* Only we change this so SMP safe */
825 return current->egid;
830 /* Thread ID - the internal kernel "pid" */
831 asmlinkage long sys_gettid(void)
836 asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
839 unsigned long expire;
841 if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
844 if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
848 if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
849 current->policy != SCHED_OTHER)
852 * Short delay requests up to 2 ms will be handled with
853 * high precision by a busy wait for all real-time processes.
855 * Its important on SMP not to do this holding locks.
857 udelay((t.tv_nsec + 999) / 1000);
861 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
863 current->state = TASK_INTERRUPTIBLE;
864 expire = schedule_timeout(expire);
868 jiffies_to_timespec(expire, &t);
869 if (copy_to_user(rmtp, &t, sizeof(struct timespec)))