[PATCH] Fix cascade lookup of next_timer_interrupt

[powerpc.git] / kernel / timer.c
diff --git a/kernel/timer.c b/kernel/timer.c

index c2a8ccf..201bee0 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -85,7 +85,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
   * @j: the time in (absolute) jiffies that should be rounded
   * @cpu: the processor number on which the timeout will happen
   *
- * __round_jiffies rounds an absolute time in the future (in jiffies)
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
   * up or down to (approximately) full seconds. This is useful for timers
   * for which the exact time they fire does not matter too much, as long as
   * they fire approximately every X seconds.
@@ -98,7 +98,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
   * processors firing at the exact same time, which could lead
   * to lock contention or spurious cache line bouncing.
   *
- * The return value is the rounded version of the "j" parameter.
+ * The return value is the rounded version of the @j parameter.
   */
  unsigned long __round_jiffies(unsigned long j, int cpu)
  {
@@ -142,7 +142,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
   * @j: the time in (relative) jiffies that should be rounded
   * @cpu: the processor number on which the timeout will happen
   *
- * __round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
   * up or down to (approximately) full seconds. This is useful for timers
   * for which the exact time they fire does not matter too much, as long as
   * they fire approximately every X seconds.
@@ -155,7 +155,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
   * processors firing at the exact same time, which could lead
   * to lock contention or spurious cache line bouncing.
   *
- * The return value is the rounded version of the "j" parameter.
+ * The return value is the rounded version of the @j parameter.
   */
  unsigned long __round_jiffies_relative(unsigned long j, int cpu)
  {
@@ -173,7 +173,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
   * round_jiffies - function to round jiffies to a full second
   * @j: the time in (absolute) jiffies that should be rounded
   *
- * round_jiffies rounds an absolute time in the future (in jiffies)
+ * round_jiffies() rounds an absolute time in the future (in jiffies)
   * up or down to (approximately) full seconds. This is useful for timers
   * for which the exact time they fire does not matter too much, as long as
   * they fire approximately every X seconds.
@@ -182,7 +182,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
   * at the same time, rather than at various times spread out. The goal
   * of this is to have the CPU wake up less, which saves power.
   *
- * The return value is the rounded version of the "j" parameter.
+ * The return value is the rounded version of the @j parameter.
   */
  unsigned long round_jiffies(unsigned long j)
  {
@@ -194,7 +194,7 @@ EXPORT_SYMBOL_GPL(round_jiffies);
   * round_jiffies_relative - function to round jiffies to a full second
   * @j: the time in (relative) jiffies that should be rounded
   *
- * round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
   * up or down to (approximately) full seconds. This is useful for timers
   * for which the exact time they fire does not matter too much, as long as
   * they fire approximately every X seconds.
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(round_jiffies);
   * at the same time, rather than at various times spread out. The goal
   * of this is to have the CPU wake up less, which saves power.
   *
- * The return value is the rounded version of the "j" parameter.
+ * The return value is the rounded version of the @j parameter.
   */
  unsigned long round_jiffies_relative(unsigned long j)
  {
@@ -387,7 +387,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
   * @timer: the timer to be modified
   * @expires: new timeout in jiffies
   *
- * mod_timer is a more efficient way to update the expire field of an
+ * mod_timer() is a more efficient way to update the expire field of an
   * active timer (if the timer is inactive it will be activated)
   *
   * mod_timer(timer, expires) is equivalent to:
@@ -490,7 +490,7 @@ out:
   * the timer it also makes sure the handler has finished executing on other
   * CPUs.
   *
- * Synchronization rules: callers must prevent restarting of the timer,
+ * Synchronization rules: Callers must prevent restarting of the timer,
   * otherwise this function is meaningless. It must not be called from
   * interrupt contexts. The caller must not hold locks which would prevent
   * completion of the timer's handler. The timer's handler must not call
@@ -597,99 +597,110 @@ static inline void __run_timers(tvec_base_t *base)
   * is used on S/390 to stop all activity when a cpus is idle.
   * This functions needs to be called disabled.
   */
-unsigned long next_timer_interrupt(void)
+static unsigned long __next_timer_interrupt(tvec_base_t *base)
  {
-       tvec_base_t *base;
-       struct list_head *list;
+       unsigned long timer_jiffies = base->timer_jiffies;
+       unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
+       int index, slot, array, found = 0;
         struct timer_list *nte;
-       unsigned long expires;
-       unsigned long hr_expires = MAX_JIFFY_OFFSET;
-       ktime_t hr_delta;
         tvec_t *varray[4];
-       int i, j;
-
-       hr_delta = hrtimer_get_next_event();
-       if (hr_delta.tv64 != KTIME_MAX) {
-               struct timespec tsdelta;
-               tsdelta = ktime_to_timespec(hr_delta);
-               hr_expires = timespec_to_jiffies(&tsdelta);
-               if (hr_expires < 3)
-                       return hr_expires + jiffies;
-       }
-       hr_expires += jiffies;
-
-       base = __get_cpu_var(tvec_bases);
-       spin_lock(&base->lock);
-       expires = base->timer_jiffies + (LONG_MAX >> 1);
-       list = NULL;
  
         /* Look for timer events in tv1. */
-       j = base->timer_jiffies & TVR_MASK;
+       index = slot = timer_jiffies & TVR_MASK;
         do {
-               list_for_each_entry(nte, base->tv1.vec + j, entry) {
+               list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+                       found = 1;
                         expires = nte->expires;
-                       if (j < (base->timer_jiffies & TVR_MASK))
-                               list = base->tv2.vec + (INDEX(0));
-                       goto found;
+                       /* Look at the cascade bucket(s)? */
+                       if (!index || slot < index)
+                               goto cascade;
+                       return expires;
                 }
-               j = (j + 1) & TVR_MASK;
-       } while (j != (base->timer_jiffies & TVR_MASK));
+               slot = (slot + 1) & TVR_MASK;
+       } while (slot != index);
+
+cascade:
+       /* Calculate the next cascade event */
+       if (index)
+               timer_jiffies += TVR_SIZE - index;
+       timer_jiffies >>= TVR_BITS;
  
         /* Check tv2-tv5. */
         varray[0] = &base->tv2;
         varray[1] = &base->tv3;
         varray[2] = &base->tv4;
         varray[3] = &base->tv5;
-       for (i = 0; i < 4; i++) {
-               j = INDEX(i);
+
+       for (array = 0; array < 4; array++) {
+               tvec_t *varp = varray[array];
+
+               index = slot = timer_jiffies & TVN_MASK;
                 do {
-                       if (list_empty(varray[i]->vec + j)) {
-                               j = (j + 1) & TVN_MASK;
-                               continue;
-                       }
-                       list_for_each_entry(nte, varray[i]->vec + j, entry)
+                       list_for_each_entry(nte, varp->vec + slot, entry) {
+                               found = 1;
                                 if (time_before(nte->expires, expires))
                                         expires = nte->expires;
-                       if (j < (INDEX(i)) && i < 3)
-                               list = varray[i + 1]->vec + (INDEX(i + 1));
-                       goto found;
-               } while (j != (INDEX(i)));
-       }
-found:
-       if (list) {
-               /*
-                * The search wrapped. We need to look at the next list
-                * from next tv element that would cascade into tv element
-                * where we found the timer element.
-                */
-               list_for_each_entry(nte, list, entry) {
-                       if (time_before(nte->expires, expires))
-                               expires = nte->expires;
-               }
+                       }
+                       /*
+                        * Do we still search for the first timer or are
+                        * we looking up the cascade buckets ?
+                        */
+                       if (found) {
+                               /* Look at the cascade bucket(s)? */
+                               if (!index || slot < index)
+                                       break;
+                               return expires;
+                       }
+                       slot = (slot + 1) & TVN_MASK;
+               } while (slot != index);
+
+               if (index)
+                       timer_jiffies += TVN_SIZE - index;
+               timer_jiffies >>= TVN_BITS;
         }
-       spin_unlock(&base->lock);
+       return expires;
+}
  
-       /*
-        * It can happen that other CPUs service timer IRQs and increment
-        * jiffies, but we have not yet got a local timer tick to process
-        * the timer wheels.  In that case, the expiry time can be before
-        * jiffies, but since the high-resolution timer here is relative to
-        * jiffies, the default expression when high-resolution timers are
-        * not active,
-        *
-        *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
-        *
-        * would falsely evaluate to true.  If that is the case, just
-        * return jiffies so that we can immediately fire the local timer
-        */
-       if (time_before(expires, jiffies))
-               return jiffies;
+/*
+ * Check, if the next hrtimer event is before the next timer wheel
+ * event:
+ */
+static unsigned long cmp_next_hrtimer_event(unsigned long now,
+                                           unsigned long expires)
+{
+       ktime_t hr_delta = hrtimer_get_next_event();
+       struct timespec tsdelta;
  
-       if (time_before(hr_expires, expires))
-               return hr_expires;
+       if (hr_delta.tv64 == KTIME_MAX)
+               return expires;
  
+       if (hr_delta.tv64 <= TICK_NSEC)
+               return now;
+
+       tsdelta = ktime_to_timespec(hr_delta);
+       now += timespec_to_jiffies(&tsdelta);
+       if (time_before(now, expires))
+               return now;
         return expires;
  }
+
+/**
+ * next_timer_interrupt - return the jiffy of the next pending timer
+ */
+unsigned long next_timer_interrupt(void)
+{
+       tvec_base_t *base = __get_cpu_var(tvec_bases);
+       unsigned long expires, now = jiffies;
+
+       spin_lock(&base->lock);
+       expires = __next_timer_interrupt(base);
+       spin_unlock(&base->lock);
+
+       if (time_before_eq(expires, now))
+               return now;
+
+       return cmp_next_hrtimer_event(now, expires);
+}
  #endif
  
  /******************************************************************/
@@ -832,32 +843,33 @@ EXPORT_SYMBOL(do_settimeofday);
   *
   * Accumulates current time interval and initializes new clocksource
   */
-static int change_clocksource(void)
+static void change_clocksource(void)
  {
         struct clocksource *new;
         cycle_t now;
         u64 nsec;
+
         new = clocksource_get_next();
-       if (clock != new) {
-               now = clocksource_read(new);
-               nsec =  __get_nsec_offset();
-               timespec_add_ns(&xtime, nsec);
-
-               clock = new;
-               clock->cycle_last = now;
-               printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-                      clock->name);
-               return 1;
-       } else if (clock->update_callback) {
-               return clock->update_callback();
-       }
-       return 0;
+
+       if (clock == new)
+               return;
+
+       now = clocksource_read(new);
+       nsec =  __get_nsec_offset();
+       timespec_add_ns(&xtime, nsec);
+
+       clock = new;
+       clock->cycle_last = now;
+
+       clock->error = 0;
+       clock->xtime_nsec = 0;
+       clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+       printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+              clock->name);
  }
  #else
-static inline int change_clocksource(void)
-{
-       return 0;
-}
+static inline void change_clocksource(void) { }
  #endif
  
  /**
@@ -871,33 +883,57 @@ int timekeeping_is_continuous(void)
         do {
                 seq = read_seqbegin(&xtime_lock);
  
-               ret = clock->is_continuous;
+               ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
  
         } while (read_seqretry(&xtime_lock, seq));
  
         return ret;
  }
  
+/**
+ * read_persistent_clock -  Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ *  XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+       return 0;
+}
+
  /*
   * timekeeping_init - Initializes the clocksource and common timekeeping values
   */
  void __init timekeeping_init(void)
  {
         unsigned long flags;
+       unsigned long sec = read_persistent_clock();
  
         write_seqlock_irqsave(&xtime_lock, flags);
  
         ntp_clear();
  
         clock = clocksource_get_next();
-       clocksource_calculate_interval(clock, tick_nsec);
+       clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
         clock->cycle_last = clocksource_read(clock);
  
+       xtime.tv_sec = sec;
+       xtime.tv_nsec = 0;
+       set_normalized_timespec(&wall_to_monotonic,
+               -xtime.tv_sec, -xtime.tv_nsec);
+
         write_sequnlock_irqrestore(&xtime_lock, flags);
  }
  
  
+/* flag for if timekeeping is suspended */
  static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+
  /**
   * timekeeping_resume - Resumes the generic timekeeping subsystem.
   * @dev:       unused
@@ -909,13 +945,25 @@ static int timekeeping_suspended;
  static int timekeeping_resume(struct sys_device *dev)
  {
         unsigned long flags;
+       unsigned long now = read_persistent_clock();
  
         write_seqlock_irqsave(&xtime_lock, flags);
-       /* restart the last cycle value */
+
+       if (now && (now > timekeeping_suspend_time)) {
+               unsigned long sleep_length = now - timekeeping_suspend_time;
+
+               xtime.tv_sec += sleep_length;
+               wall_to_monotonic.tv_sec -= sleep_length;
+       }
+       /* re-base the last cycle value */
         clock->cycle_last = clocksource_read(clock);
         clock->error = 0;
         timekeeping_suspended = 0;
         write_sequnlock_irqrestore(&xtime_lock, flags);
+
+       touch_softlockup_watchdog();
+       hrtimer_notify_resume();
+
         return 0;
  }
  
@@ -925,6 +973,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
  
         write_seqlock_irqsave(&xtime_lock, flags);
         timekeeping_suspended = 1;
+       timekeeping_suspend_time = read_persistent_clock();
         write_sequnlock_irqrestore(&xtime_lock, flags);
         return 0;
  }
@@ -1089,11 +1138,7 @@ static void update_wall_time(void)
         clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
  
         /* check to see if there is a new clocksource to use */
-       if (change_clocksource()) {
-               clock->error = 0;
-               clock->xtime_nsec = 0;
-               clocksource_calculate_interval(clock, tick_nsec);
-       }
+       change_clocksource();
  }
  
  /*
@@ -1162,11 +1207,9 @@ static inline void calc_load(unsigned long ticks)
   * This read-write spinlock protects us from races in SMP while
   * playing with xtime and avenrun.
   */
-#ifndef ARCH_HAVE_XTIME_LOCK
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
  
  EXPORT_SYMBOL(xtime_lock);
-#endif
  
  /*
   * This function runs timers and the timer-tq in bottom half context.
@@ -1392,17 +1435,16 @@ asmlinkage long sys_gettid(void)
  }
  
  /**
- * sys_sysinfo - fill in sysinfo struct
+ * do_sysinfo - fill in sysinfo struct
   * @info: pointer to buffer to fill
   */ 
-asmlinkage long sys_sysinfo(struct sysinfo __user *info)
+int do_sysinfo(struct sysinfo *info)
  {
-       struct sysinfo val;
         unsigned long mem_total, sav_total;
         unsigned int mem_unit, bitcount;
         unsigned long seq;
  
-       memset((char *)&val, 0, sizeof(struct sysinfo));
+       memset(info, 0, sizeof(struct sysinfo));
  
         do {
                 struct timespec tp;
@@ -1422,17 +1464,17 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
                         tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
                         tp.tv_sec++;
                 }
-               val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+               info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
  
-               val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-               val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-               val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+               info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+               info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+               info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
  
-               val.procs = nr_threads;
+               info->procs = nr_threads;
         } while (read_seqretry(&xtime_lock, seq));
  
-       si_meminfo(&val);
-       si_swapinfo(&val);
+       si_meminfo(info);
+       si_swapinfo(info);
  
         /*
          * If the sum of all the available memory (i.e. ram + swap)
@@ -1443,11 +1485,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
          *  -Erik Andersen <andersee@debian.org>
          */
  
-       mem_total = val.totalram + val.totalswap;
-       if (mem_total < val.totalram || mem_total < val.totalswap)
+       mem_total = info->totalram + info->totalswap;
+       if (mem_total < info->totalram || mem_total < info->totalswap)
                 goto out;
         bitcount = 0;
-       mem_unit = val.mem_unit;
+       mem_unit = info->mem_unit;
         while (mem_unit > 1) {
                 bitcount++;
                 mem_unit >>= 1;
@@ -1459,22 +1501,31 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
  
         /*
          * If mem_total did not overflow, multiply all memory values by
-        * val.mem_unit and set it to 1.  This leaves things compatible
+        * info->mem_unit and set it to 1.  This leaves things compatible
          * with 2.2.x, and also retains compatibility with earlier 2.4.x
          * kernels...
          */
  
-       val.mem_unit = 1;
-       val.totalram <<= bitcount;
-       val.freeram <<= bitcount;
-       val.sharedram <<= bitcount;
-       val.bufferram <<= bitcount;
-       val.totalswap <<= bitcount;
-       val.freeswap <<= bitcount;
-       val.totalhigh <<= bitcount;
-       val.freehigh <<= bitcount;
+       info->mem_unit = 1;
+       info->totalram <<= bitcount;
+       info->freeram <<= bitcount;
+       info->sharedram <<= bitcount;
+       info->bufferram <<= bitcount;
+       info->totalswap <<= bitcount;
+       info->freeswap <<= bitcount;
+       info->totalhigh <<= bitcount;
+       info->freehigh <<= bitcount;
+
+out:
+       return 0;
+}
+
+asmlinkage long sys_sysinfo(struct sysinfo __user *info)
+{
+       struct sysinfo val;
+
+       do_sysinfo(&val);
  
- out:
         if (copy_to_user(info, &val, sizeof(struct sysinfo)))
                 return -EFAULT;
  
@@ -1624,7 +1675,7 @@ struct time_interpolator *time_interpolator __read_mostly;
  static struct time_interpolator *time_interpolator_list __read_mostly;
  static DEFINE_SPINLOCK(time_interpolator_lock);
  
-static inline u64 time_interpolator_get_cycles(unsigned int src)
+static inline cycles_t time_interpolator_get_cycles(unsigned int src)
  {
         unsigned long (*x)(void);
  
@@ -1650,8 +1701,8 @@ static inline u64 time_interpolator_get_counter(int writelock)
  
         if (time_interpolator->jitter)
         {
-               u64 lcycle;
-               u64 now;
+               cycles_t lcycle;
+               cycles_t now;
  
                 do {
                         lcycle = time_interpolator->last_cycle;