Merge branch 'devel' of master.kernel.org:/home/rmk/linux-2.6-serial
[powerpc.git] / drivers / cpufreq / cpufreq_ondemand.c
1 /*
2  *  drivers/cpufreq/cpufreq_ondemand.c
3  *
4  *  Copyright (C)  2001 Russell King
5  *            (C)  2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
6  *                      Jun Nakajima <jun.nakajima@intel.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  */
12
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/smp.h>
16 #include <linux/init.h>
17 #include <linux/interrupt.h>
18 #include <linux/ctype.h>
19 #include <linux/cpufreq.h>
20 #include <linux/sysctl.h>
21 #include <linux/types.h>
22 #include <linux/fs.h>
23 #include <linux/sysfs.h>
24 #include <linux/sched.h>
25 #include <linux/kmod.h>
26 #include <linux/workqueue.h>
27 #include <linux/jiffies.h>
28 #include <linux/kernel_stat.h>
29 #include <linux/percpu.h>
30 #include <linux/mutex.h>
31
32 /*
33  * dbs is used in this file as a shortform for demandbased switching
34  * It helps to keep variable names smaller, simpler
35  */
36
37 #define DEF_FREQUENCY_UP_THRESHOLD              (80)
38 #define MIN_FREQUENCY_UP_THRESHOLD              (11)
39 #define MAX_FREQUENCY_UP_THRESHOLD              (100)
40
41 /*
42  * The polling frequency of this governor depends on the capability of
43  * the processor. Default polling frequency is 1000 times the transition
44  * latency of the processor. The governor will work on any processor with
45  * transition latency <= 10mS, using appropriate sampling
46  * rate.
47  * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL)
48  * this governor will not work.
49  * All times here are in uS.
50  */
51 static unsigned int def_sampling_rate;
52 #define MIN_SAMPLING_RATE_RATIO                 (2)
53 /* for correct statistics, we need at least 10 ticks between each measure */
54 #define MIN_STAT_SAMPLING_RATE                  (MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10))
55 #define MIN_SAMPLING_RATE                       (def_sampling_rate / MIN_SAMPLING_RATE_RATIO)
56 #define MAX_SAMPLING_RATE                       (500 * def_sampling_rate)
57 #define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER    (1000)
58 #define DEF_SAMPLING_DOWN_FACTOR                (1)
59 #define MAX_SAMPLING_DOWN_FACTOR                (10)
60 #define TRANSITION_LATENCY_LIMIT                (10 * 1000)
61
62 static void do_dbs_timer(void *data);
63
64 struct cpu_dbs_info_s {
65         struct cpufreq_policy *cur_policy;
66         unsigned int prev_cpu_idle_up;
67         unsigned int prev_cpu_idle_down;
68         unsigned int enable;
69 };
70 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
71
72 static unsigned int dbs_enable; /* number of CPUs using this policy */
73
74 /*
75  * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
76  * lock and dbs_mutex. cpu_hotplug lock should always be held before
77  * dbs_mutex. If any function that can potentially take cpu_hotplug lock
78  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
79  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
80  * is recursive for the same process. -Venki
81  */
82 static DEFINE_MUTEX (dbs_mutex);
83 static DECLARE_WORK     (dbs_work, do_dbs_timer, NULL);
84
85 static struct workqueue_struct *dbs_workq;
86
87 struct dbs_tuners {
88         unsigned int sampling_rate;
89         unsigned int sampling_down_factor;
90         unsigned int up_threshold;
91         unsigned int ignore_nice;
92 };
93
94 static struct dbs_tuners dbs_tuners_ins = {
95         .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
96         .sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR,
97         .ignore_nice = 0,
98 };
99
100 static inline unsigned int get_cpu_idle_time(unsigned int cpu)
101 {
102         return  kstat_cpu(cpu).cpustat.idle +
103                 kstat_cpu(cpu).cpustat.iowait +
104                 ( dbs_tuners_ins.ignore_nice ?
105                   kstat_cpu(cpu).cpustat.nice :
106                   0);
107 }
108
109 /************************** sysfs interface ************************/
110 static ssize_t show_sampling_rate_max(struct cpufreq_policy *policy, char *buf)
111 {
112         return sprintf (buf, "%u\n", MAX_SAMPLING_RATE);
113 }
114
115 static ssize_t show_sampling_rate_min(struct cpufreq_policy *policy, char *buf)
116 {
117         return sprintf (buf, "%u\n", MIN_SAMPLING_RATE);
118 }
119
120 #define define_one_ro(_name)            \
121 static struct freq_attr _name =         \
122 __ATTR(_name, 0444, show_##_name, NULL)
123
124 define_one_ro(sampling_rate_max);
125 define_one_ro(sampling_rate_min);
126
127 /* cpufreq_ondemand Governor Tunables */
128 #define show_one(file_name, object)                                     \
129 static ssize_t show_##file_name                                         \
130 (struct cpufreq_policy *unused, char *buf)                              \
131 {                                                                       \
132         return sprintf(buf, "%u\n", dbs_tuners_ins.object);             \
133 }
134 show_one(sampling_rate, sampling_rate);
135 show_one(sampling_down_factor, sampling_down_factor);
136 show_one(up_threshold, up_threshold);
137 show_one(ignore_nice_load, ignore_nice);
138
139 static ssize_t store_sampling_down_factor(struct cpufreq_policy *unused,
140                 const char *buf, size_t count)
141 {
142         unsigned int input;
143         int ret;
144         ret = sscanf (buf, "%u", &input);
145         if (ret != 1 )
146                 return -EINVAL;
147
148         if (input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
149                 return -EINVAL;
150
151         mutex_lock(&dbs_mutex);
152         dbs_tuners_ins.sampling_down_factor = input;
153         mutex_unlock(&dbs_mutex);
154
155         return count;
156 }
157
158 static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
159                 const char *buf, size_t count)
160 {
161         unsigned int input;
162         int ret;
163         ret = sscanf (buf, "%u", &input);
164
165         mutex_lock(&dbs_mutex);
166         if (ret != 1 || input > MAX_SAMPLING_RATE || input < MIN_SAMPLING_RATE) {
167                 mutex_unlock(&dbs_mutex);
168                 return -EINVAL;
169         }
170
171         dbs_tuners_ins.sampling_rate = input;
172         mutex_unlock(&dbs_mutex);
173
174         return count;
175 }
176
177 static ssize_t store_up_threshold(struct cpufreq_policy *unused,
178                 const char *buf, size_t count)
179 {
180         unsigned int input;
181         int ret;
182         ret = sscanf (buf, "%u", &input);
183
184         mutex_lock(&dbs_mutex);
185         if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
186                         input < MIN_FREQUENCY_UP_THRESHOLD) {
187                 mutex_unlock(&dbs_mutex);
188                 return -EINVAL;
189         }
190
191         dbs_tuners_ins.up_threshold = input;
192         mutex_unlock(&dbs_mutex);
193
194         return count;
195 }
196
197 static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
198                 const char *buf, size_t count)
199 {
200         unsigned int input;
201         int ret;
202
203         unsigned int j;
204
205         ret = sscanf (buf, "%u", &input);
206         if ( ret != 1 )
207                 return -EINVAL;
208
209         if ( input > 1 )
210                 input = 1;
211
212         mutex_lock(&dbs_mutex);
213         if ( input == dbs_tuners_ins.ignore_nice ) { /* nothing to do */
214                 mutex_unlock(&dbs_mutex);
215                 return count;
216         }
217         dbs_tuners_ins.ignore_nice = input;
218
219         /* we need to re-evaluate prev_cpu_idle_up and prev_cpu_idle_down */
220         for_each_online_cpu(j) {
221                 struct cpu_dbs_info_s *j_dbs_info;
222                 j_dbs_info = &per_cpu(cpu_dbs_info, j);
223                 j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(j);
224                 j_dbs_info->prev_cpu_idle_down = j_dbs_info->prev_cpu_idle_up;
225         }
226         mutex_unlock(&dbs_mutex);
227
228         return count;
229 }
230
231 #define define_one_rw(_name) \
232 static struct freq_attr _name = \
233 __ATTR(_name, 0644, show_##_name, store_##_name)
234
235 define_one_rw(sampling_rate);
236 define_one_rw(sampling_down_factor);
237 define_one_rw(up_threshold);
238 define_one_rw(ignore_nice_load);
239
240 static struct attribute * dbs_attributes[] = {
241         &sampling_rate_max.attr,
242         &sampling_rate_min.attr,
243         &sampling_rate.attr,
244         &sampling_down_factor.attr,
245         &up_threshold.attr,
246         &ignore_nice_load.attr,
247         NULL
248 };
249
250 static struct attribute_group dbs_attr_group = {
251         .attrs = dbs_attributes,
252         .name = "ondemand",
253 };
254
255 /************************** sysfs end ************************/
256
257 static void dbs_check_cpu(int cpu)
258 {
259         unsigned int idle_ticks, up_idle_ticks, total_ticks;
260         unsigned int freq_next;
261         unsigned int freq_down_sampling_rate;
262         static int down_skip[NR_CPUS];
263         struct cpu_dbs_info_s *this_dbs_info;
264
265         struct cpufreq_policy *policy;
266         unsigned int j;
267
268         this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
269         if (!this_dbs_info->enable)
270                 return;
271
272         policy = this_dbs_info->cur_policy;
273         /*
274          * Every sampling_rate, we check, if current idle time is less
275          * than 20% (default), then we try to increase frequency
276          * Every sampling_rate*sampling_down_factor, we look for a the lowest
277          * frequency which can sustain the load while keeping idle time over
278          * 30%. If such a frequency exist, we try to decrease to this frequency.
279          *
280          * Any frequency increase takes it to the maximum frequency.
281          * Frequency reduction happens at minimum steps of
282          * 5% (default) of current frequency
283          */
284
285         /* Check for frequency increase */
286         idle_ticks = UINT_MAX;
287         for_each_cpu_mask(j, policy->cpus) {
288                 unsigned int tmp_idle_ticks, total_idle_ticks;
289                 struct cpu_dbs_info_s *j_dbs_info;
290
291                 j_dbs_info = &per_cpu(cpu_dbs_info, j);
292                 total_idle_ticks = get_cpu_idle_time(j);
293                 tmp_idle_ticks = total_idle_ticks -
294                         j_dbs_info->prev_cpu_idle_up;
295                 j_dbs_info->prev_cpu_idle_up = total_idle_ticks;
296
297                 if (tmp_idle_ticks < idle_ticks)
298                         idle_ticks = tmp_idle_ticks;
299         }
300
301         /* Scale idle ticks by 100 and compare with up and down ticks */
302         idle_ticks *= 100;
303         up_idle_ticks = (100 - dbs_tuners_ins.up_threshold) *
304                         usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
305
306         if (idle_ticks < up_idle_ticks) {
307                 down_skip[cpu] = 0;
308                 for_each_cpu_mask(j, policy->cpus) {
309                         struct cpu_dbs_info_s *j_dbs_info;
310
311                         j_dbs_info = &per_cpu(cpu_dbs_info, j);
312                         j_dbs_info->prev_cpu_idle_down =
313                                         j_dbs_info->prev_cpu_idle_up;
314                 }
315                 /* if we are already at full speed then break out early */
316                 if (policy->cur == policy->max)
317                         return;
318
319                 __cpufreq_driver_target(policy, policy->max,
320                         CPUFREQ_RELATION_H);
321                 return;
322         }
323
324         /* Check for frequency decrease */
325         down_skip[cpu]++;
326         if (down_skip[cpu] < dbs_tuners_ins.sampling_down_factor)
327                 return;
328
329         idle_ticks = UINT_MAX;
330         for_each_cpu_mask(j, policy->cpus) {
331                 unsigned int tmp_idle_ticks, total_idle_ticks;
332                 struct cpu_dbs_info_s *j_dbs_info;
333
334                 j_dbs_info = &per_cpu(cpu_dbs_info, j);
335                 /* Check for frequency decrease */
336                 total_idle_ticks = j_dbs_info->prev_cpu_idle_up;
337                 tmp_idle_ticks = total_idle_ticks -
338                         j_dbs_info->prev_cpu_idle_down;
339                 j_dbs_info->prev_cpu_idle_down = total_idle_ticks;
340
341                 if (tmp_idle_ticks < idle_ticks)
342                         idle_ticks = tmp_idle_ticks;
343         }
344
345         down_skip[cpu] = 0;
346         /* if we cannot reduce the frequency anymore, break out early */
347         if (policy->cur == policy->min)
348                 return;
349
350         /* Compute how many ticks there are between two measurements */
351         freq_down_sampling_rate = dbs_tuners_ins.sampling_rate *
352                 dbs_tuners_ins.sampling_down_factor;
353         total_ticks = usecs_to_jiffies(freq_down_sampling_rate);
354
355         /*
356          * The optimal frequency is the frequency that is the lowest that
357          * can support the current CPU usage without triggering the up
358          * policy. To be safe, we focus 10 points under the threshold.
359          */
360         freq_next = ((total_ticks - idle_ticks) * 100) / total_ticks;
361         freq_next = (freq_next * policy->cur) /
362                         (dbs_tuners_ins.up_threshold - 10);
363
364         if (freq_next < policy->min)
365                 freq_next = policy->min;
366
367         if (freq_next <= ((policy->cur * 95) / 100))
368                 __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_L);
369 }
370
371 static void do_dbs_timer(void *data)
372 {
373         int i;
374         lock_cpu_hotplug();
375         mutex_lock(&dbs_mutex);
376         for_each_online_cpu(i)
377                 dbs_check_cpu(i);
378         queue_delayed_work(dbs_workq, &dbs_work,
379                            usecs_to_jiffies(dbs_tuners_ins.sampling_rate));
380         mutex_unlock(&dbs_mutex);
381         unlock_cpu_hotplug();
382 }
383
384 static inline void dbs_timer_init(void)
385 {
386         INIT_WORK(&dbs_work, do_dbs_timer, NULL);
387         if (!dbs_workq)
388                 dbs_workq = create_singlethread_workqueue("ondemand");
389         if (!dbs_workq) {
390                 printk(KERN_ERR "ondemand: Cannot initialize kernel thread\n");
391                 return;
392         }
393         queue_delayed_work(dbs_workq, &dbs_work,
394                            usecs_to_jiffies(dbs_tuners_ins.sampling_rate));
395         return;
396 }
397
398 static inline void dbs_timer_exit(void)
399 {
400         if (dbs_workq)
401                 cancel_rearming_delayed_workqueue(dbs_workq, &dbs_work);
402 }
403
404 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
405                                    unsigned int event)
406 {
407         unsigned int cpu = policy->cpu;
408         struct cpu_dbs_info_s *this_dbs_info;
409         unsigned int j;
410
411         this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
412
413         switch (event) {
414         case CPUFREQ_GOV_START:
415                 if ((!cpu_online(cpu)) ||
416                     (!policy->cur))
417                         return -EINVAL;
418
419                 if (policy->cpuinfo.transition_latency >
420                                 (TRANSITION_LATENCY_LIMIT * 1000)) {
421                         printk(KERN_WARNING "ondemand governor failed to load "
422                                "due to too long transition latency\n");
423                         return -EINVAL;
424                 }
425                 if (this_dbs_info->enable) /* Already enabled */
426                         break;
427
428                 mutex_lock(&dbs_mutex);
429                 for_each_cpu_mask(j, policy->cpus) {
430                         struct cpu_dbs_info_s *j_dbs_info;
431                         j_dbs_info = &per_cpu(cpu_dbs_info, j);
432                         j_dbs_info->cur_policy = policy;
433
434                         j_dbs_info->prev_cpu_idle_up = get_cpu_idle_time(j);
435                         j_dbs_info->prev_cpu_idle_down
436                                 = j_dbs_info->prev_cpu_idle_up;
437                 }
438                 this_dbs_info->enable = 1;
439                 sysfs_create_group(&policy->kobj, &dbs_attr_group);
440                 dbs_enable++;
441                 /*
442                  * Start the timerschedule work, when this governor
443                  * is used for first time
444                  */
445                 if (dbs_enable == 1) {
446                         unsigned int latency;
447                         /* policy latency is in nS. Convert it to uS first */
448                         latency = policy->cpuinfo.transition_latency / 1000;
449                         if (latency == 0)
450                                 latency = 1;
451
452                         def_sampling_rate = latency *
453                                         DEF_SAMPLING_RATE_LATENCY_MULTIPLIER;
454
455                         if (def_sampling_rate < MIN_STAT_SAMPLING_RATE)
456                                 def_sampling_rate = MIN_STAT_SAMPLING_RATE;
457
458                         dbs_tuners_ins.sampling_rate = def_sampling_rate;
459                         dbs_timer_init();
460                 }
461
462                 mutex_unlock(&dbs_mutex);
463                 break;
464
465         case CPUFREQ_GOV_STOP:
466                 mutex_lock(&dbs_mutex);
467                 this_dbs_info->enable = 0;
468                 sysfs_remove_group(&policy->kobj, &dbs_attr_group);
469                 dbs_enable--;
470                 /*
471                  * Stop the timerschedule work, when this governor
472                  * is used for first time
473                  */
474                 if (dbs_enable == 0)
475                         dbs_timer_exit();
476
477                 mutex_unlock(&dbs_mutex);
478
479                 break;
480
481         case CPUFREQ_GOV_LIMITS:
482                 lock_cpu_hotplug();
483                 mutex_lock(&dbs_mutex);
484                 if (policy->max < this_dbs_info->cur_policy->cur)
485                         __cpufreq_driver_target(
486                                         this_dbs_info->cur_policy,
487                                         policy->max, CPUFREQ_RELATION_H);
488                 else if (policy->min > this_dbs_info->cur_policy->cur)
489                         __cpufreq_driver_target(
490                                         this_dbs_info->cur_policy,
491                                         policy->min, CPUFREQ_RELATION_L);
492                 mutex_unlock(&dbs_mutex);
493                 unlock_cpu_hotplug();
494                 break;
495         }
496         return 0;
497 }
498
499 static struct cpufreq_governor cpufreq_gov_dbs = {
500         .name           = "ondemand",
501         .governor       = cpufreq_governor_dbs,
502         .owner          = THIS_MODULE,
503 };
504
505 static int __init cpufreq_gov_dbs_init(void)
506 {
507         return cpufreq_register_governor(&cpufreq_gov_dbs);
508 }
509
510 static void __exit cpufreq_gov_dbs_exit(void)
511 {
512         /* Make sure that the scheduled work is indeed not running.
513            Assumes the timer has been cancelled first. */
514         if (dbs_workq) {
515                 flush_workqueue(dbs_workq);
516                 destroy_workqueue(dbs_workq);
517         }
518
519         cpufreq_unregister_governor(&cpufreq_gov_dbs);
520 }
521
522
523 MODULE_AUTHOR ("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
524 MODULE_DESCRIPTION ("'cpufreq_ondemand' - A dynamic cpufreq governor for "
525                 "Low Latency Frequency Transition capable processors");
526 MODULE_LICENSE ("GPL");
527
528 module_init(cpufreq_gov_dbs_init);
529 module_exit(cpufreq_gov_dbs_exit);