Merge powerclamp driver updates (that depend on cpuidle material) for v4.10.
[linux] / drivers / thermal / intel_powerclamp.c
1 /*
2  * intel_powerclamp.c - package c-state idle injection
3  *
4  * Copyright (c) 2012, Intel Corporation.
5  *
6  * Authors:
7  *     Arjan van de Ven <arjan@linux.intel.com>
8  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms and conditions of the GNU General Public License,
12  * version 2, as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22  *
23  *
24  *      TODO:
25  *           1. better handle wakeup from external interrupts, currently a fixed
26  *              compensation is added to clamping duration when excessive amount
27  *              of wakeups are observed during idle time. the reason is that in
28  *              case of external interrupts without need for ack, clamping down
29  *              cpu in non-irq context does not reduce irq. for majority of the
30  *              cases, clamping down cpu does help reduce irq as well, we should
31  *              be able to differenciate the two cases and give a quantitative
32  *              solution for the irqs that we can control. perhaps based on
33  *              get_cpu_iowait_time_us()
34  *
35  *           2. synchronization with other hw blocks
36  *
37  *
38  */
39
40 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
41
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/cpu.h>
47 #include <linux/thermal.h>
48 #include <linux/slab.h>
49 #include <linux/tick.h>
50 #include <linux/debugfs.h>
51 #include <linux/seq_file.h>
52 #include <linux/sched/rt.h>
53
54 #include <asm/nmi.h>
55 #include <asm/msr.h>
56 #include <asm/mwait.h>
57 #include <asm/cpu_device_id.h>
58 #include <asm/idle.h>
59 #include <asm/hardirq.h>
60
61 #define MAX_TARGET_RATIO (50U)
62 /* For each undisturbed clamping period (no extra wake ups during idle time),
63  * we increment the confidence counter for the given target ratio.
64  * CONFIDENCE_OK defines the level where runtime calibration results are
65  * valid.
66  */
67 #define CONFIDENCE_OK (3)
68 /* Default idle injection duration, driver adjust sleep time to meet target
69  * idle ratio. Similar to frequency modulation.
70  */
71 #define DEFAULT_DURATION_JIFFIES (6)
72
73 static unsigned int target_mwait;
74 static struct dentry *debug_dir;
75
76 /* user selected target */
77 static unsigned int set_target_ratio;
78 static unsigned int current_ratio;
79 static bool should_skip;
80 static bool reduce_irq;
81 static atomic_t idle_wakeup_counter;
82 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
83                                   * control parameters. default to BSP but BSP
84                                   * can be offlined.
85                                   */
86 static bool clamping;
87
88 static const struct sched_param sparam = {
89         .sched_priority = MAX_USER_RT_PRIO / 2,
90 };
91 struct powerclamp_worker_data {
92         struct kthread_worker *worker;
93         struct kthread_work balancing_work;
94         struct kthread_delayed_work idle_injection_work;
95         unsigned int cpu;
96         unsigned int count;
97         unsigned int guard;
98         unsigned int window_size_now;
99         unsigned int target_ratio;
100         unsigned int duration_jiffies;
101         bool clamping;
102 };
103
104 static struct powerclamp_worker_data * __percpu worker_data;
105 static struct thermal_cooling_device *cooling_dev;
106 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
107                                            * clamping kthread worker
108                                            */
109
110 static unsigned int duration;
111 static unsigned int pkg_cstate_ratio_cur;
112 static unsigned int window_size;
113
114 static int duration_set(const char *arg, const struct kernel_param *kp)
115 {
116         int ret = 0;
117         unsigned long new_duration;
118
119         ret = kstrtoul(arg, 10, &new_duration);
120         if (ret)
121                 goto exit;
122         if (new_duration > 25 || new_duration < 6) {
123                 pr_err("Out of recommended range %lu, between 6-25ms\n",
124                         new_duration);
125                 ret = -EINVAL;
126         }
127
128         duration = clamp(new_duration, 6ul, 25ul);
129         smp_mb();
130
131 exit:
132
133         return ret;
134 }
135
136 static const struct kernel_param_ops duration_ops = {
137         .set = duration_set,
138         .get = param_get_int,
139 };
140
141
142 module_param_cb(duration, &duration_ops, &duration, 0644);
143 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
144
145 struct powerclamp_calibration_data {
146         unsigned long confidence;  /* used for calibration, basically a counter
147                                     * gets incremented each time a clamping
148                                     * period is completed without extra wakeups
149                                     * once that counter is reached given level,
150                                     * compensation is deemed usable.
151                                     */
152         unsigned long steady_comp; /* steady state compensation used when
153                                     * no extra wakeups occurred.
154                                     */
155         unsigned long dynamic_comp; /* compensate excessive wakeup from idle
156                                      * mostly from external interrupts.
157                                      */
158 };
159
160 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
161
162 static int window_size_set(const char *arg, const struct kernel_param *kp)
163 {
164         int ret = 0;
165         unsigned long new_window_size;
166
167         ret = kstrtoul(arg, 10, &new_window_size);
168         if (ret)
169                 goto exit_win;
170         if (new_window_size > 10 || new_window_size < 2) {
171                 pr_err("Out of recommended window size %lu, between 2-10\n",
172                         new_window_size);
173                 ret = -EINVAL;
174         }
175
176         window_size = clamp(new_window_size, 2ul, 10ul);
177         smp_mb();
178
179 exit_win:
180
181         return ret;
182 }
183
184 static const struct kernel_param_ops window_size_ops = {
185         .set = window_size_set,
186         .get = param_get_int,
187 };
188
189 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
190 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
191         "\tpowerclamp controls idle ratio within this window. larger\n"
192         "\twindow size results in slower response time but more smooth\n"
193         "\tclamping results. default to 2.");
194
195 static void find_target_mwait(void)
196 {
197         unsigned int eax, ebx, ecx, edx;
198         unsigned int highest_cstate = 0;
199         unsigned int highest_subcstate = 0;
200         int i;
201
202         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
203                 return;
204
205         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
206
207         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
208             !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
209                 return;
210
211         edx >>= MWAIT_SUBSTATE_SIZE;
212         for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
213                 if (edx & MWAIT_SUBSTATE_MASK) {
214                         highest_cstate = i;
215                         highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
216                 }
217         }
218         target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
219                 (highest_subcstate - 1);
220
221 }
222
223 struct pkg_cstate_info {
224         bool skip;
225         int msr_index;
226         int cstate_id;
227 };
228
229 #define PKG_CSTATE_INIT(id) {                           \
230                 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
231                 .cstate_id = id                         \
232                         }
233
234 static struct pkg_cstate_info pkg_cstates[] = {
235         PKG_CSTATE_INIT(2),
236         PKG_CSTATE_INIT(3),
237         PKG_CSTATE_INIT(6),
238         PKG_CSTATE_INIT(7),
239         PKG_CSTATE_INIT(8),
240         PKG_CSTATE_INIT(9),
241         PKG_CSTATE_INIT(10),
242         {NULL},
243 };
244
245 static bool has_pkg_state_counter(void)
246 {
247         u64 val;
248         struct pkg_cstate_info *info = pkg_cstates;
249
250         /* check if any one of the counter msrs exists */
251         while (info->msr_index) {
252                 if (!rdmsrl_safe(info->msr_index, &val))
253                         return true;
254                 info++;
255         }
256
257         return false;
258 }
259
260 static u64 pkg_state_counter(void)
261 {
262         u64 val;
263         u64 count = 0;
264         struct pkg_cstate_info *info = pkg_cstates;
265
266         while (info->msr_index) {
267                 if (!info->skip) {
268                         if (!rdmsrl_safe(info->msr_index, &val))
269                                 count += val;
270                         else
271                                 info->skip = true;
272                 }
273                 info++;
274         }
275
276         return count;
277 }
278
279 static unsigned int get_compensation(int ratio)
280 {
281         unsigned int comp = 0;
282
283         /* we only use compensation if all adjacent ones are good */
284         if (ratio == 1 &&
285                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
286                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
287                 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
288                 comp = (cal_data[ratio].steady_comp +
289                         cal_data[ratio + 1].steady_comp +
290                         cal_data[ratio + 2].steady_comp) / 3;
291         } else if (ratio == MAX_TARGET_RATIO - 1 &&
292                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
293                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
294                 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
295                 comp = (cal_data[ratio].steady_comp +
296                         cal_data[ratio - 1].steady_comp +
297                         cal_data[ratio - 2].steady_comp) / 3;
298         } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
299                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
300                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
301                 comp = (cal_data[ratio].steady_comp +
302                         cal_data[ratio - 1].steady_comp +
303                         cal_data[ratio + 1].steady_comp) / 3;
304         }
305
306         /* REVISIT: simple penalty of double idle injection */
307         if (reduce_irq)
308                 comp = ratio;
309         /* do not exceed limit */
310         if (comp + ratio >= MAX_TARGET_RATIO)
311                 comp = MAX_TARGET_RATIO - ratio - 1;
312
313         return comp;
314 }
315
316 static void adjust_compensation(int target_ratio, unsigned int win)
317 {
318         int delta;
319         struct powerclamp_calibration_data *d = &cal_data[target_ratio];
320
321         /*
322          * adjust compensations if confidence level has not been reached or
323          * there are too many wakeups during the last idle injection period, we
324          * cannot trust the data for compensation.
325          */
326         if (d->confidence >= CONFIDENCE_OK ||
327                 atomic_read(&idle_wakeup_counter) >
328                 win * num_online_cpus())
329                 return;
330
331         delta = set_target_ratio - current_ratio;
332         /* filter out bad data */
333         if (delta >= 0 && delta <= (1+target_ratio/10)) {
334                 if (d->steady_comp)
335                         d->steady_comp =
336                                 roundup(delta+d->steady_comp, 2)/2;
337                 else
338                         d->steady_comp = delta;
339                 d->confidence++;
340         }
341 }
342
343 static bool powerclamp_adjust_controls(unsigned int target_ratio,
344                                 unsigned int guard, unsigned int win)
345 {
346         static u64 msr_last, tsc_last;
347         u64 msr_now, tsc_now;
348         u64 val64;
349
350         /* check result for the last window */
351         msr_now = pkg_state_counter();
352         tsc_now = rdtsc();
353
354         /* calculate pkg cstate vs tsc ratio */
355         if (!msr_last || !tsc_last)
356                 current_ratio = 1;
357         else if (tsc_now-tsc_last) {
358                 val64 = 100*(msr_now-msr_last);
359                 do_div(val64, (tsc_now-tsc_last));
360                 current_ratio = val64;
361         }
362
363         /* update record */
364         msr_last = msr_now;
365         tsc_last = tsc_now;
366
367         adjust_compensation(target_ratio, win);
368         /*
369          * too many external interrupts, set flag such
370          * that we can take measure later.
371          */
372         reduce_irq = atomic_read(&idle_wakeup_counter) >=
373                 2 * win * num_online_cpus();
374
375         atomic_set(&idle_wakeup_counter, 0);
376         /* if we are above target+guard, skip */
377         return set_target_ratio + guard <= current_ratio;
378 }
379
380 static void clamp_balancing_func(struct kthread_work *work)
381 {
382         struct powerclamp_worker_data *w_data;
383         int sleeptime;
384         unsigned long target_jiffies;
385         unsigned int compensated_ratio;
386         int interval; /* jiffies to sleep for each attempt */
387
388         w_data = container_of(work, struct powerclamp_worker_data,
389                               balancing_work);
390
391         /*
392          * make sure user selected ratio does not take effect until
393          * the next round. adjust target_ratio if user has changed
394          * target such that we can converge quickly.
395          */
396         w_data->target_ratio = READ_ONCE(set_target_ratio);
397         w_data->guard = 1 + w_data->target_ratio / 20;
398         w_data->window_size_now = window_size;
399         w_data->duration_jiffies = msecs_to_jiffies(duration);
400         w_data->count++;
401
402         /*
403          * systems may have different ability to enter package level
404          * c-states, thus we need to compensate the injected idle ratio
405          * to achieve the actual target reported by the HW.
406          */
407         compensated_ratio = w_data->target_ratio +
408                 get_compensation(w_data->target_ratio);
409         if (compensated_ratio <= 0)
410                 compensated_ratio = 1;
411         interval = w_data->duration_jiffies * 100 / compensated_ratio;
412
413         /* align idle time */
414         target_jiffies = roundup(jiffies, interval);
415         sleeptime = target_jiffies - jiffies;
416         if (sleeptime <= 0)
417                 sleeptime = 1;
418
419         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
420                 kthread_queue_delayed_work(w_data->worker,
421                                            &w_data->idle_injection_work,
422                                            sleeptime);
423 }
424
425 static void clamp_idle_injection_func(struct kthread_work *work)
426 {
427         struct powerclamp_worker_data *w_data;
428
429         w_data = container_of(work, struct powerclamp_worker_data,
430                               idle_injection_work.work);
431
432         /*
433          * only elected controlling cpu can collect stats and update
434          * control parameters.
435          */
436         if (w_data->cpu == control_cpu &&
437             !(w_data->count % w_data->window_size_now)) {
438                 should_skip =
439                         powerclamp_adjust_controls(w_data->target_ratio,
440                                                    w_data->guard,
441                                                    w_data->window_size_now);
442                 smp_mb();
443         }
444
445         if (should_skip)
446                 goto balance;
447
448         play_idle(jiffies_to_msecs(w_data->duration_jiffies));
449
450 balance:
451         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
452                 kthread_queue_work(w_data->worker, &w_data->balancing_work);
453 }
454
455 /*
456  * 1 HZ polling while clamping is active, useful for userspace
457  * to monitor actual idle ratio.
458  */
459 static void poll_pkg_cstate(struct work_struct *dummy);
460 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
461 static void poll_pkg_cstate(struct work_struct *dummy)
462 {
463         static u64 msr_last;
464         static u64 tsc_last;
465         static unsigned long jiffies_last;
466
467         u64 msr_now;
468         unsigned long jiffies_now;
469         u64 tsc_now;
470         u64 val64;
471
472         msr_now = pkg_state_counter();
473         tsc_now = rdtsc();
474         jiffies_now = jiffies;
475
476         /* calculate pkg cstate vs tsc ratio */
477         if (!msr_last || !tsc_last)
478                 pkg_cstate_ratio_cur = 1;
479         else {
480                 if (tsc_now - tsc_last) {
481                         val64 = 100 * (msr_now - msr_last);
482                         do_div(val64, (tsc_now - tsc_last));
483                         pkg_cstate_ratio_cur = val64;
484                 }
485         }
486
487         /* update record */
488         msr_last = msr_now;
489         jiffies_last = jiffies_now;
490         tsc_last = tsc_now;
491
492         if (true == clamping)
493                 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
494 }
495
496 static void start_power_clamp_worker(unsigned long cpu)
497 {
498         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
499         struct kthread_worker *worker;
500
501         worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
502         if (IS_ERR(worker))
503                 return;
504
505         w_data->worker = worker;
506         w_data->count = 0;
507         w_data->cpu = cpu;
508         w_data->clamping = true;
509         set_bit(cpu, cpu_clamping_mask);
510         sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
511         kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
512         kthread_init_delayed_work(&w_data->idle_injection_work,
513                                   clamp_idle_injection_func);
514         kthread_queue_work(w_data->worker, &w_data->balancing_work);
515 }
516
517 static void stop_power_clamp_worker(unsigned long cpu)
518 {
519         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
520
521         if (!w_data->worker)
522                 return;
523
524         w_data->clamping = false;
525         /*
526          * Make sure that all works that get queued after this point see
527          * the clamping disabled. The counter part is not needed because
528          * there is an implicit memory barrier when the queued work
529          * is proceed.
530          */
531         smp_wmb();
532         kthread_cancel_work_sync(&w_data->balancing_work);
533         kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
534         /*
535          * The balancing work still might be queued here because
536          * the handling of the "clapming" variable, cancel, and queue
537          * operations are not synchronized via a lock. But it is not
538          * a big deal. The balancing work is fast and destroy kthread
539          * will wait for it.
540          */
541         clear_bit(w_data->cpu, cpu_clamping_mask);
542         kthread_destroy_worker(w_data->worker);
543
544         w_data->worker = NULL;
545 }
546
547 static int start_power_clamp(void)
548 {
549         unsigned long cpu;
550
551         set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
552         /* prevent cpu hotplug */
553         get_online_cpus();
554
555         /* prefer BSP */
556         control_cpu = 0;
557         if (!cpu_online(control_cpu))
558                 control_cpu = smp_processor_id();
559
560         clamping = true;
561         schedule_delayed_work(&poll_pkg_cstate_work, 0);
562
563         /* start one kthread worker per online cpu */
564         for_each_online_cpu(cpu) {
565                 start_power_clamp_worker(cpu);
566         }
567         put_online_cpus();
568
569         return 0;
570 }
571
572 static void end_power_clamp(void)
573 {
574         int i;
575
576         /*
577          * Block requeuing in all the kthread workers. They will flush and
578          * stop faster.
579          */
580         clamping = false;
581         if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
582                 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
583                         pr_debug("clamping worker for cpu %d alive, destroy\n",
584                                  i);
585                         stop_power_clamp_worker(i);
586                 }
587         }
588 }
589
590 static int powerclamp_cpu_online(unsigned int cpu)
591 {
592         if (clamping == false)
593                 return 0;
594         start_power_clamp_worker(cpu);
595         /* prefer BSP as controlling CPU */
596         if (cpu == 0) {
597                 control_cpu = 0;
598                 smp_mb();
599         }
600         return 0;
601 }
602
603 static int powerclamp_cpu_predown(unsigned int cpu)
604 {
605         if (clamping == false)
606                 return 0;
607
608         stop_power_clamp_worker(cpu);
609         if (cpu != control_cpu)
610                 return 0;
611
612         control_cpu = cpumask_first(cpu_online_mask);
613         if (control_cpu == cpu)
614                 control_cpu = cpumask_next(cpu, cpu_online_mask);
615         smp_mb();
616         return 0;
617 }
618
619 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
620                                  unsigned long *state)
621 {
622         *state = MAX_TARGET_RATIO;
623
624         return 0;
625 }
626
627 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
628                                  unsigned long *state)
629 {
630         if (true == clamping)
631                 *state = pkg_cstate_ratio_cur;
632         else
633                 /* to save power, do not poll idle ratio while not clamping */
634                 *state = -1; /* indicates invalid state */
635
636         return 0;
637 }
638
639 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
640                                  unsigned long new_target_ratio)
641 {
642         int ret = 0;
643
644         new_target_ratio = clamp(new_target_ratio, 0UL,
645                                 (unsigned long) (MAX_TARGET_RATIO-1));
646         if (set_target_ratio == 0 && new_target_ratio > 0) {
647                 pr_info("Start idle injection to reduce power\n");
648                 set_target_ratio = new_target_ratio;
649                 ret = start_power_clamp();
650                 goto exit_set;
651         } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
652                 pr_info("Stop forced idle injection\n");
653                 end_power_clamp();
654                 set_target_ratio = 0;
655         } else  /* adjust currently running */ {
656                 set_target_ratio = new_target_ratio;
657                 /* make new set_target_ratio visible to other cpus */
658                 smp_mb();
659         }
660
661 exit_set:
662         return ret;
663 }
664
665 /* bind to generic thermal layer as cooling device*/
666 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
667         .get_max_state = powerclamp_get_max_state,
668         .get_cur_state = powerclamp_get_cur_state,
669         .set_cur_state = powerclamp_set_cur_state,
670 };
671
672 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
673         { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
674         {}
675 };
676 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
677
678 static int __init powerclamp_probe(void)
679 {
680
681         if (!x86_match_cpu(intel_powerclamp_ids)) {
682                 pr_err("CPU does not support MWAIT");
683                 return -ENODEV;
684         }
685
686         /* The goal for idle time alignment is to achieve package cstate. */
687         if (!has_pkg_state_counter()) {
688                 pr_info("No package C-state available");
689                 return -ENODEV;
690         }
691
692         /* find the deepest mwait value */
693         find_target_mwait();
694
695         return 0;
696 }
697
698 static int powerclamp_debug_show(struct seq_file *m, void *unused)
699 {
700         int i = 0;
701
702         seq_printf(m, "controlling cpu: %d\n", control_cpu);
703         seq_printf(m, "pct confidence steady dynamic (compensation)\n");
704         for (i = 0; i < MAX_TARGET_RATIO; i++) {
705                 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
706                         i,
707                         cal_data[i].confidence,
708                         cal_data[i].steady_comp,
709                         cal_data[i].dynamic_comp);
710         }
711
712         return 0;
713 }
714
715 static int powerclamp_debug_open(struct inode *inode,
716                         struct file *file)
717 {
718         return single_open(file, powerclamp_debug_show, inode->i_private);
719 }
720
721 static const struct file_operations powerclamp_debug_fops = {
722         .open           = powerclamp_debug_open,
723         .read           = seq_read,
724         .llseek         = seq_lseek,
725         .release        = single_release,
726         .owner          = THIS_MODULE,
727 };
728
729 static inline void powerclamp_create_debug_files(void)
730 {
731         debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
732         if (!debug_dir)
733                 return;
734
735         if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
736                                         cal_data, &powerclamp_debug_fops))
737                 goto file_error;
738
739         return;
740
741 file_error:
742         debugfs_remove_recursive(debug_dir);
743 }
744
745 static enum cpuhp_state hp_state;
746
747 static int __init powerclamp_init(void)
748 {
749         int retval;
750         int bitmap_size;
751
752         bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
753         cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
754         if (!cpu_clamping_mask)
755                 return -ENOMEM;
756
757         /* probe cpu features and ids here */
758         retval = powerclamp_probe();
759         if (retval)
760                 goto exit_free;
761
762         /* set default limit, maybe adjusted during runtime based on feedback */
763         window_size = 2;
764         retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
765                                            "thermal/intel_powerclamp:online",
766                                            powerclamp_cpu_online,
767                                            powerclamp_cpu_predown);
768         if (retval < 0)
769                 goto exit_free;
770
771         hp_state = retval;
772
773         worker_data = alloc_percpu(struct powerclamp_worker_data);
774         if (!worker_data) {
775                 retval = -ENOMEM;
776                 goto exit_unregister;
777         }
778
779         cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
780                                                 &powerclamp_cooling_ops);
781         if (IS_ERR(cooling_dev)) {
782                 retval = -ENODEV;
783                 goto exit_free_thread;
784         }
785
786         if (!duration)
787                 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
788
789         powerclamp_create_debug_files();
790
791         return 0;
792
793 exit_free_thread:
794         free_percpu(worker_data);
795 exit_unregister:
796         cpuhp_remove_state_nocalls(hp_state);
797 exit_free:
798         kfree(cpu_clamping_mask);
799         return retval;
800 }
801 module_init(powerclamp_init);
802
803 static void __exit powerclamp_exit(void)
804 {
805         end_power_clamp();
806         cpuhp_remove_state_nocalls(hp_state);
807         free_percpu(worker_data);
808         thermal_cooling_device_unregister(cooling_dev);
809         kfree(cpu_clamping_mask);
810
811         cancel_delayed_work_sync(&poll_pkg_cstate_work);
812         debugfs_remove_recursive(debug_dir);
813 }
814 module_exit(powerclamp_exit);
815
816 MODULE_LICENSE("GPL");
817 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
818 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
819 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");