aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorShaohua Li <shaohua.li@intel.com>2009-07-27 18:11:02 -0400
committerLen Brown <len.brown@intel.com>2009-07-31 18:23:34 -0400
commit8e0af5141ab950b78b3ebbfaded5439dcf8b3a8d (patch)
treee5935fb5e4fe6bc03123f5734ba80ca2230f13e2 /drivers
parent4be3bd7849165e7efa6b0b35a23d6a3598d97465 (diff)
ACPI: create Processor Aggregator Device driver
ACPI 4.0 created the logical "processor aggregator device" as a mechinism for platforms to ask the OS to force otherwise busy processors to enter (power saving) idle. The intent is to lower power consumption to ride-out transient electrical and thermal emergencies, rather than powering off the server. On platforms that can save more power/performance via P-states, the platform will first exhaust P-states before forcing idle. However, the relative benefit of P-states vs. idle states is platform dependent, and thus this driver need not know or care about it. This driver does not use the kernel's CPU hot-plug mechanism because after the transient emergency is over, the system must be returned to its normal state, and hotplug would permanently break both cpusets and binding. So to force idle, the driver creates a power saving thread. The scheduler will migrate the thread to the preferred CPU. The thread has max priority and has SCHED_RR policy, so it can occupy one CPU. To save power, the thread will invoke the deep C-state entry instructions. To avoid starvation, the thread will sleep 5% of the time time for every second (current RT scheduler has threshold to avoid starvation, but if other CPUs are idle, the CPU can borrow CPU timer from other, which makes the mechanism not work here) Vaidyanathan Srinivasan has proposed scheduler enhancements to allow injecting idle time into the system. This driver doesn't depend on those enhancements, but could cut over to them when they are available. Peter Z. does not favor upstreaming this driver until the those scheduler enhancements are in place. However, we favor upstreaming this driver now because it is useful now, and can be enhanced over time. Signed-off-by: Shaohua Li <shaohua.li@intel.com> NACKed-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/Kconfig11
-rw-r--r--drivers/acpi/Makefile2
-rw-r--r--drivers/acpi/acpi_pad.c514
3 files changed, 527 insertions, 0 deletions
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 7ec7d88c5999..13531ed3cbd3 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -196,6 +196,17 @@ config ACPI_HOTPLUG_CPU
196 select ACPI_CONTAINER 196 select ACPI_CONTAINER
197 default y 197 default y
198 198
199config ACPI_PROCESSOR_AGGREGATOR
200 tristate "Processor Aggregator"
201 depends on ACPI_PROCESSOR
202 depends on EXPERIMENTAL
203 help
204 ACPI 4.0 defines processor Aggregator, which enables OS to perform
205 specfic processor configuration and control that applies to all
206 processors in the platform. Currently only logical processor idling
207 is defined, which is to reduce power consumption. This driver
208 support the new device.
209
199config ACPI_THERMAL 210config ACPI_THERMAL
200 tristate "Thermal Zone" 211 tristate "Thermal Zone"
201 depends on ACPI_PROCESSOR 212 depends on ACPI_PROCESSOR
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 03a985be3fe3..8dae61c8a176 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -61,3 +61,5 @@ obj-$(CONFIG_ACPI_SBS) += sbs.o
61processor-y := processor_core.o processor_throttling.o 61processor-y := processor_core.o processor_throttling.o
62processor-y += processor_idle.o processor_thermal.o 62processor-y += processor_idle.o processor_thermal.o
63processor-$(CONFIG_CPU_FREQ) += processor_perflib.o 63processor-$(CONFIG_CPU_FREQ) += processor_perflib.o
64
65obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
new file mode 100644
index 000000000000..0d2cdb86158b
--- /dev/null
+++ b/drivers/acpi/acpi_pad.c
@@ -0,0 +1,514 @@
1/*
2 * acpi_pad.c ACPI Processor Aggregator Driver
3 *
4 * Copyright (c) 2009, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#include <linux/kernel.h>
22#include <linux/cpumask.h>
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
26#include <linux/kthread.h>
27#include <linux/freezer.h>
28#include <linux/cpu.h>
29#include <linux/clockchips.h>
30#include <acpi/acpi_bus.h>
31#include <acpi/acpi_drivers.h>
32
33#define ACPI_PROCESSOR_AGGREGATOR_CLASS "processor_aggregator"
34#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
35#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
36static DEFINE_MUTEX(isolated_cpus_lock);
37
38#define MWAIT_SUBSTATE_MASK (0xf)
39#define MWAIT_CSTATE_MASK (0xf)
40#define MWAIT_SUBSTATE_SIZE (4)
41#define CPUID_MWAIT_LEAF (5)
42#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
43#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
44static unsigned long power_saving_mwait_eax;
45static void power_saving_mwait_init(void)
46{
47 unsigned int eax, ebx, ecx, edx;
48 unsigned int highest_cstate = 0;
49 unsigned int highest_subcstate = 0;
50 int i;
51
52 if (!boot_cpu_has(X86_FEATURE_MWAIT))
53 return;
54 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
55 return;
56
57 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
58
59 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
60 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
61 return;
62
63 edx >>= MWAIT_SUBSTATE_SIZE;
64 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
65 if (edx & MWAIT_SUBSTATE_MASK) {
66 highest_cstate = i;
67 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
68 }
69 }
70 power_saving_mwait_eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
71 (highest_subcstate - 1);
72
73 for_each_online_cpu(i)
74 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ON, &i);
75
76#if defined(CONFIG_GENERIC_TIME) && defined(CONFIG_X86)
77 switch (boot_cpu_data.x86_vendor) {
78 case X86_VENDOR_AMD:
79 case X86_VENDOR_INTEL:
80 /*
81 * AMD Fam10h TSC will tick in all
82 * C/P/S0/S1 states when this bit is set.
83 */
84 if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
85 return;
86
87 /*FALL THROUGH*/
88 default:
89 /* TSC could halt in idle, so notify users */
90 mark_tsc_unstable("TSC halts in idle");
91 }
92#endif
93}
94
95static unsigned long cpu_weight[NR_CPUS];
96static int tsk_in_cpu[NR_CPUS] = {[0 ... NR_CPUS-1] = -1};
97static DECLARE_BITMAP(pad_busy_cpus_bits, NR_CPUS);
98static void round_robin_cpu(unsigned int tsk_index)
99{
100 struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits);
101 cpumask_var_t tmp;
102 int cpu;
103 unsigned long min_weight = -1, preferred_cpu;
104
105 if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
106 return;
107
108 mutex_lock(&isolated_cpus_lock);
109 cpumask_clear(tmp);
110 for_each_cpu(cpu, pad_busy_cpus)
111 cpumask_or(tmp, tmp, topology_thread_cpumask(cpu));
112 cpumask_andnot(tmp, cpu_online_mask, tmp);
113 /* avoid HT sibilings if possible */
114 if (cpumask_empty(tmp))
115 cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus);
116 if (cpumask_empty(tmp)) {
117 mutex_unlock(&isolated_cpus_lock);
118 return;
119 }
120 for_each_cpu(cpu, tmp) {
121 if (cpu_weight[cpu] < min_weight) {
122 min_weight = cpu_weight[cpu];
123 preferred_cpu = cpu;
124 }
125 }
126
127 if (tsk_in_cpu[tsk_index] != -1)
128 cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus);
129 tsk_in_cpu[tsk_index] = preferred_cpu;
130 cpumask_set_cpu(preferred_cpu, pad_busy_cpus);
131 cpu_weight[preferred_cpu]++;
132 mutex_unlock(&isolated_cpus_lock);
133
134 set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu));
135}
136
137static void exit_round_robin(unsigned int tsk_index)
138{
139 struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits);
140 cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus);
141 tsk_in_cpu[tsk_index] = -1;
142}
143
144static unsigned int idle_pct = 5; /* percentage */
145static unsigned int round_robin_time = 10; /* second */
146static int power_saving_thread(void *data)
147{
148 struct sched_param param = {.sched_priority = 1};
149 int do_sleep;
150 unsigned int tsk_index = (unsigned long)data;
151 u64 last_jiffies = 0;
152
153 sched_setscheduler(current, SCHED_RR, &param);
154
155 while (!kthread_should_stop()) {
156 int cpu;
157 u64 expire_time;
158
159 try_to_freeze();
160
161 /* round robin to cpus */
162 if (last_jiffies + round_robin_time * HZ < jiffies) {
163 last_jiffies = jiffies;
164 round_robin_cpu(tsk_index);
165 }
166
167 do_sleep = 0;
168
169 current_thread_info()->status &= ~TS_POLLING;
170 /*
171 * TS_POLLING-cleared state must be visible before we test
172 * NEED_RESCHED:
173 */
174 smp_mb();
175
176 expire_time = jiffies + HZ * (100 - idle_pct) / 100;
177
178 while (!need_resched()) {
179 local_irq_disable();
180 cpu = smp_processor_id();
181 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
182 &cpu);
183 stop_critical_timings();
184
185 __monitor((void *)&current_thread_info()->flags, 0, 0);
186 smp_mb();
187 if (!need_resched())
188 __mwait(power_saving_mwait_eax, 1);
189
190 start_critical_timings();
191 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
192 &cpu);
193 local_irq_enable();
194
195 if (jiffies > expire_time) {
196 do_sleep = 1;
197 break;
198 }
199 }
200
201 current_thread_info()->status |= TS_POLLING;
202
203 /*
204 * current sched_rt has threshold for rt task running time.
205 * When a rt task uses 95% CPU time, the rt thread will be
206 * scheduled out for 5% CPU time to not starve other tasks. But
207 * the mechanism only works when all CPUs have RT task running,
208 * as if one CPU hasn't RT task, RT task from other CPUs will
209 * borrow CPU time from this CPU and cause RT task use > 95%
210 * CPU time. To make 'avoid staration' work, takes a nap here.
211 */
212 if (do_sleep)
213 schedule_timeout_killable(HZ * idle_pct / 100);
214 }
215
216 exit_round_robin(tsk_index);
217 return 0;
218}
219
220static struct task_struct *ps_tsks[NR_CPUS];
221static unsigned int ps_tsk_num;
222static int create_power_saving_task(void)
223{
224 ps_tsks[ps_tsk_num] = kthread_run(power_saving_thread,
225 (void *)(unsigned long)ps_tsk_num,
226 "power_saving/%d", ps_tsk_num);
227 if (ps_tsks[ps_tsk_num]) {
228 ps_tsk_num++;
229 return 0;
230 }
231 return -EINVAL;
232}
233
234static void destroy_power_saving_task(void)
235{
236 if (ps_tsk_num > 0) {
237 ps_tsk_num--;
238 kthread_stop(ps_tsks[ps_tsk_num]);
239 }
240}
241
242static void set_power_saving_task_num(unsigned int num)
243{
244 if (num > ps_tsk_num) {
245 while (ps_tsk_num < num) {
246 if (create_power_saving_task())
247 return;
248 }
249 } else if (num < ps_tsk_num) {
250 while (ps_tsk_num > num)
251 destroy_power_saving_task();
252 }
253}
254
255static int acpi_pad_idle_cpus(unsigned int num_cpus)
256{
257 get_online_cpus();
258
259 num_cpus = min_t(unsigned int, num_cpus, num_online_cpus());
260 set_power_saving_task_num(num_cpus);
261
262 put_online_cpus();
263 return 0;
264}
265
266static uint32_t acpi_pad_idle_cpus_num(void)
267{
268 return ps_tsk_num;
269}
270
271static ssize_t acpi_pad_rrtime_store(struct device *dev,
272 struct device_attribute *attr, const char *buf, size_t count)
273{
274 unsigned long num;
275 if (strict_strtoul(buf, 0, &num))
276 return -EINVAL;
277 if (num < 1 || num >= 100)
278 return -EINVAL;
279 mutex_lock(&isolated_cpus_lock);
280 round_robin_time = num;
281 mutex_unlock(&isolated_cpus_lock);
282 return count;
283}
284
285static ssize_t acpi_pad_rrtime_show(struct device *dev,
286 struct device_attribute *attr, char *buf)
287{
288 return scnprintf(buf, PAGE_SIZE, "%d", round_robin_time);
289}
290static DEVICE_ATTR(rrtime, S_IRUGO|S_IWUSR,
291 acpi_pad_rrtime_show,
292 acpi_pad_rrtime_store);
293
294static ssize_t acpi_pad_idlepct_store(struct device *dev,
295 struct device_attribute *attr, const char *buf, size_t count)
296{
297 unsigned long num;
298 if (strict_strtoul(buf, 0, &num))
299 return -EINVAL;
300 if (num < 1 || num >= 100)
301 return -EINVAL;
302 mutex_lock(&isolated_cpus_lock);
303 idle_pct = num;
304 mutex_unlock(&isolated_cpus_lock);
305 return count;
306}
307
308static ssize_t acpi_pad_idlepct_show(struct device *dev,
309 struct device_attribute *attr, char *buf)
310{
311 return scnprintf(buf, PAGE_SIZE, "%d", idle_pct);
312}
313static DEVICE_ATTR(idlepct, S_IRUGO|S_IWUSR,
314 acpi_pad_idlepct_show,
315 acpi_pad_idlepct_store);
316
317static ssize_t acpi_pad_idlecpus_store(struct device *dev,
318 struct device_attribute *attr, const char *buf, size_t count)
319{
320 unsigned long num;
321 if (strict_strtoul(buf, 0, &num))
322 return -EINVAL;
323 mutex_lock(&isolated_cpus_lock);
324 acpi_pad_idle_cpus(num);
325 mutex_unlock(&isolated_cpus_lock);
326 return count;
327}
328
329static ssize_t acpi_pad_idlecpus_show(struct device *dev,
330 struct device_attribute *attr, char *buf)
331{
332 return cpumask_scnprintf(buf, PAGE_SIZE,
333 to_cpumask(pad_busy_cpus_bits));
334}
335static DEVICE_ATTR(idlecpus, S_IRUGO|S_IWUSR,
336 acpi_pad_idlecpus_show,
337 acpi_pad_idlecpus_store);
338
339static int acpi_pad_add_sysfs(struct acpi_device *device)
340{
341 int result;
342
343 result = device_create_file(&device->dev, &dev_attr_idlecpus);
344 if (result)
345 return -ENODEV;
346 result = device_create_file(&device->dev, &dev_attr_idlepct);
347 if (result) {
348 device_remove_file(&device->dev, &dev_attr_idlecpus);
349 return -ENODEV;
350 }
351 result = device_create_file(&device->dev, &dev_attr_rrtime);
352 if (result) {
353 device_remove_file(&device->dev, &dev_attr_idlecpus);
354 device_remove_file(&device->dev, &dev_attr_idlepct);
355 return -ENODEV;
356 }
357 return 0;
358}
359
360static void acpi_pad_remove_sysfs(struct acpi_device *device)
361{
362 device_remove_file(&device->dev, &dev_attr_idlecpus);
363 device_remove_file(&device->dev, &dev_attr_idlepct);
364 device_remove_file(&device->dev, &dev_attr_rrtime);
365}
366
367/* Query firmware how many CPUs should be idle */
368static int acpi_pad_pur(acpi_handle handle, int *num_cpus)
369{
370 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
371 acpi_status status;
372 union acpi_object *package;
373 int rev, num, ret = -EINVAL;
374
375 status = acpi_evaluate_object(handle, "_PUR", NULL, &buffer);
376 if (ACPI_FAILURE(status))
377 return -EINVAL;
378 package = buffer.pointer;
379 if (package->type != ACPI_TYPE_PACKAGE || package->package.count != 2)
380 goto out;
381 rev = package->package.elements[0].integer.value;
382 num = package->package.elements[1].integer.value;
383 if (rev != 1)
384 goto out;
385 *num_cpus = num;
386 ret = 0;
387out:
388 kfree(buffer.pointer);
389 return ret;
390}
391
392/* Notify firmware how many CPUs are idle */
393static void acpi_pad_ost(acpi_handle handle, int stat,
394 uint32_t idle_cpus)
395{
396 union acpi_object params[3] = {
397 {.type = ACPI_TYPE_INTEGER,},
398 {.type = ACPI_TYPE_INTEGER,},
399 {.type = ACPI_TYPE_BUFFER,},
400 };
401 struct acpi_object_list arg_list = {3, params};
402
403 params[0].integer.value = ACPI_PROCESSOR_AGGREGATOR_NOTIFY;
404 params[1].integer.value = stat;
405 params[2].buffer.length = 4;
406 params[2].buffer.pointer = (void *)&idle_cpus;
407 acpi_evaluate_object(handle, "_OST", &arg_list, NULL);
408}
409
410static void acpi_pad_handle_notify(acpi_handle handle)
411{
412 int num_cpus, ret;
413 uint32_t idle_cpus;
414
415 mutex_lock(&isolated_cpus_lock);
416 if (acpi_pad_pur(handle, &num_cpus)) {
417 mutex_unlock(&isolated_cpus_lock);
418 return;
419 }
420 ret = acpi_pad_idle_cpus(num_cpus);
421 idle_cpus = acpi_pad_idle_cpus_num();
422 if (!ret)
423 acpi_pad_ost(handle, 0, idle_cpus);
424 else
425 acpi_pad_ost(handle, 1, 0);
426 mutex_unlock(&isolated_cpus_lock);
427}
428
429static void acpi_pad_notify(acpi_handle handle, u32 event,
430 void *data)
431{
432 struct acpi_device *device = data;
433
434 switch (event) {
435 case ACPI_PROCESSOR_AGGREGATOR_NOTIFY:
436 acpi_pad_handle_notify(handle);
437 acpi_bus_generate_proc_event(device, event, 0);
438 acpi_bus_generate_netlink_event(device->pnp.device_class,
439 dev_name(&device->dev), event, 0);
440 break;
441 default:
442 printk(KERN_WARNING"Unsupported event [0x%x]\n", event);
443 break;
444 }
445}
446
447static int acpi_pad_add(struct acpi_device *device)
448{
449 acpi_status status;
450
451 strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME);
452 strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS);
453
454 if (acpi_pad_add_sysfs(device))
455 return -ENODEV;
456
457 status = acpi_install_notify_handler(device->handle,
458 ACPI_DEVICE_NOTIFY, acpi_pad_notify, device);
459 if (ACPI_FAILURE(status)) {
460 acpi_pad_remove_sysfs(device);
461 return -ENODEV;
462 }
463
464 return 0;
465}
466
467static int acpi_pad_remove(struct acpi_device *device,
468 int type)
469{
470 mutex_lock(&isolated_cpus_lock);
471 acpi_pad_idle_cpus(0);
472 mutex_unlock(&isolated_cpus_lock);
473
474 acpi_remove_notify_handler(device->handle,
475 ACPI_DEVICE_NOTIFY, acpi_pad_notify);
476 acpi_pad_remove_sysfs(device);
477 return 0;
478}
479
480static const struct acpi_device_id pad_device_ids[] = {
481 {"ACPI000C", 0},
482 {"", 0},
483};
484MODULE_DEVICE_TABLE(acpi, pad_device_ids);
485
486static struct acpi_driver acpi_pad_driver = {
487 .name = "processor_aggregator",
488 .class = ACPI_PROCESSOR_AGGREGATOR_CLASS,
489 .ids = pad_device_ids,
490 .ops = {
491 .add = acpi_pad_add,
492 .remove = acpi_pad_remove,
493 },
494};
495
496static int __init acpi_pad_init(void)
497{
498 power_saving_mwait_init();
499 if (power_saving_mwait_eax == 0)
500 return -EINVAL;
501
502 return acpi_bus_register_driver(&acpi_pad_driver);
503}
504
505static void __exit acpi_pad_exit(void)
506{
507 acpi_bus_unregister_driver(&acpi_pad_driver);
508}
509
510module_init(acpi_pad_init);
511module_exit(acpi_pad_exit);
512MODULE_AUTHOR("Shaohua Li<shaohua.li@intel.com>");
513MODULE_DESCRIPTION("ACPI Processor Aggregator Driver");
514MODULE_LICENSE("GPL");