aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-09-19 05:27:32 -0400
committerIngo Molnar <mingo@elte.hu>2009-09-19 05:28:41 -0400
commit929bf0d0156562ce631728b6fa53d68004d456d2 (patch)
tree739063990a8077b29ef97e69d73bce94573daae4 /kernel
parentdef0a9b2573e00ab0b486cb5382625203ab4c4a6 (diff)
parent202c4675c55ddf6b443c7e057d2dff6b42ef71aa (diff)
Merge branch 'linus' into perfcounters/core
Merge reason: Bring in tracing changes we depend on. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/perf_counter.c7
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c412
-rw-r--r--kernel/printk.c6
-rw-r--r--kernel/sched.c448
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c414
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c4
-rw-r--r--kernel/sched_rt.c7
-rw-r--r--kernel/smp.c40
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/trace/Kconfig28
-rw-r--r--kernel/trace/ftrace.c158
-rw-r--r--kernel/trace/ring_buffer.c17
-rw-r--r--kernel/trace/trace.c127
-rw-r--r--kernel/trace/trace.h276
-rw-r--r--kernel/trace/trace_boot.c8
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h383
-rw-r--r--kernel/trace/trace_event_profile.c5
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c85
-rw-r--r--kernel/trace/trace_events_filter.c41
-rw-r--r--kernel/trace/trace_export.c284
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c66
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_output.c42
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_wakeup.c52
44 files changed, 2009 insertions, 1543 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index b833bd5cc127..3d9c7e27e3f9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -90,7 +90,6 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
90obj-$(CONFIG_MARKERS) += marker.o 90obj-$(CONFIG_MARKERS) += marker.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 92obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/ 95obj-$(CONFIG_X86_DS) += trace/
@@ -117,7 +116,7 @@ $(obj)/config_data.gz: .config FORCE
117 $(call if_changed,gzip) 116 $(call if_changed,gzip)
118 117
119quiet_cmd_ikconfiggz = IKCFG $@ 118quiet_cmd_ikconfiggz = IKCFG $@
120 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ 119 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
121targets += config_data.h 120targets += config_data.h
122$(obj)/config_data.h: $(obj)/config_data.gz FORCE 121$(obj)/config_data.h: $(obj)/config_data.gz FORCE
123 $(call if_changed,ikconfiggz) 122 $(call if_changed,ikconfiggz)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4ac..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)
401 break; 401 break;
402 } 402 }
403 } 403 }
404
404 if (!error) { 405 if (!error) {
405 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
406 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)
413 return error; 414 return error;
414} 415}
415 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
416void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
417{ 426{
418 int cpu, error; 427 int cpu, error;
@@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
424 goto out; 433 goto out;
425 434
426 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
427 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
428 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
429 if (!error) { 441 if (!error) {
@@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
432 } 444 }
433 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
434 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
435 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
436out: 451out:
437 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cred.c b/kernel/cred.c
index 006fcab009d5..d7f7a01082eb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -147,7 +147,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
147 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
148 key_put(cred->request_key_auth); 148 key_put(cred->request_key_auth);
149 release_tgcred(cred); 149 release_tgcred(cred);
150 put_group_info(cred->group_info); 150 if (cred->group_info)
151 put_group_info(cred->group_info);
151 free_uid(cred->user); 152 free_uid(cred->user);
152 kmem_cache_free(cred_jar, cred); 153 kmem_cache_free(cred_jar, cred);
153} 154}
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d3..654efd09f6a9 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 37 depends on S390 || X86 || (PPC && EXPERIMENTAL)
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab8486..05071bf6a37b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -485,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
485 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 485 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
486 __hrtimer_init(timer, clock_id, mode); 486 __hrtimer_init(timer, clock_id, mode);
487} 487}
488EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
488 489
489void destroy_hrtimer_on_stack(struct hrtimer *timer) 490void destroy_hrtimer_on_stack(struct hrtimer *timer)
490{ 491{
@@ -1477,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1477 sl->timer.function = hrtimer_wakeup; 1478 sl->timer.function = hrtimer_wakeup;
1478 sl->task = task; 1479 sl->task = task;
1479} 1480}
1481EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1480 1482
1481static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1483static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1482{ 1484{
diff --git a/kernel/module.c b/kernel/module.c
index 46580edff0cb..05ce49ced8f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -369,7 +369,7 @@ EXPORT_SYMBOL_GPL(find_module);
369 369
370#ifdef CONFIG_SMP 370#ifdef CONFIG_SMP
371 371
372#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 372#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
373 373
374static void *percpu_modalloc(unsigned long size, unsigned long align, 374static void *percpu_modalloc(unsigned long size, unsigned long align,
375 const char *name) 375 const char *name)
@@ -394,7 +394,7 @@ static void percpu_modfree(void *freeme)
394 free_percpu(freeme); 394 free_percpu(freeme);
395} 395}
396 396
397#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 397#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
398 398
399/* Number of blocks used and allocated. */ 399/* Number of blocks used and allocated. */
400static unsigned int pcpu_num_used, pcpu_num_allocated; 400static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -540,7 +540,7 @@ static int percpu_modinit(void)
540} 540}
541__initcall(percpu_modinit); 541__initcall(percpu_modinit);
542 542
543#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 543#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
544 544
545static unsigned int find_pcpusec(Elf_Ehdr *hdr, 545static unsigned int find_pcpusec(Elf_Ehdr *hdr,
546 Elf_Shdr *sechdrs, 546 Elf_Shdr *sechdrs,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 06d233a06da5..d013f4e89e9c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -106,16 +106,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
106 106
107void __weak perf_counter_print_debug(void) { } 107void __weak perf_counter_print_debug(void) { }
108 108
109static DEFINE_PER_CPU(int, disable_count); 109static DEFINE_PER_CPU(int, perf_disable_count);
110 110
111void __perf_disable(void) 111void __perf_disable(void)
112{ 112{
113 __get_cpu_var(disable_count)++; 113 __get_cpu_var(perf_disable_count)++;
114} 114}
115 115
116bool __perf_enable(void) 116bool __perf_enable(void)
117{ 117{
118 return !--__get_cpu_var(disable_count); 118 return !--__get_cpu_var(perf_disable_count);
119} 119}
120 120
121void perf_disable(void) 121void perf_disable(void)
@@ -4246,6 +4246,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4246 if (val) 4246 if (val)
4247 goto err_size; 4247 goto err_size;
4248 } 4248 }
4249 size = sizeof(*attr);
4249 } 4250 }
4250 4251
4251 ret = copy_from_user(attr, uattr, size); 4252 ret = copy_from_user(attr, uattr, size);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
208 random kernel OOPSes or reboots that don't seem to be related to 208 random kernel OOPSes or reboots that don't seem to be related to
209 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
210 APM in your BIOS). 210 APM in your BIOS).
211
212config PM_RUNTIME
213 bool "Run-time PM core functionality"
214 depends on PM
215 ---help---
216 Enable functionality allowing I/O devices to be put into energy-saving
217 (low power) states at run time (or autosuspended) after a specified
218 period of inactivity and woken up in response to a hardware-generated
219 wake-up event or a driver's request.
220
221 Hardware support is generally required for this functionality to work
222 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and
224 wake-up events.
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e7464893..04b3a83d686f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
298 if (error) 298 if (error)
299 return error; 299 return error;
300 300
301 /* Free memory before shutting down devices. */ 301 /* Preallocate image memory before shutting down devices. */
302 error = swsusp_shrink_memory(); 302 error = hibernate_preallocate_memory();
303 if (error) 303 if (error)
304 goto Close; 304 goto Close;
305 305
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 /* We may need to release the preallocated image pages here. */
319 if (error || !in_suspend)
320 swsusp_free();
321
318 dpm_resume_end(in_suspend ? 322 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 324 resume_console();
@@ -460,11 +464,11 @@ int hibernation_platform_enter(void)
460 464
461 error = hibernation_ops->prepare(); 465 error = hibernation_ops->prepare();
462 if (error) 466 if (error)
463 goto Platofrm_finish; 467 goto Platform_finish;
464 468
465 error = disable_nonboot_cpus(); 469 error = disable_nonboot_cpus();
466 if (error) 470 if (error)
467 goto Platofrm_finish; 471 goto Platform_finish;
468 472
469 local_irq_disable(); 473 local_irq_disable();
470 sysdev_suspend(PMSG_HIBERNATE); 474 sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +480,7 @@ int hibernation_platform_enter(void)
476 * We don't need to reenable the nonboot CPUs or resume consoles, since 480 * We don't need to reenable the nonboot CPUs or resume consoles, since
477 * the system is going to be halted anyway. 481 * the system is going to be halted anyway.
478 */ 482 */
479 Platofrm_finish: 483 Platform_finish:
480 hibernation_ops->finish(); 484 hibernation_ops->finish();
481 485
482 dpm_suspend_noirq(PMSG_RESTORE); 486 dpm_suspend_noirq(PMSG_RESTORE);
@@ -578,7 +582,10 @@ int hibernate(void)
578 goto Thaw; 582 goto Thaw;
579 583
580 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 584 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
581 if (in_suspend && !error) { 585 if (error)
586 goto Thaw;
587
588 if (in_suspend) {
582 unsigned int flags = 0; 589 unsigned int flags = 0;
583 590
584 if (hibernation_mode == HIBERNATION_PLATFORM) 591 if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
590 power_down(); 597 power_down();
591 } else { 598 } else {
592 pr_debug("PM: Image restored successfully.\n"); 599 pr_debug("PM: Image restored successfully.\n");
593 swsusp_free();
594 } 600 }
601
595 Thaw: 602 Thaw:
596 thaw_processes(); 603 thaw_processes();
597 Finish: 604 Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h>
14 15
15#include "power.h" 16#include "power.h"
16 17
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
217 .attrs = g, 218 .attrs = g,
218}; 219};
219 220
221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq;
223
224static int __init pm_start_workqueue(void)
225{
226 pm_wq = create_freezeable_workqueue("pm");
227
228 return pm_wq ? 0 : -ENOMEM;
229}
230#else
231static inline int pm_start_workqueue(void) { return 0; }
232#endif
233
220static int __init pm_init(void) 234static int __init pm_init(void)
221{ 235{
236 int error = pm_start_workqueue();
237 if (error)
238 return error;
222 power_kobj = kobject_create_and_add("power", NULL); 239 power_kobj = kobject_create_and_add("power", NULL);
223 if (!power_kobj) 240 if (!power_kobj)
224 return -ENOMEM; 241 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e3..46c5a26630a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern int swsusp_shrink_memory(void); 77extern int hibernate_preallocate_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d3..97955b0e44f4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
235 235
236#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 236#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
237 237
238struct bm_block { 238struct bm_block {
239 struct list_head hook; /* hook into a list of bitmap blocks */ 239 struct list_head hook; /* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
275 275
276/** 276/**
277 * create_bm_block_list - create a list of block bitmap objects 277 * create_bm_block_list - create a list of block bitmap objects
278 * @nr_blocks - number of blocks to allocate 278 * @pages - number of pages to track
279 * @list - list to put the allocated blocks into 279 * @list - list to put the allocated blocks into
280 * @ca - chain allocator to be used for allocating memory 280 * @ca - chain allocator to be used for allocating memory
281 */ 281 */
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
853 struct zone *zone; 853 struct zone *zone;
854 unsigned int n = 0; 854 unsigned int n = 0;
855 855
856 for_each_zone(zone) { 856 for_each_populated_zone(zone) {
857 unsigned long pfn, max_zone_pfn; 857 unsigned long pfn, max_zone_pfn;
858 858
859 if (!is_highmem(zone)) 859 if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
916 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
917 unsigned int n = 0; 917 unsigned int n = 0;
918 918
919 for_each_zone(zone) { 919 for_each_populated_zone(zone) {
920 if (is_highmem(zone)) 920 if (is_highmem(zone))
921 continue; 921 continue;
922 922
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1010 struct zone *zone; 1010 struct zone *zone;
1011 unsigned long pfn; 1011 unsigned long pfn;
1012 1012
1013 for_each_zone(zone) { 1013 for_each_populated_zone(zone) {
1014 unsigned long max_zone_pfn; 1014 unsigned long max_zone_pfn;
1015 1015
1016 mark_free_pages(zone); 1016 mark_free_pages(zone);
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1033static unsigned int nr_copy_pages; 1033static unsigned int nr_copy_pages;
1034/* Number of pages needed for saving the original pfns of the image pages */ 1034/* Number of pages needed for saving the original pfns of the image pages */
1035static unsigned int nr_meta_pages; 1035static unsigned int nr_meta_pages;
1036/*
1037 * Numbers of normal and highmem page frames allocated for hibernation image
1038 * before suspending devices.
1039 */
1040unsigned int alloc_normal, alloc_highmem;
1041/*
1042 * Memory bitmap used for marking saveable pages (during hibernation) or
1043 * hibernation image pages (during restore)
1044 */
1045static struct memory_bitmap orig_bm;
1046/*
1047 * Memory bitmap used during hibernation for marking allocated page frames that
1048 * will contain copies of saveable pages. During restore it is initially used
1049 * for marking hibernation image pages, but then the set bits from it are
1050 * duplicated in @orig_bm and it is released. On highmem systems it is next
1051 * used for marking "safe" highmem pages, but it has to be reinitialized for
1052 * this purpose.
1053 */
1054static struct memory_bitmap copy_bm;
1036 1055
1037/** 1056/**
1038 * swsusp_free - free pages allocated for the suspend. 1057 * swsusp_free - free pages allocated for the suspend.
@@ -1046,7 +1065,7 @@ void swsusp_free(void)
1046 struct zone *zone; 1065 struct zone *zone;
1047 unsigned long pfn, max_zone_pfn; 1066 unsigned long pfn, max_zone_pfn;
1048 1067
1049 for_each_zone(zone) { 1068 for_each_populated_zone(zone) {
1050 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1069 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1051 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1070 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1052 if (pfn_valid(pfn)) { 1071 if (pfn_valid(pfn)) {
@@ -1064,74 +1083,286 @@ void swsusp_free(void)
1064 nr_meta_pages = 0; 1083 nr_meta_pages = 0;
1065 restore_pblist = NULL; 1084 restore_pblist = NULL;
1066 buffer = NULL; 1085 buffer = NULL;
1086 alloc_normal = 0;
1087 alloc_highmem = 0;
1067} 1088}
1068 1089
1090/* Helper functions used for the shrinking of memory. */
1091
1092#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1093
1069/** 1094/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed 1095 * preallocate_image_pages - Allocate a number of pages for hibernation image
1071 * 1096 * @nr_pages: Number of page frames to allocate.
1072 * ... but do not OOM-kill anyone 1097 * @mask: GFP flags to use for the allocation.
1073 * 1098 *
1074 * Notice: all userland should be stopped before it is called, or 1099 * Return value: Number of page frames actually allocated
1075 * livelock is possible. 1100 */
1101static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1102{
1103 unsigned long nr_alloc = 0;
1104
1105 while (nr_pages > 0) {
1106 struct page *page;
1107
1108 page = alloc_image_page(mask);
1109 if (!page)
1110 break;
1111 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1112 if (PageHighMem(page))
1113 alloc_highmem++;
1114 else
1115 alloc_normal++;
1116 nr_pages--;
1117 nr_alloc++;
1118 }
1119
1120 return nr_alloc;
1121}
1122
1123static unsigned long preallocate_image_memory(unsigned long nr_pages)
1124{
1125 return preallocate_image_pages(nr_pages, GFP_IMAGE);
1126}
1127
1128#ifdef CONFIG_HIGHMEM
1129static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1130{
1131 return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1132}
1133
1134/**
1135 * __fraction - Compute (an approximation of) x * (multiplier / base)
1076 */ 1136 */
1137static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1138{
1139 x *= multiplier;
1140 do_div(x, base);
1141 return (unsigned long)x;
1142}
1143
1144static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1145 unsigned long highmem,
1146 unsigned long total)
1147{
1148 unsigned long alloc = __fraction(nr_pages, highmem, total);
1077 1149
1078#define SHRINK_BITE 10000 1150 return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1079static inline unsigned long __shrink_memory(long tmp) 1151}
1152#else /* CONFIG_HIGHMEM */
1153static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1080{ 1154{
1081 if (tmp > SHRINK_BITE) 1155 return 0;
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084} 1156}
1085 1157
1086int swsusp_shrink_memory(void) 1158static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1159 unsigned long highmem,
1160 unsigned long total)
1161{
1162 return 0;
1163}
1164#endif /* CONFIG_HIGHMEM */
1165
1166/**
1167 * free_unnecessary_pages - Release preallocated pages not needed for the image
1168 */
1169static void free_unnecessary_pages(void)
1170{
1171 unsigned long save_highmem, to_free_normal, to_free_highmem;
1172
1173 to_free_normal = alloc_normal - count_data_pages();
1174 save_highmem = count_highmem_pages();
1175 if (alloc_highmem > save_highmem) {
1176 to_free_highmem = alloc_highmem - save_highmem;
1177 } else {
1178 to_free_highmem = 0;
1179 to_free_normal -= save_highmem - alloc_highmem;
1180 }
1181
1182 memory_bm_position_reset(&copy_bm);
1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn);
1187
1188 if (PageHighMem(page)) {
1189 if (!to_free_highmem)
1190 continue;
1191 to_free_highmem--;
1192 alloc_highmem--;
1193 } else {
1194 if (!to_free_normal)
1195 continue;
1196 to_free_normal--;
1197 alloc_normal--;
1198 }
1199 memory_bm_clear_bit(&copy_bm, pfn);
1200 swsusp_unset_page_forbidden(page);
1201 swsusp_unset_page_free(page);
1202 __free_page(page);
1203 }
1204}
1205
1206/**
1207 * minimum_image_size - Estimate the minimum acceptable size of an image
1208 * @saveable: Number of saveable pages in the system.
1209 *
1210 * We want to avoid attempting to free too much memory too hard, so estimate the
1211 * minimum acceptable size of a hibernation image to use as the lower limit for
1212 * preallocating memory.
1213 *
1214 * We assume that the minimum image size should be proportional to
1215 *
1216 * [number of saveable pages] - [number of pages that can be freed in theory]
1217 *
1218 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1219 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
1220 * minus mapped file pages.
1221 */
1222static unsigned long minimum_image_size(unsigned long saveable)
1223{
1224 unsigned long size;
1225
1226 size = global_page_state(NR_SLAB_RECLAIMABLE)
1227 + global_page_state(NR_ACTIVE_ANON)
1228 + global_page_state(NR_INACTIVE_ANON)
1229 + global_page_state(NR_ACTIVE_FILE)
1230 + global_page_state(NR_INACTIVE_FILE)
1231 - global_page_state(NR_FILE_MAPPED);
1232
1233 return saveable <= size ? 0 : saveable - size;
1234}
1235
1236/**
1237 * hibernate_preallocate_memory - Preallocate memory for hibernation image
1238 *
1239 * To create a hibernation image it is necessary to make a copy of every page
1240 * frame in use. We also need a number of page frames to be free during
1241 * hibernation for allocations made while saving the image and for device
1242 * drivers, in case they need to allocate memory from their hibernation
1243 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
1244 * respectively, both of which are rough estimates). To make this happen, we
1245 * compute the total number of available page frames and allocate at least
1246 *
1247 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
1248 *
1249 * of them, which corresponds to the maximum size of a hibernation image.
1250 *
1251 * If image_size is set below the number following from the above formula,
1252 * the preallocation of memory is continued until the total number of saveable
1253 * pages in the system is below the requested image size or the minimum
1254 * acceptable image size returned by minimum_image_size(), whichever is greater.
1255 */
1256int hibernate_preallocate_memory(void)
1087{ 1257{
1088 long tmp;
1089 struct zone *zone; 1258 struct zone *zone;
1090 unsigned long pages = 0; 1259 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1091 unsigned int i = 0; 1260 unsigned long alloc, save_highmem, pages_highmem;
1092 char *p = "-\\|/";
1093 struct timeval start, stop; 1261 struct timeval start, stop;
1262 int error;
1094 1263
1095 printk(KERN_INFO "PM: Shrinking memory... "); 1264 printk(KERN_INFO "PM: Preallocating image memory... ");
1096 do_gettimeofday(&start); 1265 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114 1266
1115 if (highmem_size < 0) 1267 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1116 highmem_size = 0; 1268 if (error)
1269 goto err_out;
1117 1270
1118 tmp += highmem_size; 1271 error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1119 if (tmp > 0) { 1272 if (error)
1120 tmp = __shrink_memory(tmp); 1273 goto err_out;
1121 if (!tmp) 1274
1122 return -ENOMEM; 1275 alloc_normal = 0;
1123 pages += tmp; 1276 alloc_highmem = 0;
1124 } else if (size > image_size / PAGE_SIZE) { 1277
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); 1278 /* Count the number of saveable data pages. */
1126 pages += tmp; 1279 save_highmem = count_highmem_pages();
1127 } 1280 saveable = count_data_pages();
1128 printk("\b%c", p[i++%4]); 1281
1129 } while (tmp > 0); 1282 /*
1283 * Compute the total number of page frames we can use (count) and the
1284 * number of pages needed for image metadata (size).
1285 */
1286 count = saveable;
1287 saveable += save_highmem;
1288 highmem = save_highmem;
1289 size = 0;
1290 for_each_populated_zone(zone) {
1291 size += snapshot_additional_pages(zone);
1292 if (is_highmem(zone))
1293 highmem += zone_page_state(zone, NR_FREE_PAGES);
1294 else
1295 count += zone_page_state(zone, NR_FREE_PAGES);
1296 }
1297 count += highmem;
1298 count -= totalreserve_pages;
1299
1300 /* Compute the maximum number of saveable pages to leave in memory. */
1301 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1302 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1303 if (size > max_size)
1304 size = max_size;
1305 /*
1306 * If the maximum is not less than the current number of saveable pages
1307 * in memory, allocate page frames for the image and we're done.
1308 */
1309 if (size >= saveable) {
1310 pages = preallocate_image_highmem(save_highmem);
1311 pages += preallocate_image_memory(saveable - pages);
1312 goto out;
1313 }
1314
1315 /* Estimate the minimum size of the image. */
1316 pages = minimum_image_size(saveable);
1317 if (size < pages)
1318 size = min_t(unsigned long, pages, max_size);
1319
1320 /*
1321 * Let the memory management subsystem know that we're going to need a
1322 * large number of page frames to allocate and make it free some memory.
1323 * NOTE: If this is not done, performance will be hurt badly in some
1324 * test cases.
1325 */
1326 shrink_all_memory(saveable - size);
1327
1328 /*
1329 * The number of saveable pages in memory was too high, so apply some
1330 * pressure to decrease it. First, make room for the largest possible
1331 * image and fail if that doesn't work. Next, try to decrease the size
1332 * of the image as much as indicated by 'size' using allocations from
1333 * highmem and non-highmem zones separately.
1334 */
1335 pages_highmem = preallocate_image_highmem(highmem / 2);
1336 alloc = (count - max_size) - pages_highmem;
1337 pages = preallocate_image_memory(alloc);
1338 if (pages < alloc)
1339 goto err_out;
1340 size = max_size - size;
1341 alloc = size;
1342 size = preallocate_highmem_fraction(size, highmem, count);
1343 pages_highmem += size;
1344 alloc -= size;
1345 pages += preallocate_image_memory(alloc);
1346 pages += pages_highmem;
1347
1348 /*
1349 * We only need as many page frames for the image as there are saveable
1350 * pages in memory, but we have allocated more. Release the excessive
1351 * ones now.
1352 */
1353 free_unnecessary_pages();
1354
1355 out:
1130 do_gettimeofday(&stop); 1356 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages); 1357 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed"); 1358 swsusp_show_speed(&start, &stop, pages, "Allocated");
1133 1359
1134 return 0; 1360 return 0;
1361
1362 err_out:
1363 printk(KERN_CONT "\n");
1364 swsusp_free();
1365 return -ENOMEM;
1135} 1366}
1136 1367
1137#ifdef CONFIG_HIGHMEM 1368#ifdef CONFIG_HIGHMEM
@@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void)
1142 1373
1143static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1374static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1144{ 1375{
1145 unsigned int free_highmem = count_free_highmem_pages(); 1376 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1146 1377
1147 if (free_highmem >= nr_highmem) 1378 if (free_highmem >= nr_highmem)
1148 nr_highmem = 0; 1379 nr_highmem = 0;
@@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1164static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1395static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1165{ 1396{
1166 struct zone *zone; 1397 struct zone *zone;
1167 unsigned int free = 0, meta = 0; 1398 unsigned int free = alloc_normal;
1168 1399
1169 for_each_zone(zone) { 1400 for_each_populated_zone(zone)
1170 meta += snapshot_additional_pages(zone);
1171 if (!is_highmem(zone)) 1401 if (!is_highmem(zone))
1172 free += zone_page_state(zone, NR_FREE_PAGES); 1402 free += zone_page_state(zone, NR_FREE_PAGES);
1173 }
1174 1403
1175 nr_pages += count_pages_for_highmem(nr_highmem); 1404 nr_pages += count_pages_for_highmem(nr_highmem);
1176 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", 1405 pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1177 nr_pages, PAGES_FOR_IO, meta, free); 1406 nr_pages, PAGES_FOR_IO, free);
1178 1407
1179 return free > nr_pages + PAGES_FOR_IO + meta; 1408 return free > nr_pages + PAGES_FOR_IO;
1180} 1409}
1181 1410
1182#ifdef CONFIG_HIGHMEM 1411#ifdef CONFIG_HIGHMEM
@@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed)
1198 */ 1427 */
1199 1428
1200static inline unsigned int 1429static inline unsigned int
1201alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) 1430alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1202{ 1431{
1203 unsigned int to_alloc = count_free_highmem_pages(); 1432 unsigned int to_alloc = count_free_highmem_pages();
1204 1433
@@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1218static inline int get_highmem_buffer(int safe_needed) { return 0; } 1447static inline int get_highmem_buffer(int safe_needed) { return 0; }
1219 1448
1220static inline unsigned int 1449static inline unsigned int
1221alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1450alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1222#endif /* CONFIG_HIGHMEM */ 1451#endif /* CONFIG_HIGHMEM */
1223 1452
1224/** 1453/**
@@ -1237,51 +1466,36 @@ static int
1237swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1466swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1238 unsigned int nr_pages, unsigned int nr_highmem) 1467 unsigned int nr_pages, unsigned int nr_highmem)
1239{ 1468{
1240 int error; 1469 int error = 0;
1241
1242 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1243 if (error)
1244 goto Free;
1245
1246 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1247 if (error)
1248 goto Free;
1249 1470
1250 if (nr_highmem > 0) { 1471 if (nr_highmem > 0) {
1251 error = get_highmem_buffer(PG_ANY); 1472 error = get_highmem_buffer(PG_ANY);
1252 if (error) 1473 if (error)
1253 goto Free; 1474 goto err_out;
1254 1475 if (nr_highmem > alloc_highmem) {
1255 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); 1476 nr_highmem -= alloc_highmem;
1477 nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1478 }
1256 } 1479 }
1257 while (nr_pages-- > 0) { 1480 if (nr_pages > alloc_normal) {
1258 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1481 nr_pages -= alloc_normal;
1259 1482 while (nr_pages-- > 0) {
1260 if (!page) 1483 struct page *page;
1261 goto Free;
1262 1484
1263 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1485 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1486 if (!page)
1487 goto err_out;
1488 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1489 }
1264 } 1490 }
1491
1265 return 0; 1492 return 0;
1266 1493
1267 Free: 1494 err_out:
1268 swsusp_free(); 1495 swsusp_free();
1269 return -ENOMEM; 1496 return error;
1270} 1497}
1271 1498
1272/* Memory bitmap used for marking saveable pages (during suspend) or the
1273 * suspend image pages (during resume)
1274 */
1275static struct memory_bitmap orig_bm;
1276/* Memory bitmap used on suspend for marking allocated pages that will contain
1277 * the copies of saveable pages. During resume it is initially used for
1278 * marking the suspend image pages, but then its set bits are duplicated in
1279 * @orig_bm and it is released. Next, on systems with high memory, it may be
1280 * used for marking "safe" highmem pages, but it has to be reinitialized for
1281 * this purpose.
1282 */
1283static struct memory_bitmap copy_bm;
1284
1285asmlinkage int swsusp_save(void) 1499asmlinkage int swsusp_save(void)
1286{ 1500{
1287 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
@@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1474 unsigned long pfn, max_zone_pfn; 1688 unsigned long pfn, max_zone_pfn;
1475 1689
1476 /* Clear page flags */ 1690 /* Clear page flags */
1477 for_each_zone(zone) { 1691 for_each_populated_zone(zone) {
1478 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1692 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1479 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1693 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1480 if (pfn_valid(pfn)) 1694 if (pfn_valid(pfn))
diff --git a/kernel/printk.c b/kernel/printk.c
index e10d193a833a..602033acd6c7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1075,12 +1075,6 @@ void __sched console_conditional_schedule(void)
1075} 1075}
1076EXPORT_SYMBOL(console_conditional_schedule); 1076EXPORT_SYMBOL(console_conditional_schedule);
1077 1077
1078void console_print(const char *s)
1079{
1080 printk(KERN_EMERG "%s", s);
1081}
1082EXPORT_SYMBOL(console_print);
1083
1084void console_unblank(void) 1078void console_unblank(void)
1085{ 1079{
1086 struct console *c; 1080 struct console *c;
diff --git a/kernel/sched.c b/kernel/sched.c
index e27a53685ed9..faf4d463bbff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -119,8 +119,6 @@
119 */ 119 */
120#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
121 121
122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
123
124static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
125{ 123{
126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -295,12 +293,12 @@ struct task_group root_task_group;
295/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
296static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
297/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
298static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
299#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
300 298
301#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
302static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
303static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
304#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
305#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
306#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
378 376
379#else 377#else
380 378
381#ifdef CONFIG_SMP
382static int root_task_group_empty(void)
383{
384 return 1;
385}
386#endif
387
388static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
389static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
390{ 381{
@@ -514,14 +505,6 @@ struct root_domain {
514#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
515 struct cpupri cpupri; 506 struct cpupri cpupri;
516#endif 507#endif
517#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
518 /*
519 * Preferred wake up cpu nominated by sched_mc balance that will be
520 * used when most cpus are idle in the system indicating overall very
521 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
522 */
523 unsigned int sched_mc_preferred_wakeup_cpu;
524#endif
525}; 508};
526 509
527/* 510/*
@@ -646,9 +629,10 @@ struct rq {
646 629
647static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
648 631
649static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
650{ 634{
651 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
652} 636}
653 637
654static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -1509,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)
1509#endif 1493#endif
1510 1494
1511#ifdef CONFIG_SMP 1495#ifdef CONFIG_SMP
1512static unsigned long source_load(int cpu, int type); 1496/* Used instead of source_load when we know the type == 0 */
1513static unsigned long target_load(int cpu, int type); 1497static unsigned long weighted_cpuload(const int cpu)
1498{
1499 return cpu_rq(cpu)->load.weight;
1500}
1501
1502/*
1503 * Return a low guess at the load of a migration-source cpu weighted
1504 * according to the scheduling class and "nice" value.
1505 *
1506 * We want to under-estimate the load of migration sources, to
1507 * balance conservatively.
1508 */
1509static unsigned long source_load(int cpu, int type)
1510{
1511 struct rq *rq = cpu_rq(cpu);
1512 unsigned long total = weighted_cpuload(cpu);
1513
1514 if (type == 0 || !sched_feat(LB_BIAS))
1515 return total;
1516
1517 return min(rq->cpu_load[type-1], total);
1518}
1519
1520/*
1521 * Return a high guess at the load of a migration-target cpu weighted
1522 * according to the scheduling class and "nice" value.
1523 */
1524static unsigned long target_load(int cpu, int type)
1525{
1526 struct rq *rq = cpu_rq(cpu);
1527 unsigned long total = weighted_cpuload(cpu);
1528
1529 if (type == 0 || !sched_feat(LB_BIAS))
1530 return total;
1531
1532 return max(rq->cpu_load[type-1], total);
1533}
1534
1535static struct sched_group *group_of(int cpu)
1536{
1537 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1538
1539 if (!sd)
1540 return NULL;
1541
1542 return sd->groups;
1543}
1544
1545static unsigned long power_of(int cpu)
1546{
1547 struct sched_group *group = group_of(cpu);
1548
1549 if (!group)
1550 return SCHED_LOAD_SCALE;
1551
1552 return group->cpu_power;
1553}
1554
1514static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1555static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1515 1556
1516static unsigned long cpu_avg_load_per_task(int cpu) 1557static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1695,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1695 1736
1696#ifdef CONFIG_PREEMPT 1737#ifdef CONFIG_PREEMPT
1697 1738
1739static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1740
1698/* 1741/*
1699 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1742 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1700 * way at the expense of forcing extra atomic operations in all 1743 * way at the expense of forcing extra atomic operations in all
@@ -1959,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1959} 2002}
1960 2003
1961#ifdef CONFIG_SMP 2004#ifdef CONFIG_SMP
1962
1963/* Used instead of source_load when we know the type == 0 */
1964static unsigned long weighted_cpuload(const int cpu)
1965{
1966 return cpu_rq(cpu)->load.weight;
1967}
1968
1969/* 2005/*
1970 * Is this task likely cache-hot: 2006 * Is this task likely cache-hot:
1971 */ 2007 */
@@ -2239,185 +2275,6 @@ void kick_process(struct task_struct *p)
2239 preempt_enable(); 2275 preempt_enable();
2240} 2276}
2241EXPORT_SYMBOL_GPL(kick_process); 2277EXPORT_SYMBOL_GPL(kick_process);
2242
2243/*
2244 * Return a low guess at the load of a migration-source cpu weighted
2245 * according to the scheduling class and "nice" value.
2246 *
2247 * We want to under-estimate the load of migration sources, to
2248 * balance conservatively.
2249 */
2250static unsigned long source_load(int cpu, int type)
2251{
2252 struct rq *rq = cpu_rq(cpu);
2253 unsigned long total = weighted_cpuload(cpu);
2254
2255 if (type == 0 || !sched_feat(LB_BIAS))
2256 return total;
2257
2258 return min(rq->cpu_load[type-1], total);
2259}
2260
2261/*
2262 * Return a high guess at the load of a migration-target cpu weighted
2263 * according to the scheduling class and "nice" value.
2264 */
2265static unsigned long target_load(int cpu, int type)
2266{
2267 struct rq *rq = cpu_rq(cpu);
2268 unsigned long total = weighted_cpuload(cpu);
2269
2270 if (type == 0 || !sched_feat(LB_BIAS))
2271 return total;
2272
2273 return max(rq->cpu_load[type-1], total);
2274}
2275
2276/*
2277 * find_idlest_group finds and returns the least busy CPU group within the
2278 * domain.
2279 */
2280static struct sched_group *
2281find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2282{
2283 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2284 unsigned long min_load = ULONG_MAX, this_load = 0;
2285 int load_idx = sd->forkexec_idx;
2286 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2287
2288 do {
2289 unsigned long load, avg_load;
2290 int local_group;
2291 int i;
2292
2293 /* Skip over this group if it has no CPUs allowed */
2294 if (!cpumask_intersects(sched_group_cpus(group),
2295 &p->cpus_allowed))
2296 continue;
2297
2298 local_group = cpumask_test_cpu(this_cpu,
2299 sched_group_cpus(group));
2300
2301 /* Tally up the load of all CPUs in the group */
2302 avg_load = 0;
2303
2304 for_each_cpu(i, sched_group_cpus(group)) {
2305 /* Bias balancing toward cpus of our domain */
2306 if (local_group)
2307 load = source_load(i, load_idx);
2308 else
2309 load = target_load(i, load_idx);
2310
2311 avg_load += load;
2312 }
2313
2314 /* Adjust by relative CPU power of the group */
2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2316
2317 if (local_group) {
2318 this_load = avg_load;
2319 this = group;
2320 } else if (avg_load < min_load) {
2321 min_load = avg_load;
2322 idlest = group;
2323 }
2324 } while (group = group->next, group != sd->groups);
2325
2326 if (!idlest || 100*this_load < imbalance*min_load)
2327 return NULL;
2328 return idlest;
2329}
2330
2331/*
2332 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2333 */
2334static int
2335find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2336{
2337 unsigned long load, min_load = ULONG_MAX;
2338 int idlest = -1;
2339 int i;
2340
2341 /* Traverse only the allowed CPUs */
2342 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2343 load = weighted_cpuload(i);
2344
2345 if (load < min_load || (load == min_load && i == this_cpu)) {
2346 min_load = load;
2347 idlest = i;
2348 }
2349 }
2350
2351 return idlest;
2352}
2353
2354/*
2355 * sched_balance_self: balance the current task (running on cpu) in domains
2356 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2357 * SD_BALANCE_EXEC.
2358 *
2359 * Balance, ie. select the least loaded group.
2360 *
2361 * Returns the target CPU number, or the same CPU if no balancing is needed.
2362 *
2363 * preempt must be disabled.
2364 */
2365static int sched_balance_self(int cpu, int flag)
2366{
2367 struct task_struct *t = current;
2368 struct sched_domain *tmp, *sd = NULL;
2369
2370 for_each_domain(cpu, tmp) {
2371 /*
2372 * If power savings logic is enabled for a domain, stop there.
2373 */
2374 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2375 break;
2376 if (tmp->flags & flag)
2377 sd = tmp;
2378 }
2379
2380 if (sd)
2381 update_shares(sd);
2382
2383 while (sd) {
2384 struct sched_group *group;
2385 int new_cpu, weight;
2386
2387 if (!(sd->flags & flag)) {
2388 sd = sd->child;
2389 continue;
2390 }
2391
2392 group = find_idlest_group(sd, t, cpu);
2393 if (!group) {
2394 sd = sd->child;
2395 continue;
2396 }
2397
2398 new_cpu = find_idlest_cpu(group, t, cpu);
2399 if (new_cpu == -1 || new_cpu == cpu) {
2400 /* Now try balancing at a lower domain level of cpu */
2401 sd = sd->child;
2402 continue;
2403 }
2404
2405 /* Now try balancing at a lower domain level of new_cpu */
2406 cpu = new_cpu;
2407 weight = cpumask_weight(sched_domain_span(sd));
2408 sd = NULL;
2409 for_each_domain(cpu, tmp) {
2410 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2411 break;
2412 if (tmp->flags & flag)
2413 sd = tmp;
2414 }
2415 /* while loop will break here if sd == NULL */
2416 }
2417
2418 return cpu;
2419}
2420
2421#endif /* CONFIG_SMP */ 2278#endif /* CONFIG_SMP */
2422 2279
2423/** 2280/**
@@ -2455,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p,
2455 * 2312 *
2456 * returns failure only if the task is already active. 2313 * returns failure only if the task is already active.
2457 */ 2314 */
2458static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2315static int try_to_wake_up(struct task_struct *p, unsigned int state,
2316 int wake_flags)
2459{ 2317{
2460 int cpu, orig_cpu, this_cpu, success = 0; 2318 int cpu, orig_cpu, this_cpu, success = 0;
2461 unsigned long flags; 2319 unsigned long flags;
2462 long old_state;
2463 struct rq *rq; 2320 struct rq *rq;
2464 2321
2465 if (!sched_feat(SYNC_WAKEUPS)) 2322 if (!sched_feat(SYNC_WAKEUPS))
2466 sync = 0; 2323 wake_flags &= ~WF_SYNC;
2467
2468#ifdef CONFIG_SMP
2469 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2470 struct sched_domain *sd;
2471 2324
2472 this_cpu = raw_smp_processor_id(); 2325 this_cpu = get_cpu();
2473 cpu = task_cpu(p);
2474
2475 for_each_domain(this_cpu, sd) {
2476 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2477 update_shares(sd);
2478 break;
2479 }
2480 }
2481 }
2482#endif
2483 2326
2484 smp_wmb(); 2327 smp_wmb();
2485 rq = task_rq_lock(p, &flags); 2328 rq = task_rq_lock(p, &flags);
2486 update_rq_clock(rq); 2329 update_rq_clock(rq);
2487 old_state = p->state; 2330 if (!(p->state & state))
2488 if (!(old_state & state))
2489 goto out; 2331 goto out;
2490 2332
2491 if (p->se.on_rq) 2333 if (p->se.on_rq)
@@ -2493,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2493 2335
2494 cpu = task_cpu(p); 2336 cpu = task_cpu(p);
2495 orig_cpu = cpu; 2337 orig_cpu = cpu;
2496 this_cpu = smp_processor_id();
2497 2338
2498#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2499 if (unlikely(task_running(rq, p))) 2340 if (unlikely(task_running(rq, p)))
2500 goto out_activate; 2341 goto out_activate;
2501 2342
2502 cpu = p->sched_class->select_task_rq(p, sync); 2343 /*
2503 if (cpu != orig_cpu) { 2344 * In order to handle concurrent wakeups and release the rq->lock
2345 * we put the task in TASK_WAKING state.
2346 *
2347 * First fix up the nr_uninterruptible count:
2348 */
2349 if (task_contributes_to_load(p))
2350 rq->nr_uninterruptible--;
2351 p->state = TASK_WAKING;
2352 task_rq_unlock(rq, &flags);
2353
2354 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2355 if (cpu != orig_cpu)
2504 set_task_cpu(p, cpu); 2356 set_task_cpu(p, cpu);
2505 task_rq_unlock(rq, &flags);
2506 /* might preempt at this point */
2507 rq = task_rq_lock(p, &flags);
2508 old_state = p->state;
2509 if (!(old_state & state))
2510 goto out;
2511 if (p->se.on_rq)
2512 goto out_running;
2513 2357
2514 this_cpu = smp_processor_id(); 2358 rq = task_rq_lock(p, &flags);
2515 cpu = task_cpu(p); 2359 WARN_ON(p->state != TASK_WAKING);
2516 } 2360 cpu = task_cpu(p);
2517 2361
2518#ifdef CONFIG_SCHEDSTATS 2362#ifdef CONFIG_SCHEDSTATS
2519 schedstat_inc(rq, ttwu_count); 2363 schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2533out_activate: 2377out_activate:
2534#endif /* CONFIG_SMP */ 2378#endif /* CONFIG_SMP */
2535 schedstat_inc(p, se.nr_wakeups); 2379 schedstat_inc(p, se.nr_wakeups);
2536 if (sync) 2380 if (wake_flags & WF_SYNC)
2537 schedstat_inc(p, se.nr_wakeups_sync); 2381 schedstat_inc(p, se.nr_wakeups_sync);
2538 if (orig_cpu != cpu) 2382 if (orig_cpu != cpu)
2539 schedstat_inc(p, se.nr_wakeups_migrate); 2383 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,7 +2406,7 @@ out_activate:
2562 2406
2563out_running: 2407out_running:
2564 trace_sched_wakeup(rq, p, success); 2408 trace_sched_wakeup(rq, p, success);
2565 check_preempt_curr(rq, p, sync); 2409 check_preempt_curr(rq, p, wake_flags);
2566 2410
2567 p->state = TASK_RUNNING; 2411 p->state = TASK_RUNNING;
2568#ifdef CONFIG_SMP 2412#ifdef CONFIG_SMP
@@ -2571,6 +2415,7 @@ out_running:
2571#endif 2415#endif
2572out: 2416out:
2573 task_rq_unlock(rq, &flags); 2417 task_rq_unlock(rq, &flags);
2418 put_cpu();
2574 2419
2575 return success; 2420 return success;
2576} 2421}
@@ -2613,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)
2613 p->se.avg_overlap = 0; 2458 p->se.avg_overlap = 0;
2614 p->se.start_runtime = 0; 2459 p->se.start_runtime = 0;
2615 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2460 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2461 p->se.avg_running = 0;
2616 2462
2617#ifdef CONFIG_SCHEDSTATS 2463#ifdef CONFIG_SCHEDSTATS
2618 p->se.wait_start = 0; 2464 p->se.wait_start = 0;
@@ -2674,11 +2520,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
2674 2520
2675 __sched_fork(p); 2521 __sched_fork(p);
2676 2522
2677#ifdef CONFIG_SMP
2678 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2679#endif
2680 set_task_cpu(p, cpu);
2681
2682 /* 2523 /*
2683 * Make sure we do not leak PI boosting priority to the child. 2524 * Make sure we do not leak PI boosting priority to the child.
2684 */ 2525 */
@@ -2709,6 +2550,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2709 if (!rt_prio(p->prio)) 2550 if (!rt_prio(p->prio))
2710 p->sched_class = &fair_sched_class; 2551 p->sched_class = &fair_sched_class;
2711 2552
2553#ifdef CONFIG_SMP
2554 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2555#endif
2556 set_task_cpu(p, cpu);
2557
2712#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2558#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2713 if (likely(sched_info_on())) 2559 if (likely(sched_info_on()))
2714 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2560 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2754,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2754 inc_nr_running(rq); 2600 inc_nr_running(rq);
2755 } 2601 }
2756 trace_sched_wakeup_new(rq, p, 1); 2602 trace_sched_wakeup_new(rq, p, 1);
2757 check_preempt_curr(rq, p, 0); 2603 check_preempt_curr(rq, p, WF_FORK);
2758#ifdef CONFIG_SMP 2604#ifdef CONFIG_SMP
2759 if (p->sched_class->task_wake_up) 2605 if (p->sched_class->task_wake_up)
2760 p->sched_class->task_wake_up(rq, p); 2606 p->sched_class->task_wake_up(rq, p);
@@ -3263,7 +3109,7 @@ out:
3263void sched_exec(void) 3109void sched_exec(void)
3264{ 3110{
3265 int new_cpu, this_cpu = get_cpu(); 3111 int new_cpu, this_cpu = get_cpu();
3266 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3112 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3267 put_cpu(); 3113 put_cpu();
3268 if (new_cpu != this_cpu) 3114 if (new_cpu != this_cpu)
3269 sched_migrate_task(current, new_cpu); 3115 sched_migrate_task(current, new_cpu);
@@ -3683,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3683 *imbalance = sds->min_load_per_task; 3529 *imbalance = sds->min_load_per_task;
3684 sds->busiest = sds->group_min; 3530 sds->busiest = sds->group_min;
3685 3531
3686 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3687 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3688 group_first_cpu(sds->group_leader);
3689 }
3690
3691 return 1; 3532 return 1;
3692 3533
3693} 3534}
@@ -3711,7 +3552,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3711} 3552}
3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3553#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3713 3554
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 3555
3556unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3557{
3558 return SCHED_LOAD_SCALE;
3559}
3560
3561unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3562{
3563 return default_scale_freq_power(sd, cpu);
3564}
3565
3566unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3715{ 3567{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3568 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain; 3569 unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3573,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3721 return smt_gain; 3573 return smt_gain;
3722} 3574}
3723 3575
3576unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3577{
3578 return default_scale_smt_power(sd, cpu);
3579}
3580
3724unsigned long scale_rt_power(int cpu) 3581unsigned long scale_rt_power(int cpu)
3725{ 3582{
3726 struct rq *rq = cpu_rq(cpu); 3583 struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3602,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3745 unsigned long power = SCHED_LOAD_SCALE; 3602 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups; 3603 struct sched_group *sdg = sd->groups;
3747 3604
3748 /* here we could scale based on cpufreq */ 3605 if (sched_feat(ARCH_POWER))
3606 power *= arch_scale_freq_power(sd, cpu);
3607 else
3608 power *= default_scale_freq_power(sd, cpu);
3609
3610 power >>= SCHED_LOAD_SHIFT;
3749 3611
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 3612 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu); 3613 if (sched_feat(ARCH_POWER))
3614 power *= arch_scale_smt_power(sd, cpu);
3615 else
3616 power *= default_scale_smt_power(sd, cpu);
3617
3752 power >>= SCHED_LOAD_SHIFT; 3618 power >>= SCHED_LOAD_SHIFT;
3753 } 3619 }
3754 3620
@@ -4161,26 +4027,6 @@ ret:
4161 return NULL; 4027 return NULL;
4162} 4028}
4163 4029
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
4184/* 4030/*
4185 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4031 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4186 */ 4032 */
@@ -5465,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
5465#endif 5311#endif
5466} 5312}
5467 5313
5468static void put_prev_task(struct rq *rq, struct task_struct *prev) 5314static void put_prev_task(struct rq *rq, struct task_struct *p)
5469{ 5315{
5470 if (prev->state == TASK_RUNNING) { 5316 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5471 u64 runtime = prev->se.sum_exec_runtime;
5472 5317
5473 runtime -= prev->se.prev_sum_exec_runtime; 5318 update_avg(&p->se.avg_running, runtime);
5474 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5475 5319
5320 if (p->state == TASK_RUNNING) {
5476 /* 5321 /*
5477 * In order to avoid avg_overlap growing stale when we are 5322 * In order to avoid avg_overlap growing stale when we are
5478 * indeed overlapping and hence not getting put to sleep, grow 5323 * indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5482 * correlates to the amount of cache footprint a task can 5327 * correlates to the amount of cache footprint a task can
5483 * build up. 5328 * build up.
5484 */ 5329 */
5485 update_avg(&prev->se.avg_overlap, runtime); 5330 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5331 update_avg(&p->se.avg_overlap, runtime);
5332 } else {
5333 update_avg(&p->se.avg_running, 0);
5486 } 5334 }
5487 prev->sched_class->put_prev_task(rq, prev); 5335 p->sched_class->put_prev_task(rq, p);
5488} 5336}
5489 5337
5490/* 5338/*
@@ -5716,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5716 5564
5717#endif /* CONFIG_PREEMPT */ 5565#endif /* CONFIG_PREEMPT */
5718 5566
5719int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5567int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5720 void *key) 5568 void *key)
5721{ 5569{
5722 return try_to_wake_up(curr->private, mode, sync); 5570 return try_to_wake_up(curr->private, mode, wake_flags);
5723} 5571}
5724EXPORT_SYMBOL(default_wake_function); 5572EXPORT_SYMBOL(default_wake_function);
5725 5573
@@ -5733,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
5733 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5581 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5734 */ 5582 */
5735static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5583static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5736 int nr_exclusive, int sync, void *key) 5584 int nr_exclusive, int wake_flags, void *key)
5737{ 5585{
5738 wait_queue_t *curr, *next; 5586 wait_queue_t *curr, *next;
5739 5587
5740 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5588 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5741 unsigned flags = curr->flags; 5589 unsigned flags = curr->flags;
5742 5590
5743 if (curr->func(curr, mode, sync, key) && 5591 if (curr->func(curr, mode, wake_flags, key) &&
5744 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5592 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5745 break; 5593 break;
5746 } 5594 }
@@ -5801,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5801 int nr_exclusive, void *key) 5649 int nr_exclusive, void *key)
5802{ 5650{
5803 unsigned long flags; 5651 unsigned long flags;
5804 int sync = 1; 5652 int wake_flags = WF_SYNC;
5805 5653
5806 if (unlikely(!q)) 5654 if (unlikely(!q))
5807 return; 5655 return;
5808 5656
5809 if (unlikely(!nr_exclusive)) 5657 if (unlikely(!nr_exclusive))
5810 sync = 0; 5658 wake_flags = 0;
5811 5659
5812 spin_lock_irqsave(&q->lock, flags); 5660 spin_lock_irqsave(&q->lock, flags);
5813 __wake_up_common(q, mode, nr_exclusive, sync, key); 5661 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5814 spin_unlock_irqrestore(&q->lock, flags); 5662 spin_unlock_irqrestore(&q->lock, flags);
5815} 5663}
5816EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5664EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -8000,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd)
8000 } 7848 }
8001 7849
8002 /* Following flags don't use groups */ 7850 /* Following flags don't use groups */
8003 if (sd->flags & (SD_WAKE_IDLE | 7851 if (sd->flags & (SD_WAKE_AFFINE))
8004 SD_WAKE_AFFINE |
8005 SD_WAKE_BALANCE))
8006 return 0; 7852 return 0;
8007 7853
8008 return 1; 7854 return 1;
@@ -8019,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
8019 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7865 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
8020 return 0; 7866 return 0;
8021 7867
8022 /* Does parent contain flags not in child? */
8023 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
8024 if (cflags & SD_WAKE_AFFINE)
8025 pflags &= ~SD_WAKE_BALANCE;
8026 /* Flags needing groups don't count if only 1 group in parent */ 7868 /* Flags needing groups don't count if only 1 group in parent */
8027 if (parent->groups == parent->groups->next) { 7869 if (parent->groups == parent->groups->next) {
8028 pflags &= ~(SD_LOAD_BALANCE | 7870 pflags &= ~(SD_LOAD_BALANCE |
@@ -8708,10 +8550,10 @@ static void set_domain_attribute(struct sched_domain *sd,
8708 request = attr->relax_domain_level; 8550 request = attr->relax_domain_level;
8709 if (request < sd->level) { 8551 if (request < sd->level) {
8710 /* turn off idle balance on this domain */ 8552 /* turn off idle balance on this domain */
8711 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8553 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8712 } else { 8554 } else {
8713 /* turn on idle balance on this domain */ 8555 /* turn on idle balance on this domain */
8714 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8556 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8715 } 8557 }
8716} 8558}
8717 8559
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd0891267..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a097e909e80f..990b188803ce 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -712,7 +712,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
712 712
713 if (!initial) { 713 if (!initial) {
714 /* sleeps upto a single latency don't count. */ 714 /* sleeps upto a single latency don't count. */
715 if (sched_feat(NEW_FAIR_SLEEPERS)) { 715 if (sched_feat(FAIR_SLEEPERS)) {
716 unsigned long thresh = sysctl_sched_latency; 716 unsigned long thresh = sysctl_sched_latency;
717 717
718 /* 718 /*
@@ -726,6 +726,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
726 task_of(se)->policy != SCHED_IDLE)) 726 task_of(se)->policy != SCHED_IDLE))
727 thresh = calc_delta_fair(thresh, se); 727 thresh = calc_delta_fair(thresh, se);
728 728
729 /*
730 * Halve their sleep time's effect, to allow
731 * for a gentler effect of sleepers:
732 */
733 if (sched_feat(GENTLE_FAIR_SLEEPERS))
734 thresh >>= 1;
735
729 vruntime -= thresh; 736 vruntime -= thresh;
730 } 737 }
731 } 738 }
@@ -758,10 +765,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
758 765
759static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 766static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
760{ 767{
761 if (cfs_rq->last == se) 768 if (!se || cfs_rq->last == se)
762 cfs_rq->last = NULL; 769 cfs_rq->last = NULL;
763 770
764 if (cfs_rq->next == se) 771 if (!se || cfs_rq->next == se)
765 cfs_rq->next = NULL; 772 cfs_rq->next = NULL;
766} 773}
767 774
@@ -1063,83 +1070,6 @@ static void yield_task_fair(struct rq *rq)
1063 se->vruntime = rightmost->vruntime + 1; 1070 se->vruntime = rightmost->vruntime + 1;
1064} 1071}
1065 1072
1066/*
1067 * wake_idle() will wake a task on an idle cpu if task->cpu is
1068 * not idle and an idle cpu is available. The span of cpus to
1069 * search starts with cpus closest then further out as needed,
1070 * so we always favor a closer, idle cpu.
1071 * Domains may include CPUs that are not usable for migration,
1072 * hence we need to mask them out (rq->rd->online)
1073 *
1074 * Returns the CPU we should wake onto.
1075 */
1076#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1077
1078#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1079
1080static int wake_idle(int cpu, struct task_struct *p)
1081{
1082 struct sched_domain *sd;
1083 int i;
1084 unsigned int chosen_wakeup_cpu;
1085 int this_cpu;
1086 struct rq *task_rq = task_rq(p);
1087
1088 /*
1089 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1090 * are idle and this is not a kernel thread and this task's affinity
1091 * allows it to be moved to preferred cpu, then just move!
1092 */
1093
1094 this_cpu = smp_processor_id();
1095 chosen_wakeup_cpu =
1096 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1097
1098 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1099 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1100 p->mm && !(p->flags & PF_KTHREAD) &&
1101 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1102 return chosen_wakeup_cpu;
1103
1104 /*
1105 * If it is idle, then it is the best cpu to run this task.
1106 *
1107 * This cpu is also the best, if it has more than one task already.
1108 * Siblings must be also busy(in most cases) as they didn't already
1109 * pickup the extra load from this cpu and hence we need not check
1110 * sibling runqueue info. This will avoid the checks and cache miss
1111 * penalities associated with that.
1112 */
1113 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1114 return cpu;
1115
1116 for_each_domain(cpu, sd) {
1117 if ((sd->flags & SD_WAKE_IDLE)
1118 || ((sd->flags & SD_WAKE_IDLE_FAR)
1119 && !task_hot(p, task_rq->clock, sd))) {
1120 for_each_cpu_and(i, sched_domain_span(sd),
1121 &p->cpus_allowed) {
1122 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1123 if (i != task_cpu(p)) {
1124 schedstat_inc(p,
1125 se.nr_wakeups_idle);
1126 }
1127 return i;
1128 }
1129 }
1130 } else {
1131 break;
1132 }
1133 }
1134 return cpu;
1135}
1136#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1137static inline int wake_idle(int cpu, struct task_struct *p)
1138{
1139 return cpu;
1140}
1141#endif
1142
1143#ifdef CONFIG_SMP 1073#ifdef CONFIG_SMP
1144 1074
1145#ifdef CONFIG_FAIR_GROUP_SCHED 1075#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1226,25 +1156,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1226 1156
1227#endif 1157#endif
1228 1158
1229static int 1159static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1230wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1231 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1232 int idx, unsigned long load, unsigned long this_load,
1233 unsigned int imbalance)
1234{ 1160{
1235 struct task_struct *curr = this_rq->curr; 1161 struct task_struct *curr = current;
1236 struct task_group *tg; 1162 unsigned long this_load, load;
1237 unsigned long tl = this_load; 1163 int idx, this_cpu, prev_cpu;
1238 unsigned long tl_per_task; 1164 unsigned long tl_per_task;
1165 unsigned int imbalance;
1166 struct task_group *tg;
1239 unsigned long weight; 1167 unsigned long weight;
1240 int balanced; 1168 int balanced;
1241 1169
1242 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1170 idx = sd->wake_idx;
1243 return 0; 1171 this_cpu = smp_processor_id();
1172 prev_cpu = task_cpu(p);
1173 load = source_load(prev_cpu, idx);
1174 this_load = target_load(this_cpu, idx);
1244 1175
1245 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1176 if (sync) {
1246 p->se.avg_overlap > sysctl_sched_migration_cost)) 1177 if (sched_feat(SYNC_LESS) &&
1247 sync = 0; 1178 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1179 p->se.avg_overlap > sysctl_sched_migration_cost))
1180 sync = 0;
1181 } else {
1182 if (sched_feat(SYNC_MORE) &&
1183 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1184 p->se.avg_overlap < sysctl_sched_migration_cost))
1185 sync = 1;
1186 }
1248 1187
1249 /* 1188 /*
1250 * If sync wakeup then subtract the (maximum possible) 1189 * If sync wakeup then subtract the (maximum possible)
@@ -1255,24 +1194,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1255 tg = task_group(current); 1194 tg = task_group(current);
1256 weight = current->se.load.weight; 1195 weight = current->se.load.weight;
1257 1196
1258 tl += effective_load(tg, this_cpu, -weight, -weight); 1197 this_load += effective_load(tg, this_cpu, -weight, -weight);
1259 load += effective_load(tg, prev_cpu, 0, -weight); 1198 load += effective_load(tg, prev_cpu, 0, -weight);
1260 } 1199 }
1261 1200
1262 tg = task_group(p); 1201 tg = task_group(p);
1263 weight = p->se.load.weight; 1202 weight = p->se.load.weight;
1264 1203
1204 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1205
1265 /* 1206 /*
1266 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1207 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1267 * due to the sync cause above having dropped tl to 0, we'll always have 1208 * due to the sync cause above having dropped this_load to 0, we'll
1268 * an imbalance, but there's really nothing you can do about that, so 1209 * always have an imbalance, but there's really nothing you can do
1269 * that's good too. 1210 * about that, so that's good too.
1270 * 1211 *
1271 * Otherwise check if either cpus are near enough in load to allow this 1212 * Otherwise check if either cpus are near enough in load to allow this
1272 * task to be woken on this_cpu. 1213 * task to be woken on this_cpu.
1273 */ 1214 */
1274 balanced = !tl || 1215 balanced = !this_load ||
1275 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1216 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1276 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1217 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1277 1218
1278 /* 1219 /*
@@ -1286,14 +1227,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1286 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1227 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1287 tl_per_task = cpu_avg_load_per_task(this_cpu); 1228 tl_per_task = cpu_avg_load_per_task(this_cpu);
1288 1229
1289 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1230 if (balanced ||
1290 tl_per_task)) { 1231 (this_load <= load &&
1232 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1291 /* 1233 /*
1292 * This domain has SD_WAKE_AFFINE and 1234 * This domain has SD_WAKE_AFFINE and
1293 * p is cache cold in this domain, and 1235 * p is cache cold in this domain, and
1294 * there is no bad imbalance. 1236 * there is no bad imbalance.
1295 */ 1237 */
1296 schedstat_inc(this_sd, ttwu_move_affine); 1238 schedstat_inc(sd, ttwu_move_affine);
1297 schedstat_inc(p, se.nr_wakeups_affine); 1239 schedstat_inc(p, se.nr_wakeups_affine);
1298 1240
1299 return 1; 1241 return 1;
@@ -1301,65 +1243,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1301 return 0; 1243 return 0;
1302} 1244}
1303 1245
1304static int select_task_rq_fair(struct task_struct *p, int sync) 1246/*
1247 * find_idlest_group finds and returns the least busy CPU group within the
1248 * domain.
1249 */
1250static struct sched_group *
1251find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1252 int this_cpu, int load_idx)
1305{ 1253{
1306 struct sched_domain *sd, *this_sd = NULL; 1254 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1307 int prev_cpu, this_cpu, new_cpu; 1255 unsigned long min_load = ULONG_MAX, this_load = 0;
1308 unsigned long load, this_load; 1256 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1309 struct rq *this_rq;
1310 unsigned int imbalance;
1311 int idx;
1312 1257
1313 prev_cpu = task_cpu(p); 1258 do {
1314 this_cpu = smp_processor_id(); 1259 unsigned long load, avg_load;
1315 this_rq = cpu_rq(this_cpu); 1260 int local_group;
1316 new_cpu = prev_cpu; 1261 int i;
1317 1262
1318 /* 1263 /* Skip over this group if it has no CPUs allowed */
1319 * 'this_sd' is the first domain that both 1264 if (!cpumask_intersects(sched_group_cpus(group),
1320 * this_cpu and prev_cpu are present in: 1265 &p->cpus_allowed))
1321 */ 1266 continue;
1322 for_each_domain(this_cpu, sd) { 1267
1323 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1268 local_group = cpumask_test_cpu(this_cpu,
1324 this_sd = sd; 1269 sched_group_cpus(group));
1325 break; 1270
1271 /* Tally up the load of all CPUs in the group */
1272 avg_load = 0;
1273
1274 for_each_cpu(i, sched_group_cpus(group)) {
1275 /* Bias balancing toward cpus of our domain */
1276 if (local_group)
1277 load = source_load(i, load_idx);
1278 else
1279 load = target_load(i, load_idx);
1280
1281 avg_load += load;
1282 }
1283
1284 /* Adjust by relative CPU power of the group */
1285 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1286
1287 if (local_group) {
1288 this_load = avg_load;
1289 this = group;
1290 } else if (avg_load < min_load) {
1291 min_load = avg_load;
1292 idlest = group;
1293 }
1294 } while (group = group->next, group != sd->groups);
1295
1296 if (!idlest || 100*this_load < imbalance*min_load)
1297 return NULL;
1298 return idlest;
1299}
1300
1301/*
1302 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1303 */
1304static int
1305find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1306{
1307 unsigned long load, min_load = ULONG_MAX;
1308 int idlest = -1;
1309 int i;
1310
1311 /* Traverse only the allowed CPUs */
1312 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1313 load = weighted_cpuload(i);
1314
1315 if (load < min_load || (load == min_load && i == this_cpu)) {
1316 min_load = load;
1317 idlest = i;
1326 } 1318 }
1327 } 1319 }
1328 1320
1329 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1321 return idlest;
1330 goto out; 1322}
1331 1323
1332 /* 1324/*
1333 * Check for affine wakeup and passive balancing possibilities. 1325 * sched_balance_self: balance the current task (running on cpu) in domains
1334 */ 1326 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1335 if (!this_sd) 1327 * SD_BALANCE_EXEC.
1328 *
1329 * Balance, ie. select the least loaded group.
1330 *
1331 * Returns the target CPU number, or the same CPU if no balancing is needed.
1332 *
1333 * preempt must be disabled.
1334 */
1335static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1336{
1337 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1338 int cpu = smp_processor_id();
1339 int prev_cpu = task_cpu(p);
1340 int new_cpu = cpu;
1341 int want_affine = 0;
1342 int want_sd = 1;
1343 int sync = wake_flags & WF_SYNC;
1344
1345 if (sd_flag & SD_BALANCE_WAKE) {
1346 if (sched_feat(AFFINE_WAKEUPS))
1347 want_affine = 1;
1348 new_cpu = prev_cpu;
1349 }
1350
1351 rcu_read_lock();
1352 for_each_domain(cpu, tmp) {
1353 /*
1354 * If power savings logic is enabled for a domain, see if we
1355 * are not overloaded, if so, don't balance wider.
1356 */
1357 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1358 unsigned long power = 0;
1359 unsigned long nr_running = 0;
1360 unsigned long capacity;
1361 int i;
1362
1363 for_each_cpu(i, sched_domain_span(tmp)) {
1364 power += power_of(i);
1365 nr_running += cpu_rq(i)->cfs.nr_running;
1366 }
1367
1368 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1369
1370 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1371 nr_running /= 2;
1372
1373 if (nr_running < capacity)
1374 want_sd = 0;
1375 }
1376
1377 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1378 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1379
1380 affine_sd = tmp;
1381 want_affine = 0;
1382 }
1383
1384 if (!want_sd && !want_affine)
1385 break;
1386
1387 if (!(tmp->flags & sd_flag))
1388 continue;
1389
1390 if (want_sd)
1391 sd = tmp;
1392 }
1393
1394 if (sched_feat(LB_SHARES_UPDATE)) {
1395 /*
1396 * Pick the largest domain to update shares over
1397 */
1398 tmp = sd;
1399 if (affine_sd && (!tmp ||
1400 cpumask_weight(sched_domain_span(affine_sd)) >
1401 cpumask_weight(sched_domain_span(sd))))
1402 tmp = affine_sd;
1403
1404 if (tmp)
1405 update_shares(tmp);
1406 }
1407
1408 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1409 new_cpu = cpu;
1336 goto out; 1410 goto out;
1411 }
1337 1412
1338 idx = this_sd->wake_idx; 1413 while (sd) {
1414 int load_idx = sd->forkexec_idx;
1415 struct sched_group *group;
1416 int weight;
1339 1417
1340 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1418 if (!(sd->flags & sd_flag)) {
1419 sd = sd->child;
1420 continue;
1421 }
1341 1422
1342 load = source_load(prev_cpu, idx); 1423 if (sd_flag & SD_BALANCE_WAKE)
1343 this_load = target_load(this_cpu, idx); 1424 load_idx = sd->wake_idx;
1344 1425
1345 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1426 group = find_idlest_group(sd, p, cpu, load_idx);
1346 load, this_load, imbalance)) 1427 if (!group) {
1347 return this_cpu; 1428 sd = sd->child;
1429 continue;
1430 }
1348 1431
1349 /* 1432 new_cpu = find_idlest_cpu(group, p, cpu);
1350 * Start passive balancing when half the imbalance_pct 1433 if (new_cpu == -1 || new_cpu == cpu) {
1351 * limit is reached. 1434 /* Now try balancing at a lower domain level of cpu */
1352 */ 1435 sd = sd->child;
1353 if (this_sd->flags & SD_WAKE_BALANCE) { 1436 continue;
1354 if (imbalance*this_load <= 100*load) {
1355 schedstat_inc(this_sd, ttwu_move_balance);
1356 schedstat_inc(p, se.nr_wakeups_passive);
1357 return this_cpu;
1358 } 1437 }
1438
1439 /* Now try balancing at a lower domain level of new_cpu */
1440 cpu = new_cpu;
1441 weight = cpumask_weight(sched_domain_span(sd));
1442 sd = NULL;
1443 for_each_domain(cpu, tmp) {
1444 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1445 break;
1446 if (tmp->flags & sd_flag)
1447 sd = tmp;
1448 }
1449 /* while loop will break here if sd == NULL */
1359 } 1450 }
1360 1451
1361out: 1452out:
1362 return wake_idle(new_cpu, p); 1453 rcu_read_unlock();
1454 return new_cpu;
1363} 1455}
1364#endif /* CONFIG_SMP */ 1456#endif /* CONFIG_SMP */
1365 1457
@@ -1472,11 +1564,12 @@ static void set_next_buddy(struct sched_entity *se)
1472/* 1564/*
1473 * Preempt the current task with a newly woken task if needed: 1565 * Preempt the current task with a newly woken task if needed:
1474 */ 1566 */
1475static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1567static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1476{ 1568{
1477 struct task_struct *curr = rq->curr; 1569 struct task_struct *curr = rq->curr;
1478 struct sched_entity *se = &curr->se, *pse = &p->se; 1570 struct sched_entity *se = &curr->se, *pse = &p->se;
1479 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1571 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1572 int sync = wake_flags & WF_SYNC;
1480 1573
1481 update_curr(cfs_rq); 1574 update_curr(cfs_rq);
1482 1575
@@ -1502,7 +1595,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1502 */ 1595 */
1503 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1596 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1504 set_last_buddy(se); 1597 set_last_buddy(se);
1505 set_next_buddy(pse); 1598 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1599 set_next_buddy(pse);
1506 1600
1507 /* 1601 /*
1508 * We can come here with TIF_NEED_RESCHED already set from new task 1602 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1524,16 +1618,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1524 return; 1618 return;
1525 } 1619 }
1526 1620
1527 if (!sched_feat(WAKEUP_PREEMPT)) 1621 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1528 return; 1622 (sched_feat(WAKEUP_OVERLAP) &&
1529 1623 (se->avg_overlap < sysctl_sched_migration_cost &&
1530 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1624 pse->avg_overlap < sysctl_sched_migration_cost))) {
1531 (se->avg_overlap < sysctl_sched_migration_cost &&
1532 pse->avg_overlap < sysctl_sched_migration_cost))) {
1533 resched_task(curr); 1625 resched_task(curr);
1534 return; 1626 return;
1535 } 1627 }
1536 1628
1629 if (sched_feat(WAKEUP_RUNNING)) {
1630 if (pse->avg_running < se->avg_running) {
1631 set_next_buddy(pse);
1632 resched_task(curr);
1633 return;
1634 }
1635 }
1636
1637 if (!sched_feat(WAKEUP_PREEMPT))
1638 return;
1639
1537 find_matching_se(&se, &pse); 1640 find_matching_se(&se, &pse);
1538 1641
1539 BUG_ON(!pse); 1642 BUG_ON(!pse);
@@ -1556,8 +1659,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1556 /* 1659 /*
1557 * If se was a buddy, clear it so that it will have to earn 1660 * If se was a buddy, clear it so that it will have to earn
1558 * the favour again. 1661 * the favour again.
1662 *
1663 * If se was not a buddy, clear the buddies because neither
1664 * was elegible to run, let them earn it again.
1665 *
1666 * IOW. unconditionally clear buddies.
1559 */ 1667 */
1560 __clear_buddies(cfs_rq, se); 1668 __clear_buddies(cfs_rq, NULL);
1561 set_next_entity(cfs_rq, se); 1669 set_next_entity(cfs_rq, se);
1562 cfs_rq = group_cfs_rq(se); 1670 cfs_rq = group_cfs_rq(se);
1563 } while (cfs_rq); 1671 } while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e2dc63a5815d..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..a8b448af004b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526c..13de7126a6ab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
938#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
939static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
940 940
941static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
942{ 942{
943 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
944 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
945 /* 948 /*
946 * If the current task is an RT task, then 949 * If the current task is an RT task, then
947 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
999/* 1002/*
1000 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
1001 */ 1004 */
1002static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1003{ 1006{
1004 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
1005 resched_task(rq->curr); 1008 resched_task(rq->curr);
diff --git a/kernel/smp.c b/kernel/smp.c
index 94188b8ecc33..8e218500ab14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -177,6 +177,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 177 int cpu = get_cpu();
178 178
179 /* 179 /*
180 * Shouldn't receive this interrupt on a cpu that is not yet online.
181 */
182 WARN_ON_ONCE(!cpu_online(cpu));
183
184 /*
180 * Ensure entry is visible on call_function_queue after we have 185 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 186 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 187 * If we don't have this, then we may miss an entry on the list
@@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 235 unsigned int data_flags;
231 LIST_HEAD(list); 236 LIST_HEAD(list);
232 237
238 /*
239 * Shouldn't receive this interrupt on a cpu that is not yet online.
240 */
241 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
242
233 spin_lock(&q->lock); 243 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 244 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 245 spin_unlock(&q->lock);
@@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 295 */
286 this_cpu = get_cpu(); 296 this_cpu = get_cpu();
287 297
288 /* Can deadlock when called with interrupts disabled */ 298 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 299 * Can deadlock when called with interrupts disabled.
300 * We allow cpu's that are not yet online though, as no one else can
301 * send smp call function interrupt to this cpu and as such deadlocks
302 * can't happen.
303 */
304 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
305 && !oops_in_progress);
290 306
291 if (cpu == this_cpu) { 307 if (cpu == this_cpu) {
292 local_irq_save(flags); 308 local_irq_save(flags);
@@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 345{
330 csd_lock(data); 346 csd_lock(data);
331 347
332 /* Can deadlock when called with interrupts disabled */ 348 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 349 * Can deadlock when called with interrupts disabled.
350 * We allow cpu's that are not yet online though, as no one else can
351 * send smp call function interrupt to this cpu and as such deadlocks
352 * can't happen.
353 */
354 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
355 && !oops_in_progress);
334 356
335 generic_exec_single(cpu, data, wait); 357 generic_exec_single(cpu, data, wait);
336} 358}
@@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 387 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 388 int cpu, next_cpu, this_cpu = smp_processor_id();
367 389
368 /* Can deadlock when called with interrupts disabled */ 390 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 391 * Can deadlock when called with interrupts disabled.
392 * We allow cpu's that are not yet online though, as no one else can
393 * send smp call function interrupt to this cpu and as such deadlocks
394 * can't happen.
395 */
396 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
397 && !oops_in_progress);
370 398
371 /* So, what's a CPU they want? Ignoring this one. */ 399 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 400 cpu = cpumask_first_and(mask, cpu_online_mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7db25067cd2d..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3125cff1c570..1a631ba684a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -91,6 +91,9 @@ extern int sysctl_nr_trim_pages;
91#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
92extern int rcutorture_runnable; 92extern int rcutorture_runnable;
93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled;
96#endif
94 97
95/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
96#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -997,7 +1000,16 @@ static struct ctl_table kern_table[] = {
997 .proc_handler = &proc_dointvec, 1000 .proc_handler = &proc_dointvec,
998 }, 1001 },
999#endif 1002#endif
1000 1003#ifdef CONFIG_BLOCK
1004 {
1005 .ctl_name = CTL_UNNUMBERED,
1006 .procname = "blk_iopoll",
1007 .data = &blk_iopoll_enabled,
1008 .maxlen = sizeof(int),
1009 .mode = 0644,
1010 .proc_handler = &proc_dointvec,
1011 },
1012#endif
1001/* 1013/*
1002 * NOTE: do not add new entries to this table unless you have read 1014 * NOTE: do not add new entries to this table unless you have read
1003 * Documentation/sysctl/ctl_unnumbered.txt 1015 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
108/* 108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid 109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */ 110 */
111static int send_reply(struct sk_buff *skb, pid_t pid) 111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 112{
113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114 void *reply = genlmsg_data(genlhdr); 114 void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
120 return rc; 120 return rc;
121 } 121 }
122 122
123 return genlmsg_unicast(skb, pid); 123 return genlmsg_reply(skb, info);
124} 124}
125 125
126/* 126/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
150 if (!skb_next) 150 if (!skb_next)
151 break; 151 break;
152 } 152 }
153 rc = genlmsg_unicast(skb_cur, s->pid); 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154 if (rc == -ECONNREFUSED) { 154 if (rc == -ECONNREFUSED) {
155 s->valid = 0; 155 s->valid = 0;
156 delcount++; 156 delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
418 goto err; 418 goto err;
419 } 419 }
420 420
421 rc = send_reply(rep_skb, info->snd_pid); 421 rc = send_reply(rep_skb, info);
422 422
423err: 423err:
424 fput_light(file, fput_needed); 424 fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
487 } else 487 } else
488 goto err; 488 goto err;
489 489
490 return send_reply(rep_skb, info->snd_pid); 490 return send_reply(rep_skb, info);
491err: 491err:
492 nlmsg_free(rep_skb); 492 nlmsg_free(rep_skb);
493 return rc; 493 return rc;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1ea0d1234f4a..e71634604400 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_SYSCALL_TRACEPOINTS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -469,6 +479,18 @@ config FTRACE_STARTUP_TEST
469 functioning properly. It will do tests on all the configured 479 functioning properly. It will do tests on all the configured
470 tracers of ftrace. 480 tracers of ftrace.
471 481
482config EVENT_TRACE_TEST_SYSCALLS
483 bool "Run selftest on syscall events"
484 depends on FTRACE_STARTUP_TEST
485 help
486 This option will also enable testing every syscall event.
487 It only enables the event and disables it and runs various loads
488 with the event enabled. This adds a bit more time for kernel boot
489 up since it runs this on every system call defined.
490
491 TBD - enable a way to actually call the syscalls as we test their
492 events
493
472config MMIOTRACE 494config MMIOTRACE
473 bool "Memory mapped IO tracing" 495 bool "Memory mapped IO tracing"
474 depends on HAVE_MMIOTRACE_SUPPORT && PCI 496 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c804e24f96f..cc615f84751b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1323,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1323 1323
1324enum { 1324enum {
1325 FTRACE_ITER_FILTER = (1 << 0), 1325 FTRACE_ITER_FILTER = (1 << 0),
1326 FTRACE_ITER_CONT = (1 << 1), 1326 FTRACE_ITER_NOTRACE = (1 << 1),
1327 FTRACE_ITER_NOTRACE = (1 << 2), 1327 FTRACE_ITER_FAILURES = (1 << 2),
1328 FTRACE_ITER_FAILURES = (1 << 3), 1328 FTRACE_ITER_PRINTALL = (1 << 3),
1329 FTRACE_ITER_PRINTALL = (1 << 4), 1329 FTRACE_ITER_HASH = (1 << 4),
1330 FTRACE_ITER_HASH = (1 << 5),
1331}; 1330};
1332 1331
1333#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1332#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1337,8 +1336,7 @@ struct ftrace_iterator {
1337 int hidx; 1336 int hidx;
1338 int idx; 1337 int idx;
1339 unsigned flags; 1338 unsigned flags;
1340 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1339 struct trace_parser parser;
1341 unsigned buffer_idx;
1342}; 1340};
1343 1341
1344static void * 1342static void *
@@ -1407,7 +1405,7 @@ static int t_hash_show(struct seq_file *m, void *v)
1407 if (rec->ops->print) 1405 if (rec->ops->print)
1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1406 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1409 1407
1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); 1408 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1411 1409
1412 if (rec->data) 1410 if (rec->data)
1413 seq_printf(m, ":%p", rec->data); 1411 seq_printf(m, ":%p", rec->data);
@@ -1517,7 +1515,7 @@ static int t_show(struct seq_file *m, void *v)
1517 if (!rec) 1515 if (!rec)
1518 return 0; 1516 return 0;
1519 1517
1520 seq_printf(m, "%pf\n", (void *)rec->ip); 1518 seq_printf(m, "%ps\n", (void *)rec->ip);
1521 1519
1522 return 0; 1520 return 0;
1523} 1521}
@@ -1604,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1604 if (!iter) 1602 if (!iter)
1605 return -ENOMEM; 1603 return -ENOMEM;
1606 1604
1605 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1606 kfree(iter);
1607 return -ENOMEM;
1608 }
1609
1607 mutex_lock(&ftrace_regex_lock); 1610 mutex_lock(&ftrace_regex_lock);
1608 if ((file->f_mode & FMODE_WRITE) && 1611 if ((file->f_mode & FMODE_WRITE) &&
1609 (file->f_flags & O_TRUNC)) 1612 (file->f_flags & O_TRUNC))
@@ -2059,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2059 int i, len = 0; 2062 int i, len = 0;
2060 char *search; 2063 char *search;
2061 2064
2062 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2065 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2063 glob = NULL; 2066 glob = NULL;
2064 else { 2067 else if (glob) {
2065 int not; 2068 int not;
2066 2069
2067 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2070 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
@@ -2196,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2196 size_t cnt, loff_t *ppos, int enable) 2199 size_t cnt, loff_t *ppos, int enable)
2197{ 2200{
2198 struct ftrace_iterator *iter; 2201 struct ftrace_iterator *iter;
2199 char ch; 2202 struct trace_parser *parser;
2200 size_t read = 0; 2203 ssize_t ret, read;
2201 ssize_t ret;
2202 2204
2203 if (!cnt || cnt < 0) 2205 if (!cnt || cnt < 0)
2204 return 0; 2206 return 0;
@@ -2211,72 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2211 } else 2213 } else
2212 iter = file->private_data; 2214 iter = file->private_data;
2213 2215
2214 if (!*ppos) { 2216 parser = &iter->parser;
2215 iter->flags &= ~FTRACE_ITER_CONT; 2217 read = trace_get_user(parser, ubuf, cnt, ppos);
2216 iter->buffer_idx = 0;
2217 }
2218
2219 ret = get_user(ch, ubuf++);
2220 if (ret)
2221 goto out;
2222 read++;
2223 cnt--;
2224 2218
2225 /* 2219 if (trace_parser_loaded(parser) &&
2226 * If the parser haven't finished with the last write, 2220 !trace_parser_cont(parser)) {
2227 * continue reading the user input without skipping spaces. 2221 ret = ftrace_process_regex(parser->buffer,
2228 */ 2222 parser->idx, enable);
2229 if (!(iter->flags & FTRACE_ITER_CONT)) {
2230 /* skip white space */
2231 while (cnt && isspace(ch)) {
2232 ret = get_user(ch, ubuf++);
2233 if (ret)
2234 goto out;
2235 read++;
2236 cnt--;
2237 }
2238
2239 /* only spaces were written */
2240 if (isspace(ch)) {
2241 *ppos += read;
2242 ret = read;
2243 goto out;
2244 }
2245
2246 iter->buffer_idx = 0;
2247 }
2248
2249 while (cnt && !isspace(ch)) {
2250 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2251 iter->buffer[iter->buffer_idx++] = ch;
2252 else {
2253 ret = -EINVAL;
2254 goto out;
2255 }
2256 ret = get_user(ch, ubuf++);
2257 if (ret) 2223 if (ret)
2258 goto out; 2224 goto out;
2259 read++;
2260 cnt--;
2261 }
2262 2225
2263 if (isspace(ch)) { 2226 trace_parser_clear(parser);
2264 iter->buffer[iter->buffer_idx] = 0;
2265 ret = ftrace_process_regex(iter->buffer,
2266 iter->buffer_idx, enable);
2267 if (ret)
2268 goto out;
2269 iter->buffer_idx = 0;
2270 } else {
2271 iter->flags |= FTRACE_ITER_CONT;
2272 iter->buffer[iter->buffer_idx++] = ch;
2273 } 2227 }
2274 2228
2275 *ppos += read;
2276 ret = read; 2229 ret = read;
2277 out:
2278 mutex_unlock(&ftrace_regex_lock);
2279 2230
2231 mutex_unlock(&ftrace_regex_lock);
2232out:
2280 return ret; 2233 return ret;
2281} 2234}
2282 2235
@@ -2381,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2381{ 2334{
2382 struct seq_file *m = (struct seq_file *)file->private_data; 2335 struct seq_file *m = (struct seq_file *)file->private_data;
2383 struct ftrace_iterator *iter; 2336 struct ftrace_iterator *iter;
2337 struct trace_parser *parser;
2384 2338
2385 mutex_lock(&ftrace_regex_lock); 2339 mutex_lock(&ftrace_regex_lock);
2386 if (file->f_mode & FMODE_READ) { 2340 if (file->f_mode & FMODE_READ) {
@@ -2390,9 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2390 } else 2344 } else
2391 iter = file->private_data; 2345 iter = file->private_data;
2392 2346
2393 if (iter->buffer_idx) { 2347 parser = &iter->parser;
2394 iter->buffer[iter->buffer_idx] = 0; 2348 if (trace_parser_loaded(parser)) {
2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2349 parser->buffer[parser->idx] = 0;
2350 ftrace_match_records(parser->buffer, parser->idx, enable);
2396 } 2351 }
2397 2352
2398 mutex_lock(&ftrace_lock); 2353 mutex_lock(&ftrace_lock);
@@ -2400,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2400 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2355 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2401 mutex_unlock(&ftrace_lock); 2356 mutex_unlock(&ftrace_lock);
2402 2357
2358 trace_parser_put(parser);
2403 kfree(iter); 2359 kfree(iter);
2360
2404 mutex_unlock(&ftrace_regex_lock); 2361 mutex_unlock(&ftrace_regex_lock);
2405 return 0; 2362 return 0;
2406} 2363}
@@ -2499,7 +2456,7 @@ static int g_show(struct seq_file *m, void *v)
2499 return 0; 2456 return 0;
2500 } 2457 }
2501 2458
2502 seq_printf(m, "%pf\n", v); 2459 seq_printf(m, "%ps\n", (void *)*ptr);
2503 2460
2504 return 0; 2461 return 0;
2505} 2462}
@@ -2602,12 +2559,10 @@ static ssize_t
2602ftrace_graph_write(struct file *file, const char __user *ubuf, 2559ftrace_graph_write(struct file *file, const char __user *ubuf,
2603 size_t cnt, loff_t *ppos) 2560 size_t cnt, loff_t *ppos)
2604{ 2561{
2605 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2562 struct trace_parser parser;
2606 unsigned long *array; 2563 unsigned long *array;
2607 size_t read = 0; 2564 size_t read = 0;
2608 ssize_t ret; 2565 ssize_t ret;
2609 int index = 0;
2610 char ch;
2611 2566
2612 if (!cnt || cnt < 0) 2567 if (!cnt || cnt < 0)
2613 return 0; 2568 return 0;
@@ -2625,51 +2580,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2625 } else 2580 } else
2626 array = file->private_data; 2581 array = file->private_data;
2627 2582
2628 ret = get_user(ch, ubuf++); 2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2629 if (ret) 2584 ret = -ENOMEM;
2630 goto out; 2585 goto out;
2631 read++;
2632 cnt--;
2633
2634 /* skip white space */
2635 while (cnt && isspace(ch)) {
2636 ret = get_user(ch, ubuf++);
2637 if (ret)
2638 goto out;
2639 read++;
2640 cnt--;
2641 } 2586 }
2642 2587
2643 if (isspace(ch)) { 2588 read = trace_get_user(&parser, ubuf, cnt, ppos);
2644 *ppos += read;
2645 ret = read;
2646 goto out;
2647 }
2648 2589
2649 while (cnt && !isspace(ch)) { 2590 if (trace_parser_loaded((&parser))) {
2650 if (index < FTRACE_BUFF_MAX) 2591 parser.buffer[parser.idx] = 0;
2651 buffer[index++] = ch; 2592
2652 else { 2593 /* we allow only one expression at a time */
2653 ret = -EINVAL; 2594 ret = ftrace_set_func(array, &ftrace_graph_count,
2654 goto out; 2595 parser.buffer);
2655 }
2656 ret = get_user(ch, ubuf++);
2657 if (ret) 2596 if (ret)
2658 goto out; 2597 goto out;
2659 read++;
2660 cnt--;
2661 } 2598 }
2662 buffer[index] = 0;
2663
2664 /* we allow only one expression at a time */
2665 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2666 if (ret)
2667 goto out;
2668
2669 file->f_pos += read;
2670 2599
2671 ret = read; 2600 ret = read;
2672 out: 2601 out:
2602 trace_parser_put(&parser);
2673 mutex_unlock(&graph_lock); 2603 mutex_unlock(&graph_lock);
2674 2604
2675 return ret; 2605 return ret;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 454e74e718cf..6eef38923b07 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -701,8 +701,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
701 701
702 val &= ~RB_FLAG_MASK; 702 val &= ~RB_FLAG_MASK;
703 703
704 ret = (unsigned long)cmpxchg(&list->next, 704 ret = cmpxchg((unsigned long *)&list->next,
705 val | old_flag, val | new_flag); 705 val | old_flag, val | new_flag);
706 706
707 /* check if the reader took the page */ 707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val) 708 if ((ret & ~RB_FLAG_MASK) != val)
@@ -794,7 +794,7 @@ static int rb_head_page_replace(struct buffer_page *old,
794 val = *ptr & ~RB_FLAG_MASK; 794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD; 795 val |= RB_PAGE_HEAD;
796 796
797 ret = cmpxchg(ptr, val, &new->list); 797 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
798 798
799 return ret == val; 799 return ret == val;
800} 800}
@@ -2997,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2997} 2997}
2998 2998
2999static struct ring_buffer_event * 2999static struct ring_buffer_event *
3000rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3000rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3001{ 3001{
3002 struct ring_buffer_per_cpu *cpu_buffer;
3003 struct ring_buffer_event *event; 3002 struct ring_buffer_event *event;
3004 struct buffer_page *reader; 3003 struct buffer_page *reader;
3005 int nr_loops = 0; 3004 int nr_loops = 0;
3006 3005
3007 cpu_buffer = buffer->buffers[cpu];
3008
3009 again: 3006 again:
3010 /* 3007 /*
3011 * We repeat when a timestamp is encountered. It is possible 3008 * We repeat when a timestamp is encountered. It is possible
@@ -3049,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3049 case RINGBUF_TYPE_DATA: 3046 case RINGBUF_TYPE_DATA:
3050 if (ts) { 3047 if (ts) {
3051 *ts = cpu_buffer->read_stamp + event->time_delta; 3048 *ts = cpu_buffer->read_stamp + event->time_delta;
3052 ring_buffer_normalize_time_stamp(buffer, 3049 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3053 cpu_buffer->cpu, ts); 3050 cpu_buffer->cpu, ts);
3054 } 3051 }
3055 return event; 3052 return event;
@@ -3168,7 +3165,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3168 local_irq_save(flags); 3165 local_irq_save(flags);
3169 if (dolock) 3166 if (dolock)
3170 spin_lock(&cpu_buffer->reader_lock); 3167 spin_lock(&cpu_buffer->reader_lock);
3171 event = rb_buffer_peek(buffer, cpu, ts); 3168 event = rb_buffer_peek(cpu_buffer, ts);
3172 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3169 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3173 rb_advance_reader(cpu_buffer); 3170 rb_advance_reader(cpu_buffer);
3174 if (dolock) 3171 if (dolock)
@@ -3237,7 +3234,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3237 if (dolock) 3234 if (dolock)
3238 spin_lock(&cpu_buffer->reader_lock); 3235 spin_lock(&cpu_buffer->reader_lock);
3239 3236
3240 event = rb_buffer_peek(buffer, cpu, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts);
3241 if (event) 3238 if (event)
3242 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3243 3240
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c75deeefe30..fd52a19dd172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -339,6 +339,112 @@ static struct {
339 339
340int trace_clock_id; 340int trace_clock_id;
341 341
342/*
343 * trace_parser_get_init - gets the buffer for trace parser
344 */
345int trace_parser_get_init(struct trace_parser *parser, int size)
346{
347 memset(parser, 0, sizeof(*parser));
348
349 parser->buffer = kmalloc(size, GFP_KERNEL);
350 if (!parser->buffer)
351 return 1;
352
353 parser->size = size;
354 return 0;
355}
356
357/*
358 * trace_parser_put - frees the buffer for trace parser
359 */
360void trace_parser_put(struct trace_parser *parser)
361{
362 kfree(parser->buffer);
363}
364
365/*
366 * trace_get_user - reads the user input string separated by space
367 * (matched by isspace(ch))
368 *
369 * For each string found the 'struct trace_parser' is updated,
370 * and the function returns.
371 *
372 * Returns number of bytes read.
373 *
374 * See kernel/trace/trace.h for 'struct trace_parser' details.
375 */
376int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
377 size_t cnt, loff_t *ppos)
378{
379 char ch;
380 size_t read = 0;
381 ssize_t ret;
382
383 if (!*ppos)
384 trace_parser_clear(parser);
385
386 ret = get_user(ch, ubuf++);
387 if (ret)
388 goto out;
389
390 read++;
391 cnt--;
392
393 /*
394 * The parser is not finished with the last write,
395 * continue reading the user input without skipping spaces.
396 */
397 if (!parser->cont) {
398 /* skip white space */
399 while (cnt && isspace(ch)) {
400 ret = get_user(ch, ubuf++);
401 if (ret)
402 goto out;
403 read++;
404 cnt--;
405 }
406
407 /* only spaces were written */
408 if (isspace(ch)) {
409 *ppos += read;
410 ret = read;
411 goto out;
412 }
413
414 parser->idx = 0;
415 }
416
417 /* read the non-space input */
418 while (cnt && !isspace(ch)) {
419 if (parser->idx < parser->size)
420 parser->buffer[parser->idx++] = ch;
421 else {
422 ret = -EINVAL;
423 goto out;
424 }
425 ret = get_user(ch, ubuf++);
426 if (ret)
427 goto out;
428 read++;
429 cnt--;
430 }
431
432 /* We either got finished input or we have to wait for another call. */
433 if (isspace(ch)) {
434 parser->buffer[parser->idx] = 0;
435 parser->cont = false;
436 } else {
437 parser->cont = true;
438 parser->buffer[parser->idx++] = ch;
439 }
440
441 *ppos += read;
442 ret = read;
443
444out:
445 return ret;
446}
447
342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 448ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
343{ 449{
344 int len; 450 int len;
@@ -719,6 +825,11 @@ static void trace_init_cmdlines(void)
719 cmdline_idx = 0; 825 cmdline_idx = 0;
720} 826}
721 827
828int is_tracing_stopped(void)
829{
830 return trace_stop_count;
831}
832
722/** 833/**
723 * ftrace_off_permanent - disable all ftrace code permanently 834 * ftrace_off_permanent - disable all ftrace code permanently
724 * 835 *
@@ -886,7 +997,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
886 997
887 entry->preempt_count = pc & 0xff; 998 entry->preempt_count = pc & 0xff;
888 entry->pid = (tsk) ? tsk->pid : 0; 999 entry->pid = (tsk) ? tsk->pid : 0;
889 entry->tgid = (tsk) ? tsk->tgid : 0; 1000 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
890 entry->flags = 1001 entry->flags =
891#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1002#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
892 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1003 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1068,6 +1179,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1068 return; 1179 return;
1069 entry = ring_buffer_event_data(event); 1180 entry = ring_buffer_event_data(event);
1070 1181
1182 entry->tgid = current->tgid;
1071 memset(&entry->caller, 0, sizeof(entry->caller)); 1183 memset(&entry->caller, 0, sizeof(entry->caller));
1072 1184
1073 trace.nr_entries = 0; 1185 trace.nr_entries = 0;
@@ -1094,6 +1206,7 @@ ftrace_trace_special(void *__tr,
1094 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1206 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1095 int pc) 1207 int pc)
1096{ 1208{
1209 struct ftrace_event_call *call = &event_special;
1097 struct ring_buffer_event *event; 1210 struct ring_buffer_event *event;
1098 struct trace_array *tr = __tr; 1211 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer; 1212 struct ring_buffer *buffer = tr->buffer;
@@ -1107,7 +1220,9 @@ ftrace_trace_special(void *__tr,
1107 entry->arg1 = arg1; 1220 entry->arg1 = arg1;
1108 entry->arg2 = arg2; 1221 entry->arg2 = arg2;
1109 entry->arg3 = arg3; 1222 entry->arg3 = arg3;
1110 trace_buffer_unlock_commit(buffer, event, 0, pc); 1223
1224 if (!filter_check_discard(call, entry, buffer, event))
1225 trace_buffer_unlock_commit(buffer, event, 0, pc);
1111} 1226}
1112 1227
1113void 1228void
@@ -1530,10 +1645,10 @@ static void print_lat_help_header(struct seq_file *m)
1530 seq_puts(m, "# | / _----=> need-resched \n"); 1645 seq_puts(m, "# | / _----=> need-resched \n");
1531 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1646 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1532 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1647 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1533 seq_puts(m, "# |||| / \n"); 1648 seq_puts(m, "# |||| /_--=> lock-depth \n");
1534 seq_puts(m, "# ||||| delay \n"); 1649 seq_puts(m, "# |||||/ delay \n");
1535 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1650 seq_puts(m, "# cmd pid |||||| time | caller \n");
1536 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1651 seq_puts(m, "# \\ / |||||| \\ | / \n");
1537} 1652}
1538 1653
1539static void print_func_help_header(struct seq_file *m) 1654static void print_func_help_header(struct seq_file *m)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fa1dccb579d5..86bcff94791a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,6 +7,7 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
@@ -42,157 +43,54 @@ enum trace_type {
42 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
43}; 44};
44 45
45/* 46enum kmemtrace_type_id {
46 * Function trace entry - function address and parent function addres: 47 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 */ 48 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48struct ftrace_entry { 49 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49 struct trace_entry ent;
50 unsigned long ip;
51 unsigned long parent_ip;
52};
53
54/* Function call entry */
55struct ftrace_graph_ent_entry {
56 struct trace_entry ent;
57 struct ftrace_graph_ent graph_ent;
58}; 50};
59 51
60/* Function return entry */
61struct ftrace_graph_ret_entry {
62 struct trace_entry ent;
63 struct ftrace_graph_ret ret;
64};
65extern struct tracer boot_tracer; 52extern struct tracer boot_tracer;
66 53
67/* 54#undef __field
68 * Context switch trace entry - which task (and prio) we switched from/to: 55#define __field(type, item) type item;
69 */
70struct ctx_switch_entry {
71 struct trace_entry ent;
72 unsigned int prev_pid;
73 unsigned char prev_prio;
74 unsigned char prev_state;
75 unsigned int next_pid;
76 unsigned char next_prio;
77 unsigned char next_state;
78 unsigned int next_cpu;
79};
80
81/*
82 * Special (free-form) trace entry:
83 */
84struct special_entry {
85 struct trace_entry ent;
86 unsigned long arg1;
87 unsigned long arg2;
88 unsigned long arg3;
89};
90
91/*
92 * Stack-trace entry:
93 */
94
95#define FTRACE_STACK_ENTRIES 8
96
97struct stack_entry {
98 struct trace_entry ent;
99 unsigned long caller[FTRACE_STACK_ENTRIES];
100};
101
102struct userstack_entry {
103 struct trace_entry ent;
104 unsigned long caller[FTRACE_STACK_ENTRIES];
105};
106
107/*
108 * trace_printk entry:
109 */
110struct bprint_entry {
111 struct trace_entry ent;
112 unsigned long ip;
113 const char *fmt;
114 u32 buf[];
115};
116 56
117struct print_entry { 57#undef __field_struct
118 struct trace_entry ent; 58#define __field_struct(type, item) __field(type, item)
119 unsigned long ip;
120 char buf[];
121};
122 59
123#define TRACE_OLD_SIZE 88 60#undef __field_desc
61#define __field_desc(type, container, item)
124 62
125struct trace_field_cont { 63#undef __array
126 unsigned char type; 64#define __array(type, item, size) type item[size];
127 /* Temporary till we get rid of this completely */
128 char buf[TRACE_OLD_SIZE - 1];
129};
130 65
131struct trace_mmiotrace_rw { 66#undef __array_desc
132 struct trace_entry ent; 67#define __array_desc(type, container, item, size)
133 struct mmiotrace_rw rw;
134};
135 68
136struct trace_mmiotrace_map { 69#undef __dynamic_array
137 struct trace_entry ent; 70#define __dynamic_array(type, item) type item[];
138 struct mmiotrace_map map;
139};
140 71
141struct trace_boot_call { 72#undef F_STRUCT
142 struct trace_entry ent; 73#define F_STRUCT(args...) args
143 struct boot_trace_call boot_call;
144};
145 74
146struct trace_boot_ret { 75#undef FTRACE_ENTRY
147 struct trace_entry ent; 76#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
148 struct boot_trace_ret boot_ret; 77 struct struct_name { \
149}; 78 struct trace_entry ent; \
150 79 tstruct \
151#define TRACE_FUNC_SIZE 30 80 }
152#define TRACE_FILE_SIZE 20
153struct trace_branch {
154 struct trace_entry ent;
155 unsigned line;
156 char func[TRACE_FUNC_SIZE+1];
157 char file[TRACE_FILE_SIZE+1];
158 char correct;
159};
160
161struct hw_branch_entry {
162 struct trace_entry ent;
163 u64 from;
164 u64 to;
165};
166
167struct trace_power {
168 struct trace_entry ent;
169 struct power_trace state_data;
170};
171 81
172enum kmemtrace_type_id { 82#undef TP_ARGS
173 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ 83#define TP_ARGS(args...) args
174 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
175 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
176};
177 84
178struct kmemtrace_alloc_entry { 85#undef FTRACE_ENTRY_DUP
179 struct trace_entry ent; 86#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
180 enum kmemtrace_type_id type_id;
181 unsigned long call_site;
182 const void *ptr;
183 size_t bytes_req;
184 size_t bytes_alloc;
185 gfp_t gfp_flags;
186 int node;
187};
188 87
189struct kmemtrace_free_entry { 88#include "trace_entries.h"
190 struct trace_entry ent;
191 enum kmemtrace_type_id type_id;
192 unsigned long call_site;
193 const void *ptr;
194};
195 89
90/*
91 * syscalls are special, and need special handling, this is why
92 * they are not included in trace_entries.h
93 */
196struct syscall_trace_enter { 94struct syscall_trace_enter {
197 struct trace_entry ent; 95 struct trace_entry ent;
198 int nr; 96 int nr;
@@ -205,13 +103,12 @@ struct syscall_trace_exit {
205 unsigned long ret; 103 unsigned long ret;
206}; 104};
207 105
208
209/* 106/*
210 * trace_flag_type is an enumeration that holds different 107 * trace_flag_type is an enumeration that holds different
211 * states when a trace occurs. These are: 108 * states when a trace occurs. These are:
212 * IRQS_OFF - interrupts were disabled 109 * IRQS_OFF - interrupts were disabled
213 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 110 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
214 * NEED_RESCED - reschedule is requested 111 * NEED_RESCHED - reschedule is requested
215 * HARDIRQ - inside an interrupt handler 112 * HARDIRQ - inside an interrupt handler
216 * SOFTIRQ - inside a softirq handler 113 * SOFTIRQ - inside a softirq handler
217 */ 114 */
@@ -390,7 +287,6 @@ struct tracer {
390 struct tracer *next; 287 struct tracer *next;
391 int print_max; 288 int print_max;
392 struct tracer_flags *flags; 289 struct tracer_flags *flags;
393 struct tracer_stat *stats;
394}; 290};
395 291
396 292
@@ -469,6 +365,7 @@ void tracing_stop_sched_switch_record(void);
469void tracing_start_sched_switch_record(void); 365void tracing_start_sched_switch_record(void);
470int register_tracer(struct tracer *type); 366int register_tracer(struct tracer *type);
471void unregister_tracer(struct tracer *type); 367void unregister_tracer(struct tracer *type);
368int is_tracing_stopped(void);
472 369
473extern unsigned long nsecs_to_usecs(unsigned long nsecs); 370extern unsigned long nsecs_to_usecs(unsigned long nsecs);
474 371
@@ -509,20 +406,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
509 406
510extern cycle_t ftrace_now(int cpu); 407extern cycle_t ftrace_now(int cpu);
511 408
512#ifdef CONFIG_CONTEXT_SWITCH_TRACER
513typedef void
514(*tracer_switch_func_t)(void *private,
515 void *__rq,
516 struct task_struct *prev,
517 struct task_struct *next);
518
519struct tracer_switch_ops {
520 tracer_switch_func_t func;
521 void *private;
522 struct tracer_switch_ops *next;
523};
524#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
525
526extern void trace_find_cmdline(int pid, char comm[]); 409extern void trace_find_cmdline(int pid, char comm[]);
527 410
528#ifdef CONFIG_DYNAMIC_FTRACE 411#ifdef CONFIG_DYNAMIC_FTRACE
@@ -638,6 +521,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
638#endif 521#endif
639 522
640/* 523/*
524 * struct trace_parser - servers for reading the user input separated by spaces
525 * @cont: set if the input is not complete - no final space char was found
526 * @buffer: holds the parsed user input
527 * @idx: user input lenght
528 * @size: buffer size
529 */
530struct trace_parser {
531 bool cont;
532 char *buffer;
533 unsigned idx;
534 unsigned size;
535};
536
537static inline bool trace_parser_loaded(struct trace_parser *parser)
538{
539 return (parser->idx != 0);
540}
541
542static inline bool trace_parser_cont(struct trace_parser *parser)
543{
544 return parser->cont;
545}
546
547static inline void trace_parser_clear(struct trace_parser *parser)
548{
549 parser->cont = false;
550 parser->idx = 0;
551}
552
553extern int trace_parser_get_init(struct trace_parser *parser, int size);
554extern void trace_parser_put(struct trace_parser *parser);
555extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
556 size_t cnt, loff_t *ppos);
557
558/*
641 * trace_iterator_flags is an enumeration that defines bit 559 * trace_iterator_flags is an enumeration that defines bit
642 * positions into trace_flags that controls the output. 560 * positions into trace_flags that controls the output.
643 * 561 *
@@ -823,58 +741,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
823 return 0; 741 return 0;
824} 742}
825 743
826#define DEFINE_COMPARISON_PRED(type) \
827static int filter_pred_##type(struct filter_pred *pred, void *event, \
828 int val1, int val2) \
829{ \
830 type *addr = (type *)(event + pred->offset); \
831 type val = (type)pred->val; \
832 int match = 0; \
833 \
834 switch (pred->op) { \
835 case OP_LT: \
836 match = (*addr < val); \
837 break; \
838 case OP_LE: \
839 match = (*addr <= val); \
840 break; \
841 case OP_GT: \
842 match = (*addr > val); \
843 break; \
844 case OP_GE: \
845 match = (*addr >= val); \
846 break; \
847 default: \
848 break; \
849 } \
850 \
851 return match; \
852}
853
854#define DEFINE_EQUALITY_PRED(size) \
855static int filter_pred_##size(struct filter_pred *pred, void *event, \
856 int val1, int val2) \
857{ \
858 u##size *addr = (u##size *)(event + pred->offset); \
859 u##size val = (u##size)pred->val; \
860 int match; \
861 \
862 match = (val == *addr) ^ pred->not; \
863 \
864 return match; \
865}
866
867extern struct mutex event_mutex; 744extern struct mutex event_mutex;
868extern struct list_head ftrace_events; 745extern struct list_head ftrace_events;
869 746
870extern const char *__start___trace_bprintk_fmt[]; 747extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[]; 748extern const char *__stop___trace_bprintk_fmt[];
872 749
873#undef TRACE_EVENT_FORMAT 750#undef FTRACE_ENTRY
874#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 751#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
875 extern struct ftrace_event_call event_##call; 752 extern struct ftrace_event_call event_##call;
876#undef TRACE_EVENT_FORMAT_NOFILTER 753#undef FTRACE_ENTRY_DUP
877#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 754#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
878#include "trace_event_types.h" 755 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
756#include "trace_entries.h"
879 757
880#endif /* _LINUX_KERNEL_TRACE_H */ 758#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 19bfc75d467e..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly =
129 129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
132 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
133 struct ring_buffer *buffer; 134 struct ring_buffer *buffer;
134 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
@@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(buffer, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
161 struct ring_buffer *buffer; 164 struct ring_buffer *buffer;
162 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
@@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
175 goto out; 178 goto out;
176 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
177 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
178 trace_buffer_unlock_commit(buffer, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
179 out: 183 out:
180 preempt_enable(); 184 preempt_enable();
181} 185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..a431748ddd6e
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,383 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(power, trace_power,
334
335 TRACE_POWER,
336
337 F_STRUCT(
338 __field_struct( struct power_trace, state_data )
339 __field_desc( s64, state_data, stamp )
340 __field_desc( s64, state_data, end )
341 __field_desc( int, state_data, type )
342 __field_desc( int, state_data, state )
343 ),
344
345 F_printk("%llx->%llx type:%u state:%u",
346 __entry->stamp, __entry->end,
347 __entry->type, __entry->state)
348);
349
350FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
351
352 TRACE_KMEM_ALLOC,
353
354 F_STRUCT(
355 __field( enum kmemtrace_type_id, type_id )
356 __field( unsigned long, call_site )
357 __field( const void *, ptr )
358 __field( size_t, bytes_req )
359 __field( size_t, bytes_alloc )
360 __field( gfp_t, gfp_flags )
361 __field( int, node )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
365 " flags:%x node:%d",
366 __entry->type_id, __entry->call_site, __entry->ptr,
367 __entry->bytes_req, __entry->bytes_alloc,
368 __entry->gfp_flags, __entry->node)
369);
370
371FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
372
373 TRACE_KMEM_FREE,
374
375 F_STRUCT(
376 __field( enum kmemtrace_type_id, type_id )
377 __field( unsigned long, call_site )
378 __field( const void *, ptr )
379 ),
380
381 F_printk("type:%u call_site:%lx ptr:%p",
382 __entry->type_id, __entry->call_site, __entry->ptr)
383);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0a..55a25c933d15 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,6 +5,7 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
10int ftrace_profile_enable(int event_id) 11int ftrace_profile_enable(int event_id)
@@ -14,7 +15,8 @@ int ftrace_profile_enable(int event_id)
14 15
15 mutex_lock(&event_mutex); 16 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 17 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id && event->profile_enable) { 18 if (event->id == event_id && event->profile_enable &&
19 try_module_get(event->mod)) {
18 ret = event->profile_enable(event); 20 ret = event->profile_enable(event);
19 break; 21 break;
20 } 22 }
@@ -32,6 +34,7 @@ void ftrace_profile_disable(int event_id)
32 list_for_each_entry(event, &ftrace_events, list) { 34 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 35 if (event->id == event_id) {
34 event->profile_disable(event); 36 event->profile_disable(event);
37 module_put(event->mod);
35 break; 38 break;
36 } 39 }
37 } 40 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 6db005e12487..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
32 TRACE_FIELD(int, ret.depth, depth)
33 ),
34 TP_RAW_FMT("<-- %lx (%d)")
35);
36
37TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
38 TRACE_STRUCT(
39 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
40 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
41 TRACE_FIELD(unsigned char, prev_state, prev_state)
42 TRACE_FIELD(unsigned int, next_pid, next_pid)
43 TRACE_FIELD(unsigned char, next_prio, next_prio)
44 TRACE_FIELD(unsigned char, next_state, next_state)
45 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
46 ),
47 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
48);
49
50TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
51 TRACE_STRUCT(
52 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
53 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
54 TRACE_FIELD(unsigned char, prev_state, prev_state)
55 TRACE_FIELD(unsigned int, next_pid, next_pid)
56 TRACE_FIELD(unsigned char, next_prio, next_prio)
57 TRACE_FIELD(unsigned char, next_state, next_state)
58 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
59 ),
60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
61);
62
63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
64 TRACE_STRUCT(
65 TRACE_FIELD(unsigned long, arg1, arg1)
66 TRACE_FIELD(unsigned long, arg2, arg2)
67 TRACE_FIELD(unsigned long, arg3, arg3)
68 ),
69 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
70);
71
72/*
73 * Stack-trace entry:
74 */
75
76/* #define FTRACE_STACK_ENTRIES 8 */
77
78TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
79 TRACE_STRUCT(
80 TRACE_FIELD(unsigned long, caller[0], stack0)
81 TRACE_FIELD(unsigned long, caller[1], stack1)
82 TRACE_FIELD(unsigned long, caller[2], stack2)
83 TRACE_FIELD(unsigned long, caller[3], stack3)
84 TRACE_FIELD(unsigned long, caller[4], stack4)
85 TRACE_FIELD(unsigned long, caller[5], stack5)
86 TRACE_FIELD(unsigned long, caller[6], stack6)
87 TRACE_FIELD(unsigned long, caller[7], stack7)
88 ),
89 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
90 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
91);
92
93TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
94 TRACE_STRUCT(
95 TRACE_FIELD(unsigned long, caller[0], stack0)
96 TRACE_FIELD(unsigned long, caller[1], stack1)
97 TRACE_FIELD(unsigned long, caller[2], stack2)
98 TRACE_FIELD(unsigned long, caller[3], stack3)
99 TRACE_FIELD(unsigned long, caller[4], stack4)
100 TRACE_FIELD(unsigned long, caller[5], stack5)
101 TRACE_FIELD(unsigned long, caller[6], stack6)
102 TRACE_FIELD(unsigned long, caller[7], stack7)
103 ),
104 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
105 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
106);
107
108TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
109 TRACE_STRUCT(
110 TRACE_FIELD(unsigned long, ip, ip)
111 TRACE_FIELD(char *, fmt, fmt)
112 TRACE_FIELD_ZERO_CHAR(buf)
113 ),
114 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
115);
116
117TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
118 TRACE_STRUCT(
119 TRACE_FIELD(unsigned long, ip, ip)
120 TRACE_FIELD_ZERO_CHAR(buf)
121 ),
122 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
123);
124
125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
126 TRACE_STRUCT(
127 TRACE_FIELD(unsigned int, line, line)
128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
132 TRACE_FIELD(char, correct, correct)
133 ),
134 TP_RAW_FMT("%u:%s:%s (%u)")
135);
136
137TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
138 TRACE_STRUCT(
139 TRACE_FIELD(u64, from, from)
140 TRACE_FIELD(u64, to, to)
141 ),
142 TP_RAW_FMT("from: %llx to: %llx")
143);
144
145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
146 TRACE_STRUCT(
147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
149 TRACE_FIELD(int, state_data.type, type)
150 TRACE_FIELD(int, state_data.state, state)
151 ),
152 TP_RAW_FMT("%llx->%llx type:%u state:%u")
153);
154
155TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
156 TRACE_STRUCT(
157 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
158 TRACE_FIELD(unsigned long, call_site, call_site)
159 TRACE_FIELD(const void *, ptr, ptr)
160 TRACE_FIELD(size_t, bytes_req, bytes_req)
161 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
162 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
163 TRACE_FIELD(int, node, node)
164 ),
165 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
166 " flags:%x node:%d")
167);
168
169TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
170 TRACE_STRUCT(
171 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
172 TRACE_FIELD(unsigned long, call_site, call_site)
173 TRACE_FIELD(const void *, ptr, ptr)
174 ),
175 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
176);
177
178#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 78b1ed230177..56c260b83a9c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,6 +21,7 @@
21 21
22#include "trace_output.h" 22#include "trace_output.h"
23 23
24#undef TRACE_SYSTEM
24#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
25 26
26DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
@@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
86 __common_field(unsigned char, flags); 87 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count); 88 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid); 89 __common_field(int, pid);
89 __common_field(int, tgid); 90 __common_field(int, lock_depth);
90 91
91 return ret; 92 return ret;
92} 93}
@@ -230,11 +231,9 @@ static ssize_t
230ftrace_event_write(struct file *file, const char __user *ubuf, 231ftrace_event_write(struct file *file, const char __user *ubuf,
231 size_t cnt, loff_t *ppos) 232 size_t cnt, loff_t *ppos)
232{ 233{
234 struct trace_parser parser;
233 size_t read = 0; 235 size_t read = 0;
234 int i, set = 1;
235 ssize_t ret; 236 ssize_t ret;
236 char *buf;
237 char ch;
238 237
239 if (!cnt || cnt < 0) 238 if (!cnt || cnt < 0)
240 return 0; 239 return 0;
@@ -243,60 +242,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
243 if (ret < 0) 242 if (ret < 0)
244 return ret; 243 return ret;
245 244
246 ret = get_user(ch, ubuf++); 245 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
247 if (ret)
248 return ret;
249 read++;
250 cnt--;
251
252 /* skip white space */
253 while (cnt && isspace(ch)) {
254 ret = get_user(ch, ubuf++);
255 if (ret)
256 return ret;
257 read++;
258 cnt--;
259 }
260
261 /* Only white space found? */
262 if (isspace(ch)) {
263 file->f_pos += read;
264 ret = read;
265 return ret;
266 }
267
268 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
269 if (!buf)
270 return -ENOMEM; 246 return -ENOMEM;
271 247
272 if (cnt > EVENT_BUF_SIZE) 248 read = trace_get_user(&parser, ubuf, cnt, ppos);
273 cnt = EVENT_BUF_SIZE; 249
250 if (trace_parser_loaded((&parser))) {
251 int set = 1;
274 252
275 i = 0; 253 if (*parser.buffer == '!')
276 while (cnt && !isspace(ch)) {
277 if (!i && ch == '!')
278 set = 0; 254 set = 0;
279 else
280 buf[i++] = ch;
281 255
282 ret = get_user(ch, ubuf++); 256 parser.buffer[parser.idx] = 0;
257
258 ret = ftrace_set_clr_event(parser.buffer + !set, set);
283 if (ret) 259 if (ret)
284 goto out_free; 260 goto out_put;
285 read++;
286 cnt--;
287 } 261 }
288 buf[i] = 0;
289
290 file->f_pos += read;
291
292 ret = ftrace_set_clr_event(buf, set);
293 if (ret)
294 goto out_free;
295 262
296 ret = read; 263 ret = read;
297 264
298 out_free: 265 out_put:
299 kfree(buf); 266 trace_parser_put(&parser);
300 267
301 return ret; 268 return ret;
302} 269}
@@ -578,7 +545,7 @@ static int trace_write_header(struct trace_seq *s)
578 FIELD(unsigned char, flags), 545 FIELD(unsigned char, flags),
579 FIELD(unsigned char, preempt_count), 546 FIELD(unsigned char, preempt_count),
580 FIELD(int, pid), 547 FIELD(int, pid),
581 FIELD(int, tgid)); 548 FIELD(int, lock_depth));
582} 549}
583 550
584static ssize_t 551static ssize_t
@@ -1187,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self,
1187} 1154}
1188#endif /* CONFIG_MODULES */ 1155#endif /* CONFIG_MODULES */
1189 1156
1190struct notifier_block trace_module_nb = { 1157static struct notifier_block trace_module_nb = {
1191 .notifier_call = trace_module_notify, 1158 .notifier_call = trace_module_notify,
1192 .priority = 0, 1159 .priority = 0,
1193}; 1160};
@@ -1359,6 +1326,18 @@ static __init void event_trace_self_tests(void)
1359 if (!call->regfunc) 1326 if (!call->regfunc)
1360 continue; 1327 continue;
1361 1328
1329/*
1330 * Testing syscall events here is pretty useless, but
1331 * we still do it if configured. But this is time consuming.
1332 * What we really need is a user thread to perform the
1333 * syscalls as we test.
1334 */
1335#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1336 if (call->system &&
1337 strcmp(call->system, "syscalls") == 0)
1338 continue;
1339#endif
1340
1362 pr_info("Testing event %s: ", call->name); 1341 pr_info("Testing event %s: ", call->name);
1363 1342
1364 /* 1343 /*
@@ -1432,7 +1411,7 @@ static __init void event_trace_self_tests(void)
1432 1411
1433#ifdef CONFIG_FUNCTION_TRACER 1412#ifdef CONFIG_FUNCTION_TRACER
1434 1413
1435static DEFINE_PER_CPU(atomic_t, test_event_disable); 1414static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1436 1415
1437static void 1416static void
1438function_test_events_call(unsigned long ip, unsigned long parent_ip) 1417function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1449,7 +1428,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1449 pc = preempt_count(); 1428 pc = preempt_count();
1450 resched = ftrace_preempt_disable(); 1429 resched = ftrace_preempt_disable();
1451 cpu = raw_smp_processor_id(); 1430 cpu = raw_smp_processor_id();
1452 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1431 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1453 1432
1454 if (disabled != 1) 1433 if (disabled != 1)
1455 goto out; 1434 goto out;
@@ -1468,7 +1447,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1468 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1447 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1469 1448
1470 out: 1449 out:
1471 atomic_dec(&per_cpu(test_event_disable, cpu)); 1450 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1472 ftrace_preempt_enable(resched); 1451 ftrace_preempt_enable(resched);
1473} 1452}
1474 1453
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 93660fbbf629..23245785927f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -121,6 +121,47 @@ struct filter_parse_state {
121 } operand; 121 } operand;
122}; 122};
123 123
124#define DEFINE_COMPARISON_PRED(type) \
125static int filter_pred_##type(struct filter_pred *pred, void *event, \
126 int val1, int val2) \
127{ \
128 type *addr = (type *)(event + pred->offset); \
129 type val = (type)pred->val; \
130 int match = 0; \
131 \
132 switch (pred->op) { \
133 case OP_LT: \
134 match = (*addr < val); \
135 break; \
136 case OP_LE: \
137 match = (*addr <= val); \
138 break; \
139 case OP_GT: \
140 match = (*addr > val); \
141 break; \
142 case OP_GE: \
143 match = (*addr >= val); \
144 break; \
145 default: \
146 break; \
147 } \
148 \
149 return match; \
150}
151
152#define DEFINE_EQUALITY_PRED(size) \
153static int filter_pred_##size(struct filter_pred *pred, void *event, \
154 int val1, int val2) \
155{ \
156 u##size *addr = (u##size *)(event + pred->offset); \
157 u##size val = (u##size)pred->val; \
158 int match; \
159 \
160 match = (val == *addr) ^ pred->not; \
161 \
162 return match; \
163}
164
124DEFINE_COMPARISON_PRED(s64); 165DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 166DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 167DEFINE_COMPARISON_PRED(s32);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index df1bf6e48bb9..9753fcc61bc5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,146 +15,125 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
35 83
36#undef TRACE_FIELD_SPECIAL 84#undef __array
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 85#define __array(type, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 87 "offset:%zu;\tsize:%zu;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 88 offsetof(typeof(field), item), \
41 (unsigned int)sizeof(field.item)); \ 89 sizeof(field.item)); \
42 if (!ret) \ 90 if (!ret) \
43 return 0; 91 return 0;
44 92
45#undef TRACE_FIELD_ZERO_CHAR 93#undef __array_desc
46#define TRACE_FIELD_ZERO_CHAR(item) \ 94#define __array_desc(type, container, item, len) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
48 "offset:%u;\tsize:0;\n", \ 96 "offset:%zu;\tsize:%zu;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 97 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \
50 if (!ret) \ 99 if (!ret) \
51 return 0; 100 return 0;
52 101
53#undef TRACE_FIELD_SIGN 102#undef __dynamic_array
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 103#define __dynamic_array(type, item) \
55 TRACE_FIELD(type, item, assign) 104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \
106 offsetof(typeof(field), item)); \
107 if (!ret) \
108 return 0;
56 109
57#undef TP_RAW_FMT 110#undef F_printk
58#define TP_RAW_FMT(args...) args 111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
59 112
60#undef TRACE_EVENT_FORMAT 113#undef __entry
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 114#define __entry REC
62static int \
63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
65{ \
66 struct args field; \
67 int ret; \
68 \
69 tstruct; \
70 \
71 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
72 \
73 return ret; \
74}
75 115
76#undef TRACE_EVENT_FORMAT_NOFILTER 116#undef FTRACE_ENTRY
77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
78 tpfmt) \
79static int \ 118static int \
80ftrace_format_##call(struct ftrace_event_call *unused, \ 119ftrace_format_##name(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \ 120 struct trace_seq *s) \
82{ \ 121{ \
83 struct args field; \ 122 struct struct_name field __attribute__((unused)); \
84 int ret; \ 123 int ret = 0; \
85 \ 124 \
86 tstruct; \ 125 tstruct; \
87 \ 126 \
88 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 127 trace_seq_printf(s, "\nprint fmt: " print); \
89 \ 128 \
90 return ret; \ 129 return ret; \
91} 130}
92 131
93#include "trace_event_types.h" 132#include "trace_entries.h"
94
95#undef TRACE_ZERO_CHAR
96#define TRACE_ZERO_CHAR(arg)
97
98#undef TRACE_FIELD
99#define TRACE_FIELD(type, item, assign)\
100 entry->item = assign;
101
102#undef TRACE_FIELD
103#define TRACE_FIELD(type, item, assign)\
104 entry->item = assign;
105
106#undef TRACE_FIELD_SIGN
107#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
108 TRACE_FIELD(type, item, assign)
109
110#undef TP_CMD
111#define TP_CMD(cmd...) cmd
112
113#undef TRACE_ENTRY
114#define TRACE_ENTRY entry
115
116#undef TRACE_FIELD_SPECIAL
117#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
118 cmd;
119
120#undef TRACE_EVENT_FORMAT
121#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
122int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
123static int ftrace_raw_init_event_##call(void); \
124 \
125struct ftrace_event_call __used \
126__attribute__((__aligned__(4))) \
127__attribute__((section("_ftrace_events"))) event_##call = { \
128 .name = #call, \
129 .id = proto, \
130 .system = __stringify(TRACE_SYSTEM), \
131 .raw_init = ftrace_raw_init_event_##call, \
132 .show_format = ftrace_format_##call, \
133 .define_fields = ftrace_define_fields_##call, \
134}; \
135static int ftrace_raw_init_event_##call(void) \
136{ \
137 INIT_LIST_HEAD(&event_##call.fields); \
138 return 0; \
139} \
140
141#undef TRACE_EVENT_FORMAT_NOFILTER
142#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
143 tpfmt) \
144 \
145struct ftrace_event_call __used \
146__attribute__((__aligned__(4))) \
147__attribute__((section("_ftrace_events"))) event_##call = { \
148 .name = #call, \
149 .id = proto, \
150 .system = __stringify(TRACE_SYSTEM), \
151 .show_format = ftrace_format_##call, \
152};
153 133
154#include "trace_event_types.h"
155 134
156#undef TRACE_FIELD 135#undef __field
157#define TRACE_FIELD(type, item, assign) \ 136#define __field(type, item) \
158 ret = trace_define_field(event_call, #type, #item, \ 137 ret = trace_define_field(event_call, #type, #item, \
159 offsetof(typeof(field), item), \ 138 offsetof(typeof(field), item), \
160 sizeof(field.item), \ 139 sizeof(field.item), \
@@ -162,32 +141,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
162 if (ret) \ 141 if (ret) \
163 return ret; 142 return ret;
164 143
165#undef TRACE_FIELD_SPECIAL 144#undef __field_desc
166#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 145#define __field_desc(type, container, item) \
146 ret = trace_define_field(event_call, #type, #item, \
147 offsetof(typeof(field), \
148 container.item), \
149 sizeof(field.container.item), \
150 is_signed_type(type), FILTER_OTHER); \
151 if (ret) \
152 return ret;
153
154#undef __array
155#define __array(type, item, len) \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
167 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
168 offsetof(typeof(field), item), \ 158 offsetof(typeof(field), item), \
169 sizeof(field.item), 0, FILTER_OTHER); \ 159 sizeof(field.item), 0, FILTER_OTHER); \
170 if (ret) \ 160 if (ret) \
171 return ret; 161 return ret;
172 162
173#undef TRACE_FIELD_SIGN 163#undef __array_desc
174#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 164#define __array_desc(type, container, item, len) \
175 ret = trace_define_field(event_call, #type, #item, \ 165 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
176 offsetof(typeof(field), item), \ 166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
177 sizeof(field.item), is_signed, \ 167 offsetof(typeof(field), \
168 container.item), \
169 sizeof(field.container.item), 0, \
178 FILTER_OTHER); \ 170 FILTER_OTHER); \
179 if (ret) \ 171 if (ret) \
180 return ret; 172 return ret;
181 173
182#undef TRACE_FIELD_ZERO_CHAR 174#undef __dynamic_array
183#define TRACE_FIELD_ZERO_CHAR(item) 175#define __dynamic_array(type, item)
184 176
185#undef TRACE_EVENT_FORMAT 177#undef FTRACE_ENTRY
186#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 178#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
187int \ 179int \
188ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ 180ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
189{ \ 181{ \
190 struct args field; \ 182 struct struct_name field; \
191 int ret; \ 183 int ret; \
192 \ 184 \
193 ret = trace_define_common_fields(event_call); \ 185 ret = trace_define_common_fields(event_call); \
@@ -199,8 +191,42 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
199 return ret; \ 191 return ret; \
200} 192}
201 193
202#undef TRACE_EVENT_FORMAT_NOFILTER 194#include "trace_entries.h"
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 195
204 tpfmt) 196
197#undef __field
198#define __field(type, item)
199
200#undef __field_desc
201#define __field_desc(type, container, item)
202
203#undef __array
204#define __array(type, item, len)
205
206#undef __array_desc
207#define __array_desc(type, container, item, len)
208
209#undef __dynamic_array
210#define __dynamic_array(type, item)
211
212#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \
216struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \
218__attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \
220 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \
223 .show_format = ftrace_format_##call, \
224 .define_fields = ftrace_define_fields_##call, \
225}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
205 231
206#include "trace_event_types.h" 232#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b01b94518fc..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
290{ 290{
291 long count = (long)data; 291 long count = (long)data;
292 292
293 seq_printf(m, "%pf:", (void *)ip); 293 seq_printf(m, "%ps:", (void *)ip);
294 294
295 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b3749a2c3132..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
364} 364}
365 365
366 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
367/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
368static enum print_line_t 377static enum print_line_t
369verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
521 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
522 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
523 } 532 }
533
524 /* Proc */ 534 /* Proc */
525 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
526 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
659 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
660 } 670 }
661 671
662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
663 if (!ret) 673 if (!ret)
664 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
665 675
@@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
702 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
703 } 713 }
704 714
705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
706 if (!ret) 716 if (!ret)
707 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
708 718
@@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
758 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
759 } 769 }
760 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
761 return 0; 778 return 0;
762} 779}
763 780
@@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
952 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
953} 970}
954 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
955static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
956{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
957 /* 1st line */ 1001 /* 1st line */
958 seq_printf(s, "# "); 1002 seq_printf(s, "#");
959 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
960 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
961 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
962 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
963 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
964 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
965 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
966 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
967 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
968 1014
969 /* 2nd line */ 1015 /* 2nd line */
970 seq_printf(s, "# "); 1016 seq_printf(s, "#");
971 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
972 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
973 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
974 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
975 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
976 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
977 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
978 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
979 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5555b75a0d12..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c4c9bbda53d3..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
310 struct ring_buffer *buffer = tr->buffer; 311 struct ring_buffer *buffer = tr->buffer;
311 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
312 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
@@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
320 } 321 }
321 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
322 entry->rw = *rw; 323 entry->rw = *rw;
323 trace_buffer_unlock_commit(buffer, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
324} 327}
325 328
326void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
334 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
335 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
336{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
337 struct ring_buffer *buffer = tr->buffer; 341 struct ring_buffer *buffer = tr->buffer;
338 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
339 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
@@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
347 } 351 }
348 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
349 entry->map = *map; 353 entry->map = *map;
350 trace_buffer_unlock_commit(buffer, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
351} 357}
352 358
353void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..f572f44c6e1e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
407 * since individual threads might have already quit! 407 * since individual threads might have already quit!
408 */ 408 */
409 rcu_read_lock(); 409 rcu_read_lock();
410 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
411 if (task) 411 if (task)
412 mm = get_task_mm(task); 412 mm = get_task_mm(task);
413 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
460 return ret; 460 return ret;
461} 461}
462 462
463static int 463/**
464lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
465{ 472{
466 int hardirq, softirq; 473 int hardirq, softirq;
467 char comm[TASK_COMM_LEN]; 474 int ret;
468 475
469 trace_find_cmdline(entry->pid, comm);
470 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
471 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
472 478
473 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
474 comm, entry->pid, cpu,
475 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
476 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
477 'X' : '.', 482 'X' : '.',
@@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
481 hardirq ? 'h' : softirq ? 's' : '.')) 486 hardirq ? 'h' : softirq ? 's' : '.'))
482 return 0; 487 return 0;
483 488
489 if (entry->lock_depth < 0)
490 ret = trace_seq_putc(s, '.');
491 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth);
493 if (!ret)
494 return 0;
495
484 if (entry->preempt_count) 496 if (entry->preempt_count)
485 return trace_seq_printf(s, "%x", entry->preempt_count); 497 return trace_seq_printf(s, "%x", entry->preempt_count);
486 return trace_seq_puts(s, "."); 498 return trace_seq_putc(s, '.');
499}
500
501static int
502lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
503{
504 char comm[TASK_COMM_LEN];
505
506 trace_find_cmdline(entry->pid, comm);
507
508 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
509 comm, entry->pid, cpu))
510 return 0;
511
512 return trace_print_lat_fmt(s, entry);
487} 513}
488 514
489static unsigned long preempt_mark_thresh = 100; 515static unsigned long preempt_mark_thresh = 100;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ad69f105a7c6..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
244 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
245 236
246 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
247 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
248 240
249 wakeup_task = p; 241 wakeup_task = p;
@@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
293 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
294 } 286 }
295 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
296 wakeup_reset(tr); 295 wakeup_reset(tr);
297 296
298 /* 297 /*
@@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
328} 328}
329 329
330static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)