aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c23
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/cred.c25
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/gdbstub.c191
-rw-r--r--kernel/debug/kdb/kdb_main.c132
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/hrtimer.c8
-rw-r--r--kernel/hw_breakpoint.c90
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/module.c1092
-rw-r--r--kernel/padata.c755
-rw-r--r--kernel/perf_event.c460
-rw-r--r--kernel/pm_qos_params.c215
-rw-r--r--kernel/posix-cpu-timers.c36
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c13
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/printk.c33
-rw-r--r--kernel/rcupdate.c160
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutorture.c3
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/sched.c397
-rw-r--r--kernel/sched_clock.c95
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c532
-rw-r--r--kernel/sched_rt.c3
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/softlockup.c293
-rw-r--r--kernel/sysctl.c55
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/timer.c22
-rw-r--r--kernel/trace/Kconfig68
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/ftrace.c5
-rw-r--r--kernel/trace/kmemtrace.c529
-rw-r--r--kernel/trace/ring_buffer.c40
-rw-r--r--kernel/trace/trace.c170
-rw-r--r--kernel/trace/trace.h107
-rw-r--r--kernel/trace/trace_boot.c185
-rw-r--r--kernel/trace/trace_clock.c7
-rw-r--r--kernel/trace/trace_entries.h94
-rw-r--r--kernel/trace/trace_event_perf.c27
-rw-r--r--kernel/trace/trace_events.c299
-rw-r--r--kernel/trace/trace_events_filter.c27
-rw-r--r--kernel/trace/trace_export.c8
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c3
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kdb.c136
-rw-r--r--kernel/trace/trace_kprobe.c383
-rw-r--r--kernel/trace/trace_ksym.c508
-rw-r--r--kernel/trace/trace_output.c69
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c87
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_syscalls.c7
-rw-r--r--kernel/trace/trace_sysprof.c329
-rw-r--r--kernel/user_namespace.c44
-rw-r--r--kernel/watchdog.c567
-rw-r--r--kernel/workqueue.c15
-rw-r--r--kernel/workqueue_sched.h16
73 files changed, 4364 insertions, 4224 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 057472fbc272..ce53fb2bd1d9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -76,8 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_KPROBES) += kprobes.o 77obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += debug/ 78obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 79obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
82obj-$(CONFIG_SECCOMP) += seccomp.o 82obj-$(CONFIG_SECCOMP) += seccomp.o
83obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 83obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..8296aa516c5a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -407,7 +407,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
407 audit_hold_skb(skb); 407 audit_hold_skb(skb);
408 } else 408 } else
409 /* drop the extra reference if sent ok */ 409 /* drop the extra reference if sent ok */
410 kfree_skb(skb); 410 consume_skb(skb);
411} 411}
412 412
413static int kauditd_thread(void *dummy) 413static int kauditd_thread(void *dummy)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3ac6f5b0a64b..a8ce09954404 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1788,6 +1788,29 @@ out:
1788 return retval; 1788 return retval;
1789} 1789}
1790 1790
1791/**
1792 * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
1793 * @tsk: the task to be attached
1794 */
1795int cgroup_attach_task_current_cg(struct task_struct *tsk)
1796{
1797 struct cgroupfs_root *root;
1798 struct cgroup *cur_cg;
1799 int retval = 0;
1800
1801 cgroup_lock();
1802 for_each_active_root(root) {
1803 cur_cg = task_cgroup_from_root(current, root);
1804 retval = cgroup_attach_task(cur_cg, tsk);
1805 if (retval)
1806 break;
1807 }
1808 cgroup_unlock();
1809
1810 return retval;
1811}
1812EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
1813
1791/* 1814/*
1792 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1815 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1793 * held. May take task_lock of task 1816 * held. May take task_lock of task
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
238 set_cpu_active(cpu, false);
239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 if (err) { 239 if (err) {
241 set_cpu_active(cpu, true);
242
243 nr_calls--; 240 nr_calls--;
244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 246
250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 if (err) { 248 if (err) {
252 set_cpu_active(cpu, true);
253 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 goto out_notify; 317 goto out_notify;
322 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
323 319
324 set_cpu_active(cpu, true);
325
326 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
327 cpu_notify(CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
328 322
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
105 /* for custom sched domain */ 105 /* for custom sched domain */
106 int relax_domain_level; 106 int relax_domain_level;
107 107
108 /* used for walking a cpuset heirarchy */ 108 /* used for walking a cpuset hierarchy */
109 struct list_head stack_list; 109 struct list_head stack_list;
110}; 110};
111 111
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2113 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2114 * 2114 *
2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 * 2117 *
2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2120 */ 2120 */
2121static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void cpuset_update_active_cpus(void)
2122 unsigned long phase, void *unused_cpu)
2123{ 2122{
2124 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2125 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2126 int ndoms; 2125 int ndoms;
2127 2126
2128 switch (phase) {
2129 case CPU_ONLINE:
2130 case CPU_ONLINE_FROZEN:
2131 case CPU_DOWN_PREPARE:
2132 case CPU_DOWN_PREPARE_FROZEN:
2133 case CPU_DOWN_FAILED:
2134 case CPU_DOWN_FAILED_FROZEN:
2135 break;
2136
2137 default:
2138 return NOTIFY_DONE;
2139 }
2140
2141 cgroup_lock(); 2127 cgroup_lock();
2142 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2143 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2148 2134
2149 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2150 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2151
2152 return NOTIFY_OK;
2153} 2137}
2154 2138
2155#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189
2206 hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191
2209 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/cred.c b/kernel/cred.c
index a2d5504fbcc2..60bc8b1e32e6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk)
209 } 209 }
210} 210}
211 211
212/**
213 * get_task_cred - Get another task's objective credentials
214 * @task: The task to query
215 *
216 * Get the objective credentials of a task, pinning them so that they can't go
217 * away. Accessing a task's credentials directly is not permitted.
218 *
219 * The caller must also make sure task doesn't get deleted, either by holding a
220 * ref on task or by holding tasklist_lock to prevent it from being unlinked.
221 */
222const struct cred *get_task_cred(struct task_struct *task)
223{
224 const struct cred *cred;
225
226 rcu_read_lock();
227
228 do {
229 cred = __task_cred((task));
230 BUG_ON(!cred);
231 } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
232
233 rcu_read_unlock();
234 return cred;
235}
236
212/* 237/*
213 * Allocate blank credentials, such that the credentials can be filled in at a 238 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM. 239 * later date without risk of ENOMEM.
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 8bc5eeffec8a..3c2d4972d235 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -605,6 +605,8 @@ cpu_master_loop:
605 if (dbg_kdb_mode) { 605 if (dbg_kdb_mode) {
606 kgdb_connected = 1; 606 kgdb_connected = 1;
607 error = kdb_stub(ks); 607 error = kdb_stub(ks);
608 if (error == -1)
609 continue;
608 kgdb_connected = 0; 610 kgdb_connected = 0;
609 } else { 611 } else {
610 error = gdb_serial_stub(ks); 612 error = gdb_serial_stub(ks);
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e8fd6868682d..481a7bd2dfe7 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2000-2001 VERITAS Software Corporation. 6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation 7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> 8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> 9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> 10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. 11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc. 12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES +
52 * GDB remote protocol parser: 52 * GDB remote protocol parser:
53 */ 53 */
54 54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB 55#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void) 56static int gdbstub_read_wait(void)
68{ 57{
@@ -123,8 +112,8 @@ static void get_packet(char *buffer)
123 buffer[count] = 0; 112 buffer[count] = 0;
124 113
125 if (ch == '#') { 114 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4; 115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait()); 116 xmitcsum += hex_to_bin(gdbstub_read_wait());
128 117
129 if (checksum != xmitcsum) 118 if (checksum != xmitcsum)
130 /* failed checksum */ 119 /* failed checksum */
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
236 * buf. Return a pointer to the last char put in buf (null). May 225 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error. 226 * return an error.
238 */ 227 */
239int kgdb_mem2hex(char *mem, char *buf, int count) 228char *kgdb_mem2hex(char *mem, char *buf, int count)
240{ 229{
241 char *tmp; 230 char *tmp;
242 int err; 231 int err;
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
248 tmp = buf + count; 237 tmp = buf + count;
249 238
250 err = probe_kernel_read(tmp, mem, count); 239 err = probe_kernel_read(tmp, mem, count);
251 if (!err) { 240 if (err)
252 while (count > 0) { 241 return NULL;
253 buf = pack_hex_byte(buf, *tmp); 242 while (count > 0) {
254 tmp++; 243 buf = pack_hex_byte(buf, *tmp);
255 count--; 244 tmp++;
256 } 245 count--;
257
258 *buf = 0;
259 } 246 }
247 *buf = 0;
260 248
261 return err; 249 return buf;
262} 250}
263 251
264/* 252/*
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
280 tmp_hex = tmp_raw - 1; 268 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) { 269 while (tmp_hex >= buf) {
282 tmp_raw--; 270 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--); 271 *tmp_raw = hex_to_bin(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4; 272 *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
285 } 273 }
286 274
287 return probe_kernel_write(mem, tmp_raw, count); 275 return probe_kernel_write(mem, tmp_raw, count);
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
304 (*ptr)++; 292 (*ptr)++;
305 } 293 }
306 while (**ptr) { 294 while (**ptr) {
307 hex_val = hex(**ptr); 295 hex_val = hex_to_bin(**ptr);
308 if (hex_val < 0) 296 if (hex_val < 0)
309 break; 297 break;
310 298
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
339 return probe_kernel_write(mem, c, size); 327 return probe_kernel_write(mem, c, size);
340} 328}
341 329
330#if DBG_MAX_REG_NUM > 0
331void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
332{
333 int i;
334 int idx = 0;
335 char *ptr = (char *)gdb_regs;
336
337 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
338 dbg_get_reg(i, ptr + idx, regs);
339 idx += dbg_reg_def[i].size;
340 }
341}
342
343void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
344{
345 int i;
346 int idx = 0;
347 char *ptr = (char *)gdb_regs;
348
349 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
350 dbg_set_reg(i, ptr + idx, regs);
351 idx += dbg_reg_def[i].size;
352 }
353}
354#endif /* DBG_MAX_REG_NUM > 0 */
355
342/* Write memory due to an 'M' or 'X' packet. */ 356/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary) 357static int write_mem_msg(int binary)
344{ 358{
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error)
378 * remapped to negative TIDs. 392 * remapped to negative TIDs.
379 */ 393 */
380 394
381#define BUF_THREAD_ID_SIZE 16 395#define BUF_THREAD_ID_SIZE 8
382 396
383static char *pack_threadid(char *pkt, unsigned char *id) 397static char *pack_threadid(char *pkt, unsigned char *id)
384{ 398{
385 char *limit; 399 unsigned char *limit;
400 int lzero = 1;
401
402 limit = id + (BUF_THREAD_ID_SIZE / 2);
403 while (id < limit) {
404 if (!lzero || *id != 0) {
405 pkt = pack_hex_byte(pkt, *id);
406 lzero = 0;
407 }
408 id++;
409 }
386 410
387 limit = pkt + BUF_THREAD_ID_SIZE; 411 if (lzero)
388 while (pkt < limit) 412 pkt = pack_hex_byte(pkt, 0);
389 pkt = pack_hex_byte(pkt, *id++);
390 413
391 return pkt; 414 return pkt;
392} 415}
393 416
394static void int_to_threadref(unsigned char *id, int value) 417static void int_to_threadref(unsigned char *id, int value)
395{ 418{
396 unsigned char *scan; 419 put_unaligned_be32(value, id);
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403} 420}
404 421
405static struct task_struct *getthread(struct pt_regs *regs, int tid) 422static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo); 480 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464} 481}
465 482
466/* Handle the 'g' get registers request */ 483static void gdb_get_regs_helper(struct kgdb_state *ks)
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{ 484{
469 struct task_struct *thread; 485 struct task_struct *thread;
470 void *local_debuggerinfo; 486 void *local_debuggerinfo;
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
505 */ 521 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread); 522 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 } 523 }
524}
525
526/* Handle the 'g' get registers request */
527static void gdb_cmd_getregs(struct kgdb_state *ks)
528{
529 gdb_get_regs_helper(ks);
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); 530 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509} 531}
510 532
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
527 char *ptr = &remcom_in_buffer[1]; 549 char *ptr = &remcom_in_buffer[1];
528 unsigned long length; 550 unsigned long length;
529 unsigned long addr; 551 unsigned long addr;
530 int err; 552 char *err;
531 553
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && 554 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) { 555 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); 556 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err) 557 if (!err)
536 error_packet(remcom_out_buffer, err); 558 error_packet(remcom_out_buffer, -EINVAL);
537 } else { 559 } else {
538 error_packet(remcom_out_buffer, -EINVAL); 560 error_packet(remcom_out_buffer, -EINVAL);
539 } 561 }
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
550 strcpy(remcom_out_buffer, "OK"); 572 strcpy(remcom_out_buffer, "OK");
551} 573}
552 574
575#if DBG_MAX_REG_NUM > 0
576static char *gdb_hex_reg_helper(int regnum, char *out)
577{
578 int i;
579 int offset = 0;
580
581 for (i = 0; i < regnum; i++)
582 offset += dbg_reg_def[i].size;
583 return kgdb_mem2hex((char *)gdb_regs + offset, out,
584 dbg_reg_def[i].size);
585}
586
587/* Handle the 'p' individual regster get */
588static void gdb_cmd_reg_get(struct kgdb_state *ks)
589{
590 unsigned long regnum;
591 char *ptr = &remcom_in_buffer[1];
592
593 kgdb_hex2long(&ptr, &regnum);
594 if (regnum >= DBG_MAX_REG_NUM) {
595 error_packet(remcom_out_buffer, -EINVAL);
596 return;
597 }
598 gdb_get_regs_helper(ks);
599 gdb_hex_reg_helper(regnum, remcom_out_buffer);
600}
601
602/* Handle the 'P' individual regster set */
603static void gdb_cmd_reg_set(struct kgdb_state *ks)
604{
605 unsigned long regnum;
606 char *ptr = &remcom_in_buffer[1];
607 int i = 0;
608
609 kgdb_hex2long(&ptr, &regnum);
610 if (*ptr++ != '=' ||
611 !(!kgdb_usethread || kgdb_usethread == current) ||
612 !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
613 error_packet(remcom_out_buffer, -EINVAL);
614 return;
615 }
616 memset(gdb_regs, 0, sizeof(gdb_regs));
617 while (i < sizeof(gdb_regs) * 2)
618 if (hex_to_bin(ptr[i]) >= 0)
619 i++;
620 else
621 break;
622 i = i / 2;
623 kgdb_hex2mem(ptr, (char *)gdb_regs, i);
624 dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
625 strcpy(remcom_out_buffer, "OK");
626}
627#endif /* DBG_MAX_REG_NUM > 0 */
628
553/* Handle the 'X' memory binary write bytes */ 629/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks) 630static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{ 631{
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
612{ 688{
613 struct task_struct *g; 689 struct task_struct *g;
614 struct task_struct *p; 690 struct task_struct *p;
615 unsigned char thref[8]; 691 unsigned char thref[BUF_THREAD_ID_SIZE];
616 char *ptr; 692 char *ptr;
617 int i; 693 int i;
618 int cpu; 694 int cpu;
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
632 for_each_online_cpu(cpu) { 708 for_each_online_cpu(cpu) {
633 ks->thr_query = 0; 709 ks->thr_query = 0;
634 int_to_threadref(thref, -cpu - 2); 710 int_to_threadref(thref, -cpu - 2);
635 pack_threadid(ptr, thref); 711 ptr = pack_threadid(ptr, thref);
636 ptr += BUF_THREAD_ID_SIZE;
637 *(ptr++) = ','; 712 *(ptr++) = ',';
638 i++; 713 i++;
639 } 714 }
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
642 do_each_thread(g, p) { 717 do_each_thread(g, p) {
643 if (i >= ks->thr_query && !finished) { 718 if (i >= ks->thr_query && !finished) {
644 int_to_threadref(thref, p->pid); 719 int_to_threadref(thref, p->pid);
645 pack_threadid(ptr, thref); 720 ptr = pack_threadid(ptr, thref);
646 ptr += BUF_THREAD_ID_SIZE;
647 *(ptr++) = ','; 721 *(ptr++) = ',';
648 ks->thr_query++; 722 ks->thr_query++;
649 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) 723 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
858 int error = 0; 932 int error = 0;
859 int tmp; 933 int tmp;
860 934
861 /* Clear the out buffer. */ 935 /* Initialize comm buffer and globals. */
862 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); 936 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
937 kgdb_usethread = kgdb_info[ks->cpu].task;
938 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
939 ks->pass_exception = 0;
863 940
864 if (kgdb_connected) { 941 if (kgdb_connected) {
865 unsigned char thref[8]; 942 unsigned char thref[BUF_THREAD_ID_SIZE];
866 char *ptr; 943 char *ptr;
867 944
868 /* Reply to host that an exception has occurred */ 945 /* Reply to host that an exception has occurred */
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
876 put_packet(remcom_out_buffer); 953 put_packet(remcom_out_buffer);
877 } 954 }
878 955
879 kgdb_usethread = kgdb_info[ks->cpu].task;
880 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
881 ks->pass_exception = 0;
882
883 while (1) { 956 while (1) {
884 error = 0; 957 error = 0;
885 958
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
904 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 977 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
905 gdb_cmd_memwrite(ks); 978 gdb_cmd_memwrite(ks);
906 break; 979 break;
980#if DBG_MAX_REG_NUM > 0
981 case 'p': /* pXX Return gdb register XX (in hex) */
982 gdb_cmd_reg_get(ks);
983 break;
984 case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
985 gdb_cmd_reg_set(ks);
986 break;
987#endif /* DBG_MAX_REG_NUM > 0 */
907 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ 988 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_binwrite(ks); 989 gdb_cmd_binwrite(ks);
909 break; 990 break;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..8577e45a9a58 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
312 312
313 if (endp == arg) { 313 if (endp == arg) {
314 /* 314 /*
315 * Try base 16, for us folks too lazy to type the 315 * Also try base 16, for us folks too lazy to type the
316 * leading 0x... 316 * leading 0x...
317 */ 317 */
318 val = simple_strtoul(arg, &endp, 16); 318 val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
325 return 0; 325 return 0;
326} 326}
327 327
328int kdbgetu64arg(const char *arg, u64 *value)
329{
330 char *endp;
331 u64 val;
332
333 val = simple_strtoull(arg, &endp, 0);
334
335 if (endp == arg) {
336
337 val = simple_strtoull(arg, &endp, 16);
338 if (endp == arg)
339 return KDB_BADINT;
340 }
341
342 *value = val;
343
344 return 0;
345}
346
328/* 347/*
329 * kdb_set - This function implements the 'set' command. Alter an 348 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one. 349 * existing environment variable or create a new one.
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
1770 */ 1789 */
1771static int kdb_rd(int argc, const char **argv) 1790static int kdb_rd(int argc, const char **argv)
1772{ 1791{
1773 int diag = kdb_check_regs(); 1792 int len = kdb_check_regs();
1774 if (diag) 1793#if DBG_MAX_REG_NUM > 0
1775 return diag; 1794 int i;
1795 char *rname;
1796 int rsize;
1797 u64 reg64;
1798 u32 reg32;
1799 u16 reg16;
1800 u8 reg8;
1801
1802 if (len)
1803 return len;
1804
1805 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1806 rsize = dbg_reg_def[i].size * 2;
1807 if (rsize > 16)
1808 rsize = 2;
1809 if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
1810 len = 0;
1811 kdb_printf("\n");
1812 }
1813 if (len)
1814 len += kdb_printf(" ");
1815 switch(dbg_reg_def[i].size * 8) {
1816 case 8:
1817 rname = dbg_get_reg(i, &reg8, kdb_current_regs);
1818 if (!rname)
1819 break;
1820 len += kdb_printf("%s: %02x", rname, reg8);
1821 break;
1822 case 16:
1823 rname = dbg_get_reg(i, &reg16, kdb_current_regs);
1824 if (!rname)
1825 break;
1826 len += kdb_printf("%s: %04x", rname, reg16);
1827 break;
1828 case 32:
1829 rname = dbg_get_reg(i, &reg32, kdb_current_regs);
1830 if (!rname)
1831 break;
1832 len += kdb_printf("%s: %08x", rname, reg32);
1833 break;
1834 case 64:
1835 rname = dbg_get_reg(i, &reg64, kdb_current_regs);
1836 if (!rname)
1837 break;
1838 len += kdb_printf("%s: %016llx", rname, reg64);
1839 break;
1840 default:
1841 len += kdb_printf("%s: ??", dbg_reg_def[i].name);
1842 }
1843 }
1844 kdb_printf("\n");
1845#else
1846 if (len)
1847 return len;
1776 1848
1777 kdb_dumpregs(kdb_current_regs); 1849 kdb_dumpregs(kdb_current_regs);
1850#endif
1778 return 0; 1851 return 0;
1779} 1852}
1780 1853
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
1782 * kdb_rm - This function implements the 'rm' (register modify) command. 1855 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents 1856 * rm register-name new-contents
1784 * Remarks: 1857 * Remarks:
1785 * Currently doesn't allow modification of control or 1858 * Allows register modification with the same restrictions as gdb
1786 * debug registers.
1787 */ 1859 */
1788static int kdb_rm(int argc, const char **argv) 1860static int kdb_rm(int argc, const char **argv)
1789{ 1861{
1862#if DBG_MAX_REG_NUM > 0
1790 int diag; 1863 int diag;
1791 int ind = 0; 1864 const char *rname;
1792 unsigned long contents; 1865 int i;
1866 u64 reg64;
1867 u32 reg32;
1868 u16 reg16;
1869 u8 reg8;
1793 1870
1794 if (argc != 2) 1871 if (argc != 2)
1795 return KDB_ARGCOUNT; 1872 return KDB_ARGCOUNT;
1796 /* 1873 /*
1797 * Allow presence or absence of leading '%' symbol. 1874 * Allow presence or absence of leading '%' symbol.
1798 */ 1875 */
1799 if (argv[1][0] == '%') 1876 rname = argv[1];
1800 ind = 1; 1877 if (*rname == '%')
1878 rname++;
1801 1879
1802 diag = kdbgetularg(argv[2], &contents); 1880 diag = kdbgetu64arg(argv[2], &reg64);
1803 if (diag) 1881 if (diag)
1804 return diag; 1882 return diag;
1805 1883
1806 diag = kdb_check_regs(); 1884 diag = kdb_check_regs();
1807 if (diag) 1885 if (diag)
1808 return diag; 1886 return diag;
1887
1888 diag = KDB_BADREG;
1889 for (i = 0; i < DBG_MAX_REG_NUM; i++) {
1890 if (strcmp(rname, dbg_reg_def[i].name) == 0) {
1891 diag = 0;
1892 break;
1893 }
1894 }
1895 if (!diag) {
1896 switch(dbg_reg_def[i].size * 8) {
1897 case 8:
1898 reg8 = reg64;
1899 dbg_set_reg(i, &reg8, kdb_current_regs);
1900 break;
1901 case 16:
1902 reg16 = reg64;
1903 dbg_set_reg(i, &reg16, kdb_current_regs);
1904 break;
1905 case 32:
1906 reg32 = reg64;
1907 dbg_set_reg(i, &reg32, kdb_current_regs);
1908 break;
1909 case 64:
1910 dbg_set_reg(i, &reg64, kdb_current_regs);
1911 break;
1912 }
1913 }
1914 return diag;
1915#else
1809 kdb_printf("ERROR: Register set currently not implemented\n"); 1916 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0; 1917 return 0;
1918#endif
1811} 1919}
1812 1920
1813#if defined(CONFIG_MAGIC_SYSRQ) 1921#if defined(CONFIG_MAGIC_SYSRQ)
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..c438f545a321 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t); 144extern int kdb_putword(unsigned long, unsigned long, size_t);
145 145
146extern int kdbgetularg(const char *, unsigned long *); 146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *); 147extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, 148extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **); 149 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *); 150extern int kdbgetsymval(const char *, kdb_symtab_t *);
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..a82a65cef741 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
907{ 907{
908 unsigned long new_flags = p->flags; 908 unsigned long new_flags = p->flags;
909 909
910 new_flags &= ~PF_SUPERPRIV; 910 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
911 new_flags |= PF_FORKNOEXEC; 911 new_flags |= PF_FORKNOEXEC;
912 new_flags |= PF_STARTING; 912 new_flags |= PF_STARTING;
913 p->flags = new_flags; 913 p->flags = new_flags;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..e934339fbbef 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144static int hrtimer_get_target(int this_cpu, int pinned) 144static int hrtimer_get_target(int this_cpu, int pinned)
145{ 145{
146#ifdef CONFIG_NO_HZ 146#ifdef CONFIG_NO_HZ
147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148 int preferred_cpu = get_nohz_load_balancer(); 148 return get_nohz_timer_target();
149
150 if (preferred_cpu >= 0)
151 return preferred_cpu;
152 }
153#endif 149#endif
154 return this_cpu; 150 return this_cpu;
155} 151}
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..d71a987fd2bf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/list.h>
44#include <linux/cpu.h> 45#include <linux/cpu.h>
45#include <linux/smp.h> 46#include <linux/smp.h>
46 47
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62 63
63static int nr_slots[TYPE_MAX]; 64static int nr_slots[TYPE_MAX];
64 65
66/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head);
68
65static int constraints_initialized; 69static int constraints_initialized;
66 70
67/* Gather the number of total pinned and un-pinned bp in a cpuset */ 71/* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
103 return 0; 107 return 0;
104} 108}
105 109
106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) 110/*
111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list.
113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
107{ 115{
108 struct perf_event_context *ctx = tsk->perf_event_ctxp; 116 struct perf_event_context *ctx = bp->ctx;
109 struct list_head *list; 117 struct perf_event *iter;
110 struct perf_event *bp;
111 unsigned long flags;
112 int count = 0; 118 int count = 0;
113 119
114 if (WARN_ONCE(!ctx, "No perf context for this task")) 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
115 return 0; 121 if (iter->ctx == ctx && find_slot_idx(iter) == type)
116 122 count += hw_breakpoint_weight(iter);
117 list = &ctx->event_list;
118
119 raw_spin_lock_irqsave(&ctx->lock, flags);
120
121 /*
122 * The current breakpoint counter is not included in the list
123 * at the open() callback time
124 */
125 list_for_each_entry(bp, list, event_entry) {
126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
129 } 123 }
130 124
131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
132
133 return count; 125 return count;
134} 126}
135 127
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
149 if (!tsk) 141 if (!tsk)
150 slots->pinned += max_task_bp_pinned(cpu, type); 142 slots->pinned += max_task_bp_pinned(cpu, type);
151 else 143 else
152 slots->pinned += task_bp_pinned(tsk, type); 144 slots->pinned += task_bp_pinned(bp, type);
153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
154 146
155 return; 147 return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
162 if (!tsk) 154 if (!tsk)
163 nr += max_task_bp_pinned(cpu, type); 155 nr += max_task_bp_pinned(cpu, type);
164 else 156 else
165 nr += task_bp_pinned(tsk, type); 157 nr += task_bp_pinned(bp, type);
166 158
167 if (nr > slots->pinned) 159 if (nr > slots->pinned)
168 slots->pinned = nr; 160 slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
188/* 180/*
189 * Add a pinned breakpoint for the given task in our constraint table 181 * Add a pinned breakpoint for the given task in our constraint table
190 */ 182 */
191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, 183static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
192 enum bp_type_idx type, int weight) 184 enum bp_type_idx type, int weight)
193{ 185{
194 unsigned int *tsk_pinned; 186 unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
196 int old_idx = 0; 188 int old_idx = 0;
197 int idx = 0; 189 int idx = 0;
198 190
199 old_count = task_bp_pinned(tsk, type); 191 old_count = task_bp_pinned(bp, type);
200 old_idx = old_count - 1; 192 old_idx = old_count - 1;
201 idx = old_idx + weight; 193 idx = old_idx + weight;
202 194
195 /* tsk_pinned[n] is the number of tasks having n breakpoints */
203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); 196 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
204 if (enable) { 197 if (enable) {
205 tsk_pinned[idx]++; 198 tsk_pinned[idx]++;
@@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
222 int cpu = bp->cpu; 215 int cpu = bp->cpu;
223 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->ctx->task;
224 217
218 /* Pinned counter cpu profiling */
219 if (!tsk) {
220
221 if (enable)
222 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
223 else
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
225 return;
226 }
227
225 /* Pinned counter task profiling */ 228 /* Pinned counter task profiling */
226 if (tsk) {
227 if (cpu >= 0) {
228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
229 return;
230 }
231 229
230 if (!enable)
231 list_del(&bp->hw.bp_list);
232
233 if (cpu >= 0) {
234 toggle_bp_task_slot(bp, cpu, enable, type, weight);
235 } else {
232 for_each_online_cpu(cpu) 236 for_each_online_cpu(cpu)
233 toggle_bp_task_slot(tsk, cpu, enable, type, weight); 237 toggle_bp_task_slot(bp, cpu, enable, type, weight);
234 return;
235 } 238 }
236 239
237 /* Pinned counter cpu profiling */
238 if (enable) 240 if (enable)
239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 241 list_add_tail(&bp->hw.bp_list, &bp_task_head);
240 else 242}
241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; 243
244/*
245 * Function to perform processor-specific cleanup during unregistration
246 */
247__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
248{
249 /*
250 * A weak stub function here for those archs that don't define
251 * it inside arch/.../kernel/hw_breakpoint.c
252 */
242} 253}
243 254
244/* 255/*
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
301 weight = hw_breakpoint_weight(bp); 312 weight = hw_breakpoint_weight(bp);
302 313
303 fetch_bp_busy_slots(&slots, bp, type); 314 fetch_bp_busy_slots(&slots, bp, type);
315 /*
316 * Simulate the addition of this breakpoint to the constraints
317 * and see the result.
318 */
304 fetch_this_slot(&slots, weight); 319 fetch_this_slot(&slots, weight);
305 320
306 /* Flexible counters need to keep at least one slot */ 321 /* Flexible counters need to keep at least one slot */
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
339{ 354{
340 mutex_lock(&nr_bp_mutex); 355 mutex_lock(&nr_bp_mutex);
341 356
357 arch_unregister_hw_breakpoint(bp);
342 __release_bp_slot(bp); 358 __release_bp_slot(bp);
343 359
344 mutex_unlock(&nr_bp_mutex); 360 mutex_unlock(&nr_bp_mutex);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
146 146
147static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
148{ 148{
149 return cpu_clock(smp_processor_id()); 149 return local_clock();
150} 150}
151 151
152static int lock_point(unsigned long points[], unsigned long ip) 152static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/module.c b/kernel/module.c
index 5d2d28197c82..d0b5f8db11b4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
1/* 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
@@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
110} 110}
111EXPORT_SYMBOL(unregister_module_notifier); 111EXPORT_SYMBOL(unregister_module_notifier);
112 112
113struct load_info {
114 Elf_Ehdr *hdr;
115 unsigned long len;
116 Elf_Shdr *sechdrs;
117 char *secstrings, *strtab;
118 unsigned long *strmap;
119 unsigned long symoffs, stroffs;
120 struct _ddebug *debug;
121 unsigned int num_debug;
122 struct {
123 unsigned int sym, str, mod, vers, info, pcpu;
124 } index;
125};
126
113/* We require a truly strong try_module_get(): 0 means failure due to 127/* We require a truly strong try_module_get(): 0 means failure due to
114 ongoing or failed initialization etc. */ 128 ongoing or failed initialization etc. */
115static inline int strong_try_module_get(struct module *mod) 129static inline int strong_try_module_get(struct module *mod)
@@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
140EXPORT_SYMBOL(__module_put_and_exit); 154EXPORT_SYMBOL(__module_put_and_exit);
141 155
142/* Find a module section: 0 means not found. */ 156/* Find a module section: 0 means not found. */
143static unsigned int find_sec(Elf_Ehdr *hdr, 157static unsigned int find_sec(const struct load_info *info, const char *name)
144 Elf_Shdr *sechdrs,
145 const char *secstrings,
146 const char *name)
147{ 158{
148 unsigned int i; 159 unsigned int i;
149 160
150 for (i = 1; i < hdr->e_shnum; i++) 161 for (i = 1; i < info->hdr->e_shnum; i++) {
162 Elf_Shdr *shdr = &info->sechdrs[i];
151 /* Alloc bit cleared means "ignore it." */ 163 /* Alloc bit cleared means "ignore it." */
152 if ((sechdrs[i].sh_flags & SHF_ALLOC) 164 if ((shdr->sh_flags & SHF_ALLOC)
153 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) 165 && strcmp(info->secstrings + shdr->sh_name, name) == 0)
154 return i; 166 return i;
167 }
155 return 0; 168 return 0;
156} 169}
157 170
158/* Find a module section, or NULL. */ 171/* Find a module section, or NULL. */
159static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, 172static void *section_addr(const struct load_info *info, const char *name)
160 const char *secstrings, const char *name)
161{ 173{
162 /* Section 0 has sh_addr 0. */ 174 /* Section 0 has sh_addr 0. */
163 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; 175 return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
164} 176}
165 177
166/* Find a module section, or NULL. Fill in number of "objects" in section. */ 178/* Find a module section, or NULL. Fill in number of "objects" in section. */
167static void *section_objs(Elf_Ehdr *hdr, 179static void *section_objs(const struct load_info *info,
168 Elf_Shdr *sechdrs,
169 const char *secstrings,
170 const char *name, 180 const char *name,
171 size_t object_size, 181 size_t object_size,
172 unsigned int *num) 182 unsigned int *num)
173{ 183{
174 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); 184 unsigned int sec = find_sec(info, name);
175 185
176 /* Section 0 has sh_addr 0 and sh_size 0. */ 186 /* Section 0 has sh_addr 0 and sh_size 0. */
177 *num = sechdrs[sec].sh_size / object_size; 187 *num = info->sechdrs[sec].sh_size / object_size;
178 return (void *)sechdrs[sec].sh_addr; 188 return (void *)info->sechdrs[sec].sh_addr;
179} 189}
180 190
181/* Provided by the linker */ 191/* Provided by the linker */
@@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
227 unsigned int symnum, void *data), void *data) 237 unsigned int symnum, void *data), void *data)
228{ 238{
229 struct module *mod; 239 struct module *mod;
230 const struct symsearch arr[] = { 240 static const struct symsearch arr[] = {
231 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 241 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
232 NOT_GPL_ONLY, false }, 242 NOT_GPL_ONLY, false },
233 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 243 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod,
392 mod->percpu = __alloc_reserved_percpu(size, align); 402 mod->percpu = __alloc_reserved_percpu(size, align);
393 if (!mod->percpu) { 403 if (!mod->percpu) {
394 printk(KERN_WARNING 404 printk(KERN_WARNING
395 "Could not allocate %lu bytes percpu data\n", size); 405 "%s: Could not allocate %lu bytes percpu data\n",
406 mod->name, size);
396 return -ENOMEM; 407 return -ENOMEM;
397 } 408 }
398 mod->percpu_size = size; 409 mod->percpu_size = size;
@@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod)
404 free_percpu(mod->percpu); 415 free_percpu(mod->percpu);
405} 416}
406 417
407static unsigned int find_pcpusec(Elf_Ehdr *hdr, 418static unsigned int find_pcpusec(struct load_info *info)
408 Elf_Shdr *sechdrs,
409 const char *secstrings)
410{ 419{
411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); 420 return find_sec(info, ".data..percpu");
412} 421}
413 422
414static void percpu_modcopy(struct module *mod, 423static void percpu_modcopy(struct module *mod,
@@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod,
468static inline void percpu_modfree(struct module *mod) 477static inline void percpu_modfree(struct module *mod)
469{ 478{
470} 479}
471static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 480static unsigned int find_pcpusec(struct load_info *info)
472 Elf_Shdr *sechdrs,
473 const char *secstrings)
474{ 481{
475 return 0; 482 return 0;
476} 483}
@@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
524EXPORT_TRACEPOINT_SYMBOL(module_get); 531EXPORT_TRACEPOINT_SYMBOL(module_get);
525 532
526/* Init the unload section of the module. */ 533/* Init the unload section of the module. */
527static void module_unload_init(struct module *mod) 534static int module_unload_init(struct module *mod)
528{ 535{
529 int cpu; 536 mod->refptr = alloc_percpu(struct module_ref);
537 if (!mod->refptr)
538 return -ENOMEM;
530 539
531 INIT_LIST_HEAD(&mod->source_list); 540 INIT_LIST_HEAD(&mod->source_list);
532 INIT_LIST_HEAD(&mod->target_list); 541 INIT_LIST_HEAD(&mod->target_list);
533 for_each_possible_cpu(cpu) {
534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
536 }
537 542
538 /* Hold reference count during initialization. */ 543 /* Hold reference count during initialization. */
539 __this_cpu_write(mod->refptr->incs, 1); 544 __this_cpu_write(mod->refptr->incs, 1);
540 /* Backwards compatibility macros put refcount during init. */ 545 /* Backwards compatibility macros put refcount during init. */
541 mod->waiter = current; 546 mod->waiter = current;
547
548 return 0;
542} 549}
543 550
544/* Does a already use b? */ 551/* Does a already use b? */
@@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod)
618 kfree(use); 625 kfree(use);
619 } 626 }
620 mutex_unlock(&module_mutex); 627 mutex_unlock(&module_mutex);
628
629 free_percpu(mod->refptr);
621} 630}
622 631
623#ifdef CONFIG_MODULE_FORCE_UNLOAD 632#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -787,7 +796,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
787 796
788 /* Store the name of the last unloaded module for diagnostic purposes */ 797 /* Store the name of the last unloaded module for diagnostic purposes */
789 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 798 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
790 ddebug_remove_module(mod->name);
791 799
792 free_module(mod); 800 free_module(mod);
793 return 0; 801 return 0;
@@ -892,8 +900,9 @@ int ref_module(struct module *a, struct module *b)
892} 900}
893EXPORT_SYMBOL_GPL(ref_module); 901EXPORT_SYMBOL_GPL(ref_module);
894 902
895static inline void module_unload_init(struct module *mod) 903static inline int module_unload_init(struct module *mod)
896{ 904{
905 return 0;
897} 906}
898#endif /* CONFIG_MODULE_UNLOAD */ 907#endif /* CONFIG_MODULE_UNLOAD */
899 908
@@ -1052,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1052#endif /* CONFIG_MODVERSIONS */ 1061#endif /* CONFIG_MODVERSIONS */
1053 1062
1054/* Resolve a symbol for this module. I.e. if we find one, record usage. */ 1063/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1055static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1064static const struct kernel_symbol *resolve_symbol(struct module *mod,
1056 unsigned int versindex, 1065 const struct load_info *info,
1057 const char *name, 1066 const char *name,
1058 struct module *mod,
1059 char ownername[]) 1067 char ownername[])
1060{ 1068{
1061 struct module *owner; 1069 struct module *owner;
@@ -1069,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1069 if (!sym) 1077 if (!sym)
1070 goto unlock; 1078 goto unlock;
1071 1079
1072 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { 1080 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
1081 owner)) {
1073 sym = ERR_PTR(-EINVAL); 1082 sym = ERR_PTR(-EINVAL);
1074 goto getname; 1083 goto getname;
1075 } 1084 }
@@ -1088,21 +1097,20 @@ unlock:
1088 return sym; 1097 return sym;
1089} 1098}
1090 1099
1091static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, 1100static const struct kernel_symbol *
1092 unsigned int versindex, 1101resolve_symbol_wait(struct module *mod,
1093 const char *name, 1102 const struct load_info *info,
1094 struct module *mod) 1103 const char *name)
1095{ 1104{
1096 const struct kernel_symbol *ksym; 1105 const struct kernel_symbol *ksym;
1097 char ownername[MODULE_NAME_LEN]; 1106 char owner[MODULE_NAME_LEN];
1098 1107
1099 if (wait_event_interruptible_timeout(module_wq, 1108 if (wait_event_interruptible_timeout(module_wq,
1100 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, 1109 !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
1101 mod, ownername)) || 1110 || PTR_ERR(ksym) != -EBUSY,
1102 PTR_ERR(ksym) != -EBUSY,
1103 30 * HZ) <= 0) { 1111 30 * HZ) <= 0) {
1104 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", 1112 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1105 mod->name, ownername); 1113 mod->name, owner);
1106 } 1114 }
1107 return ksym; 1115 return ksym;
1108} 1116}
@@ -1111,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1111 * /sys/module/foo/sections stuff 1119 * /sys/module/foo/sections stuff
1112 * J. Corbet <corbet@lwn.net> 1120 * J. Corbet <corbet@lwn.net>
1113 */ 1121 */
1114#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1122#ifdef CONFIG_SYSFS
1115 1123
1124#ifdef CONFIG_KALLSYMS
1116static inline bool sect_empty(const Elf_Shdr *sect) 1125static inline bool sect_empty(const Elf_Shdr *sect)
1117{ 1126{
1118 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1127 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1149,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
1149 kfree(sect_attrs); 1158 kfree(sect_attrs);
1150} 1159}
1151 1160
1152static void add_sect_attrs(struct module *mod, unsigned int nsect, 1161static void add_sect_attrs(struct module *mod, const struct load_info *info)
1153 char *secstrings, Elf_Shdr *sechdrs)
1154{ 1162{
1155 unsigned int nloaded = 0, i, size[2]; 1163 unsigned int nloaded = 0, i, size[2];
1156 struct module_sect_attrs *sect_attrs; 1164 struct module_sect_attrs *sect_attrs;
@@ -1158,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1158 struct attribute **gattr; 1166 struct attribute **gattr;
1159 1167
1160 /* Count loaded sections and allocate structures */ 1168 /* Count loaded sections and allocate structures */
1161 for (i = 0; i < nsect; i++) 1169 for (i = 0; i < info->hdr->e_shnum; i++)
1162 if (!sect_empty(&sechdrs[i])) 1170 if (!sect_empty(&info->sechdrs[i]))
1163 nloaded++; 1171 nloaded++;
1164 size[0] = ALIGN(sizeof(*sect_attrs) 1172 size[0] = ALIGN(sizeof(*sect_attrs)
1165 + nloaded * sizeof(sect_attrs->attrs[0]), 1173 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1176,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1176 sect_attrs->nsections = 0; 1184 sect_attrs->nsections = 0;
1177 sattr = &sect_attrs->attrs[0]; 1185 sattr = &sect_attrs->attrs[0];
1178 gattr = &sect_attrs->grp.attrs[0]; 1186 gattr = &sect_attrs->grp.attrs[0];
1179 for (i = 0; i < nsect; i++) { 1187 for (i = 0; i < info->hdr->e_shnum; i++) {
1180 if (sect_empty(&sechdrs[i])) 1188 Elf_Shdr *sec = &info->sechdrs[i];
1189 if (sect_empty(sec))
1181 continue; 1190 continue;
1182 sattr->address = sechdrs[i].sh_addr; 1191 sattr->address = sec->sh_addr;
1183 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1192 sattr->name = kstrdup(info->secstrings + sec->sh_name,
1184 GFP_KERNEL); 1193 GFP_KERNEL);
1185 if (sattr->name == NULL) 1194 if (sattr->name == NULL)
1186 goto out; 1195 goto out;
@@ -1248,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1248 kfree(notes_attrs); 1257 kfree(notes_attrs);
1249} 1258}
1250 1259
1251static void add_notes_attrs(struct module *mod, unsigned int nsect, 1260static void add_notes_attrs(struct module *mod, const struct load_info *info)
1252 char *secstrings, Elf_Shdr *sechdrs)
1253{ 1261{
1254 unsigned int notes, loaded, i; 1262 unsigned int notes, loaded, i;
1255 struct module_notes_attrs *notes_attrs; 1263 struct module_notes_attrs *notes_attrs;
@@ -1261,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1261 1269
1262 /* Count notes sections and allocate structures. */ 1270 /* Count notes sections and allocate structures. */
1263 notes = 0; 1271 notes = 0;
1264 for (i = 0; i < nsect; i++) 1272 for (i = 0; i < info->hdr->e_shnum; i++)
1265 if (!sect_empty(&sechdrs[i]) && 1273 if (!sect_empty(&info->sechdrs[i]) &&
1266 (sechdrs[i].sh_type == SHT_NOTE)) 1274 (info->sechdrs[i].sh_type == SHT_NOTE))
1267 ++notes; 1275 ++notes;
1268 1276
1269 if (notes == 0) 1277 if (notes == 0)
@@ -1277,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1277 1285
1278 notes_attrs->notes = notes; 1286 notes_attrs->notes = notes;
1279 nattr = &notes_attrs->attrs[0]; 1287 nattr = &notes_attrs->attrs[0];
1280 for (loaded = i = 0; i < nsect; ++i) { 1288 for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
1281 if (sect_empty(&sechdrs[i])) 1289 if (sect_empty(&info->sechdrs[i]))
1282 continue; 1290 continue;
1283 if (sechdrs[i].sh_type == SHT_NOTE) { 1291 if (info->sechdrs[i].sh_type == SHT_NOTE) {
1284 sysfs_bin_attr_init(nattr); 1292 sysfs_bin_attr_init(nattr);
1285 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1293 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1286 nattr->attr.mode = S_IRUGO; 1294 nattr->attr.mode = S_IRUGO;
1287 nattr->size = sechdrs[i].sh_size; 1295 nattr->size = info->sechdrs[i].sh_size;
1288 nattr->private = (void *) sechdrs[i].sh_addr; 1296 nattr->private = (void *) info->sechdrs[i].sh_addr;
1289 nattr->read = module_notes_read; 1297 nattr->read = module_notes_read;
1290 ++nattr; 1298 ++nattr;
1291 } 1299 }
@@ -1316,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
1316 1324
1317#else 1325#else
1318 1326
1319static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1327static inline void add_sect_attrs(struct module *mod,
1320 char *sectstrings, Elf_Shdr *sechdrs) 1328 const struct load_info *info)
1321{ 1329{
1322} 1330}
1323 1331
@@ -1325,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod)
1325{ 1333{
1326} 1334}
1327 1335
1328static inline void add_notes_attrs(struct module *mod, unsigned int nsect, 1336static inline void add_notes_attrs(struct module *mod,
1329 char *sectstrings, Elf_Shdr *sechdrs) 1337 const struct load_info *info)
1330{ 1338{
1331} 1339}
1332 1340
1333static inline void remove_notes_attrs(struct module *mod) 1341static inline void remove_notes_attrs(struct module *mod)
1334{ 1342{
1335} 1343}
1336#endif 1344#endif /* CONFIG_KALLSYMS */
1337 1345
1338#ifdef CONFIG_SYSFS
1339static void add_usage_links(struct module *mod) 1346static void add_usage_links(struct module *mod)
1340{ 1347{
1341#ifdef CONFIG_MODULE_UNLOAD 1348#ifdef CONFIG_MODULE_UNLOAD
@@ -1440,6 +1447,7 @@ out:
1440} 1447}
1441 1448
1442static int mod_sysfs_setup(struct module *mod, 1449static int mod_sysfs_setup(struct module *mod,
1450 const struct load_info *info,
1443 struct kernel_param *kparam, 1451 struct kernel_param *kparam,
1444 unsigned int num_params) 1452 unsigned int num_params)
1445{ 1453{
@@ -1464,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod,
1464 goto out_unreg_param; 1472 goto out_unreg_param;
1465 1473
1466 add_usage_links(mod); 1474 add_usage_links(mod);
1475 add_sect_attrs(mod, info);
1476 add_notes_attrs(mod, info);
1467 1477
1468 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1478 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1469 return 0; 1479 return 0;
@@ -1480,33 +1490,26 @@ out:
1480 1490
1481static void mod_sysfs_fini(struct module *mod) 1491static void mod_sysfs_fini(struct module *mod)
1482{ 1492{
1493 remove_notes_attrs(mod);
1494 remove_sect_attrs(mod);
1483 kobject_put(&mod->mkobj.kobj); 1495 kobject_put(&mod->mkobj.kobj);
1484} 1496}
1485 1497
1486#else /* CONFIG_SYSFS */ 1498#else /* !CONFIG_SYSFS */
1487
1488static inline int mod_sysfs_init(struct module *mod)
1489{
1490 return 0;
1491}
1492 1499
1493static inline int mod_sysfs_setup(struct module *mod, 1500static int mod_sysfs_setup(struct module *mod,
1501 const struct load_info *info,
1494 struct kernel_param *kparam, 1502 struct kernel_param *kparam,
1495 unsigned int num_params) 1503 unsigned int num_params)
1496{ 1504{
1497 return 0; 1505 return 0;
1498} 1506}
1499 1507
1500static inline int module_add_modinfo_attrs(struct module *mod) 1508static void mod_sysfs_fini(struct module *mod)
1501{
1502 return 0;
1503}
1504
1505static inline void module_remove_modinfo_attrs(struct module *mod)
1506{ 1509{
1507} 1510}
1508 1511
1509static void mod_sysfs_fini(struct module *mod) 1512static void module_remove_modinfo_attrs(struct module *mod)
1510{ 1513{
1511} 1514}
1512 1515
@@ -1516,7 +1519,7 @@ static void del_usage_links(struct module *mod)
1516 1519
1517#endif /* CONFIG_SYSFS */ 1520#endif /* CONFIG_SYSFS */
1518 1521
1519static void mod_kobject_remove(struct module *mod) 1522static void mod_sysfs_teardown(struct module *mod)
1520{ 1523{
1521 del_usage_links(mod); 1524 del_usage_links(mod);
1522 module_remove_modinfo_attrs(mod); 1525 module_remove_modinfo_attrs(mod);
@@ -1546,9 +1549,10 @@ static void free_module(struct module *mod)
1546 mutex_lock(&module_mutex); 1549 mutex_lock(&module_mutex);
1547 stop_machine(__unlink_module, mod, NULL); 1550 stop_machine(__unlink_module, mod, NULL);
1548 mutex_unlock(&module_mutex); 1551 mutex_unlock(&module_mutex);
1549 remove_notes_attrs(mod); 1552 mod_sysfs_teardown(mod);
1550 remove_sect_attrs(mod); 1553
1551 mod_kobject_remove(mod); 1554 /* Remove dynamic debug info */
1555 ddebug_remove_module(mod->name);
1552 1556
1553 /* Arch-specific cleanup. */ 1557 /* Arch-specific cleanup. */
1554 module_arch_cleanup(mod); 1558 module_arch_cleanup(mod);
@@ -1563,10 +1567,7 @@ static void free_module(struct module *mod)
1563 module_free(mod, mod->module_init); 1567 module_free(mod, mod->module_init);
1564 kfree(mod->args); 1568 kfree(mod->args);
1565 percpu_modfree(mod); 1569 percpu_modfree(mod);
1566#if defined(CONFIG_MODULE_UNLOAD) 1570
1567 if (mod->refptr)
1568 free_percpu(mod->refptr);
1569#endif
1570 /* Free lock-classes: */ 1571 /* Free lock-classes: */
1571 lockdep_free_key_range(mod->module_core, mod->core_size); 1572 lockdep_free_key_range(mod->module_core, mod->core_size);
1572 1573
@@ -1632,25 +1633,23 @@ static int verify_export_symbols(struct module *mod)
1632} 1633}
1633 1634
1634/* Change all symbols so that st_value encodes the pointer directly. */ 1635/* Change all symbols so that st_value encodes the pointer directly. */
1635static int simplify_symbols(Elf_Shdr *sechdrs, 1636static int simplify_symbols(struct module *mod, const struct load_info *info)
1636 unsigned int symindex, 1637{
1637 const char *strtab, 1638 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
1638 unsigned int versindex, 1639 Elf_Sym *sym = (void *)symsec->sh_addr;
1639 unsigned int pcpuindex,
1640 struct module *mod)
1641{
1642 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1643 unsigned long secbase; 1640 unsigned long secbase;
1644 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1641 unsigned int i;
1645 int ret = 0; 1642 int ret = 0;
1646 const struct kernel_symbol *ksym; 1643 const struct kernel_symbol *ksym;
1647 1644
1648 for (i = 1; i < n; i++) { 1645 for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
1646 const char *name = info->strtab + sym[i].st_name;
1647
1649 switch (sym[i].st_shndx) { 1648 switch (sym[i].st_shndx) {
1650 case SHN_COMMON: 1649 case SHN_COMMON:
1651 /* We compiled with -fno-common. These are not 1650 /* We compiled with -fno-common. These are not
1652 supposed to happen. */ 1651 supposed to happen. */
1653 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); 1652 DEBUGP("Common symbol: %s\n", name);
1654 printk("%s: please compile with -fno-common\n", 1653 printk("%s: please compile with -fno-common\n",
1655 mod->name); 1654 mod->name);
1656 ret = -ENOEXEC; 1655 ret = -ENOEXEC;
@@ -1663,9 +1662,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1663 break; 1662 break;
1664 1663
1665 case SHN_UNDEF: 1664 case SHN_UNDEF:
1666 ksym = resolve_symbol_wait(sechdrs, versindex, 1665 ksym = resolve_symbol_wait(mod, info, name);
1667 strtab + sym[i].st_name,
1668 mod);
1669 /* Ok if resolved. */ 1666 /* Ok if resolved. */
1670 if (ksym && !IS_ERR(ksym)) { 1667 if (ksym && !IS_ERR(ksym)) {
1671 sym[i].st_value = ksym->value; 1668 sym[i].st_value = ksym->value;
@@ -1677,17 +1674,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1677 break; 1674 break;
1678 1675
1679 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", 1676 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1680 mod->name, strtab + sym[i].st_name, 1677 mod->name, name, PTR_ERR(ksym));
1681 PTR_ERR(ksym));
1682 ret = PTR_ERR(ksym) ?: -ENOENT; 1678 ret = PTR_ERR(ksym) ?: -ENOENT;
1683 break; 1679 break;
1684 1680
1685 default: 1681 default:
1686 /* Divert to percpu allocation if a percpu var. */ 1682 /* Divert to percpu allocation if a percpu var. */
1687 if (sym[i].st_shndx == pcpuindex) 1683 if (sym[i].st_shndx == info->index.pcpu)
1688 secbase = (unsigned long)mod_percpu(mod); 1684 secbase = (unsigned long)mod_percpu(mod);
1689 else 1685 else
1690 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1686 secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
1691 sym[i].st_value += secbase; 1687 sym[i].st_value += secbase;
1692 break; 1688 break;
1693 } 1689 }
@@ -1696,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1696 return ret; 1692 return ret;
1697} 1693}
1698 1694
1695static int apply_relocations(struct module *mod, const struct load_info *info)
1696{
1697 unsigned int i;
1698 int err = 0;
1699
1700 /* Now do relocations. */
1701 for (i = 1; i < info->hdr->e_shnum; i++) {
1702 unsigned int infosec = info->sechdrs[i].sh_info;
1703
1704 /* Not a valid relocation section? */
1705 if (infosec >= info->hdr->e_shnum)
1706 continue;
1707
1708 /* Don't bother with non-allocated sections */
1709 if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
1710 continue;
1711
1712 if (info->sechdrs[i].sh_type == SHT_REL)
1713 err = apply_relocate(info->sechdrs, info->strtab,
1714 info->index.sym, i, mod);
1715 else if (info->sechdrs[i].sh_type == SHT_RELA)
1716 err = apply_relocate_add(info->sechdrs, info->strtab,
1717 info->index.sym, i, mod);
1718 if (err < 0)
1719 break;
1720 }
1721 return err;
1722}
1723
1699/* Additional bytes needed by arch in front of individual sections */ 1724/* Additional bytes needed by arch in front of individual sections */
1700unsigned int __weak arch_mod_section_prepend(struct module *mod, 1725unsigned int __weak arch_mod_section_prepend(struct module *mod,
1701 unsigned int section) 1726 unsigned int section)
@@ -1720,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size,
1720 might -- code, read-only data, read-write data, small data. Tally 1745 might -- code, read-only data, read-write data, small data. Tally
1721 sizes, and place the offsets into sh_entsize fields: high bit means it 1746 sizes, and place the offsets into sh_entsize fields: high bit means it
1722 belongs in init. */ 1747 belongs in init. */
1723static void layout_sections(struct module *mod, 1748static void layout_sections(struct module *mod, struct load_info *info)
1724 const Elf_Ehdr *hdr,
1725 Elf_Shdr *sechdrs,
1726 const char *secstrings)
1727{ 1749{
1728 static unsigned long const masks[][2] = { 1750 static unsigned long const masks[][2] = {
1729 /* NOTE: all executable code must be the first section 1751 /* NOTE: all executable code must be the first section
@@ -1736,21 +1758,22 @@ static void layout_sections(struct module *mod,
1736 }; 1758 };
1737 unsigned int m, i; 1759 unsigned int m, i;
1738 1760
1739 for (i = 0; i < hdr->e_shnum; i++) 1761 for (i = 0; i < info->hdr->e_shnum; i++)
1740 sechdrs[i].sh_entsize = ~0UL; 1762 info->sechdrs[i].sh_entsize = ~0UL;
1741 1763
1742 DEBUGP("Core section allocation order:\n"); 1764 DEBUGP("Core section allocation order:\n");
1743 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1765 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1744 for (i = 0; i < hdr->e_shnum; ++i) { 1766 for (i = 0; i < info->hdr->e_shnum; ++i) {
1745 Elf_Shdr *s = &sechdrs[i]; 1767 Elf_Shdr *s = &info->sechdrs[i];
1768 const char *sname = info->secstrings + s->sh_name;
1746 1769
1747 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1770 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1748 || (s->sh_flags & masks[m][1]) 1771 || (s->sh_flags & masks[m][1])
1749 || s->sh_entsize != ~0UL 1772 || s->sh_entsize != ~0UL
1750 || strstarts(secstrings + s->sh_name, ".init")) 1773 || strstarts(sname, ".init"))
1751 continue; 1774 continue;
1752 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1775 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1753 DEBUGP("\t%s\n", secstrings + s->sh_name); 1776 DEBUGP("\t%s\n", name);
1754 } 1777 }
1755 if (m == 0) 1778 if (m == 0)
1756 mod->core_text_size = mod->core_size; 1779 mod->core_text_size = mod->core_size;
@@ -1758,17 +1781,18 @@ static void layout_sections(struct module *mod,
1758 1781
1759 DEBUGP("Init section allocation order:\n"); 1782 DEBUGP("Init section allocation order:\n");
1760 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1783 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1761 for (i = 0; i < hdr->e_shnum; ++i) { 1784 for (i = 0; i < info->hdr->e_shnum; ++i) {
1762 Elf_Shdr *s = &sechdrs[i]; 1785 Elf_Shdr *s = &info->sechdrs[i];
1786 const char *sname = info->secstrings + s->sh_name;
1763 1787
1764 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1788 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1765 || (s->sh_flags & masks[m][1]) 1789 || (s->sh_flags & masks[m][1])
1766 || s->sh_entsize != ~0UL 1790 || s->sh_entsize != ~0UL
1767 || !strstarts(secstrings + s->sh_name, ".init")) 1791 || !strstarts(sname, ".init"))
1768 continue; 1792 continue;
1769 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1793 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1770 | INIT_OFFSET_MASK); 1794 | INIT_OFFSET_MASK);
1771 DEBUGP("\t%s\n", secstrings + s->sh_name); 1795 DEBUGP("\t%s\n", sname);
1772 } 1796 }
1773 if (m == 0) 1797 if (m == 0)
1774 mod->init_text_size = mod->init_size; 1798 mod->init_text_size = mod->init_size;
@@ -1807,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize)
1807 return string; 1831 return string;
1808} 1832}
1809 1833
1810static char *get_modinfo(Elf_Shdr *sechdrs, 1834static char *get_modinfo(struct load_info *info, const char *tag)
1811 unsigned int info,
1812 const char *tag)
1813{ 1835{
1814 char *p; 1836 char *p;
1815 unsigned int taglen = strlen(tag); 1837 unsigned int taglen = strlen(tag);
1816 unsigned long size = sechdrs[info].sh_size; 1838 Elf_Shdr *infosec = &info->sechdrs[info->index.info];
1839 unsigned long size = infosec->sh_size;
1817 1840
1818 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { 1841 for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
1819 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') 1842 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1820 return p + taglen + 1; 1843 return p + taglen + 1;
1821 } 1844 }
1822 return NULL; 1845 return NULL;
1823} 1846}
1824 1847
1825static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, 1848static void setup_modinfo(struct module *mod, struct load_info *info)
1826 unsigned int infoindex)
1827{ 1849{
1828 struct module_attribute *attr; 1850 struct module_attribute *attr;
1829 int i; 1851 int i;
1830 1852
1831 for (i = 0; (attr = modinfo_attrs[i]); i++) { 1853 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1832 if (attr->setup) 1854 if (attr->setup)
1833 attr->setup(mod, 1855 attr->setup(mod, get_modinfo(info, attr->attr.name));
1834 get_modinfo(sechdrs,
1835 infoindex,
1836 attr->attr.name));
1837 } 1856 }
1838} 1857}
1839 1858
@@ -1874,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value,
1874} 1893}
1875 1894
1876/* As per nm */ 1895/* As per nm */
1877static char elf_type(const Elf_Sym *sym, 1896static char elf_type(const Elf_Sym *sym, const struct load_info *info)
1878 Elf_Shdr *sechdrs,
1879 const char *secstrings,
1880 struct module *mod)
1881{ 1897{
1898 const Elf_Shdr *sechdrs = info->sechdrs;
1899
1882 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { 1900 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1883 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) 1901 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1884 return 'v'; 1902 return 'v';
@@ -1908,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym,
1908 else 1926 else
1909 return 'b'; 1927 return 'b';
1910 } 1928 }
1911 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) 1929 if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
1930 ".debug")) {
1912 return 'n'; 1931 return 'n';
1932 }
1913 return '?'; 1933 return '?';
1914} 1934}
1915 1935
@@ -1934,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1934 return true; 1954 return true;
1935} 1955}
1936 1956
1937static unsigned long layout_symtab(struct module *mod, 1957static void layout_symtab(struct module *mod, struct load_info *info)
1938 Elf_Shdr *sechdrs,
1939 unsigned int symindex,
1940 unsigned int strindex,
1941 const Elf_Ehdr *hdr,
1942 const char *secstrings,
1943 unsigned long *pstroffs,
1944 unsigned long *strmap)
1945{ 1958{
1946 unsigned long symoffs; 1959 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
1947 Elf_Shdr *symsect = sechdrs + symindex; 1960 Elf_Shdr *strsect = info->sechdrs + info->index.str;
1948 Elf_Shdr *strsect = sechdrs + strindex;
1949 const Elf_Sym *src; 1961 const Elf_Sym *src;
1950 const char *strtab;
1951 unsigned int i, nsrc, ndst; 1962 unsigned int i, nsrc, ndst;
1952 1963
1953 /* Put symbol section at end of init part of module. */ 1964 /* Put symbol section at end of init part of module. */
1954 symsect->sh_flags |= SHF_ALLOC; 1965 symsect->sh_flags |= SHF_ALLOC;
1955 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 1966 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1956 symindex) | INIT_OFFSET_MASK; 1967 info->index.sym) | INIT_OFFSET_MASK;
1957 DEBUGP("\t%s\n", secstrings + symsect->sh_name); 1968 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
1958 1969
1959 src = (void *)hdr + symsect->sh_offset; 1970 src = (void *)info->hdr + symsect->sh_offset;
1960 nsrc = symsect->sh_size / sizeof(*src); 1971 nsrc = symsect->sh_size / sizeof(*src);
1961 strtab = (void *)hdr + strsect->sh_offset;
1962 for (ndst = i = 1; i < nsrc; ++i, ++src) 1972 for (ndst = i = 1; i < nsrc; ++i, ++src)
1963 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { 1973 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
1964 unsigned int j = src->st_name; 1974 unsigned int j = src->st_name;
1965 1975
1966 while(!__test_and_set_bit(j, strmap) && strtab[j]) 1976 while (!__test_and_set_bit(j, info->strmap)
1977 && info->strtab[j])
1967 ++j; 1978 ++j;
1968 ++ndst; 1979 ++ndst;
1969 } 1980 }
1970 1981
1971 /* Append room for core symbols at end of core part. */ 1982 /* Append room for core symbols at end of core part. */
1972 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 1983 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1973 mod->core_size = symoffs + ndst * sizeof(Elf_Sym); 1984 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
1974 1985
1975 /* Put string table section at end of init part of module. */ 1986 /* Put string table section at end of init part of module. */
1976 strsect->sh_flags |= SHF_ALLOC; 1987 strsect->sh_flags |= SHF_ALLOC;
1977 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 1988 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1978 strindex) | INIT_OFFSET_MASK; 1989 info->index.str) | INIT_OFFSET_MASK;
1979 DEBUGP("\t%s\n", secstrings + strsect->sh_name); 1990 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
1980 1991
1981 /* Append room for core symbols' strings at end of core part. */ 1992 /* Append room for core symbols' strings at end of core part. */
1982 *pstroffs = mod->core_size; 1993 info->stroffs = mod->core_size;
1983 __set_bit(0, strmap); 1994 __set_bit(0, info->strmap);
1984 mod->core_size += bitmap_weight(strmap, strsect->sh_size); 1995 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
1985
1986 return symoffs;
1987} 1996}
1988 1997
1989static void add_kallsyms(struct module *mod, 1998static void add_kallsyms(struct module *mod, const struct load_info *info)
1990 Elf_Shdr *sechdrs,
1991 unsigned int shnum,
1992 unsigned int symindex,
1993 unsigned int strindex,
1994 unsigned long symoffs,
1995 unsigned long stroffs,
1996 const char *secstrings,
1997 unsigned long *strmap)
1998{ 1999{
1999 unsigned int i, ndst; 2000 unsigned int i, ndst;
2000 const Elf_Sym *src; 2001 const Elf_Sym *src;
2001 Elf_Sym *dst; 2002 Elf_Sym *dst;
2002 char *s; 2003 char *s;
2004 Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
2003 2005
2004 mod->symtab = (void *)sechdrs[symindex].sh_addr; 2006 mod->symtab = (void *)symsec->sh_addr;
2005 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 2007 mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
2006 mod->strtab = (void *)sechdrs[strindex].sh_addr; 2008 /* Make sure we get permanent strtab: don't use info->strtab. */
2009 mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
2007 2010
2008 /* Set types up while we still have access to sections. */ 2011 /* Set types up while we still have access to sections. */
2009 for (i = 0; i < mod->num_symtab; i++) 2012 for (i = 0; i < mod->num_symtab; i++)
2010 mod->symtab[i].st_info 2013 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2011 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
2012 2014
2013 mod->core_symtab = dst = mod->module_core + symoffs; 2015 mod->core_symtab = dst = mod->module_core + info->symoffs;
2014 src = mod->symtab; 2016 src = mod->symtab;
2015 *dst = *src; 2017 *dst = *src;
2016 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2018 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2017 if (!is_core_symbol(src, sechdrs, shnum)) 2019 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2018 continue; 2020 continue;
2019 dst[ndst] = *src; 2021 dst[ndst] = *src;
2020 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); 2022 dst[ndst].st_name = bitmap_weight(info->strmap,
2023 dst[ndst].st_name);
2021 ++ndst; 2024 ++ndst;
2022 } 2025 }
2023 mod->core_num_syms = ndst; 2026 mod->core_num_syms = ndst;
2024 2027
2025 mod->core_strtab = s = mod->module_core + stroffs; 2028 mod->core_strtab = s = mod->module_core + info->stroffs;
2026 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) 2029 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2027 if (test_bit(i, strmap)) 2030 if (test_bit(i, info->strmap))
2028 *++s = mod->strtab[i]; 2031 *++s = mod->strtab[i];
2029} 2032}
2030#else 2033#else
2031static inline unsigned long layout_symtab(struct module *mod, 2034static inline void layout_symtab(struct module *mod, struct load_info *info)
2032 Elf_Shdr *sechdrs,
2033 unsigned int symindex,
2034 unsigned int strindex,
2035 const Elf_Ehdr *hdr,
2036 const char *secstrings,
2037 unsigned long *pstroffs,
2038 unsigned long *strmap)
2039{ 2035{
2040 return 0;
2041} 2036}
2042 2037
2043static inline void add_kallsyms(struct module *mod, 2038static void add_kallsyms(struct module *mod, struct load_info *info)
2044 Elf_Shdr *sechdrs,
2045 unsigned int shnum,
2046 unsigned int symindex,
2047 unsigned int strindex,
2048 unsigned long symoffs,
2049 unsigned long stroffs,
2050 const char *secstrings,
2051 const unsigned long *strmap)
2052{ 2039{
2053} 2040}
2054#endif /* CONFIG_KALLSYMS */ 2041#endif /* CONFIG_KALLSYMS */
2055 2042
2056static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) 2043static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
2057{ 2044{
2045 if (!debug)
2046 return;
2058#ifdef CONFIG_DYNAMIC_DEBUG 2047#ifdef CONFIG_DYNAMIC_DEBUG
2059 if (ddebug_add_module(debug, num, debug->modname)) 2048 if (ddebug_add_module(debug, num, debug->modname))
2060 printk(KERN_ERR "dynamic debug error adding module: %s\n", 2049 printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -2085,65 +2074,47 @@ static void *module_alloc_update_bounds(unsigned long size)
2085} 2074}
2086 2075
2087#ifdef CONFIG_DEBUG_KMEMLEAK 2076#ifdef CONFIG_DEBUG_KMEMLEAK
2088static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2077static void kmemleak_load_module(const struct module *mod,
2089 Elf_Shdr *sechdrs, char *secstrings) 2078 const struct load_info *info)
2090{ 2079{
2091 unsigned int i; 2080 unsigned int i;
2092 2081
2093 /* only scan the sections containing data */ 2082 /* only scan the sections containing data */
2094 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2083 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2095 2084
2096 for (i = 1; i < hdr->e_shnum; i++) { 2085 for (i = 1; i < info->hdr->e_shnum; i++) {
2097 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2086 const char *name = info->secstrings + info->sechdrs[i].sh_name;
2087 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
2098 continue; 2088 continue;
2099 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 2089 if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
2100 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2101 continue; 2090 continue;
2102 2091
2103 kmemleak_scan_area((void *)sechdrs[i].sh_addr, 2092 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
2104 sechdrs[i].sh_size, GFP_KERNEL); 2093 info->sechdrs[i].sh_size, GFP_KERNEL);
2105 } 2094 }
2106} 2095}
2107#else 2096#else
2108static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, 2097static inline void kmemleak_load_module(const struct module *mod,
2109 Elf_Shdr *sechdrs, char *secstrings) 2098 const struct load_info *info)
2110{ 2099{
2111} 2100}
2112#endif 2101#endif
2113 2102
2114/* Allocate and load the module: note that size of section 0 is always 2103/* Sets info->hdr and info->len. */
2115 zero, and we rely on this for optional sections. */ 2104static int copy_and_check(struct load_info *info,
2116static noinline struct module *load_module(void __user *umod, 2105 const void __user *umod, unsigned long len,
2117 unsigned long len, 2106 const char __user *uargs)
2118 const char __user *uargs)
2119{ 2107{
2108 int err;
2120 Elf_Ehdr *hdr; 2109 Elf_Ehdr *hdr;
2121 Elf_Shdr *sechdrs;
2122 char *secstrings, *args, *modmagic, *strtab = NULL;
2123 char *staging;
2124 unsigned int i;
2125 unsigned int symindex = 0;
2126 unsigned int strindex = 0;
2127 unsigned int modindex, versindex, infoindex, pcpuindex;
2128 struct module *mod;
2129 long err = 0;
2130 void *ptr = NULL; /* Stops spurious gcc warning */
2131 unsigned long symoffs, stroffs, *strmap;
2132 void __percpu *percpu;
2133 struct _ddebug *debug = NULL;
2134 unsigned int num_debug = 0;
2135
2136 mm_segment_t old_fs;
2137 2110
2138 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2139 umod, len, uargs);
2140 if (len < sizeof(*hdr)) 2111 if (len < sizeof(*hdr))
2141 return ERR_PTR(-ENOEXEC); 2112 return -ENOEXEC;
2142 2113
2143 /* Suck in entire file: we'll want most of it. */ 2114 /* Suck in entire file: we'll want most of it. */
2144 /* vmalloc barfs on "unusual" numbers. Check here */ 2115 /* vmalloc barfs on "unusual" numbers. Check here */
2145 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 2116 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2146 return ERR_PTR(-ENOMEM); 2117 return -ENOMEM;
2147 2118
2148 if (copy_from_user(hdr, umod, len) != 0) { 2119 if (copy_from_user(hdr, umod, len) != 0) {
2149 err = -EFAULT; 2120 err = -EFAULT;
@@ -2151,135 +2122,225 @@ static noinline struct module *load_module(void __user *umod,
2151 } 2122 }
2152 2123
2153 /* Sanity checks against insmoding binaries or wrong arch, 2124 /* Sanity checks against insmoding binaries or wrong arch,
2154 weird elf version */ 2125 weird elf version */
2155 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2126 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
2156 || hdr->e_type != ET_REL 2127 || hdr->e_type != ET_REL
2157 || !elf_check_arch(hdr) 2128 || !elf_check_arch(hdr)
2158 || hdr->e_shentsize != sizeof(*sechdrs)) { 2129 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2130 err = -ENOEXEC;
2131 goto free_hdr;
2132 }
2133
2134 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
2159 err = -ENOEXEC; 2135 err = -ENOEXEC;
2160 goto free_hdr; 2136 goto free_hdr;
2161 } 2137 }
2162 2138
2163 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) 2139 info->hdr = hdr;
2164 goto truncated; 2140 info->len = len;
2141 return 0;
2142
2143free_hdr:
2144 vfree(hdr);
2145 return err;
2146}
2147
2148static void free_copy(struct load_info *info)
2149{
2150 vfree(info->hdr);
2151}
2152
2153static int rewrite_section_headers(struct load_info *info)
2154{
2155 unsigned int i;
2165 2156
2166 /* Convenience variables */ 2157 /* This should always be true, but let's be sure. */
2167 sechdrs = (void *)hdr + hdr->e_shoff; 2158 info->sechdrs[0].sh_addr = 0;
2168 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
2169 sechdrs[0].sh_addr = 0;
2170 2159
2171 for (i = 1; i < hdr->e_shnum; i++) { 2160 for (i = 1; i < info->hdr->e_shnum; i++) {
2172 if (sechdrs[i].sh_type != SHT_NOBITS 2161 Elf_Shdr *shdr = &info->sechdrs[i];
2173 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) 2162 if (shdr->sh_type != SHT_NOBITS
2174 goto truncated; 2163 && info->len < shdr->sh_offset + shdr->sh_size) {
2164 printk(KERN_ERR "Module len %lu truncated\n",
2165 info->len);
2166 return -ENOEXEC;
2167 }
2175 2168
2176 /* Mark all sections sh_addr with their address in the 2169 /* Mark all sections sh_addr with their address in the
2177 temporary image. */ 2170 temporary image. */
2178 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; 2171 shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
2179 2172
2180 /* Internal symbols and strings. */
2181 if (sechdrs[i].sh_type == SHT_SYMTAB) {
2182 symindex = i;
2183 strindex = sechdrs[i].sh_link;
2184 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
2185 }
2186#ifndef CONFIG_MODULE_UNLOAD 2173#ifndef CONFIG_MODULE_UNLOAD
2187 /* Don't load .exit sections */ 2174 /* Don't load .exit sections */
2188 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) 2175 if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
2189 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 2176 shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
2190#endif 2177#endif
2191 } 2178 }
2192 2179
2193 modindex = find_sec(hdr, sechdrs, secstrings, 2180 /* Track but don't keep modinfo and version sections. */
2194 ".gnu.linkonce.this_module"); 2181 info->index.vers = find_sec(info, "__versions");
2195 if (!modindex) { 2182 info->index.info = find_sec(info, ".modinfo");
2183 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2184 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2185 return 0;
2186}
2187
2188/*
2189 * Set up our basic convenience variables (pointers to section headers,
2190 * search for module section index etc), and do some basic section
2191 * verification.
2192 *
2193 * Return the temporary module pointer (we'll replace it with the final
2194 * one when we move the module sections around).
2195 */
2196static struct module *setup_load_info(struct load_info *info)
2197{
2198 unsigned int i;
2199 int err;
2200 struct module *mod;
2201
2202 /* Set up the convenience variables */
2203 info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
2204 info->secstrings = (void *)info->hdr
2205 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2206
2207 err = rewrite_section_headers(info);
2208 if (err)
2209 return ERR_PTR(err);
2210
2211 /* Find internal symbols and strings. */
2212 for (i = 1; i < info->hdr->e_shnum; i++) {
2213 if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
2214 info->index.sym = i;
2215 info->index.str = info->sechdrs[i].sh_link;
2216 info->strtab = (char *)info->hdr
2217 + info->sechdrs[info->index.str].sh_offset;
2218 break;
2219 }
2220 }
2221
2222 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2223 if (!info->index.mod) {
2196 printk(KERN_WARNING "No module found in object\n"); 2224 printk(KERN_WARNING "No module found in object\n");
2197 err = -ENOEXEC; 2225 return ERR_PTR(-ENOEXEC);
2198 goto free_hdr;
2199 } 2226 }
2200 /* This is temporary: point mod into copy of data. */ 2227 /* This is temporary: point mod into copy of data. */
2201 mod = (void *)sechdrs[modindex].sh_addr; 2228 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2202 2229
2203 if (symindex == 0) { 2230 if (info->index.sym == 0) {
2204 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 2231 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
2205 mod->name); 2232 mod->name);
2206 err = -ENOEXEC; 2233 return ERR_PTR(-ENOEXEC);
2207 goto free_hdr;
2208 } 2234 }
2209 2235
2210 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 2236 info->index.pcpu = find_pcpusec(info);
2211 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
2212 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
2213
2214 /* Don't keep modinfo and version sections. */
2215 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2216 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2217 2237
2218 /* Check module struct version now, before we try to use module. */ 2238 /* Check module struct version now, before we try to use module. */
2219 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2239 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
2220 err = -ENOEXEC; 2240 return ERR_PTR(-ENOEXEC);
2221 goto free_hdr; 2241
2222 } 2242 return mod;
2243}
2244
2245static int check_modinfo(struct module *mod, struct load_info *info)
2246{
2247 const char *modmagic = get_modinfo(info, "vermagic");
2248 int err;
2223 2249
2224 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2225 /* This is allowed: modprobe --force will invalidate it. */ 2250 /* This is allowed: modprobe --force will invalidate it. */
2226 if (!modmagic) { 2251 if (!modmagic) {
2227 err = try_to_force_load(mod, "bad vermagic"); 2252 err = try_to_force_load(mod, "bad vermagic");
2228 if (err) 2253 if (err)
2229 goto free_hdr; 2254 return err;
2230 } else if (!same_magic(modmagic, vermagic, versindex)) { 2255 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2231 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 2256 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
2232 mod->name, modmagic, vermagic); 2257 mod->name, modmagic, vermagic);
2233 err = -ENOEXEC; 2258 return -ENOEXEC;
2234 goto free_hdr;
2235 } 2259 }
2236 2260
2237 staging = get_modinfo(sechdrs, infoindex, "staging"); 2261 if (get_modinfo(info, "staging")) {
2238 if (staging) {
2239 add_taint_module(mod, TAINT_CRAP); 2262 add_taint_module(mod, TAINT_CRAP);
2240 printk(KERN_WARNING "%s: module is from the staging directory," 2263 printk(KERN_WARNING "%s: module is from the staging directory,"
2241 " the quality is unknown, you have been warned.\n", 2264 " the quality is unknown, you have been warned.\n",
2242 mod->name); 2265 mod->name);
2243 } 2266 }
2244 2267
2245 /* Now copy in args */ 2268 /* Set up license info based on the info section */
2246 args = strndup_user(uargs, ~0UL >> 1); 2269 set_license(mod, get_modinfo(info, "license"));
2247 if (IS_ERR(args)) {
2248 err = PTR_ERR(args);
2249 goto free_hdr;
2250 }
2251 2270
2252 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) 2271 return 0;
2253 * sizeof(long), GFP_KERNEL); 2272}
2254 if (!strmap) {
2255 err = -ENOMEM;
2256 goto free_mod;
2257 }
2258 2273
2259 mod->state = MODULE_STATE_COMING; 2274static void find_module_sections(struct module *mod, struct load_info *info)
2275{
2276 mod->kp = section_objs(info, "__param",
2277 sizeof(*mod->kp), &mod->num_kp);
2278 mod->syms = section_objs(info, "__ksymtab",
2279 sizeof(*mod->syms), &mod->num_syms);
2280 mod->crcs = section_addr(info, "__kcrctab");
2281 mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
2282 sizeof(*mod->gpl_syms),
2283 &mod->num_gpl_syms);
2284 mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
2285 mod->gpl_future_syms = section_objs(info,
2286 "__ksymtab_gpl_future",
2287 sizeof(*mod->gpl_future_syms),
2288 &mod->num_gpl_future_syms);
2289 mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
2260 2290
2261 /* Allow arches to frob section contents and sizes. */ 2291#ifdef CONFIG_UNUSED_SYMBOLS
2262 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); 2292 mod->unused_syms = section_objs(info, "__ksymtab_unused",
2263 if (err < 0) 2293 sizeof(*mod->unused_syms),
2264 goto free_mod; 2294 &mod->num_unused_syms);
2295 mod->unused_crcs = section_addr(info, "__kcrctab_unused");
2296 mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
2297 sizeof(*mod->unused_gpl_syms),
2298 &mod->num_unused_gpl_syms);
2299 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2300#endif
2301#ifdef CONFIG_CONSTRUCTORS
2302 mod->ctors = section_objs(info, ".ctors",
2303 sizeof(*mod->ctors), &mod->num_ctors);
2304#endif
2265 2305
2266 if (pcpuindex) { 2306#ifdef CONFIG_TRACEPOINTS
2267 /* We have a special allocation for this section. */ 2307 mod->tracepoints = section_objs(info, "__tracepoints",
2268 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, 2308 sizeof(*mod->tracepoints),
2269 sechdrs[pcpuindex].sh_addralign); 2309 &mod->num_tracepoints);
2270 if (err) 2310#endif
2271 goto free_mod; 2311#ifdef CONFIG_EVENT_TRACING
2272 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2312 mod->trace_events = section_objs(info, "_ftrace_events",
2273 } 2313 sizeof(*mod->trace_events),
2274 /* Keep this around for failure path. */ 2314 &mod->num_trace_events);
2275 percpu = mod_percpu(mod); 2315 /*
2316 * This section contains pointers to allocated objects in the trace
2317 * code and not scanning it leads to false positives.
2318 */
2319 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2320 mod->num_trace_events, GFP_KERNEL);
2321#endif
2322#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2323 /* sechdrs[0].sh_size is always zero */
2324 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
2325 sizeof(*mod->ftrace_callsites),
2326 &mod->num_ftrace_callsites);
2327#endif
2276 2328
2277 /* Determine total sizes, and put offsets in sh_entsize. For now 2329 mod->extable = section_objs(info, "__ex_table",
2278 this is done generically; there doesn't appear to be any 2330 sizeof(*mod->extable), &mod->num_exentries);
2279 special cases for the architectures. */ 2331
2280 layout_sections(mod, hdr, sechdrs, secstrings); 2332 if (section_addr(info, "__obsparm"))
2281 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, 2333 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2282 secstrings, &stroffs, strmap); 2334 mod->name);
2335
2336 info->debug = section_objs(info, "__verbose",
2337 sizeof(*info->debug), &info->num_debug);
2338}
2339
2340static int move_module(struct module *mod, struct load_info *info)
2341{
2342 int i;
2343 void *ptr;
2283 2344
2284 /* Do the allocs. */ 2345 /* Do the allocs. */
2285 ptr = module_alloc_update_bounds(mod->core_size); 2346 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2289,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod,
2289 * leak. 2350 * leak.
2290 */ 2351 */
2291 kmemleak_not_leak(ptr); 2352 kmemleak_not_leak(ptr);
2292 if (!ptr) { 2353 if (!ptr)
2293 err = -ENOMEM; 2354 return -ENOMEM;
2294 goto free_percpu; 2355
2295 }
2296 memset(ptr, 0, mod->core_size); 2356 memset(ptr, 0, mod->core_size);
2297 mod->module_core = ptr; 2357 mod->module_core = ptr;
2298 2358
@@ -2305,50 +2365,40 @@ static noinline struct module *load_module(void __user *umod,
2305 */ 2365 */
2306 kmemleak_ignore(ptr); 2366 kmemleak_ignore(ptr);
2307 if (!ptr && mod->init_size) { 2367 if (!ptr && mod->init_size) {
2308 err = -ENOMEM; 2368 module_free(mod, mod->module_core);
2309 goto free_core; 2369 return -ENOMEM;
2310 } 2370 }
2311 memset(ptr, 0, mod->init_size); 2371 memset(ptr, 0, mod->init_size);
2312 mod->module_init = ptr; 2372 mod->module_init = ptr;
2313 2373
2314 /* Transfer each section which specifies SHF_ALLOC */ 2374 /* Transfer each section which specifies SHF_ALLOC */
2315 DEBUGP("final section addresses:\n"); 2375 DEBUGP("final section addresses:\n");
2316 for (i = 0; i < hdr->e_shnum; i++) { 2376 for (i = 0; i < info->hdr->e_shnum; i++) {
2317 void *dest; 2377 void *dest;
2378 Elf_Shdr *shdr = &info->sechdrs[i];
2318 2379
2319 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 2380 if (!(shdr->sh_flags & SHF_ALLOC))
2320 continue; 2381 continue;
2321 2382
2322 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) 2383 if (shdr->sh_entsize & INIT_OFFSET_MASK)
2323 dest = mod->module_init 2384 dest = mod->module_init
2324 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); 2385 + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
2325 else 2386 else
2326 dest = mod->module_core + sechdrs[i].sh_entsize; 2387 dest = mod->module_core + shdr->sh_entsize;
2327 2388
2328 if (sechdrs[i].sh_type != SHT_NOBITS) 2389 if (shdr->sh_type != SHT_NOBITS)
2329 memcpy(dest, (void *)sechdrs[i].sh_addr, 2390 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2330 sechdrs[i].sh_size);
2331 /* Update sh_addr to point to copy in image. */ 2391 /* Update sh_addr to point to copy in image. */
2332 sechdrs[i].sh_addr = (unsigned long)dest; 2392 shdr->sh_addr = (unsigned long)dest;
2333 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); 2393 DEBUGP("\t0x%lx %s\n",
2394 shdr->sh_addr, info->secstrings + shdr->sh_name);
2334 } 2395 }
2335 /* Module has been moved. */
2336 mod = (void *)sechdrs[modindex].sh_addr;
2337 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2338 2396
2339#if defined(CONFIG_MODULE_UNLOAD) 2397 return 0;
2340 mod->refptr = alloc_percpu(struct module_ref); 2398}
2341 if (!mod->refptr) {
2342 err = -ENOMEM;
2343 goto free_init;
2344 }
2345#endif
2346 /* Now we've moved module, initialize linked lists, etc. */
2347 module_unload_init(mod);
2348
2349 /* Set up license info based on the info section */
2350 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
2351 2399
2400static int check_module_license_and_versions(struct module *mod)
2401{
2352 /* 2402 /*
2353 * ndiswrapper is under GPL by itself, but loads proprietary modules. 2403 * ndiswrapper is under GPL by itself, but loads proprietary modules.
2354 * Don't use add_taint_module(), as it would prevent ndiswrapper from 2404 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2361,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod,
2361 if (strcmp(mod->name, "driverloader") == 0) 2411 if (strcmp(mod->name, "driverloader") == 0)
2362 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2412 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
2363 2413
2364 /* Set up MODINFO_ATTR fields */
2365 setup_modinfo(mod, sechdrs, infoindex);
2366
2367 /* Fix up syms, so that st_value is a pointer to location. */
2368 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
2369 mod);
2370 if (err < 0)
2371 goto cleanup;
2372
2373 /* Now we've got everything in the final locations, we can
2374 * find optional sections. */
2375 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2376 sizeof(*mod->kp), &mod->num_kp);
2377 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2378 sizeof(*mod->syms), &mod->num_syms);
2379 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2380 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2381 sizeof(*mod->gpl_syms),
2382 &mod->num_gpl_syms);
2383 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2384 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2385 "__ksymtab_gpl_future",
2386 sizeof(*mod->gpl_future_syms),
2387 &mod->num_gpl_future_syms);
2388 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2389 "__kcrctab_gpl_future");
2390
2391#ifdef CONFIG_UNUSED_SYMBOLS
2392 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2393 "__ksymtab_unused",
2394 sizeof(*mod->unused_syms),
2395 &mod->num_unused_syms);
2396 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2397 "__kcrctab_unused");
2398 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2399 "__ksymtab_unused_gpl",
2400 sizeof(*mod->unused_gpl_syms),
2401 &mod->num_unused_gpl_syms);
2402 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2403 "__kcrctab_unused_gpl");
2404#endif
2405#ifdef CONFIG_CONSTRUCTORS
2406 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2407 sizeof(*mod->ctors), &mod->num_ctors);
2408#endif
2409
2410#ifdef CONFIG_TRACEPOINTS
2411 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2412 "__tracepoints",
2413 sizeof(*mod->tracepoints),
2414 &mod->num_tracepoints);
2415#endif
2416#ifdef CONFIG_EVENT_TRACING
2417 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2418 "_ftrace_events",
2419 sizeof(*mod->trace_events),
2420 &mod->num_trace_events);
2421 /*
2422 * This section contains pointers to allocated objects in the trace
2423 * code and not scanning it leads to false positives.
2424 */
2425 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2426 mod->num_trace_events, GFP_KERNEL);
2427#endif
2428#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2429 /* sechdrs[0].sh_size is always zero */
2430 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2431 "__mcount_loc",
2432 sizeof(*mod->ftrace_callsites),
2433 &mod->num_ftrace_callsites);
2434#endif
2435#ifdef CONFIG_MODVERSIONS 2414#ifdef CONFIG_MODVERSIONS
2436 if ((mod->num_syms && !mod->crcs) 2415 if ((mod->num_syms && !mod->crcs)
2437 || (mod->num_gpl_syms && !mod->gpl_crcs) 2416 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2441,56 +2420,16 @@ static noinline struct module *load_module(void __user *umod,
2441 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2420 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2442#endif 2421#endif
2443 ) { 2422 ) {
2444 err = try_to_force_load(mod, 2423 return try_to_force_load(mod,
2445 "no versions for exported symbols"); 2424 "no versions for exported symbols");
2446 if (err)
2447 goto cleanup;
2448 } 2425 }
2449#endif 2426#endif
2427 return 0;
2428}
2450 2429
2451 /* Now do relocations. */ 2430static void flush_module_icache(const struct module *mod)
2452 for (i = 1; i < hdr->e_shnum; i++) { 2431{
2453 const char *strtab = (char *)sechdrs[strindex].sh_addr; 2432 mm_segment_t old_fs;
2454 unsigned int info = sechdrs[i].sh_info;
2455
2456 /* Not a valid relocation section? */
2457 if (info >= hdr->e_shnum)
2458 continue;
2459
2460 /* Don't bother with non-allocated sections */
2461 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
2462 continue;
2463
2464 if (sechdrs[i].sh_type == SHT_REL)
2465 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
2466 else if (sechdrs[i].sh_type == SHT_RELA)
2467 err = apply_relocate_add(sechdrs, strtab, symindex, i,
2468 mod);
2469 if (err < 0)
2470 goto cleanup;
2471 }
2472
2473 /* Set up and sort exception table */
2474 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2475 sizeof(*mod->extable), &mod->num_exentries);
2476 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2477
2478 /* Finally, copy percpu area over. */
2479 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2480 sechdrs[pcpuindex].sh_size);
2481
2482 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2483 symoffs, stroffs, secstrings, strmap);
2484 kfree(strmap);
2485 strmap = NULL;
2486
2487 if (!mod->taints)
2488 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2489 sizeof(*debug), &num_debug);
2490
2491 err = module_finalize(hdr, sechdrs, mod);
2492 if (err < 0)
2493 goto cleanup;
2494 2433
2495 /* flush the icache in correct context */ 2434 /* flush the icache in correct context */
2496 old_fs = get_fs(); 2435 old_fs = get_fs();
@@ -2509,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod,
2509 (unsigned long)mod->module_core + mod->core_size); 2448 (unsigned long)mod->module_core + mod->core_size);
2510 2449
2511 set_fs(old_fs); 2450 set_fs(old_fs);
2451}
2512 2452
2513 mod->args = args; 2453static struct module *layout_and_allocate(struct load_info *info)
2514 if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) 2454{
2515 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2455 /* Module within temporary copy. */
2516 mod->name); 2456 struct module *mod;
2457 Elf_Shdr *pcpusec;
2458 int err;
2459
2460 mod = setup_load_info(info);
2461 if (IS_ERR(mod))
2462 return mod;
2463
2464 err = check_modinfo(mod, info);
2465 if (err)
2466 return ERR_PTR(err);
2467
2468 /* Allow arches to frob section contents and sizes. */
2469 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2470 info->secstrings, mod);
2471 if (err < 0)
2472 goto out;
2473
2474 pcpusec = &info->sechdrs[info->index.pcpu];
2475 if (pcpusec->sh_size) {
2476 /* We have a special allocation for this section. */
2477 err = percpu_modalloc(mod,
2478 pcpusec->sh_size, pcpusec->sh_addralign);
2479 if (err)
2480 goto out;
2481 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2482 }
2483
2484 /* Determine total sizes, and put offsets in sh_entsize. For now
2485 this is done generically; there doesn't appear to be any
2486 special cases for the architectures. */
2487 layout_sections(mod, info);
2488
2489 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2490 * sizeof(long), GFP_KERNEL);
2491 if (!info->strmap) {
2492 err = -ENOMEM;
2493 goto free_percpu;
2494 }
2495 layout_symtab(mod, info);
2496
2497 /* Allocate and move to the final place */
2498 err = move_module(mod, info);
2499 if (err)
2500 goto free_strmap;
2501
2502 /* Module has been copied to its final place now: return it. */
2503 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2504 kmemleak_load_module(mod, info);
2505 return mod;
2506
2507free_strmap:
2508 kfree(info->strmap);
2509free_percpu:
2510 percpu_modfree(mod);
2511out:
2512 return ERR_PTR(err);
2513}
2514
2515/* mod is no longer valid after this! */
2516static void module_deallocate(struct module *mod, struct load_info *info)
2517{
2518 kfree(info->strmap);
2519 percpu_modfree(mod);
2520 module_free(mod, mod->module_init);
2521 module_free(mod, mod->module_core);
2522}
2523
2524static int post_relocation(struct module *mod, const struct load_info *info)
2525{
2526 /* Sort exception table now relocations are done. */
2527 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2528
2529 /* Copy relocated percpu area over. */
2530 percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
2531 info->sechdrs[info->index.pcpu].sh_size);
2532
2533 /* Setup kallsyms-specific fields. */
2534 add_kallsyms(mod, info);
2535
2536 /* Arch-specific module finalizing. */
2537 return module_finalize(info->hdr, info->sechdrs, mod);
2538}
2539
2540/* Allocate and load the module: note that size of section 0 is always
2541 zero, and we rely on this for optional sections. */
2542static struct module *load_module(void __user *umod,
2543 unsigned long len,
2544 const char __user *uargs)
2545{
2546 struct load_info info = { NULL, };
2547 struct module *mod;
2548 long err;
2549
2550 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
2551 umod, len, uargs);
2552
2553 /* Copy in the blobs from userspace, check they are vaguely sane. */
2554 err = copy_and_check(&info, umod, len, uargs);
2555 if (err)
2556 return ERR_PTR(err);
2557
2558 /* Figure out module layout, and allocate all the memory. */
2559 mod = layout_and_allocate(&info);
2560 if (IS_ERR(mod)) {
2561 err = PTR_ERR(mod);
2562 goto free_copy;
2563 }
2564
2565 /* Now module is in final location, initialize linked lists, etc. */
2566 err = module_unload_init(mod);
2567 if (err)
2568 goto free_module;
2569
2570 /* Now we've got everything in the final locations, we can
2571 * find optional sections. */
2572 find_module_sections(mod, &info);
2573
2574 err = check_module_license_and_versions(mod);
2575 if (err)
2576 goto free_unload;
2577
2578 /* Set up MODINFO_ATTR fields */
2579 setup_modinfo(mod, &info);
2580
2581 /* Fix up syms, so that st_value is a pointer to location. */
2582 err = simplify_symbols(mod, &info);
2583 if (err < 0)
2584 goto free_modinfo;
2585
2586 err = apply_relocations(mod, &info);
2587 if (err < 0)
2588 goto free_modinfo;
2589
2590 err = post_relocation(mod, &info);
2591 if (err < 0)
2592 goto free_modinfo;
2593
2594 flush_module_icache(mod);
2595
2596 /* Now copy in args */
2597 mod->args = strndup_user(uargs, ~0UL >> 1);
2598 if (IS_ERR(mod->args)) {
2599 err = PTR_ERR(mod->args);
2600 goto free_arch_cleanup;
2601 }
2602
2603 /* Mark state as coming so strong_try_module_get() ignores us. */
2604 mod->state = MODULE_STATE_COMING;
2517 2605
2518 /* Now sew it into the lists so we can get lockdep and oops 2606 /* Now sew it into the lists so we can get lockdep and oops
2519 * info during argument parsing. Noone should access us, since 2607 * info during argument parsing. Noone should access us, since
@@ -2528,8 +2616,9 @@ static noinline struct module *load_module(void __user *umod,
2528 goto unlock; 2616 goto unlock;
2529 } 2617 }
2530 2618
2531 if (debug) 2619 /* This has to be done once we're sure module name is unique. */
2532 dynamic_debug_setup(debug, num_debug); 2620 if (!mod->taints)
2621 dynamic_debug_setup(info.debug, info.num_debug);
2533 2622
2534 /* Find duplicate symbols */ 2623 /* Find duplicate symbols */
2535 err = verify_export_symbols(mod); 2624 err = verify_export_symbols(mod);
@@ -2539,23 +2628,22 @@ static noinline struct module *load_module(void __user *umod,
2539 list_add_rcu(&mod->list, &modules); 2628 list_add_rcu(&mod->list, &modules);
2540 mutex_unlock(&module_mutex); 2629 mutex_unlock(&module_mutex);
2541 2630
2631 /* Module is ready to execute: parsing args may do that. */
2542 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2632 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2543 if (err < 0) 2633 if (err < 0)
2544 goto unlink; 2634 goto unlink;
2545 2635
2546 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2636 /* Link in to syfs. */
2637 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
2547 if (err < 0) 2638 if (err < 0)
2548 goto unlink; 2639 goto unlink;
2549 2640
2550 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2641 /* Get rid of temporary copy and strmap. */
2551 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2642 kfree(info.strmap);
2552 2643 free_copy(&info);
2553 /* Get rid of temporary copy */
2554 vfree(hdr);
2555
2556 trace_module_load(mod);
2557 2644
2558 /* Done! */ 2645 /* Done! */
2646 trace_module_load(mod);
2559 return mod; 2647 return mod;
2560 2648
2561 unlink: 2649 unlink:
@@ -2563,35 +2651,23 @@ static noinline struct module *load_module(void __user *umod,
2563 /* Unlink carefully: kallsyms could be walking list. */ 2651 /* Unlink carefully: kallsyms could be walking list. */
2564 list_del_rcu(&mod->list); 2652 list_del_rcu(&mod->list);
2565 ddebug: 2653 ddebug:
2566 dynamic_debug_remove(debug); 2654 if (!mod->taints)
2655 dynamic_debug_remove(info.debug);
2567 unlock: 2656 unlock:
2568 mutex_unlock(&module_mutex); 2657 mutex_unlock(&module_mutex);
2569 synchronize_sched(); 2658 synchronize_sched();
2659 kfree(mod->args);
2660 free_arch_cleanup:
2570 module_arch_cleanup(mod); 2661 module_arch_cleanup(mod);
2571 cleanup: 2662 free_modinfo:
2572 free_modinfo(mod); 2663 free_modinfo(mod);
2664 free_unload:
2573 module_unload_free(mod); 2665 module_unload_free(mod);
2574#if defined(CONFIG_MODULE_UNLOAD) 2666 free_module:
2575 free_percpu(mod->refptr); 2667 module_deallocate(mod, &info);
2576 free_init: 2668 free_copy:
2577#endif 2669 free_copy(&info);
2578 module_free(mod, mod->module_init);
2579 free_core:
2580 module_free(mod, mod->module_core);
2581 /* mod will be freed with core. Don't access it beyond this line! */
2582 free_percpu:
2583 free_percpu(percpu);
2584 free_mod:
2585 kfree(args);
2586 kfree(strmap);
2587 free_hdr:
2588 vfree(hdr);
2589 return ERR_PTR(err); 2670 return ERR_PTR(err);
2590
2591 truncated:
2592 printk(KERN_ERR "Module len %lu truncated\n", len);
2593 err = -ENOEXEC;
2594 goto free_hdr;
2595} 2671}
2596 2672
2597/* Call module constructors. */ 2673/* Call module constructors. */
diff --git a/kernel/padata.c b/kernel/padata.c
index fdd8ae609ce3..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sysfs.h>
29#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
30 31
31#define MAX_SEQ_NR INT_MAX - NR_CPUS 32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
32#define MAX_OBJ_NUM 1000 33#define MAX_OBJ_NUM 1000
33 34
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{ 36{
36 int cpu, target_cpu; 37 int cpu, target_cpu;
37 38
38 target_cpu = cpumask_first(pd->cpumask); 39 target_cpu = cpumask_first(pd->cpumask.pcpu);
39 for (cpu = 0; cpu < cpu_index; cpu++) 40 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask); 41 target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
41 42
42 return target_cpu; 43 return target_cpu;
43} 44}
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
53 * Hash the sequence numbers to the cpus by taking 54 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use. 55 * seq_nr mod. number of cpus in use.
55 */ 56 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); 57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
57 58
58 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
59} 60}
60 61
61static void padata_parallel_worker(struct work_struct *work) 62static void padata_parallel_worker(struct work_struct *parallel_work)
62{ 63{
63 struct padata_queue *queue; 64 struct padata_parallel_queue *pqueue;
64 struct parallel_data *pd; 65 struct parallel_data *pd;
65 struct padata_instance *pinst; 66 struct padata_instance *pinst;
66 LIST_HEAD(local_list); 67 LIST_HEAD(local_list);
67 68
68 local_bh_disable(); 69 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork); 70 pqueue = container_of(parallel_work,
70 pd = queue->pd; 71 struct padata_parallel_queue, work);
72 pd = pqueue->pd;
71 pinst = pd->pinst; 73 pinst = pd->pinst;
72 74
73 spin_lock(&queue->parallel.lock); 75 spin_lock(&pqueue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list); 76 list_replace_init(&pqueue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock); 77 spin_unlock(&pqueue->parallel.lock);
76 78
77 while (!list_empty(&local_list)) { 79 while (!list_empty(&local_list)) {
78 struct padata_priv *padata; 80 struct padata_priv *padata;
@@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work)
94 * @pinst: padata instance 96 * @pinst: padata instance
95 * @padata: object to be parallelized 97 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on, 98 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata. 99 * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
98 * 100 *
99 * The parallelization callback function will run with BHs off. 101 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel 102 * Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu) 106 struct padata_priv *padata, int cb_cpu)
105{ 107{
106 int target_cpu, err; 108 int target_cpu, err;
107 struct padata_queue *queue; 109 struct padata_parallel_queue *queue;
108 struct parallel_data *pd; 110 struct parallel_data *pd;
109 111
110 rcu_read_lock_bh(); 112 rcu_read_lock_bh();
111 113
112 pd = rcu_dereference(pinst->pd); 114 pd = rcu_dereference(pinst->pd);
113 115
114 err = 0; 116 err = -EINVAL;
115 if (!(pinst->flags & PADATA_INIT)) 117 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
118 goto out;
119
120 if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
116 goto out; 121 goto out;
117 122
118 err = -EBUSY; 123 err = -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) 127 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out; 128 goto out;
124 129
125 err = -EINVAL; 130 err = 0;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt); 131 atomic_inc(&pd->refcnt);
131 padata->pd = pd; 132 padata->pd = pd;
132 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr); 138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138 139
139 target_cpu = padata_cpu_hash(padata); 140 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu); 141 queue = per_cpu_ptr(pd->pqueue, target_cpu);
141 142
142 spin_lock(&queue->parallel.lock); 143 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list); 144 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock); 145 spin_unlock(&queue->parallel.lock);
145 146
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork); 147 queue_work_on(target_cpu, pinst->wq, &queue->work);
147 148
148out: 149out:
149 rcu_read_unlock_bh(); 150 rcu_read_unlock_bh();
@@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel);
171 */ 172 */
172static struct padata_priv *padata_get_next(struct parallel_data *pd) 173static struct padata_priv *padata_get_next(struct parallel_data *pd)
173{ 174{
174 int cpu, num_cpus, empty, calc_seq_nr; 175 int cpu, num_cpus;
175 int seq_nr, next_nr, overrun, next_overrun; 176 int next_nr, next_index;
176 struct padata_queue *queue, *next_queue; 177 struct padata_parallel_queue *queue, *next_queue;
177 struct padata_priv *padata; 178 struct padata_priv *padata;
178 struct padata_list *reorder; 179 struct padata_list *reorder;
179 180
180 empty = 0; 181 num_cpus = cpumask_weight(pd->cpumask.pcpu);
181 next_nr = -1;
182 next_overrun = 0;
183 next_queue = NULL;
184
185 num_cpus = cpumask_weight(pd->cpumask);
186
187 for_each_cpu(cpu, pd->cpumask) {
188 queue = per_cpu_ptr(pd->queue, cpu);
189 reorder = &queue->reorder;
190
191 /*
192 * Calculate the seq_nr of the object that should be
193 * next in this reorder queue.
194 */
195 overrun = 0;
196 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
197 + queue->cpu_index;
198 182
199 if (unlikely(calc_seq_nr > pd->max_seq_nr)) { 183 /*
200 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; 184 * Calculate the percpu reorder queue and the sequence
201 overrun = 1; 185 * number of the next object.
202 } 186 */
203 187 next_nr = pd->processed;
204 if (!list_empty(&reorder->list)) { 188 next_index = next_nr % num_cpus;
205 padata = list_entry(reorder->list.next, 189 cpu = padata_index_to_cpu(pd, next_index);
206 struct padata_priv, list); 190 next_queue = per_cpu_ptr(pd->pqueue, cpu);
207 191
208 seq_nr = padata->seq_nr; 192 if (unlikely(next_nr > pd->max_seq_nr)) {
209 BUG_ON(calc_seq_nr != seq_nr); 193 next_nr = next_nr - pd->max_seq_nr - 1;
210 } else { 194 next_index = next_nr % num_cpus;
211 seq_nr = calc_seq_nr; 195 cpu = padata_index_to_cpu(pd, next_index);
212 empty++; 196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
213 } 197 pd->processed = 0;
214
215 if (next_nr < 0 || seq_nr < next_nr
216 || (next_overrun && !overrun)) {
217 next_nr = seq_nr;
218 next_overrun = overrun;
219 next_queue = queue;
220 }
221 } 198 }
222 199
223 padata = NULL; 200 padata = NULL;
224 201
225 if (empty == num_cpus)
226 goto out;
227
228 reorder = &next_queue->reorder; 202 reorder = &next_queue->reorder;
229 203
230 if (!list_empty(&reorder->list)) { 204 if (!list_empty(&reorder->list)) {
231 padata = list_entry(reorder->list.next, 205 padata = list_entry(reorder->list.next,
232 struct padata_priv, list); 206 struct padata_priv, list);
233 207
234 if (unlikely(next_overrun)) { 208 BUG_ON(next_nr != padata->seq_nr);
235 for_each_cpu(cpu, pd->cpumask) {
236 queue = per_cpu_ptr(pd->queue, cpu);
237 atomic_set(&queue->num_obj, 0);
238 }
239 }
240 209
241 spin_lock(&reorder->lock); 210 spin_lock(&reorder->lock);
242 list_del_init(&padata->list); 211 list_del_init(&padata->list);
243 atomic_dec(&pd->reorder_objects); 212 atomic_dec(&pd->reorder_objects);
244 spin_unlock(&reorder->lock); 213 spin_unlock(&reorder->lock);
245 214
246 atomic_inc(&next_queue->num_obj); 215 pd->processed++;
247 216
248 goto out; 217 goto out;
249 } 218 }
250 219
251 queue = per_cpu_ptr(pd->queue, smp_processor_id()); 220 queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
252 if (queue->cpu_index == next_queue->cpu_index) { 221 if (queue->cpu_index == next_queue->cpu_index) {
253 padata = ERR_PTR(-ENODATA); 222 padata = ERR_PTR(-ENODATA);
254 goto out; 223 goto out;
@@ -262,7 +231,7 @@ out:
262static void padata_reorder(struct parallel_data *pd) 231static void padata_reorder(struct parallel_data *pd)
263{ 232{
264 struct padata_priv *padata; 233 struct padata_priv *padata;
265 struct padata_queue *queue; 234 struct padata_serial_queue *squeue;
266 struct padata_instance *pinst = pd->pinst; 235 struct padata_instance *pinst = pd->pinst;
267 236
268 /* 237 /*
@@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
301 return; 270 return;
302 } 271 }
303 272
304 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
305 274
306 spin_lock(&queue->serial.lock); 275 spin_lock(&squeue->serial.lock);
307 list_add_tail(&padata->list, &queue->serial.list); 276 list_add_tail(&padata->list, &squeue->serial.list);
308 spin_unlock(&queue->serial.lock); 277 spin_unlock(&squeue->serial.lock);
309 278
310 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); 279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
311 } 280 }
312 281
313 spin_unlock_bh(&pd->lock); 282 spin_unlock_bh(&pd->lock);
@@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg)
333 padata_reorder(pd); 302 padata_reorder(pd);
334} 303}
335 304
336static void padata_serial_worker(struct work_struct *work) 305static void padata_serial_worker(struct work_struct *serial_work)
337{ 306{
338 struct padata_queue *queue; 307 struct padata_serial_queue *squeue;
339 struct parallel_data *pd; 308 struct parallel_data *pd;
340 LIST_HEAD(local_list); 309 LIST_HEAD(local_list);
341 310
342 local_bh_disable(); 311 local_bh_disable();
343 queue = container_of(work, struct padata_queue, swork); 312 squeue = container_of(serial_work, struct padata_serial_queue, work);
344 pd = queue->pd; 313 pd = squeue->pd;
345 314
346 spin_lock(&queue->serial.lock); 315 spin_lock(&squeue->serial.lock);
347 list_replace_init(&queue->serial.list, &local_list); 316 list_replace_init(&squeue->serial.list, &local_list);
348 spin_unlock(&queue->serial.lock); 317 spin_unlock(&squeue->serial.lock);
349 318
350 while (!list_empty(&local_list)) { 319 while (!list_empty(&local_list)) {
351 struct padata_priv *padata; 320 struct padata_priv *padata;
@@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
372void padata_do_serial(struct padata_priv *padata) 341void padata_do_serial(struct padata_priv *padata)
373{ 342{
374 int cpu; 343 int cpu;
375 struct padata_queue *queue; 344 struct padata_parallel_queue *pqueue;
376 struct parallel_data *pd; 345 struct parallel_data *pd;
377 346
378 pd = padata->pd; 347 pd = padata->pd;
379 348
380 cpu = get_cpu(); 349 cpu = get_cpu();
381 queue = per_cpu_ptr(pd->queue, cpu); 350 pqueue = per_cpu_ptr(pd->pqueue, cpu);
382 351
383 spin_lock(&queue->reorder.lock); 352 spin_lock(&pqueue->reorder.lock);
384 atomic_inc(&pd->reorder_objects); 353 atomic_inc(&pd->reorder_objects);
385 list_add_tail(&padata->list, &queue->reorder.list); 354 list_add_tail(&padata->list, &pqueue->reorder.list);
386 spin_unlock(&queue->reorder.lock); 355 spin_unlock(&pqueue->reorder.lock);
387 356
388 put_cpu(); 357 put_cpu();
389 358
@@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata)
391} 360}
392EXPORT_SYMBOL(padata_do_serial); 361EXPORT_SYMBOL(padata_do_serial);
393 362
394/* Allocate and initialize the internal cpumask dependend resources. */ 363static int padata_setup_cpumasks(struct parallel_data *pd,
395static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 364 const struct cpumask *pcpumask,
396 const struct cpumask *cpumask) 365 const struct cpumask *cbcpumask)
397{ 366{
398 int cpu, cpu_index, num_cpus; 367 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
399 struct padata_queue *queue; 368 return -ENOMEM;
400 struct parallel_data *pd;
401
402 cpu_index = 0;
403 369
404 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); 370 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
405 if (!pd) 371 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
406 goto err; 372 free_cpumask_var(pd->cpumask.cbcpu);
373 return -ENOMEM;
374 }
407 375
408 pd->queue = alloc_percpu(struct padata_queue); 376 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
409 if (!pd->queue) 377 return 0;
410 goto err_free_pd; 378}
411 379
412 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 380static void __padata_list_init(struct padata_list *pd_list)
413 goto err_free_queue; 381{
382 INIT_LIST_HEAD(&pd_list->list);
383 spin_lock_init(&pd_list->lock);
384}
414 385
415 cpumask_and(pd->cpumask, cpumask, cpu_active_mask); 386/* Initialize all percpu queues used by serial workers */
387static void padata_init_squeues(struct parallel_data *pd)
388{
389 int cpu;
390 struct padata_serial_queue *squeue;
416 391
417 for_each_cpu(cpu, pd->cpumask) { 392 for_each_cpu(cpu, pd->cpumask.cbcpu) {
418 queue = per_cpu_ptr(pd->queue, cpu); 393 squeue = per_cpu_ptr(pd->squeue, cpu);
394 squeue->pd = pd;
395 __padata_list_init(&squeue->serial);
396 INIT_WORK(&squeue->work, padata_serial_worker);
397 }
398}
419 399
420 queue->pd = pd; 400/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd)
402{
403 int cpu_index, num_cpus, cpu;
404 struct padata_parallel_queue *pqueue;
421 405
422 queue->cpu_index = cpu_index; 406 cpu_index = 0;
407 for_each_cpu(cpu, pd->cpumask.pcpu) {
408 pqueue = per_cpu_ptr(pd->pqueue, cpu);
409 pqueue->pd = pd;
410 pqueue->cpu_index = cpu_index;
423 cpu_index++; 411 cpu_index++;
424 412
425 INIT_LIST_HEAD(&queue->reorder.list); 413 __padata_list_init(&pqueue->reorder);
426 INIT_LIST_HEAD(&queue->parallel.list); 414 __padata_list_init(&pqueue->parallel);
427 INIT_LIST_HEAD(&queue->serial.list); 415 INIT_WORK(&pqueue->work, padata_parallel_worker);
428 spin_lock_init(&queue->reorder.lock); 416 atomic_set(&pqueue->num_obj, 0);
429 spin_lock_init(&queue->parallel.lock);
430 spin_lock_init(&queue->serial.lock);
431
432 INIT_WORK(&queue->pwork, padata_parallel_worker);
433 INIT_WORK(&queue->swork, padata_serial_worker);
434 atomic_set(&queue->num_obj, 0);
435 } 417 }
436 418
437 num_cpus = cpumask_weight(pd->cpumask); 419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
438 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421}
422
423/* Allocate and initialize the internal cpumask dependend resources. */
424static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
425 const struct cpumask *pcpumask,
426 const struct cpumask *cbcpumask)
427{
428 struct parallel_data *pd;
439 429
430 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
431 if (!pd)
432 goto err;
433
434 pd->pqueue = alloc_percpu(struct padata_parallel_queue);
435 if (!pd->pqueue)
436 goto err_free_pd;
437
438 pd->squeue = alloc_percpu(struct padata_serial_queue);
439 if (!pd->squeue)
440 goto err_free_pqueue;
441 if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
442 goto err_free_squeue;
443
444 padata_init_pqueues(pd);
445 padata_init_squeues(pd);
440 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
441 atomic_set(&pd->seq_nr, -1); 447 atomic_set(&pd->seq_nr, -1);
442 atomic_set(&pd->reorder_objects, 0); 448 atomic_set(&pd->reorder_objects, 0);
@@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
446 452
447 return pd; 453 return pd;
448 454
449err_free_queue: 455err_free_squeue:
450 free_percpu(pd->queue); 456 free_percpu(pd->squeue);
457err_free_pqueue:
458 free_percpu(pd->pqueue);
451err_free_pd: 459err_free_pd:
452 kfree(pd); 460 kfree(pd);
453err: 461err:
@@ -456,8 +464,10 @@ err:
456 464
457static void padata_free_pd(struct parallel_data *pd) 465static void padata_free_pd(struct parallel_data *pd)
458{ 466{
459 free_cpumask_var(pd->cpumask); 467 free_cpumask_var(pd->cpumask.pcpu);
460 free_percpu(pd->queue); 468 free_cpumask_var(pd->cpumask.cbcpu);
469 free_percpu(pd->pqueue);
470 free_percpu(pd->squeue);
461 kfree(pd); 471 kfree(pd);
462} 472}
463 473
@@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd)
465static void padata_flush_queues(struct parallel_data *pd) 475static void padata_flush_queues(struct parallel_data *pd)
466{ 476{
467 int cpu; 477 int cpu;
468 struct padata_queue *queue; 478 struct padata_parallel_queue *pqueue;
479 struct padata_serial_queue *squeue;
469 480
470 for_each_cpu(cpu, pd->cpumask) { 481 for_each_cpu(cpu, pd->cpumask.pcpu) {
471 queue = per_cpu_ptr(pd->queue, cpu); 482 pqueue = per_cpu_ptr(pd->pqueue, cpu);
472 flush_work(&queue->pwork); 483 flush_work(&pqueue->work);
473 } 484 }
474 485
475 del_timer_sync(&pd->timer); 486 del_timer_sync(&pd->timer);
@@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd)
477 if (atomic_read(&pd->reorder_objects)) 488 if (atomic_read(&pd->reorder_objects))
478 padata_reorder(pd); 489 padata_reorder(pd);
479 490
480 for_each_cpu(cpu, pd->cpumask) { 491 for_each_cpu(cpu, pd->cpumask.cbcpu) {
481 queue = per_cpu_ptr(pd->queue, cpu); 492 squeue = per_cpu_ptr(pd->squeue, cpu);
482 flush_work(&queue->swork); 493 flush_work(&squeue->work);
483 } 494 }
484 495
485 BUG_ON(atomic_read(&pd->refcnt) != 0); 496 BUG_ON(atomic_read(&pd->refcnt) != 0);
486} 497}
487 498
499static void __padata_start(struct padata_instance *pinst)
500{
501 pinst->flags |= PADATA_INIT;
502}
503
504static void __padata_stop(struct padata_instance *pinst)
505{
506 if (!(pinst->flags & PADATA_INIT))
507 return;
508
509 pinst->flags &= ~PADATA_INIT;
510
511 synchronize_rcu();
512
513 get_online_cpus();
514 padata_flush_queues(pinst->pd);
515 put_online_cpus();
516}
517
488/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control stucture with a new one. */
489static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
490 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
491{ 521{
492 struct parallel_data *pd_old = pinst->pd; 522 struct parallel_data *pd_old = pinst->pd;
523 int notification_mask = 0;
493 524
494 pinst->flags |= PADATA_RESET; 525 pinst->flags |= PADATA_RESET;
495 526
@@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
497 528
498 synchronize_rcu(); 529 synchronize_rcu();
499 530
531 if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
532 notification_mask |= PADATA_CPU_PARALLEL;
533 if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
534 notification_mask |= PADATA_CPU_SERIAL;
535
500 padata_flush_queues(pd_old); 536 padata_flush_queues(pd_old);
501 padata_free_pd(pd_old); 537 padata_free_pd(pd_old);
502 538
539 if (notification_mask)
540 blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
541 notification_mask,
542 &pd_new->cpumask);
543
503 pinst->flags &= ~PADATA_RESET; 544 pinst->flags &= ~PADATA_RESET;
504} 545}
505 546
506/** 547/**
507 * padata_set_cpumask - set the cpumask that padata should use 548 * padata_register_cpumask_notifier - Registers a notifier that will be called
549 * if either pcpu or cbcpu or both cpumasks change.
508 * 550 *
509 * @pinst: padata instance 551 * @pinst: A poineter to padata instance
510 * @cpumask: the cpumask to use 552 * @nblock: A pointer to notifier block.
511 */ 553 */
512int padata_set_cpumask(struct padata_instance *pinst, 554int padata_register_cpumask_notifier(struct padata_instance *pinst,
513 cpumask_var_t cpumask) 555 struct notifier_block *nblock)
514{ 556{
557 return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
558 nblock);
559}
560EXPORT_SYMBOL(padata_register_cpumask_notifier);
561
562/**
563 * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
564 * registered earlier using padata_register_cpumask_notifier
565 *
566 * @pinst: A pointer to data instance.
567 * @nlock: A pointer to notifier block.
568 */
569int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
570 struct notifier_block *nblock)
571{
572 return blocking_notifier_chain_unregister(
573 &pinst->cpumask_change_notifier,
574 nblock);
575}
576EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
577
578
579/* If cpumask contains no active cpu, we mark the instance as invalid. */
580static bool padata_validate_cpumask(struct padata_instance *pinst,
581 const struct cpumask *cpumask)
582{
583 if (!cpumask_intersects(cpumask, cpu_active_mask)) {
584 pinst->flags |= PADATA_INVALID;
585 return false;
586 }
587
588 pinst->flags &= ~PADATA_INVALID;
589 return true;
590}
591
592static int __padata_set_cpumasks(struct padata_instance *pinst,
593 cpumask_var_t pcpumask,
594 cpumask_var_t cbcpumask)
595{
596 int valid;
515 struct parallel_data *pd; 597 struct parallel_data *pd;
516 int err = 0; 598
599 valid = padata_validate_cpumask(pinst, pcpumask);
600 if (!valid) {
601 __padata_stop(pinst);
602 goto out_replace;
603 }
604
605 valid = padata_validate_cpumask(pinst, cbcpumask);
606 if (!valid)
607 __padata_stop(pinst);
608
609out_replace:
610 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
611 if (!pd)
612 return -ENOMEM;
613
614 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
615 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
616
617 padata_replace(pinst, pd);
618
619 if (valid)
620 __padata_start(pinst);
621
622 return 0;
623}
624
625/**
626 * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
627 * one is used by parallel workers and the second one
628 * by the wokers doing serialization.
629 *
630 * @pinst: padata instance
631 * @pcpumask: the cpumask to use for parallel workers
632 * @cbcpumask: the cpumsak to use for serial workers
633 */
634int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
635 cpumask_var_t cbcpumask)
636{
637 int err;
517 638
518 mutex_lock(&pinst->lock); 639 mutex_lock(&pinst->lock);
640 get_online_cpus();
519 641
642 err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
643
644 put_online_cpus();
645 mutex_unlock(&pinst->lock);
646
647 return err;
648
649}
650EXPORT_SYMBOL(padata_set_cpumasks);
651
652/**
653 * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
654 * equivalent to @cpumask.
655 *
656 * @pinst: padata instance
657 * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
658 * to parallel and serial cpumasks respectively.
659 * @cpumask: the cpumask to use
660 */
661int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
662 cpumask_var_t cpumask)
663{
664 struct cpumask *serial_mask, *parallel_mask;
665 int err = -EINVAL;
666
667 mutex_lock(&pinst->lock);
520 get_online_cpus(); 668 get_online_cpus();
521 669
522 pd = padata_alloc_pd(pinst, cpumask); 670 switch (cpumask_type) {
523 if (!pd) { 671 case PADATA_CPU_PARALLEL:
524 err = -ENOMEM; 672 serial_mask = pinst->cpumask.cbcpu;
525 goto out; 673 parallel_mask = cpumask;
674 break;
675 case PADATA_CPU_SERIAL:
676 parallel_mask = pinst->cpumask.pcpu;
677 serial_mask = cpumask;
678 break;
679 default:
680 goto out;
526 } 681 }
527 682
528 cpumask_copy(pinst->cpumask, cpumask); 683 err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
529
530 padata_replace(pinst, pd);
531 684
532out: 685out:
533 put_online_cpus(); 686 put_online_cpus();
534
535 mutex_unlock(&pinst->lock); 687 mutex_unlock(&pinst->lock);
536 688
537 return err; 689 return err;
@@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
543 struct parallel_data *pd; 695 struct parallel_data *pd;
544 696
545 if (cpumask_test_cpu(cpu, cpu_active_mask)) { 697 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
546 pd = padata_alloc_pd(pinst, pinst->cpumask); 698 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
699 pinst->cpumask.cbcpu);
547 if (!pd) 700 if (!pd)
548 return -ENOMEM; 701 return -ENOMEM;
549 702
550 padata_replace(pinst, pd); 703 padata_replace(pinst, pd);
704
705 if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
706 padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
707 __padata_start(pinst);
551 } 708 }
552 709
553 return 0; 710 return 0;
554} 711}
555 712
556/** 713 /**
557 * padata_add_cpu - add a cpu to the padata cpumask 714 * padata_add_cpu - add a cpu to one or both(parallel and serial)
715 * padata cpumasks.
558 * 716 *
559 * @pinst: padata instance 717 * @pinst: padata instance
560 * @cpu: cpu to add 718 * @cpu: cpu to add
719 * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
720 * The @mask may be any combination of the following flags:
721 * PADATA_CPU_SERIAL - serial cpumask
722 * PADATA_CPU_PARALLEL - parallel cpumask
561 */ 723 */
562int padata_add_cpu(struct padata_instance *pinst, int cpu) 724
725int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
563{ 726{
564 int err; 727 int err;
565 728
729 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
730 return -EINVAL;
731
566 mutex_lock(&pinst->lock); 732 mutex_lock(&pinst->lock);
567 733
568 get_online_cpus(); 734 get_online_cpus();
569 cpumask_set_cpu(cpu, pinst->cpumask); 735 if (mask & PADATA_CPU_SERIAL)
736 cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
737 if (mask & PADATA_CPU_PARALLEL)
738 cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
739
570 err = __padata_add_cpu(pinst, cpu); 740 err = __padata_add_cpu(pinst, cpu);
571 put_online_cpus(); 741 put_online_cpus();
572 742
@@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
578 748
579static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) 749static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
580{ 750{
581 struct parallel_data *pd; 751 struct parallel_data *pd = NULL;
582 752
583 if (cpumask_test_cpu(cpu, cpu_online_mask)) { 753 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
584 pd = padata_alloc_pd(pinst, pinst->cpumask); 754
755 if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
756 !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
757 __padata_stop(pinst);
758
759 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
760 pinst->cpumask.cbcpu);
585 if (!pd) 761 if (!pd)
586 return -ENOMEM; 762 return -ENOMEM;
587 763
@@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
591 return 0; 767 return 0;
592} 768}
593 769
594/** 770 /**
595 * padata_remove_cpu - remove a cpu from the padata cpumask 771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
772 * padata cpumasks.
596 * 773 *
597 * @pinst: padata instance 774 * @pinst: padata instance
598 * @cpu: cpu to remove 775 * @cpu: cpu to remove
776 * @mask: bitmask specifying from which cpumask @cpu should be removed
777 * The @mask may be any combination of the following flags:
778 * PADATA_CPU_SERIAL - serial cpumask
779 * PADATA_CPU_PARALLEL - parallel cpumask
599 */ 780 */
600int padata_remove_cpu(struct padata_instance *pinst, int cpu) 781int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
601{ 782{
602 int err; 783 int err;
603 784
785 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
786 return -EINVAL;
787
604 mutex_lock(&pinst->lock); 788 mutex_lock(&pinst->lock);
605 789
606 get_online_cpus(); 790 get_online_cpus();
607 cpumask_clear_cpu(cpu, pinst->cpumask); 791 if (mask & PADATA_CPU_SERIAL)
792 cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
793 if (mask & PADATA_CPU_PARALLEL)
794 cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
795
608 err = __padata_remove_cpu(pinst, cpu); 796 err = __padata_remove_cpu(pinst, cpu);
609 put_online_cpus(); 797 put_online_cpus();
610 798
@@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu);
619 * 807 *
620 * @pinst: padata instance to start 808 * @pinst: padata instance to start
621 */ 809 */
622void padata_start(struct padata_instance *pinst) 810int padata_start(struct padata_instance *pinst)
623{ 811{
812 int err = 0;
813
624 mutex_lock(&pinst->lock); 814 mutex_lock(&pinst->lock);
625 pinst->flags |= PADATA_INIT; 815
816 if (pinst->flags & PADATA_INVALID)
817 err =-EINVAL;
818
819 __padata_start(pinst);
820
626 mutex_unlock(&pinst->lock); 821 mutex_unlock(&pinst->lock);
822
823 return err;
627} 824}
628EXPORT_SYMBOL(padata_start); 825EXPORT_SYMBOL(padata_start);
629 826
@@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start);
635void padata_stop(struct padata_instance *pinst) 832void padata_stop(struct padata_instance *pinst)
636{ 833{
637 mutex_lock(&pinst->lock); 834 mutex_lock(&pinst->lock);
638 pinst->flags &= ~PADATA_INIT; 835 __padata_stop(pinst);
639 mutex_unlock(&pinst->lock); 836 mutex_unlock(&pinst->lock);
640} 837}
641EXPORT_SYMBOL(padata_stop); 838EXPORT_SYMBOL(padata_stop);
642 839
643#ifdef CONFIG_HOTPLUG_CPU 840#ifdef CONFIG_HOTPLUG_CPU
841
842static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
843{
844 return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
845 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
846}
847
848
644static int padata_cpu_callback(struct notifier_block *nfb, 849static int padata_cpu_callback(struct notifier_block *nfb,
645 unsigned long action, void *hcpu) 850 unsigned long action, void *hcpu)
646{ 851{
@@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
653 switch (action) { 858 switch (action) {
654 case CPU_ONLINE: 859 case CPU_ONLINE:
655 case CPU_ONLINE_FROZEN: 860 case CPU_ONLINE_FROZEN:
656 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 861 if (!pinst_has_cpu(pinst, cpu))
657 break; 862 break;
658 mutex_lock(&pinst->lock); 863 mutex_lock(&pinst->lock);
659 err = __padata_add_cpu(pinst, cpu); 864 err = __padata_add_cpu(pinst, cpu);
@@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
664 869
665 case CPU_DOWN_PREPARE: 870 case CPU_DOWN_PREPARE:
666 case CPU_DOWN_PREPARE_FROZEN: 871 case CPU_DOWN_PREPARE_FROZEN:
667 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 872 if (!pinst_has_cpu(pinst, cpu))
668 break; 873 break;
669 mutex_lock(&pinst->lock); 874 mutex_lock(&pinst->lock);
670 err = __padata_remove_cpu(pinst, cpu); 875 err = __padata_remove_cpu(pinst, cpu);
@@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
675 880
676 case CPU_UP_CANCELED: 881 case CPU_UP_CANCELED:
677 case CPU_UP_CANCELED_FROZEN: 882 case CPU_UP_CANCELED_FROZEN:
678 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 883 if (!pinst_has_cpu(pinst, cpu))
679 break; 884 break;
680 mutex_lock(&pinst->lock); 885 mutex_lock(&pinst->lock);
681 __padata_remove_cpu(pinst, cpu); 886 __padata_remove_cpu(pinst, cpu);
@@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
683 888
684 case CPU_DOWN_FAILED: 889 case CPU_DOWN_FAILED:
685 case CPU_DOWN_FAILED_FROZEN: 890 case CPU_DOWN_FAILED_FROZEN:
686 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 891 if (!pinst_has_cpu(pinst, cpu))
687 break; 892 break;
688 mutex_lock(&pinst->lock); 893 mutex_lock(&pinst->lock);
689 __padata_add_cpu(pinst, cpu); 894 __padata_add_cpu(pinst, cpu);
@@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb,
694} 899}
695#endif 900#endif
696 901
902static void __padata_free(struct padata_instance *pinst)
903{
904#ifdef CONFIG_HOTPLUG_CPU
905 unregister_hotcpu_notifier(&pinst->cpu_notifier);
906#endif
907
908 padata_stop(pinst);
909 padata_free_pd(pinst->pd);
910 free_cpumask_var(pinst->cpumask.pcpu);
911 free_cpumask_var(pinst->cpumask.cbcpu);
912 kfree(pinst);
913}
914
915#define kobj2pinst(_kobj) \
916 container_of(_kobj, struct padata_instance, kobj)
917#define attr2pentry(_attr) \
918 container_of(_attr, struct padata_sysfs_entry, attr)
919
920static void padata_sysfs_release(struct kobject *kobj)
921{
922 struct padata_instance *pinst = kobj2pinst(kobj);
923 __padata_free(pinst);
924}
925
926struct padata_sysfs_entry {
927 struct attribute attr;
928 ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
929 ssize_t (*store)(struct padata_instance *, struct attribute *,
930 const char *, size_t);
931};
932
933static ssize_t show_cpumask(struct padata_instance *pinst,
934 struct attribute *attr, char *buf)
935{
936 struct cpumask *cpumask;
937 ssize_t len;
938
939 mutex_lock(&pinst->lock);
940 if (!strcmp(attr->name, "serial_cpumask"))
941 cpumask = pinst->cpumask.cbcpu;
942 else
943 cpumask = pinst->cpumask.pcpu;
944
945 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
946 nr_cpu_ids);
947 if (PAGE_SIZE - len < 2)
948 len = -EINVAL;
949 else
950 len += sprintf(buf + len, "\n");
951
952 mutex_unlock(&pinst->lock);
953 return len;
954}
955
956static ssize_t store_cpumask(struct padata_instance *pinst,
957 struct attribute *attr,
958 const char *buf, size_t count)
959{
960 cpumask_var_t new_cpumask;
961 ssize_t ret;
962 int mask_type;
963
964 if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
965 return -ENOMEM;
966
967 ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
968 nr_cpumask_bits);
969 if (ret < 0)
970 goto out;
971
972 mask_type = !strcmp(attr->name, "serial_cpumask") ?
973 PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
974 ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
975 if (!ret)
976 ret = count;
977
978out:
979 free_cpumask_var(new_cpumask);
980 return ret;
981}
982
983#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
984 static struct padata_sysfs_entry _name##_attr = \
985 __ATTR(_name, 0644, _show_name, _store_name)
986#define PADATA_ATTR_RO(_name, _show_name) \
987 static struct padata_sysfs_entry _name##_attr = \
988 __ATTR(_name, 0400, _show_name, NULL)
989
990PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
991PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
992
993/*
994 * Padata sysfs provides the following objects:
995 * serial_cpumask [RW] - cpumask for serial workers
996 * parallel_cpumask [RW] - cpumask for parallel workers
997 */
998static struct attribute *padata_default_attrs[] = {
999 &serial_cpumask_attr.attr,
1000 &parallel_cpumask_attr.attr,
1001 NULL,
1002};
1003
1004static ssize_t padata_sysfs_show(struct kobject *kobj,
1005 struct attribute *attr, char *buf)
1006{
1007 struct padata_instance *pinst;
1008 struct padata_sysfs_entry *pentry;
1009 ssize_t ret = -EIO;
1010
1011 pinst = kobj2pinst(kobj);
1012 pentry = attr2pentry(attr);
1013 if (pentry->show)
1014 ret = pentry->show(pinst, attr, buf);
1015
1016 return ret;
1017}
1018
1019static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
1020 const char *buf, size_t count)
1021{
1022 struct padata_instance *pinst;
1023 struct padata_sysfs_entry *pentry;
1024 ssize_t ret = -EIO;
1025
1026 pinst = kobj2pinst(kobj);
1027 pentry = attr2pentry(attr);
1028 if (pentry->show)
1029 ret = pentry->store(pinst, attr, buf, count);
1030
1031 return ret;
1032}
1033
1034static const struct sysfs_ops padata_sysfs_ops = {
1035 .show = padata_sysfs_show,
1036 .store = padata_sysfs_store,
1037};
1038
1039static struct kobj_type padata_attr_type = {
1040 .sysfs_ops = &padata_sysfs_ops,
1041 .default_attrs = padata_default_attrs,
1042 .release = padata_sysfs_release,
1043};
1044
697/** 1045/**
698 * padata_alloc - allocate and initialize a padata instance 1046 * padata_alloc_possible - Allocate and initialize padata instance.
1047 * Use the cpu_possible_mask for serial and
1048 * parallel workers.
699 * 1049 *
700 * @cpumask: cpumask that padata uses for parallelization
701 * @wq: workqueue to use for the allocated padata instance 1050 * @wq: workqueue to use for the allocated padata instance
702 */ 1051 */
703struct padata_instance *padata_alloc(const struct cpumask *cpumask, 1052struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
704 struct workqueue_struct *wq) 1053{
1054 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
1055}
1056EXPORT_SYMBOL(padata_alloc_possible);
1057
1058/**
1059 * padata_alloc - allocate and initialize a padata instance and specify
1060 * cpumasks for serial and parallel workers.
1061 *
1062 * @wq: workqueue to use for the allocated padata instance
1063 * @pcpumask: cpumask that will be used for padata parallelization
1064 * @cbcpumask: cpumask that will be used for padata serialization
1065 */
1066struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1067 const struct cpumask *pcpumask,
1068 const struct cpumask *cbcpumask)
705{ 1069{
706 struct padata_instance *pinst; 1070 struct padata_instance *pinst;
707 struct parallel_data *pd; 1071 struct parallel_data *pd = NULL;
708 1072
709 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); 1073 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
710 if (!pinst) 1074 if (!pinst)
711 goto err; 1075 goto err;
712 1076
713 get_online_cpus(); 1077 get_online_cpus();
714 1078 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
715 pd = padata_alloc_pd(pinst, cpumask);
716 if (!pd)
717 goto err_free_inst; 1079 goto err_free_inst;
1080 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
1081 free_cpumask_var(pinst->cpumask.pcpu);
1082 goto err_free_inst;
1083 }
1084 if (!padata_validate_cpumask(pinst, pcpumask) ||
1085 !padata_validate_cpumask(pinst, cbcpumask))
1086 goto err_free_masks;
718 1087
719 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) 1088 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
720 goto err_free_pd; 1089 if (!pd)
1090 goto err_free_masks;
721 1091
722 rcu_assign_pointer(pinst->pd, pd); 1092 rcu_assign_pointer(pinst->pd, pd);
723 1093
724 pinst->wq = wq; 1094 pinst->wq = wq;
725 1095
726 cpumask_copy(pinst->cpumask, cpumask); 1096 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
1097 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
727 1098
728 pinst->flags = 0; 1099 pinst->flags = 0;
729 1100
@@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
735 1106
736 put_online_cpus(); 1107 put_online_cpus();
737 1108
1109 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1110 kobject_init(&pinst->kobj, &padata_attr_type);
738 mutex_init(&pinst->lock); 1111 mutex_init(&pinst->lock);
739 1112
740 return pinst; 1113 return pinst;
741 1114
742err_free_pd: 1115err_free_masks:
743 padata_free_pd(pd); 1116 free_cpumask_var(pinst->cpumask.pcpu);
1117 free_cpumask_var(pinst->cpumask.cbcpu);
744err_free_inst: 1118err_free_inst:
745 kfree(pinst); 1119 kfree(pinst);
746 put_online_cpus(); 1120 put_online_cpus();
@@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc);
756 */ 1130 */
757void padata_free(struct padata_instance *pinst) 1131void padata_free(struct padata_instance *pinst)
758{ 1132{
759 padata_stop(pinst); 1133 kobject_put(&pinst->kobj);
760
761 synchronize_rcu();
762
763#ifdef CONFIG_HOTPLUG_CPU
764 unregister_hotcpu_notifier(&pinst->cpu_notifier);
765#endif
766 get_online_cpus();
767 padata_flush_queues(pinst->pd);
768 put_online_cpus();
769
770 padata_free_pd(pinst->pd);
771 free_cpumask_var(pinst->cpumask);
772 kfree(pinst);
773} 1134}
774EXPORT_SYMBOL(padata_free); 1135EXPORT_SYMBOL(padata_free);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..403d1804b198 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 214
215static inline u64 perf_clock(void) 215static inline u64 perf_clock(void)
216{ 216{
217 return cpu_clock(raw_smp_processor_id()); 217 return local_clock();
218} 218}
219 219
220/* 220/*
@@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
675 struct perf_event *event, *partial_group = NULL; 675 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 676 const struct pmu *pmu = group_event->pmu;
677 bool txn = false; 677 bool txn = false;
678 int ret;
679 678
680 if (group_event->state == PERF_EVENT_STATE_OFF) 679 if (group_event->state == PERF_EVENT_STATE_OFF)
681 return 0; 680 return 0;
@@ -703,14 +702,8 @@ group_sched_in(struct perf_event *group_event,
703 } 702 }
704 } 703 }
705 704
706 if (!txn) 705 if (!txn || !pmu->commit_txn(pmu))
707 return 0;
708
709 ret = pmu->commit_txn(pmu);
710 if (!ret) {
711 pmu->cancel_txn(pmu);
712 return 0; 706 return 0;
713 }
714 707
715group_error: 708group_error:
716 /* 709 /*
@@ -1155,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
1155 * In order to keep per-task stats reliable we need to flip the event 1148 * In order to keep per-task stats reliable we need to flip the event
1156 * values when we flip the contexts. 1149 * values when we flip the contexts.
1157 */ 1150 */
1158 value = atomic64_read(&next_event->count); 1151 value = local64_read(&next_event->count);
1159 value = atomic64_xchg(&event->count, value); 1152 value = local64_xchg(&event->count, value);
1160 atomic64_set(&next_event->count, value); 1153 local64_set(&next_event->count, value);
1161 1154
1162 swap(event->total_time_enabled, next_event->total_time_enabled); 1155 swap(event->total_time_enabled, next_event->total_time_enabled);
1163 swap(event->total_time_running, next_event->total_time_running); 1156 swap(event->total_time_running, next_event->total_time_running);
@@ -1547,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1547 1540
1548 hwc->sample_period = sample_period; 1541 hwc->sample_period = sample_period;
1549 1542
1550 if (atomic64_read(&hwc->period_left) > 8*sample_period) { 1543 if (local64_read(&hwc->period_left) > 8*sample_period) {
1551 perf_disable(); 1544 perf_disable();
1552 perf_event_stop(event); 1545 perf_event_stop(event);
1553 atomic64_set(&hwc->period_left, 0); 1546 local64_set(&hwc->period_left, 0);
1554 perf_event_start(event); 1547 perf_event_start(event);
1555 perf_enable(); 1548 perf_enable();
1556 } 1549 }
@@ -1591,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1591 1584
1592 perf_disable(); 1585 perf_disable();
1593 event->pmu->read(event); 1586 event->pmu->read(event);
1594 now = atomic64_read(&event->count); 1587 now = local64_read(&event->count);
1595 delta = now - hwc->freq_count_stamp; 1588 delta = now - hwc->freq_count_stamp;
1596 hwc->freq_count_stamp = now; 1589 hwc->freq_count_stamp = now;
1597 1590
@@ -1743,6 +1736,11 @@ static void __perf_event_read(void *info)
1743 event->pmu->read(event); 1736 event->pmu->read(event);
1744} 1737}
1745 1738
1739static inline u64 perf_event_count(struct perf_event *event)
1740{
1741 return local64_read(&event->count) + atomic64_read(&event->child_count);
1742}
1743
1746static u64 perf_event_read(struct perf_event *event) 1744static u64 perf_event_read(struct perf_event *event)
1747{ 1745{
1748 /* 1746 /*
@@ -1762,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event)
1762 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1760 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1763 } 1761 }
1764 1762
1765 return atomic64_read(&event->count); 1763 return perf_event_count(event);
1766} 1764}
1767 1765
1768/* 1766/*
@@ -1883,7 +1881,7 @@ static void free_event_rcu(struct rcu_head *head)
1883} 1881}
1884 1882
1885static void perf_pending_sync(struct perf_event *event); 1883static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data); 1884static void perf_buffer_put(struct perf_buffer *buffer);
1887 1885
1888static void free_event(struct perf_event *event) 1886static void free_event(struct perf_event *event)
1889{ 1887{
@@ -1891,7 +1889,7 @@ static void free_event(struct perf_event *event)
1891 1889
1892 if (!event->parent) { 1890 if (!event->parent) {
1893 atomic_dec(&nr_events); 1891 atomic_dec(&nr_events);
1894 if (event->attr.mmap) 1892 if (event->attr.mmap || event->attr.mmap_data)
1895 atomic_dec(&nr_mmap_events); 1893 atomic_dec(&nr_mmap_events);
1896 if (event->attr.comm) 1894 if (event->attr.comm)
1897 atomic_dec(&nr_comm_events); 1895 atomic_dec(&nr_comm_events);
@@ -1899,9 +1897,9 @@ static void free_event(struct perf_event *event)
1899 atomic_dec(&nr_task_events); 1897 atomic_dec(&nr_task_events);
1900 } 1898 }
1901 1899
1902 if (event->data) { 1900 if (event->buffer) {
1903 perf_mmap_data_put(event->data); 1901 perf_buffer_put(event->buffer);
1904 event->data = NULL; 1902 event->buffer = NULL;
1905 } 1903 }
1906 1904
1907 if (event->destroy) 1905 if (event->destroy)
@@ -2126,13 +2124,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2126static unsigned int perf_poll(struct file *file, poll_table *wait) 2124static unsigned int perf_poll(struct file *file, poll_table *wait)
2127{ 2125{
2128 struct perf_event *event = file->private_data; 2126 struct perf_event *event = file->private_data;
2129 struct perf_mmap_data *data; 2127 struct perf_buffer *buffer;
2130 unsigned int events = POLL_HUP; 2128 unsigned int events = POLL_HUP;
2131 2129
2132 rcu_read_lock(); 2130 rcu_read_lock();
2133 data = rcu_dereference(event->data); 2131 buffer = rcu_dereference(event->buffer);
2134 if (data) 2132 if (buffer)
2135 events = atomic_xchg(&data->poll, 0); 2133 events = atomic_xchg(&buffer->poll, 0);
2136 rcu_read_unlock(); 2134 rcu_read_unlock();
2137 2135
2138 poll_wait(file, &event->waitq, wait); 2136 poll_wait(file, &event->waitq, wait);
@@ -2143,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
2143static void perf_event_reset(struct perf_event *event) 2141static void perf_event_reset(struct perf_event *event)
2144{ 2142{
2145 (void)perf_event_read(event); 2143 (void)perf_event_read(event);
2146 atomic64_set(&event->count, 0); 2144 local64_set(&event->count, 0);
2147 perf_event_update_userpage(event); 2145 perf_event_update_userpage(event);
2148} 2146}
2149 2147
@@ -2342,14 +2340,14 @@ static int perf_event_index(struct perf_event *event)
2342void perf_event_update_userpage(struct perf_event *event) 2340void perf_event_update_userpage(struct perf_event *event)
2343{ 2341{
2344 struct perf_event_mmap_page *userpg; 2342 struct perf_event_mmap_page *userpg;
2345 struct perf_mmap_data *data; 2343 struct perf_buffer *buffer;
2346 2344
2347 rcu_read_lock(); 2345 rcu_read_lock();
2348 data = rcu_dereference(event->data); 2346 buffer = rcu_dereference(event->buffer);
2349 if (!data) 2347 if (!buffer)
2350 goto unlock; 2348 goto unlock;
2351 2349
2352 userpg = data->user_page; 2350 userpg = buffer->user_page;
2353 2351
2354 /* 2352 /*
2355 * Disable preemption so as to not let the corresponding user-space 2353 * Disable preemption so as to not let the corresponding user-space
@@ -2359,9 +2357,9 @@ void perf_event_update_userpage(struct perf_event *event)
2359 ++userpg->lock; 2357 ++userpg->lock;
2360 barrier(); 2358 barrier();
2361 userpg->index = perf_event_index(event); 2359 userpg->index = perf_event_index(event);
2362 userpg->offset = atomic64_read(&event->count); 2360 userpg->offset = perf_event_count(event);
2363 if (event->state == PERF_EVENT_STATE_ACTIVE) 2361 if (event->state == PERF_EVENT_STATE_ACTIVE)
2364 userpg->offset -= atomic64_read(&event->hw.prev_count); 2362 userpg->offset -= local64_read(&event->hw.prev_count);
2365 2363
2366 userpg->time_enabled = event->total_time_enabled + 2364 userpg->time_enabled = event->total_time_enabled +
2367 atomic64_read(&event->child_total_time_enabled); 2365 atomic64_read(&event->child_total_time_enabled);
@@ -2376,6 +2374,25 @@ unlock:
2376 rcu_read_unlock(); 2374 rcu_read_unlock();
2377} 2375}
2378 2376
2377static unsigned long perf_data_size(struct perf_buffer *buffer);
2378
2379static void
2380perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
2381{
2382 long max_size = perf_data_size(buffer);
2383
2384 if (watermark)
2385 buffer->watermark = min(max_size, watermark);
2386
2387 if (!buffer->watermark)
2388 buffer->watermark = max_size / 2;
2389
2390 if (flags & PERF_BUFFER_WRITABLE)
2391 buffer->writable = 1;
2392
2393 atomic_set(&buffer->refcount, 1);
2394}
2395
2379#ifndef CONFIG_PERF_USE_VMALLOC 2396#ifndef CONFIG_PERF_USE_VMALLOC
2380 2397
2381/* 2398/*
@@ -2383,15 +2400,15 @@ unlock:
2383 */ 2400 */
2384 2401
2385static struct page * 2402static struct page *
2386perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2403perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2387{ 2404{
2388 if (pgoff > data->nr_pages) 2405 if (pgoff > buffer->nr_pages)
2389 return NULL; 2406 return NULL;
2390 2407
2391 if (pgoff == 0) 2408 if (pgoff == 0)
2392 return virt_to_page(data->user_page); 2409 return virt_to_page(buffer->user_page);
2393 2410
2394 return virt_to_page(data->data_pages[pgoff - 1]); 2411 return virt_to_page(buffer->data_pages[pgoff - 1]);
2395} 2412}
2396 2413
2397static void *perf_mmap_alloc_page(int cpu) 2414static void *perf_mmap_alloc_page(int cpu)
@@ -2407,42 +2424,44 @@ static void *perf_mmap_alloc_page(int cpu)
2407 return page_address(page); 2424 return page_address(page);
2408} 2425}
2409 2426
2410static struct perf_mmap_data * 2427static struct perf_buffer *
2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2428perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2412{ 2429{
2413 struct perf_mmap_data *data; 2430 struct perf_buffer *buffer;
2414 unsigned long size; 2431 unsigned long size;
2415 int i; 2432 int i;
2416 2433
2417 size = sizeof(struct perf_mmap_data); 2434 size = sizeof(struct perf_buffer);
2418 size += nr_pages * sizeof(void *); 2435 size += nr_pages * sizeof(void *);
2419 2436
2420 data = kzalloc(size, GFP_KERNEL); 2437 buffer = kzalloc(size, GFP_KERNEL);
2421 if (!data) 2438 if (!buffer)
2422 goto fail; 2439 goto fail;
2423 2440
2424 data->user_page = perf_mmap_alloc_page(event->cpu); 2441 buffer->user_page = perf_mmap_alloc_page(cpu);
2425 if (!data->user_page) 2442 if (!buffer->user_page)
2426 goto fail_user_page; 2443 goto fail_user_page;
2427 2444
2428 for (i = 0; i < nr_pages; i++) { 2445 for (i = 0; i < nr_pages; i++) {
2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu); 2446 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2430 if (!data->data_pages[i]) 2447 if (!buffer->data_pages[i])
2431 goto fail_data_pages; 2448 goto fail_data_pages;
2432 } 2449 }
2433 2450
2434 data->nr_pages = nr_pages; 2451 buffer->nr_pages = nr_pages;
2452
2453 perf_buffer_init(buffer, watermark, flags);
2435 2454
2436 return data; 2455 return buffer;
2437 2456
2438fail_data_pages: 2457fail_data_pages:
2439 for (i--; i >= 0; i--) 2458 for (i--; i >= 0; i--)
2440 free_page((unsigned long)data->data_pages[i]); 2459 free_page((unsigned long)buffer->data_pages[i]);
2441 2460
2442 free_page((unsigned long)data->user_page); 2461 free_page((unsigned long)buffer->user_page);
2443 2462
2444fail_user_page: 2463fail_user_page:
2445 kfree(data); 2464 kfree(buffer);
2446 2465
2447fail: 2466fail:
2448 return NULL; 2467 return NULL;
@@ -2456,17 +2475,17 @@ static void perf_mmap_free_page(unsigned long addr)
2456 __free_page(page); 2475 __free_page(page);
2457} 2476}
2458 2477
2459static void perf_mmap_data_free(struct perf_mmap_data *data) 2478static void perf_buffer_free(struct perf_buffer *buffer)
2460{ 2479{
2461 int i; 2480 int i;
2462 2481
2463 perf_mmap_free_page((unsigned long)data->user_page); 2482 perf_mmap_free_page((unsigned long)buffer->user_page);
2464 for (i = 0; i < data->nr_pages; i++) 2483 for (i = 0; i < buffer->nr_pages; i++)
2465 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2484 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
2466 kfree(data); 2485 kfree(buffer);
2467} 2486}
2468 2487
2469static inline int page_order(struct perf_mmap_data *data) 2488static inline int page_order(struct perf_buffer *buffer)
2470{ 2489{
2471 return 0; 2490 return 0;
2472} 2491}
@@ -2479,18 +2498,18 @@ static inline int page_order(struct perf_mmap_data *data)
2479 * Required for architectures that have d-cache aliasing issues. 2498 * Required for architectures that have d-cache aliasing issues.
2480 */ 2499 */
2481 2500
2482static inline int page_order(struct perf_mmap_data *data) 2501static inline int page_order(struct perf_buffer *buffer)
2483{ 2502{
2484 return data->page_order; 2503 return buffer->page_order;
2485} 2504}
2486 2505
2487static struct page * 2506static struct page *
2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2507perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2489{ 2508{
2490 if (pgoff > (1UL << page_order(data))) 2509 if (pgoff > (1UL << page_order(buffer)))
2491 return NULL; 2510 return NULL;
2492 2511
2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2512 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2494} 2513}
2495 2514
2496static void perf_mmap_unmark_page(void *addr) 2515static void perf_mmap_unmark_page(void *addr)
@@ -2500,57 +2519,59 @@ static void perf_mmap_unmark_page(void *addr)
2500 page->mapping = NULL; 2519 page->mapping = NULL;
2501} 2520}
2502 2521
2503static void perf_mmap_data_free_work(struct work_struct *work) 2522static void perf_buffer_free_work(struct work_struct *work)
2504{ 2523{
2505 struct perf_mmap_data *data; 2524 struct perf_buffer *buffer;
2506 void *base; 2525 void *base;
2507 int i, nr; 2526 int i, nr;
2508 2527
2509 data = container_of(work, struct perf_mmap_data, work); 2528 buffer = container_of(work, struct perf_buffer, work);
2510 nr = 1 << page_order(data); 2529 nr = 1 << page_order(buffer);
2511 2530
2512 base = data->user_page; 2531 base = buffer->user_page;
2513 for (i = 0; i < nr + 1; i++) 2532 for (i = 0; i < nr + 1; i++)
2514 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2533 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2515 2534
2516 vfree(base); 2535 vfree(base);
2517 kfree(data); 2536 kfree(buffer);
2518} 2537}
2519 2538
2520static void perf_mmap_data_free(struct perf_mmap_data *data) 2539static void perf_buffer_free(struct perf_buffer *buffer)
2521{ 2540{
2522 schedule_work(&data->work); 2541 schedule_work(&buffer->work);
2523} 2542}
2524 2543
2525static struct perf_mmap_data * 2544static struct perf_buffer *
2526perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2545perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2527{ 2546{
2528 struct perf_mmap_data *data; 2547 struct perf_buffer *buffer;
2529 unsigned long size; 2548 unsigned long size;
2530 void *all_buf; 2549 void *all_buf;
2531 2550
2532 size = sizeof(struct perf_mmap_data); 2551 size = sizeof(struct perf_buffer);
2533 size += sizeof(void *); 2552 size += sizeof(void *);
2534 2553
2535 data = kzalloc(size, GFP_KERNEL); 2554 buffer = kzalloc(size, GFP_KERNEL);
2536 if (!data) 2555 if (!buffer)
2537 goto fail; 2556 goto fail;
2538 2557
2539 INIT_WORK(&data->work, perf_mmap_data_free_work); 2558 INIT_WORK(&buffer->work, perf_buffer_free_work);
2540 2559
2541 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 2560 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2542 if (!all_buf) 2561 if (!all_buf)
2543 goto fail_all_buf; 2562 goto fail_all_buf;
2544 2563
2545 data->user_page = all_buf; 2564 buffer->user_page = all_buf;
2546 data->data_pages[0] = all_buf + PAGE_SIZE; 2565 buffer->data_pages[0] = all_buf + PAGE_SIZE;
2547 data->page_order = ilog2(nr_pages); 2566 buffer->page_order = ilog2(nr_pages);
2548 data->nr_pages = 1; 2567 buffer->nr_pages = 1;
2568
2569 perf_buffer_init(buffer, watermark, flags);
2549 2570
2550 return data; 2571 return buffer;
2551 2572
2552fail_all_buf: 2573fail_all_buf:
2553 kfree(data); 2574 kfree(buffer);
2554 2575
2555fail: 2576fail:
2556 return NULL; 2577 return NULL;
@@ -2558,15 +2579,15 @@ fail:
2558 2579
2559#endif 2580#endif
2560 2581
2561static unsigned long perf_data_size(struct perf_mmap_data *data) 2582static unsigned long perf_data_size(struct perf_buffer *buffer)
2562{ 2583{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data)); 2584 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2564} 2585}
2565 2586
2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2587static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2567{ 2588{
2568 struct perf_event *event = vma->vm_file->private_data; 2589 struct perf_event *event = vma->vm_file->private_data;
2569 struct perf_mmap_data *data; 2590 struct perf_buffer *buffer;
2570 int ret = VM_FAULT_SIGBUS; 2591 int ret = VM_FAULT_SIGBUS;
2571 2592
2572 if (vmf->flags & FAULT_FLAG_MKWRITE) { 2593 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2576,14 +2597,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2576 } 2597 }
2577 2598
2578 rcu_read_lock(); 2599 rcu_read_lock();
2579 data = rcu_dereference(event->data); 2600 buffer = rcu_dereference(event->buffer);
2580 if (!data) 2601 if (!buffer)
2581 goto unlock; 2602 goto unlock;
2582 2603
2583 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 2604 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2584 goto unlock; 2605 goto unlock;
2585 2606
2586 vmf->page = perf_mmap_to_page(data, vmf->pgoff); 2607 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
2587 if (!vmf->page) 2608 if (!vmf->page)
2588 goto unlock; 2609 goto unlock;
2589 2610
@@ -2598,52 +2619,35 @@ unlock:
2598 return ret; 2619 return ret;
2599} 2620}
2600 2621
2601static void 2622static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
2602perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2603{
2604 long max_size = perf_data_size(data);
2605
2606 if (event->attr.watermark) {
2607 data->watermark = min_t(long, max_size,
2608 event->attr.wakeup_watermark);
2609 }
2610
2611 if (!data->watermark)
2612 data->watermark = max_size / 2;
2613
2614 atomic_set(&data->refcount, 1);
2615 rcu_assign_pointer(event->data, data);
2616}
2617
2618static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2619{ 2623{
2620 struct perf_mmap_data *data; 2624 struct perf_buffer *buffer;
2621 2625
2622 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2626 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
2623 perf_mmap_data_free(data); 2627 perf_buffer_free(buffer);
2624} 2628}
2625 2629
2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) 2630static struct perf_buffer *perf_buffer_get(struct perf_event *event)
2627{ 2631{
2628 struct perf_mmap_data *data; 2632 struct perf_buffer *buffer;
2629 2633
2630 rcu_read_lock(); 2634 rcu_read_lock();
2631 data = rcu_dereference(event->data); 2635 buffer = rcu_dereference(event->buffer);
2632 if (data) { 2636 if (buffer) {
2633 if (!atomic_inc_not_zero(&data->refcount)) 2637 if (!atomic_inc_not_zero(&buffer->refcount))
2634 data = NULL; 2638 buffer = NULL;
2635 } 2639 }
2636 rcu_read_unlock(); 2640 rcu_read_unlock();
2637 2641
2638 return data; 2642 return buffer;
2639} 2643}
2640 2644
2641static void perf_mmap_data_put(struct perf_mmap_data *data) 2645static void perf_buffer_put(struct perf_buffer *buffer)
2642{ 2646{
2643 if (!atomic_dec_and_test(&data->refcount)) 2647 if (!atomic_dec_and_test(&buffer->refcount))
2644 return; 2648 return;
2645 2649
2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2650 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
2647} 2651}
2648 2652
2649static void perf_mmap_open(struct vm_area_struct *vma) 2653static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2658,16 +2662,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2658 struct perf_event *event = vma->vm_file->private_data; 2662 struct perf_event *event = vma->vm_file->private_data;
2659 2663
2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2664 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2661 unsigned long size = perf_data_size(event->data); 2665 unsigned long size = perf_data_size(event->buffer);
2662 struct user_struct *user = event->mmap_user; 2666 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data; 2667 struct perf_buffer *buffer = event->buffer;
2664 2668
2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2669 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2666 vma->vm_mm->locked_vm -= event->mmap_locked; 2670 vma->vm_mm->locked_vm -= event->mmap_locked;
2667 rcu_assign_pointer(event->data, NULL); 2671 rcu_assign_pointer(event->buffer, NULL);
2668 mutex_unlock(&event->mmap_mutex); 2672 mutex_unlock(&event->mmap_mutex);
2669 2673
2670 perf_mmap_data_put(data); 2674 perf_buffer_put(buffer);
2671 free_uid(user); 2675 free_uid(user);
2672 } 2676 }
2673} 2677}
@@ -2685,11 +2689,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2685 unsigned long user_locked, user_lock_limit; 2689 unsigned long user_locked, user_lock_limit;
2686 struct user_struct *user = current_user(); 2690 struct user_struct *user = current_user();
2687 unsigned long locked, lock_limit; 2691 unsigned long locked, lock_limit;
2688 struct perf_mmap_data *data; 2692 struct perf_buffer *buffer;
2689 unsigned long vma_size; 2693 unsigned long vma_size;
2690 unsigned long nr_pages; 2694 unsigned long nr_pages;
2691 long user_extra, extra; 2695 long user_extra, extra;
2692 int ret = 0; 2696 int ret = 0, flags = 0;
2693 2697
2694 /* 2698 /*
2695 * Don't allow mmap() of inherited per-task counters. This would 2699 * Don't allow mmap() of inherited per-task counters. This would
@@ -2706,7 +2710,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2706 nr_pages = (vma_size / PAGE_SIZE) - 1; 2710 nr_pages = (vma_size / PAGE_SIZE) - 1;
2707 2711
2708 /* 2712 /*
2709 * If we have data pages ensure they're a power-of-two number, so we 2713 * If we have buffer pages ensure they're a power-of-two number, so we
2710 * can do bitmasks instead of modulo. 2714 * can do bitmasks instead of modulo.
2711 */ 2715 */
2712 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 2716 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2720,9 +2724,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2720 2724
2721 WARN_ON_ONCE(event->ctx->parent_ctx); 2725 WARN_ON_ONCE(event->ctx->parent_ctx);
2722 mutex_lock(&event->mmap_mutex); 2726 mutex_lock(&event->mmap_mutex);
2723 if (event->data) { 2727 if (event->buffer) {
2724 if (event->data->nr_pages == nr_pages) 2728 if (event->buffer->nr_pages == nr_pages)
2725 atomic_inc(&event->data->refcount); 2729 atomic_inc(&event->buffer->refcount);
2726 else 2730 else
2727 ret = -EINVAL; 2731 ret = -EINVAL;
2728 goto unlock; 2732 goto unlock;
@@ -2752,17 +2756,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2752 goto unlock; 2756 goto unlock;
2753 } 2757 }
2754 2758
2755 WARN_ON(event->data); 2759 WARN_ON(event->buffer);
2760
2761 if (vma->vm_flags & VM_WRITE)
2762 flags |= PERF_BUFFER_WRITABLE;
2756 2763
2757 data = perf_mmap_data_alloc(event, nr_pages); 2764 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
2758 if (!data) { 2765 event->cpu, flags);
2766 if (!buffer) {
2759 ret = -ENOMEM; 2767 ret = -ENOMEM;
2760 goto unlock; 2768 goto unlock;
2761 } 2769 }
2762 2770 rcu_assign_pointer(event->buffer, buffer);
2763 perf_mmap_data_init(event, data);
2764 if (vma->vm_flags & VM_WRITE)
2765 event->data->writable = 1;
2766 2771
2767 atomic_long_add(user_extra, &user->locked_vm); 2772 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra; 2773 event->mmap_locked = extra;
@@ -2941,11 +2946,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2941 return NULL; 2946 return NULL;
2942} 2947}
2943 2948
2944__weak
2945void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2946{
2947}
2948
2949 2949
2950/* 2950/*
2951 * We assume there is only KVM supporting the callbacks. 2951 * We assume there is only KVM supporting the callbacks.
@@ -2971,15 +2971,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2971/* 2971/*
2972 * Output 2972 * Output
2973 */ 2973 */
2974static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2974static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
2975 unsigned long offset, unsigned long head) 2975 unsigned long offset, unsigned long head)
2976{ 2976{
2977 unsigned long mask; 2977 unsigned long mask;
2978 2978
2979 if (!data->writable) 2979 if (!buffer->writable)
2980 return true; 2980 return true;
2981 2981
2982 mask = perf_data_size(data) - 1; 2982 mask = perf_data_size(buffer) - 1;
2983 2983
2984 offset = (offset - tail) & mask; 2984 offset = (offset - tail) & mask;
2985 head = (head - tail) & mask; 2985 head = (head - tail) & mask;
@@ -2992,7 +2992,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2992 2992
2993static void perf_output_wakeup(struct perf_output_handle *handle) 2993static void perf_output_wakeup(struct perf_output_handle *handle)
2994{ 2994{
2995 atomic_set(&handle->data->poll, POLL_IN); 2995 atomic_set(&handle->buffer->poll, POLL_IN);
2996 2996
2997 if (handle->nmi) { 2997 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 2998 handle->event->pending_wakeup = 1;
@@ -3012,45 +3012,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
3012 */ 3012 */
3013static void perf_output_get_handle(struct perf_output_handle *handle) 3013static void perf_output_get_handle(struct perf_output_handle *handle)
3014{ 3014{
3015 struct perf_mmap_data *data = handle->data; 3015 struct perf_buffer *buffer = handle->buffer;
3016 3016
3017 preempt_disable(); 3017 preempt_disable();
3018 local_inc(&data->nest); 3018 local_inc(&buffer->nest);
3019 handle->wakeup = local_read(&data->wakeup); 3019 handle->wakeup = local_read(&buffer->wakeup);
3020} 3020}
3021 3021
3022static void perf_output_put_handle(struct perf_output_handle *handle) 3022static void perf_output_put_handle(struct perf_output_handle *handle)
3023{ 3023{
3024 struct perf_mmap_data *data = handle->data; 3024 struct perf_buffer *buffer = handle->buffer;
3025 unsigned long head; 3025 unsigned long head;
3026 3026
3027again: 3027again:
3028 head = local_read(&data->head); 3028 head = local_read(&buffer->head);
3029 3029
3030 /* 3030 /*
3031 * IRQ/NMI can happen here, which means we can miss a head update. 3031 * IRQ/NMI can happen here, which means we can miss a head update.
3032 */ 3032 */
3033 3033
3034 if (!local_dec_and_test(&data->nest)) 3034 if (!local_dec_and_test(&buffer->nest))
3035 goto out; 3035 goto out;
3036 3036
3037 /* 3037 /*
3038 * Publish the known good head. Rely on the full barrier implied 3038 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this 3039 * by atomic_dec_and_test() order the buffer->head read and this
3040 * write. 3040 * write.
3041 */ 3041 */
3042 data->user_page->data_head = head; 3042 buffer->user_page->data_head = head;
3043 3043
3044 /* 3044 /*
3045 * Now check if we missed an update, rely on the (compiler) 3045 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head. 3046 * barrier in atomic_dec_and_test() to re-read buffer->head.
3047 */ 3047 */
3048 if (unlikely(head != local_read(&data->head))) { 3048 if (unlikely(head != local_read(&buffer->head))) {
3049 local_inc(&data->nest); 3049 local_inc(&buffer->nest);
3050 goto again; 3050 goto again;
3051 } 3051 }
3052 3052
3053 if (handle->wakeup != local_read(&data->wakeup)) 3053 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3054 perf_output_wakeup(handle);
3055 3055
3056 out: 3056 out:
@@ -3070,12 +3070,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3070 buf += size; 3070 buf += size;
3071 handle->size -= size; 3071 handle->size -= size;
3072 if (!handle->size) { 3072 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data; 3073 struct perf_buffer *buffer = handle->buffer;
3074 3074
3075 handle->page++; 3075 handle->page++;
3076 handle->page &= data->nr_pages - 1; 3076 handle->page &= buffer->nr_pages - 1;
3077 handle->addr = data->data_pages[handle->page]; 3077 handle->addr = buffer->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data); 3078 handle->size = PAGE_SIZE << page_order(buffer);
3079 } 3079 }
3080 } while (len); 3080 } while (len);
3081} 3081}
@@ -3084,7 +3084,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3084 struct perf_event *event, unsigned int size, 3084 struct perf_event *event, unsigned int size,
3085 int nmi, int sample) 3085 int nmi, int sample)
3086{ 3086{
3087 struct perf_mmap_data *data; 3087 struct perf_buffer *buffer;
3088 unsigned long tail, offset, head; 3088 unsigned long tail, offset, head;
3089 int have_lost; 3089 int have_lost;
3090 struct { 3090 struct {
@@ -3100,19 +3100,19 @@ int perf_output_begin(struct perf_output_handle *handle,
3100 if (event->parent) 3100 if (event->parent)
3101 event = event->parent; 3101 event = event->parent;
3102 3102
3103 data = rcu_dereference(event->data); 3103 buffer = rcu_dereference(event->buffer);
3104 if (!data) 3104 if (!buffer)
3105 goto out; 3105 goto out;
3106 3106
3107 handle->data = data; 3107 handle->buffer = buffer;
3108 handle->event = event; 3108 handle->event = event;
3109 handle->nmi = nmi; 3109 handle->nmi = nmi;
3110 handle->sample = sample; 3110 handle->sample = sample;
3111 3111
3112 if (!data->nr_pages) 3112 if (!buffer->nr_pages)
3113 goto out; 3113 goto out;
3114 3114
3115 have_lost = local_read(&data->lost); 3115 have_lost = local_read(&buffer->lost);
3116 if (have_lost) 3116 if (have_lost)
3117 size += sizeof(lost_event); 3117 size += sizeof(lost_event);
3118 3118
@@ -3124,30 +3124,30 @@ int perf_output_begin(struct perf_output_handle *handle,
3124 * tail pointer. So that all reads will be completed before the 3124 * tail pointer. So that all reads will be completed before the
3125 * write is issued. 3125 * write is issued.
3126 */ 3126 */
3127 tail = ACCESS_ONCE(data->user_page->data_tail); 3127 tail = ACCESS_ONCE(buffer->user_page->data_tail);
3128 smp_rmb(); 3128 smp_rmb();
3129 offset = head = local_read(&data->head); 3129 offset = head = local_read(&buffer->head);
3130 head += size; 3130 head += size;
3131 if (unlikely(!perf_output_space(data, tail, offset, head))) 3131 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3132 goto fail; 3132 goto fail;
3133 } while (local_cmpxchg(&data->head, offset, head) != offset); 3133 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
3134 3134
3135 if (head - local_read(&data->wakeup) > data->watermark) 3135 if (head - local_read(&buffer->wakeup) > buffer->watermark)
3136 local_add(data->watermark, &data->wakeup); 3136 local_add(buffer->watermark, &buffer->wakeup);
3137 3137
3138 handle->page = offset >> (PAGE_SHIFT + page_order(data)); 3138 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
3139 handle->page &= data->nr_pages - 1; 3139 handle->page &= buffer->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); 3140 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
3141 handle->addr = data->data_pages[handle->page]; 3141 handle->addr = buffer->data_pages[handle->page];
3142 handle->addr += handle->size; 3142 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size; 3143 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3144 3144
3145 if (have_lost) { 3145 if (have_lost) {
3146 lost_event.header.type = PERF_RECORD_LOST; 3146 lost_event.header.type = PERF_RECORD_LOST;
3147 lost_event.header.misc = 0; 3147 lost_event.header.misc = 0;
3148 lost_event.header.size = sizeof(lost_event); 3148 lost_event.header.size = sizeof(lost_event);
3149 lost_event.id = event->id; 3149 lost_event.id = event->id;
3150 lost_event.lost = local_xchg(&data->lost, 0); 3150 lost_event.lost = local_xchg(&buffer->lost, 0);
3151 3151
3152 perf_output_put(handle, lost_event); 3152 perf_output_put(handle, lost_event);
3153 } 3153 }
@@ -3155,7 +3155,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3155 return 0; 3155 return 0;
3156 3156
3157fail: 3157fail:
3158 local_inc(&data->lost); 3158 local_inc(&buffer->lost);
3159 perf_output_put_handle(handle); 3159 perf_output_put_handle(handle);
3160out: 3160out:
3161 rcu_read_unlock(); 3161 rcu_read_unlock();
@@ -3166,15 +3166,15 @@ out:
3166void perf_output_end(struct perf_output_handle *handle) 3166void perf_output_end(struct perf_output_handle *handle)
3167{ 3167{
3168 struct perf_event *event = handle->event; 3168 struct perf_event *event = handle->event;
3169 struct perf_mmap_data *data = handle->data; 3169 struct perf_buffer *buffer = handle->buffer;
3170 3170
3171 int wakeup_events = event->attr.wakeup_events; 3171 int wakeup_events = event->attr.wakeup_events;
3172 3172
3173 if (handle->sample && wakeup_events) { 3173 if (handle->sample && wakeup_events) {
3174 int events = local_inc_return(&data->events); 3174 int events = local_inc_return(&buffer->events);
3175 if (events >= wakeup_events) { 3175 if (events >= wakeup_events) {
3176 local_sub(wakeup_events, &data->events); 3176 local_sub(wakeup_events, &buffer->events);
3177 local_inc(&data->wakeup); 3177 local_inc(&buffer->wakeup);
3178 } 3178 }
3179 } 3179 }
3180 3180
@@ -3211,7 +3211,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3211 u64 values[4]; 3211 u64 values[4];
3212 int n = 0; 3212 int n = 0;
3213 3213
3214 values[n++] = atomic64_read(&event->count); 3214 values[n++] = perf_event_count(event);
3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3215 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3216 values[n++] = event->total_time_enabled + 3216 values[n++] = event->total_time_enabled +
3217 atomic64_read(&event->child_total_time_enabled); 3217 atomic64_read(&event->child_total_time_enabled);
@@ -3248,7 +3248,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3248 if (leader != event) 3248 if (leader != event)
3249 leader->pmu->read(leader); 3249 leader->pmu->read(leader);
3250 3250
3251 values[n++] = atomic64_read(&leader->count); 3251 values[n++] = perf_event_count(leader);
3252 if (read_format & PERF_FORMAT_ID) 3252 if (read_format & PERF_FORMAT_ID)
3253 values[n++] = primary_event_id(leader); 3253 values[n++] = primary_event_id(leader);
3254 3254
@@ -3260,7 +3260,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3260 if (sub != event) 3260 if (sub != event)
3261 sub->pmu->read(sub); 3261 sub->pmu->read(sub);
3262 3262
3263 values[n++] = atomic64_read(&sub->count); 3263 values[n++] = perf_event_count(sub);
3264 if (read_format & PERF_FORMAT_ID) 3264 if (read_format & PERF_FORMAT_ID)
3265 values[n++] = primary_event_id(sub); 3265 values[n++] = primary_event_id(sub);
3266 3266
@@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event,
3491/* 3491/*
3492 * task tracking -- fork/exit 3492 * task tracking -- fork/exit
3493 * 3493 *
3494 * enabled by: attr.comm | attr.mmap | attr.task 3494 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
3495 */ 3495 */
3496 3496
3497struct perf_task_event { 3497struct perf_task_event {
@@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event)
3541 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3541 if (event->cpu != -1 && event->cpu != smp_processor_id())
3542 return 0; 3542 return 0;
3543 3543
3544 if (event->attr.comm || event->attr.mmap || event->attr.task) 3544 if (event->attr.comm || event->attr.mmap ||
3545 event->attr.mmap_data || event->attr.task)
3545 return 1; 3546 return 1;
3546 3547
3547 return 0; 3548 return 0;
@@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event,
3766} 3767}
3767 3768
3768static int perf_event_mmap_match(struct perf_event *event, 3769static int perf_event_mmap_match(struct perf_event *event,
3769 struct perf_mmap_event *mmap_event) 3770 struct perf_mmap_event *mmap_event,
3771 int executable)
3770{ 3772{
3771 if (event->state < PERF_EVENT_STATE_INACTIVE) 3773 if (event->state < PERF_EVENT_STATE_INACTIVE)
3772 return 0; 3774 return 0;
@@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event,
3774 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3776 if (event->cpu != -1 && event->cpu != smp_processor_id())
3775 return 0; 3777 return 0;
3776 3778
3777 if (event->attr.mmap) 3779 if ((!executable && event->attr.mmap_data) ||
3780 (executable && event->attr.mmap))
3778 return 1; 3781 return 1;
3779 3782
3780 return 0; 3783 return 0;
3781} 3784}
3782 3785
3783static void perf_event_mmap_ctx(struct perf_event_context *ctx, 3786static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3784 struct perf_mmap_event *mmap_event) 3787 struct perf_mmap_event *mmap_event,
3788 int executable)
3785{ 3789{
3786 struct perf_event *event; 3790 struct perf_event *event;
3787 3791
3788 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3792 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3789 if (perf_event_mmap_match(event, mmap_event)) 3793 if (perf_event_mmap_match(event, mmap_event, executable))
3790 perf_event_mmap_output(event, mmap_event); 3794 perf_event_mmap_output(event, mmap_event);
3791 } 3795 }
3792} 3796}
@@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3830 if (!vma->vm_mm) { 3834 if (!vma->vm_mm) {
3831 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3835 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3832 goto got_name; 3836 goto got_name;
3837 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
3838 vma->vm_end >= vma->vm_mm->brk) {
3839 name = strncpy(tmp, "[heap]", sizeof(tmp));
3840 goto got_name;
3841 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
3842 vma->vm_end >= vma->vm_mm->start_stack) {
3843 name = strncpy(tmp, "[stack]", sizeof(tmp));
3844 goto got_name;
3833 } 3845 }
3834 3846
3835 name = strncpy(tmp, "//anon", sizeof(tmp)); 3847 name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3846,17 +3858,17 @@ got_name:
3846 3858
3847 rcu_read_lock(); 3859 rcu_read_lock();
3848 cpuctx = &get_cpu_var(perf_cpu_context); 3860 cpuctx = &get_cpu_var(perf_cpu_context);
3849 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3861 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
3850 ctx = rcu_dereference(current->perf_event_ctxp); 3862 ctx = rcu_dereference(current->perf_event_ctxp);
3851 if (ctx) 3863 if (ctx)
3852 perf_event_mmap_ctx(ctx, mmap_event); 3864 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
3853 put_cpu_var(perf_cpu_context); 3865 put_cpu_var(perf_cpu_context);
3854 rcu_read_unlock(); 3866 rcu_read_unlock();
3855 3867
3856 kfree(buf); 3868 kfree(buf);
3857} 3869}
3858 3870
3859void __perf_event_mmap(struct vm_area_struct *vma) 3871void perf_event_mmap(struct vm_area_struct *vma)
3860{ 3872{
3861 struct perf_mmap_event mmap_event; 3873 struct perf_mmap_event mmap_event;
3862 3874
@@ -4018,14 +4030,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
4018 hwc->last_period = hwc->sample_period; 4030 hwc->last_period = hwc->sample_period;
4019 4031
4020again: 4032again:
4021 old = val = atomic64_read(&hwc->period_left); 4033 old = val = local64_read(&hwc->period_left);
4022 if (val < 0) 4034 if (val < 0)
4023 return 0; 4035 return 0;
4024 4036
4025 nr = div64_u64(period + val, period); 4037 nr = div64_u64(period + val, period);
4026 offset = nr * period; 4038 offset = nr * period;
4027 val -= offset; 4039 val -= offset;
4028 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) 4040 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4029 goto again; 4041 goto again;
4030 4042
4031 return nr; 4043 return nr;
@@ -4064,7 +4076,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4064{ 4076{
4065 struct hw_perf_event *hwc = &event->hw; 4077 struct hw_perf_event *hwc = &event->hw;
4066 4078
4067 atomic64_add(nr, &event->count); 4079 local64_add(nr, &event->count);
4068 4080
4069 if (!regs) 4081 if (!regs)
4070 return; 4082 return;
@@ -4075,7 +4087,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4075 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4087 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4076 return perf_swevent_overflow(event, 1, nmi, data, regs); 4088 return perf_swevent_overflow(event, 1, nmi, data, regs);
4077 4089
4078 if (atomic64_add_negative(nr, &hwc->period_left)) 4090 if (local64_add_negative(nr, &hwc->period_left))
4079 return; 4091 return;
4080 4092
4081 perf_swevent_overflow(event, 0, nmi, data, regs); 4093 perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4213,14 +4225,12 @@ int perf_swevent_get_recursion_context(void)
4213} 4225}
4214EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4226EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4215 4227
4216void perf_swevent_put_recursion_context(int rctx) 4228void inline perf_swevent_put_recursion_context(int rctx)
4217{ 4229{
4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4230 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4219 barrier(); 4231 barrier();
4220 cpuctx->recursion[rctx]--; 4232 cpuctx->recursion[rctx]--;
4221} 4233}
4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4223
4224 4234
4225void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4235void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4226 struct pt_regs *regs, u64 addr) 4236 struct pt_regs *regs, u64 addr)
@@ -4368,8 +4378,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4368 u64 now; 4378 u64 now;
4369 4379
4370 now = cpu_clock(cpu); 4380 now = cpu_clock(cpu);
4371 prev = atomic64_xchg(&event->hw.prev_count, now); 4381 prev = local64_xchg(&event->hw.prev_count, now);
4372 atomic64_add(now - prev, &event->count); 4382 local64_add(now - prev, &event->count);
4373} 4383}
4374 4384
4375static int cpu_clock_perf_event_enable(struct perf_event *event) 4385static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4377,7 +4387,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
4377 struct hw_perf_event *hwc = &event->hw; 4387 struct hw_perf_event *hwc = &event->hw;
4378 int cpu = raw_smp_processor_id(); 4388 int cpu = raw_smp_processor_id();
4379 4389
4380 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4390 local64_set(&hwc->prev_count, cpu_clock(cpu));
4381 perf_swevent_start_hrtimer(event); 4391 perf_swevent_start_hrtimer(event);
4382 4392
4383 return 0; 4393 return 0;
@@ -4409,9 +4419,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4409 u64 prev; 4419 u64 prev;
4410 s64 delta; 4420 s64 delta;
4411 4421
4412 prev = atomic64_xchg(&event->hw.prev_count, now); 4422 prev = local64_xchg(&event->hw.prev_count, now);
4413 delta = now - prev; 4423 delta = now - prev;
4414 atomic64_add(delta, &event->count); 4424 local64_add(delta, &event->count);
4415} 4425}
4416 4426
4417static int task_clock_perf_event_enable(struct perf_event *event) 4427static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4421,7 +4431,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
4421 4431
4422 now = event->ctx->time; 4432 now = event->ctx->time;
4423 4433
4424 atomic64_set(&hwc->prev_count, now); 4434 local64_set(&hwc->prev_count, now);
4425 4435
4426 perf_swevent_start_hrtimer(event); 4436 perf_swevent_start_hrtimer(event);
4427 4437
@@ -4601,7 +4611,7 @@ static int perf_tp_event_match(struct perf_event *event,
4601} 4611}
4602 4612
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 4613void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head) 4614 struct pt_regs *regs, struct hlist_head *head, int rctx)
4605{ 4615{
4606 struct perf_sample_data data; 4616 struct perf_sample_data data;
4607 struct perf_event *event; 4617 struct perf_event *event;
@@ -4615,12 +4625,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4615 perf_sample_data_init(&data, addr); 4625 perf_sample_data_init(&data, addr);
4616 data.raw = &raw; 4626 data.raw = &raw;
4617 4627
4618 rcu_read_lock();
4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4628 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4620 if (perf_tp_event_match(event, &data, regs)) 4629 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs); 4630 perf_swevent_add(event, count, 1, &data, regs);
4622 } 4631 }
4623 rcu_read_unlock(); 4632
4633 perf_swevent_put_recursion_context(rctx);
4624} 4634}
4625EXPORT_SYMBOL_GPL(perf_tp_event); 4635EXPORT_SYMBOL_GPL(perf_tp_event);
4626 4636
@@ -4864,7 +4874,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4864 hwc->sample_period = 1; 4874 hwc->sample_period = 1;
4865 hwc->last_period = hwc->sample_period; 4875 hwc->last_period = hwc->sample_period;
4866 4876
4867 atomic64_set(&hwc->period_left, hwc->sample_period); 4877 local64_set(&hwc->period_left, hwc->sample_period);
4868 4878
4869 /* 4879 /*
4870 * we currently do not support PERF_FORMAT_GROUP on inherited events 4880 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4913,7 +4923,7 @@ done:
4913 4923
4914 if (!event->parent) { 4924 if (!event->parent) {
4915 atomic_inc(&nr_events); 4925 atomic_inc(&nr_events);
4916 if (event->attr.mmap) 4926 if (event->attr.mmap || event->attr.mmap_data)
4917 atomic_inc(&nr_mmap_events); 4927 atomic_inc(&nr_mmap_events);
4918 if (event->attr.comm) 4928 if (event->attr.comm)
4919 atomic_inc(&nr_comm_events); 4929 atomic_inc(&nr_comm_events);
@@ -5007,7 +5017,7 @@ err_size:
5007static int 5017static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5018perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5009{ 5019{
5010 struct perf_mmap_data *data = NULL, *old_data = NULL; 5020 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5011 int ret = -EINVAL; 5021 int ret = -EINVAL;
5012 5022
5013 if (!output_event) 5023 if (!output_event)
@@ -5037,19 +5047,19 @@ set:
5037 5047
5038 if (output_event) { 5048 if (output_event) {
5039 /* get the buffer we want to redirect to */ 5049 /* get the buffer we want to redirect to */
5040 data = perf_mmap_data_get(output_event); 5050 buffer = perf_buffer_get(output_event);
5041 if (!data) 5051 if (!buffer)
5042 goto unlock; 5052 goto unlock;
5043 } 5053 }
5044 5054
5045 old_data = event->data; 5055 old_buffer = event->buffer;
5046 rcu_assign_pointer(event->data, data); 5056 rcu_assign_pointer(event->buffer, buffer);
5047 ret = 0; 5057 ret = 0;
5048unlock: 5058unlock:
5049 mutex_unlock(&event->mmap_mutex); 5059 mutex_unlock(&event->mmap_mutex);
5050 5060
5051 if (old_data) 5061 if (old_buffer)
5052 perf_mmap_data_put(old_data); 5062 perf_buffer_put(old_buffer);
5053out: 5063out:
5054 return ret; 5064 return ret;
5055} 5065}
@@ -5298,7 +5308,7 @@ inherit_event(struct perf_event *parent_event,
5298 hwc->sample_period = sample_period; 5308 hwc->sample_period = sample_period;
5299 hwc->last_period = sample_period; 5309 hwc->last_period = sample_period;
5300 5310
5301 atomic64_set(&hwc->period_left, sample_period); 5311 local64_set(&hwc->period_left, sample_period);
5302 } 5312 }
5303 5313
5304 child_event->overflow_handler = parent_event->overflow_handler; 5314 child_event->overflow_handler = parent_event->overflow_handler;
@@ -5359,12 +5369,12 @@ static void sync_child_event(struct perf_event *child_event,
5359 if (child_event->attr.inherit_stat) 5369 if (child_event->attr.inherit_stat)
5360 perf_event_read_event(child_event, child); 5370 perf_event_read_event(child_event, child);
5361 5371
5362 child_val = atomic64_read(&child_event->count); 5372 child_val = perf_event_count(child_event);
5363 5373
5364 /* 5374 /*
5365 * Add back the child's count to the parent's count: 5375 * Add back the child's count to the parent's count:
5366 */ 5376 */
5367 atomic64_add(child_val, &parent_event->count); 5377 atomic64_add(child_val, &parent_event->child_count);
5368 atomic64_add(child_event->total_time_enabled, 5378 atomic64_add(child_event->total_time_enabled,
5369 &parent_event->child_total_time_enabled); 5379 &parent_event->child_total_time_enabled);
5370 atomic64_add(child_event->total_time_running, 5380 atomic64_add(child_event->total_time_running,
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..996a4dec5f96 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
49 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
50 */ 50 */
51struct pm_qos_request_list { 51enum pm_qos_type {
52 struct list_head list; 52 PM_QOS_MAX, /* return the largest value */
53 union { 53 PM_QOS_MIN /* return the smallest value */
54 s32 value;
55 s32 usec;
56 s32 kbps;
57 };
58 int pm_qos_class;
59}; 54};
60 55
61static s32 max_compare(s32 v1, s32 v2);
62static s32 min_compare(s32 v1, s32 v2);
63
64struct pm_qos_object { 56struct pm_qos_object {
65 struct pm_qos_request_list requests; 57 struct plist_head requests;
66 struct blocking_notifier_head *notifiers; 58 struct blocking_notifier_head *notifiers;
67 struct miscdevice pm_qos_power_miscdev; 59 struct miscdevice pm_qos_power_miscdev;
68 char *name; 60 char *name;
69 s32 default_value; 61 s32 default_value;
70 atomic_t target_value; 62 enum pm_qos_type type;
71 s32 (*comparitor)(s32, s32);
72}; 63};
73 64
65static DEFINE_SPINLOCK(pm_qos_lock);
66
74static struct pm_qos_object null_pm_qos; 67static struct pm_qos_object null_pm_qos;
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 68static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 69static struct pm_qos_object cpu_dma_pm_qos = {
77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, 70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
78 .notifiers = &cpu_dma_lat_notifier, 71 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency", 72 .name = "cpu_dma_latency",
80 .default_value = 2000 * USEC_PER_SEC, 73 .default_value = 2000 * USEC_PER_SEC,
81 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 74 .type = PM_QOS_MIN,
82 .comparitor = min_compare
83}; 75};
84 76
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 77static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 78static struct pm_qos_object network_lat_pm_qos = {
87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, 79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
88 .notifiers = &network_lat_notifier, 80 .notifiers = &network_lat_notifier,
89 .name = "network_latency", 81 .name = "network_latency",
90 .default_value = 2000 * USEC_PER_SEC, 82 .default_value = 2000 * USEC_PER_SEC,
91 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 83 .type = PM_QOS_MIN
92 .comparitor = min_compare
93}; 84};
94 85
95 86
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 87static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 88static struct pm_qos_object network_throughput_pm_qos = {
98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, 89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
99 .notifiers = &network_throughput_notifier, 90 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 91 .name = "network_throughput",
101 .default_value = 0, 92 .default_value = 0,
102 .target_value = ATOMIC_INIT(0), 93 .type = PM_QOS_MAX,
103 .comparitor = max_compare
104}; 94};
105 95
106 96
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
111 &network_throughput_pm_qos 101 &network_throughput_pm_qos
112}; 102};
113 103
114static DEFINE_SPINLOCK(pm_qos_lock);
115
116static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
117 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
118static int pm_qos_power_open(struct inode *inode, struct file *filp); 106static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
124 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
125}; 113};
126 114
127/* static helper functions */ 115/* unlocked internal variant */
128static s32 max_compare(s32 v1, s32 v2) 116static inline int pm_qos_get_value(struct pm_qos_object *o)
129{ 117{
130 return max(v1, v2); 118 if (plist_head_empty(&o->requests))
131} 119 return o->default_value;
132 120
133static s32 min_compare(s32 v1, s32 v2) 121 switch (o->type) {
134{ 122 case PM_QOS_MIN:
135 return min(v1, v2); 123 return plist_last(&o->requests)->prio;
136}
137 124
125 case PM_QOS_MAX:
126 return plist_first(&o->requests)->prio;
138 127
139static void update_target(int pm_qos_class) 128 default:
129 /* runtime check for not using enum */
130 BUG();
131 }
132}
133
134static void update_target(struct pm_qos_object *o, struct plist_node *node,
135 int del, int value)
140{ 136{
141 s32 extreme_value;
142 struct pm_qos_request_list *node;
143 unsigned long flags; 137 unsigned long flags;
144 int call_notifier = 0; 138 int prev_value, curr_value;
145 139
146 spin_lock_irqsave(&pm_qos_lock, flags); 140 spin_lock_irqsave(&pm_qos_lock, flags);
147 extreme_value = pm_qos_array[pm_qos_class]->default_value; 141 prev_value = pm_qos_get_value(o);
148 list_for_each_entry(node, 142 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
149 &pm_qos_array[pm_qos_class]->requests.list, list) { 143 if (value != PM_QOS_DEFAULT_VALUE) {
150 extreme_value = pm_qos_array[pm_qos_class]->comparitor( 144 /*
151 extreme_value, node->value); 145 * to change the list, we atomically remove, reinit
152 } 146 * with new value and add, then see if the extremal
153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != 147 * changed
154 extreme_value) { 148 */
155 call_notifier = 1; 149 plist_del(node, &o->requests);
156 atomic_set(&pm_qos_array[pm_qos_class]->target_value, 150 plist_node_init(node, value);
157 extreme_value); 151 plist_add(node, &o->requests);
158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, 152 } else if (del) {
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value)); 153 plist_del(node, &o->requests);
154 } else {
155 plist_add(node, &o->requests);
160 } 156 }
157 curr_value = pm_qos_get_value(o);
161 spin_unlock_irqrestore(&pm_qos_lock, flags); 158 spin_unlock_irqrestore(&pm_qos_lock, flags);
162 159
163 if (call_notifier) 160 if (prev_value != curr_value)
164 blocking_notifier_call_chain( 161 blocking_notifier_call_chain(o->notifiers,
165 pm_qos_array[pm_qos_class]->notifiers, 162 (unsigned long)curr_value,
166 (unsigned long) extreme_value, NULL); 163 NULL);
167} 164}
168 165
169static int register_pm_qos_misc(struct pm_qos_object *qos) 166static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor)
196 */ 193 */
197int pm_qos_request(int pm_qos_class) 194int pm_qos_request(int pm_qos_class)
198{ 195{
199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 196 unsigned long flags;
197 int value;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return value;
200} 204}
201EXPORT_SYMBOL_GPL(pm_qos_request); 205EXPORT_SYMBOL_GPL(pm_qos_request);
202 206
207int pm_qos_request_active(struct pm_qos_request_list *req)
208{
209 return req->pm_qos_class != 0;
210}
211EXPORT_SYMBOL_GPL(pm_qos_request_active);
212
203/** 213/**
204 * pm_qos_add_request - inserts new qos request into the list 214 * pm_qos_add_request - inserts new qos request into the list
205 * @pm_qos_class: identifies which list of qos request to us 215 * @pm_qos_class: identifies which list of qos request to us
@@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
211 * element as a handle for use in updating and removal. Call needs to save 221 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use. 222 * this handle for later use.
213 */ 223 */
214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) 224void pm_qos_add_request(struct pm_qos_request_list *dep,
225 int pm_qos_class, s32 value)
215{ 226{
216 struct pm_qos_request_list *dep; 227 struct pm_qos_object *o = pm_qos_array[pm_qos_class];
217 unsigned long flags; 228 int new_value;
218 229
219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); 230 if (pm_qos_request_active(dep)) {
220 if (dep) { 231 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
221 if (value == PM_QOS_DEFAULT_VALUE) 232 return;
222 dep->value = pm_qos_array[pm_qos_class]->default_value;
223 else
224 dep->value = value;
225 dep->pm_qos_class = pm_qos_class;
226
227 spin_lock_irqsave(&pm_qos_lock, flags);
228 list_add(&dep->list,
229 &pm_qos_array[pm_qos_class]->requests.list);
230 spin_unlock_irqrestore(&pm_qos_lock, flags);
231 update_target(pm_qos_class);
232 } 233 }
233 234 if (value == PM_QOS_DEFAULT_VALUE)
234 return dep; 235 new_value = o->default_value;
236 else
237 new_value = value;
238 plist_node_init(&dep->list, new_value);
239 dep->pm_qos_class = pm_qos_class;
240 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
235} 241}
236EXPORT_SYMBOL_GPL(pm_qos_add_request); 242EXPORT_SYMBOL_GPL(pm_qos_add_request);
237 243
@@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
246 * Attempts are made to make this code callable on hot code paths. 252 * Attempts are made to make this code callable on hot code paths.
247 */ 253 */
248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, 254void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value) 255 s32 new_value)
250{ 256{
251 unsigned long flags;
252 int pending_update = 0;
253 s32 temp; 257 s32 temp;
258 struct pm_qos_object *o;
259
260 if (!pm_qos_req) /*guard against callers passing in null */
261 return;
254 262
255 if (pm_qos_req) { /*guard against callers passing in null */ 263 if (!pm_qos_request_active(pm_qos_req)) {
256 spin_lock_irqsave(&pm_qos_lock, flags); 264 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
257 if (new_value == PM_QOS_DEFAULT_VALUE) 265 return;
258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
259 else
260 temp = new_value;
261
262 if (temp != pm_qos_req->value) {
263 pending_update = 1;
264 pm_qos_req->value = temp;
265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
269 } 266 }
267
268 o = pm_qos_array[pm_qos_req->pm_qos_class];
269
270 if (new_value == PM_QOS_DEFAULT_VALUE)
271 temp = o->default_value;
272 else
273 temp = new_value;
274
275 if (temp != pm_qos_req->list.prio)
276 update_target(o, &pm_qos_req->list, 0, temp);
270} 277}
271EXPORT_SYMBOL_GPL(pm_qos_update_request); 278EXPORT_SYMBOL_GPL(pm_qos_update_request);
272 279
@@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
280 */ 287 */
281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) 288void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
282{ 289{
283 unsigned long flags; 290 struct pm_qos_object *o;
284 int qos_class;
285 291
286 if (pm_qos_req == NULL) 292 if (pm_qos_req == NULL)
287 return; 293 return;
288 /* silent return to keep pcm code cleaner */ 294 /* silent return to keep pcm code cleaner */
289 295
290 qos_class = pm_qos_req->pm_qos_class; 296 if (!pm_qos_request_active(pm_qos_req)) {
291 spin_lock_irqsave(&pm_qos_lock, flags); 297 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
292 list_del(&pm_qos_req->list); 298 return;
293 kfree(pm_qos_req); 299 }
294 spin_unlock_irqrestore(&pm_qos_lock, flags); 300
295 update_target(qos_class); 301 o = pm_qos_array[pm_qos_req->pm_qos_class];
302 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
303 memset(pm_qos_req, 0, sizeof(*pm_qos_req));
296} 304}
297EXPORT_SYMBOL_GPL(pm_qos_remove_request); 305EXPORT_SYMBOL_GPL(pm_qos_remove_request);
298 306
@@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
340 348
341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 349 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
342 if (pm_qos_class >= 0) { 350 if (pm_qos_class >= 0) {
343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class, 351 struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
344 PM_QOS_DEFAULT_VALUE); 352 if (!req)
353 return -ENOMEM;
354
355 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
356 filp->private_data = req;
345 357
346 if (filp->private_data) 358 if (filp->private_data)
347 return 0; 359 return 0;
@@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
353{ 365{
354 struct pm_qos_request_list *req; 366 struct pm_qos_request_list *req;
355 367
356 req = (struct pm_qos_request_list *)filp->private_data; 368 req = filp->private_data;
357 pm_qos_remove_request(req); 369 pm_qos_remove_request(req);
370 kfree(req);
358 371
359 return 0; 372 return 0;
360} 373}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..f66bdd33a6c6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
232 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{ 234{
235 struct sighand_struct *sighand; 235 struct signal_struct *sig = tsk->signal;
236 struct signal_struct *sig;
237 struct task_struct *t; 236 struct task_struct *t;
238 237
239 *times = INIT_CPUTIME; 238 times->utime = sig->utime;
239 times->stime = sig->stime;
240 times->sum_exec_runtime = sig->sum_sched_runtime;
240 241
241 rcu_read_lock(); 242 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand); 243 /* make sure we can trust tsk->thread_group list */
243 if (!sighand) 244 if (!likely(pid_alive(tsk)))
244 goto out; 245 goto out;
245 246
246 sig = tsk->signal;
247
248 t = tsk; 247 t = tsk;
249 do { 248 do {
250 times->utime = cputime_add(times->utime, t->utime); 249 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime); 250 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime; 251 times->sum_exec_runtime += t->se.sum_exec_runtime;
253 252 } while_each_thread(tsk, t);
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out: 253out:
261 rcu_read_unlock(); 254 rcu_read_unlock();
262} 255}
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1279{ 1272{
1280 struct signal_struct *sig; 1273 struct signal_struct *sig;
1281 1274
1282 /* tsk == current, ensure it is safe to use ->signal/sighand */
1283 if (unlikely(tsk->exit_state))
1284 return 0;
1285
1286 if (!task_cputime_zero(&tsk->cputime_expires)) { 1275 if (!task_cputime_zero(&tsk->cputime_expires)) {
1287 struct task_cputime task_sample = { 1276 struct task_cputime task_sample = {
1288 .utime = tsk->utime, 1277 .utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1298 if (sig->cputimer.running) { 1287 if (sig->cputimer.running) {
1299 struct task_cputime group_sample; 1288 struct task_cputime group_sample;
1300 1289
1301 thread_group_cputimer(tsk, &group_sample); 1290 spin_lock(&sig->cputimer.lock);
1291 group_sample = sig->cputimer.cputime;
1292 spin_unlock(&sig->cputimer.lock);
1293
1302 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1294 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1303 return 1; 1295 return 1;
1304 } 1296 }
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1315{ 1307{
1316 LIST_HEAD(firing); 1308 LIST_HEAD(firing);
1317 struct k_itimer *timer, *next; 1309 struct k_itimer *timer, *next;
1310 unsigned long flags;
1318 1311
1319 BUG_ON(!irqs_disabled()); 1312 BUG_ON(!irqs_disabled());
1320 1313
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1325 if (!fastpath_timer_check(tsk)) 1318 if (!fastpath_timer_check(tsk))
1326 return; 1319 return;
1327 1320
1328 spin_lock(&tsk->sighand->siglock); 1321 if (!lock_task_sighand(tsk, &flags))
1322 return;
1329 /* 1323 /*
1330 * Here we take off tsk->signal->cpu_timers[N] and 1324 * Here we take off tsk->signal->cpu_timers[N] and
1331 * tsk->cpu_timers[N] all the timers that are firing, and 1325 * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 * that gets the timer lock before we do will give it up and 1341 * that gets the timer lock before we do will give it up and
1348 * spin until we've taken care of that timer below. 1342 * spin until we've taken care of that timer below.
1349 */ 1343 */
1350 spin_unlock(&tsk->sighand->siglock); 1344 unlock_task_sighand(tsk, &flags);
1351 1345
1352 /* 1346 /*
1353 * Now that all the timers on our list have the firing flag, 1347 * Now that all the timers on our list have the firing flag,
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..8dc31e02ae12 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
277 goto Enable_irqs; 277 goto Enable_irqs;
278 } 278 }
279 279
280 if (hibernation_test(TEST_CORE)) 280 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
281 goto Power_up; 281 goto Power_up;
282 282
283 in_suspend = 1; 283 in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
288 error); 288 error);
289 /* Restore control flow magically appears here */ 289 /* Restore control flow magically appears here */
290 restore_processor_state(); 290 restore_processor_state();
291 if (!in_suspend) 291 if (!in_suspend) {
292 events_check_enabled = false;
292 platform_leave(platform_mode); 293 platform_leave(platform_mode);
294 }
293 295
294 Power_up: 296 Power_up:
295 sysdev_resume(); 297 sysdev_resume();
@@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
328 330
329 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
330 if (error) 332 if (error)
331 return error; 333 goto Close;
332 334
333 /* Preallocate image memory before shutting down devices. */ 335 /* Preallocate image memory before shutting down devices. */
334 error = hibernate_preallocate_memory(); 336 error = hibernate_preallocate_memory();
@@ -511,18 +513,24 @@ int hibernation_platform_enter(void)
511 513
512 local_irq_disable(); 514 local_irq_disable();
513 sysdev_suspend(PMSG_HIBERNATE); 515 sysdev_suspend(PMSG_HIBERNATE);
516 if (!pm_check_wakeup_events()) {
517 error = -EAGAIN;
518 goto Power_up;
519 }
520
514 hibernation_ops->enter(); 521 hibernation_ops->enter();
515 /* We should never get here */ 522 /* We should never get here */
516 while (1); 523 while (1);
517 524
518 /* 525 Power_up:
519 * We don't need to reenable the nonboot CPUs or resume consoles, since 526 sysdev_resume();
520 * the system is going to be halted anyway. 527 local_irq_enable();
521 */ 528 enable_nonboot_cpus();
529
522 Platform_finish: 530 Platform_finish:
523 hibernation_ops->finish(); 531 hibernation_ops->finish();
524 532
525 dpm_suspend_noirq(PMSG_RESTORE); 533 dpm_resume_noirq(PMSG_RESTORE);
526 534
527 Resume_devices: 535 Resume_devices:
528 entering_platform_hibernation = false; 536 entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
204 204
205power_attr(state); 205power_attr(state);
206 206
207#ifdef CONFIG_PM_SLEEP
208/*
209 * The 'wakeup_count' attribute, along with the functions defined in
210 * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
211 * handled in a non-racy way.
212 *
213 * If a wakeup event occurs when the system is in a sleep state, it simply is
214 * woken up. In turn, if an event that would wake the system up from a sleep
215 * state occurs when it is undergoing a transition to that sleep state, the
216 * transition should be aborted. Moreover, if such an event occurs when the
217 * system is in the working state, an attempt to start a transition to the
218 * given sleep state should fail during certain period after the detection of
219 * the event. Using the 'state' attribute alone is not sufficient to satisfy
220 * these requirements, because a wakeup event may occur exactly when 'state'
221 * is being written to and may be delivered to user space right before it is
222 * frozen, so the event will remain only partially processed until the system is
223 * woken up by another event. In particular, it won't cause the transition to
224 * a sleep state to be aborted.
225 *
226 * This difficulty may be overcome if user space uses 'wakeup_count' before
227 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to.
234 */
235
236static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 char *buf)
239{
240 unsigned long val;
241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
243}
244
245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr,
247 const char *buf, size_t n)
248{
249 unsigned long val;
250
251 if (sscanf(buf, "%lu", &val) == 1) {
252 if (pm_save_wakeup_count(val))
253 return n;
254 }
255 return -EINVAL;
256}
257
258power_attr(wakeup_count);
259#endif /* CONFIG_PM_SLEEP */
260
207#ifdef CONFIG_PM_TRACE 261#ifdef CONFIG_PM_TRACE
208int pm_trace_enabled; 262int pm_trace_enabled;
209 263
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
236#endif 290#endif
237#ifdef CONFIG_PM_SLEEP 291#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr, 292 &pm_async_attr.attr,
293 &wakeup_count_attr.attr,
239#ifdef CONFIG_PM_DEBUG 294#ifdef CONFIG_PM_DEBUG
240 &pm_test_attr.attr, 295 &pm_test_attr.attr,
241#endif 296#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..f6cd6faf84fd 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * This file provides system snapshot/restore functionality for swsusp. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
136 if (suspend_ops->prepare) { 136 if (suspend_ops->prepare) {
137 error = suspend_ops->prepare(); 137 error = suspend_ops->prepare();
138 if (error) 138 if (error)
139 return error; 139 goto Platform_finish;
140 } 140 }
141 141
142 error = dpm_suspend_noirq(PMSG_SUSPEND); 142 error = dpm_suspend_noirq(PMSG_SUSPEND);
143 if (error) { 143 if (error) {
144 printk(KERN_ERR "PM: Some devices failed to power down\n"); 144 printk(KERN_ERR "PM: Some devices failed to power down\n");
145 goto Platfrom_finish; 145 goto Platform_finish;
146 } 146 }
147 147
148 if (suspend_ops->prepare_late) { 148 if (suspend_ops->prepare_late) {
149 error = suspend_ops->prepare_late(); 149 error = suspend_ops->prepare_late();
150 if (error) 150 if (error)
151 goto Power_up_devices; 151 goto Platform_wake;
152 } 152 }
153 153
154 if (suspend_test(TEST_PLATFORM)) 154 if (suspend_test(TEST_PLATFORM))
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
163 163
164 error = sysdev_suspend(PMSG_SUSPEND); 164 error = sysdev_suspend(PMSG_SUSPEND);
165 if (!error) { 165 if (!error) {
166 if (!suspend_test(TEST_CORE)) 166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
167 error = suspend_ops->enter(state); 167 error = suspend_ops->enter(state);
168 events_check_enabled = false;
169 }
168 sysdev_resume(); 170 sysdev_resume();
169 } 171 }
170 172
@@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
178 if (suspend_ops->wake) 180 if (suspend_ops->wake)
179 suspend_ops->wake(); 181 suspend_ops->wake();
180 182
181 Power_up_devices:
182 dpm_resume_noirq(PMSG_RESUME); 183 dpm_resume_noirq(PMSG_RESUME);
183 184
184 Platfrom_finish: 185 Platform_finish:
185 if (suspend_ops->finish) 186 if (suspend_ops->finish)
186 suspend_ops->finish(); 187 suspend_ops->finish();
187 188
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..e6a5bdf61a37 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
4 * This file provides functions for reading the suspend image from 4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition. 5 * and writing it to a swap partition.
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * 9 *
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
@@ -32,7 +32,7 @@
32/* 32/*
33 * The swap map is a data structure used for keeping track of each page 33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page 34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries. 35 * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
36 * These structures are stored on the swap and linked together with the 36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member. 37 * help of the .next_swap member.
38 * 38 *
@@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
148 148
149/** 149/**
150 * free_all_swap_pages - free swap pages allocated for saving image data. 150 * free_all_swap_pages - free swap pages allocated for saving image data.
151 * It also frees the extents used to register which swap entres had been 151 * It also frees the extents used to register which swap entries had been
152 * allocated. 152 * allocated.
153 */ 153 */
154 154
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..4ab0164bcf84 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,8 @@
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h>
41#include <linux/notifier.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42 44
@@ -985,6 +987,32 @@ void resume_console(void)
985} 987}
986 988
987/** 989/**
990 * console_cpu_notify - print deferred console messages after CPU hotplug
991 * @self: notifier struct
992 * @action: CPU hotplug event
993 * @hcpu: unused
994 *
995 * If printk() is called from a CPU that is not online yet, the messages
996 * will be spooled but will not show up on the console. This function is
997 * called when a new CPU comes online (or fails to come up), and ensures
998 * that any such output gets printed.
999 */
1000static int __cpuinit console_cpu_notify(struct notifier_block *self,
1001 unsigned long action, void *hcpu)
1002{
1003 switch (action) {
1004 case CPU_ONLINE:
1005 case CPU_DEAD:
1006 case CPU_DYING:
1007 case CPU_DOWN_FAILED:
1008 case CPU_UP_CANCELED:
1009 acquire_console_sem();
1010 release_console_sem();
1011 }
1012 return NOTIFY_OK;
1013}
1014
1015/**
988 * acquire_console_sem - lock the console system for exclusive use. 1016 * acquire_console_sem - lock the console system for exclusive use.
989 * 1017 *
990 * Acquires a semaphore which guarantees that the caller has 1018 * Acquires a semaphore which guarantees that the caller has
@@ -1371,7 +1399,7 @@ int unregister_console(struct console *console)
1371} 1399}
1372EXPORT_SYMBOL(unregister_console); 1400EXPORT_SYMBOL(unregister_console);
1373 1401
1374static int __init disable_boot_consoles(void) 1402static int __init printk_late_init(void)
1375{ 1403{
1376 struct console *con; 1404 struct console *con;
1377 1405
@@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void)
1382 unregister_console(con); 1410 unregister_console(con);
1383 } 1411 }
1384 } 1412 }
1413 hotcpu_notifier(console_cpu_notify, 0);
1385 return 0; 1414 return 0;
1386} 1415}
1387late_initcall(disable_boot_consoles); 1416late_initcall(printk_late_init);
1388 1417
1389#if defined CONFIG_PRINTK 1418#if defined CONFIG_PRINTK
1390 1419
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
114} 114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); 115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */ 116#endif /* #ifdef CONFIG_PROVE_RCU */
117
118#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
119static inline void debug_init_rcu_head(struct rcu_head *head)
120{
121 debug_object_init(head, &rcuhead_debug_descr);
122}
123
124static inline void debug_rcu_head_free(struct rcu_head *head)
125{
126 debug_object_free(head, &rcuhead_debug_descr);
127}
128
129/*
130 * fixup_init is called when:
131 * - an active object is initialized
132 */
133static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
134{
135 struct rcu_head *head = addr;
136
137 switch (state) {
138 case ODEBUG_STATE_ACTIVE:
139 /*
140 * Ensure that queued callbacks are all executed.
141 * If we detect that we are nested in a RCU read-side critical
142 * section, we should simply fail, otherwise we would deadlock.
143 */
144 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
145 irqs_disabled()) {
146 WARN_ON(1);
147 return 0;
148 }
149 rcu_barrier();
150 rcu_barrier_sched();
151 rcu_barrier_bh();
152 debug_object_init(head, &rcuhead_debug_descr);
153 return 1;
154 default:
155 return 0;
156 }
157}
158
159/*
160 * fixup_activate is called when:
161 * - an active object is activated
162 * - an unknown object is activated (might be a statically initialized object)
163 * Activation is performed internally by call_rcu().
164 */
165static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
166{
167 struct rcu_head *head = addr;
168
169 switch (state) {
170
171 case ODEBUG_STATE_NOTAVAILABLE:
172 /*
173 * This is not really a fixup. We just make sure that it is
174 * tracked in the object tracker.
175 */
176 debug_object_init(head, &rcuhead_debug_descr);
177 debug_object_activate(head, &rcuhead_debug_descr);
178 return 0;
179
180 case ODEBUG_STATE_ACTIVE:
181 /*
182 * Ensure that queued callbacks are all executed.
183 * If we detect that we are nested in a RCU read-side critical
184 * section, we should simply fail, otherwise we would deadlock.
185 */
186 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
187 irqs_disabled()) {
188 WARN_ON(1);
189 return 0;
190 }
191 rcu_barrier();
192 rcu_barrier_sched();
193 rcu_barrier_bh();
194 debug_object_activate(head, &rcuhead_debug_descr);
195 return 1;
196 default:
197 return 0;
198 }
199}
200
201/*
202 * fixup_free is called when:
203 * - an active object is freed
204 */
205static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
206{
207 struct rcu_head *head = addr;
208
209 switch (state) {
210 case ODEBUG_STATE_ACTIVE:
211 /*
212 * Ensure that queued callbacks are all executed.
213 * If we detect that we are nested in a RCU read-side critical
214 * section, we should simply fail, otherwise we would deadlock.
215 */
216#ifndef CONFIG_PREEMPT
217 WARN_ON(1);
218 return 0;
219#else
220 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
221 irqs_disabled()) {
222 WARN_ON(1);
223 return 0;
224 }
225 rcu_barrier();
226 rcu_barrier_sched();
227 rcu_barrier_bh();
228 debug_object_free(head, &rcuhead_debug_descr);
229 return 1;
230#endif
231 default:
232 return 0;
233 }
234}
235
236/**
237 * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
238 * @head: pointer to rcu_head structure to be initialized
239 *
240 * This function informs debugobjects of a new rcu_head structure that
241 * has been allocated as an auto variable on the stack. This function
242 * is not required for rcu_head structures that are statically defined or
243 * that are dynamically allocated on the heap. This function has no
244 * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
245 */
246void init_rcu_head_on_stack(struct rcu_head *head)
247{
248 debug_object_init_on_stack(head, &rcuhead_debug_descr);
249}
250EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
251
252/**
253 * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
254 * @head: pointer to rcu_head structure to be initialized
255 *
256 * This function informs debugobjects that an on-stack rcu_head structure
257 * is about to go out of scope. As with init_rcu_head_on_stack(), this
258 * function is not required for rcu_head structures that are statically
259 * defined or that are dynamically allocated on the heap. Also as with
260 * init_rcu_head_on_stack(), this function has no effect for
261 * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
262 */
263void destroy_rcu_head_on_stack(struct rcu_head *head)
264{
265 debug_object_free(head, &rcuhead_debug_descr);
266}
267EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
268
269struct debug_obj_descr rcuhead_debug_descr = {
270 .name = "rcu_head",
271 .fixup_init = rcuhead_fixup_init,
272 .fixup_activate = rcuhead_fixup_activate,
273 .fixup_free = rcuhead_fixup_free,
274};
275EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
276#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 while (list) { 169 while (list) {
170 next = list->next; 170 next = list->next;
171 prefetch(next); 171 prefetch(next);
172 debug_rcu_head_unqueue(list);
172 list->func(list); 173 list->func(list);
173 list = next; 174 list = next;
174 } 175 }
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
211{ 212{
212 unsigned long flags; 213 unsigned long flags;
213 214
215 debug_rcu_head_queue(head);
214 head->func = func; 216 head->func = func;
215 head->next = NULL; 217 head->next = NULL;
216 218
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
239rcu_random(struct rcu_random_state *rrsp) 239rcu_random(struct rcu_random_state *rrsp)
240{ 240{
241 if (--rrsp->rrs_count < 0) { 241 if (--rrsp->rrs_count < 0) {
242 rrsp->rrs_state += 242 rrsp->rrs_state += (unsigned long)local_clock();
243 (unsigned long)cpu_clock(raw_smp_processor_id());
244 rrsp->rrs_count = RCU_RANDOM_REFRESH; 243 rrsp->rrs_count = RCU_RANDOM_REFRESH;
245 } 244 }
246 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 245 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1112 while (list) { 1112 while (list) {
1113 next = list->next; 1113 next = list->next;
1114 prefetch(next); 1114 prefetch(next);
1115 debug_rcu_head_unqueue(list);
1115 list->func(list); 1116 list->func(list);
1116 list = next; 1117 list = next;
1117 if (++count >= rdp->blimit) 1118 if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1388 unsigned long flags; 1389 unsigned long flags;
1389 struct rcu_data *rdp; 1390 struct rcu_data *rdp;
1390 1391
1392 debug_rcu_head_queue(head);
1391 head->func = func; 1393 head->func = func;
1392 head->next = NULL; 1394 head->next = NULL;
1393 1395
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..41541d79e3c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
456 unsigned long nr_running; 457 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 462 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
462#endif 464#endif
463 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
464 466
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
1193 1195
1194#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1195/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1255 smp_send_reschedule(cpu);
1233} 1256}
1234 1257
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1258#endif /* CONFIG_NO_HZ */
1246 1259
1247static u64 sched_avg_period(void) 1260static u64 sched_avg_period(void)
@@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1665 if (root_task_group_empty())
1653 return; 1666 return;
1654 1667
1655 now = cpu_clock(raw_smp_processor_id()); 1668 now = local_clock();
1656 elapsed = now - sd->last_update; 1669 elapsed = now - sd->last_update;
1657 1670
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1671 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1819static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1821static void update_cpu_load(struct rq *this_rq);
1808 1822
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1824{
@@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2281}
2268#endif 2282#endif
2269 2283
2270/*** 2284static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2285 bool is_sync, bool is_migrate, bool is_local,
2286 unsigned long en_flags)
2287{
2288 schedstat_inc(p, se.statistics.nr_wakeups);
2289 if (is_sync)
2290 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2291 if (is_migrate)
2292 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2293 if (is_local)
2294 schedstat_inc(p, se.statistics.nr_wakeups_local);
2295 else
2296 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2297
2298 activate_task(rq, p, en_flags);
2299}
2300
2301static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2302 int wake_flags, bool success)
2303{
2304 trace_sched_wakeup(p, success);
2305 check_preempt_curr(rq, p, wake_flags);
2306
2307 p->state = TASK_RUNNING;
2308#ifdef CONFIG_SMP
2309 if (p->sched_class->task_woken)
2310 p->sched_class->task_woken(rq, p);
2311
2312 if (unlikely(rq->idle_stamp)) {
2313 u64 delta = rq->clock - rq->idle_stamp;
2314 u64 max = 2*sysctl_sched_migration_cost;
2315
2316 if (delta > max)
2317 rq->avg_idle = max;
2318 else
2319 update_avg(&rq->avg_idle, delta);
2320 rq->idle_stamp = 0;
2321 }
2322#endif
2323 /* if a worker is waking up, notify workqueue */
2324 if ((p->flags & PF_WQ_WORKER) && success)
2325 wq_worker_waking_up(p, cpu_of(rq));
2326}
2327
2328/**
2271 * try_to_wake_up - wake up a thread 2329 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2330 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2331 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2332 * @wake_flags: wake modifier flags (WF_*)
2275 * 2333 *
2276 * Put it on the run-queue if it's not already there. The "current" 2334 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2335 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2337 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2338 * runnable without the overhead of this.
2281 * 2339 *
2282 * returns failure only if the task is already active. 2340 * Returns %true if @p was woken up, %false if it was already running
2341 * or @state didn't match @p's state.
2283 */ 2342 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2343static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2344 int wake_flags)
@@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2418
2360out_activate: 2419out_activate:
2361#endif /* CONFIG_SMP */ 2420#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2421 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2422 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2423 success = 1;
2373
2374out_running: 2424out_running:
2375 trace_sched_wakeup(p, success); 2425 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2426out:
2395 task_rq_unlock(rq, &flags); 2427 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2428 put_cpu();
@@ -2399,6 +2431,37 @@ out:
2399} 2431}
2400 2432
2401/** 2433/**
2434 * try_to_wake_up_local - try to wake up a local task with rq lock held
2435 * @p: the thread to be awakened
2436 *
2437 * Put @p on the run-queue if it's not alredy there. The caller must
2438 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2439 * the current task. this_rq() stays locked over invocation.
2440 */
2441static void try_to_wake_up_local(struct task_struct *p)
2442{
2443 struct rq *rq = task_rq(p);
2444 bool success = false;
2445
2446 BUG_ON(rq != this_rq());
2447 BUG_ON(p == current);
2448 lockdep_assert_held(&rq->lock);
2449
2450 if (!(p->state & TASK_NORMAL))
2451 return;
2452
2453 if (!p->se.on_rq) {
2454 if (likely(!task_running(rq, p))) {
2455 schedstat_inc(rq, ttwu_count);
2456 schedstat_inc(rq, ttwu_local);
2457 }
2458 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2459 success = true;
2460 }
2461 ttwu_post_activation(p, rq, 0, success);
2462}
2463
2464/**
2402 * wake_up_process - Wake up a specific process 2465 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2466 * @p: The process to be woken up.
2404 * 2467 *
@@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3075}
3013 3076
3014/* 3077/*
3078 * The exact cpuload at various idx values, calculated at every tick would be
3079 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3080 *
3081 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3082 * on nth tick when cpu may be busy, then we have:
3083 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3084 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3085 *
3086 * decay_load_missed() below does efficient calculation of
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3089 *
3090 * The calculation is approximated on a 128 point scale.
3091 * degrade_zero_ticks is the number of ticks after which load at any
3092 * particular idx is approximated to be zero.
3093 * degrade_factor is a precomputed table, a row for each load idx.
3094 * Each column corresponds to degradation factor for a power of two ticks,
3095 * based on 128 point scale.
3096 * Example:
3097 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3098 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3099 *
3100 * With this power of 2 load factors, we can degrade the load n times
3101 * by looking at 1 bits in n and doing as many mult/shift instead of
3102 * n mult/shifts needed by the exact degradation.
3103 */
3104#define DEGRADE_SHIFT 7
3105static const unsigned char
3106 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3107static const unsigned char
3108 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3109 {0, 0, 0, 0, 0, 0, 0, 0},
3110 {64, 32, 8, 0, 0, 0, 0, 0},
3111 {96, 72, 40, 12, 1, 0, 0},
3112 {112, 98, 75, 43, 15, 1, 0},
3113 {120, 112, 98, 76, 45, 16, 2} };
3114
3115/*
3116 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3117 * would be when CPU is idle and so we just decay the old load without
3118 * adding any new load.
3119 */
3120static unsigned long
3121decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3122{
3123 int j = 0;
3124
3125 if (!missed_updates)
3126 return load;
3127
3128 if (missed_updates >= degrade_zero_ticks[idx])
3129 return 0;
3130
3131 if (idx == 1)
3132 return load >> missed_updates;
3133
3134 while (missed_updates) {
3135 if (missed_updates % 2)
3136 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3137
3138 missed_updates >>= 1;
3139 j++;
3140 }
3141 return load;
3142}
3143
3144/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3145 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3146 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3147 * every tick. We fix it up based on jiffies.
3017 */ 3148 */
3018static void update_cpu_load(struct rq *this_rq) 3149static void update_cpu_load(struct rq *this_rq)
3019{ 3150{
3020 unsigned long this_load = this_rq->load.weight; 3151 unsigned long this_load = this_rq->load.weight;
3152 unsigned long curr_jiffies = jiffies;
3153 unsigned long pending_updates;
3021 int i, scale; 3154 int i, scale;
3022 3155
3023 this_rq->nr_load_updates++; 3156 this_rq->nr_load_updates++;
3024 3157
3158 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3159 if (curr_jiffies == this_rq->last_load_update_tick)
3160 return;
3161
3162 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3163 this_rq->last_load_update_tick = curr_jiffies;
3164
3025 /* Update our load: */ 3165 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3166 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3167 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3168 unsigned long old_load, new_load;
3028 3169
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3170 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3171
3031 old_load = this_rq->cpu_load[i]; 3172 old_load = this_rq->cpu_load[i];
3173 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3174 new_load = this_load;
3033 /* 3175 /*
3034 * Round up the averaging division if load is increasing. This 3176 * Round up the averaging division if load is increasing. This
@@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3178 * example.
3037 */ 3179 */
3038 if (new_load > old_load) 3180 if (new_load > old_load)
3039 new_load += scale-1; 3181 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3182
3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3184 }
3185}
3186
3187static void update_cpu_load_active(struct rq *this_rq)
3188{
3189 update_cpu_load(this_rq);
3042 3190
3043 calc_load_account_active(this_rq); 3191 calc_load_account_active(this_rq);
3044} 3192}
@@ -3426,7 +3574,7 @@ void scheduler_tick(void)
3426 3574
3427 raw_spin_lock(&rq->lock); 3575 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3576 update_rq_clock(rq);
3429 update_cpu_load(rq); 3577 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3578 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3579 raw_spin_unlock(&rq->lock);
3432 3580
@@ -3598,7 +3746,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3746 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3747 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3748 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3749
3603 release_kernel_lock(prev); 3750 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3751need_resched_nonpreemptible:
@@ -3611,11 +3758,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3758 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3759 clear_tsk_need_resched(prev);
3613 3760
3761 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3762 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3763 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3764 prev->state = TASK_RUNNING;
3617 else 3765 } else {
3766 /*
3767 * If a worker is going to sleep, notify and
3768 * ask workqueue whether it wants to wake up a
3769 * task to maintain concurrency. If so, wake
3770 * up the task.
3771 */
3772 if (prev->flags & PF_WQ_WORKER) {
3773 struct task_struct *to_wakeup;
3774
3775 to_wakeup = wq_worker_sleeping(prev, cpu);
3776 if (to_wakeup)
3777 try_to_wake_up_local(to_wakeup);
3778 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3779 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3780 }
3619 switch_count = &prev->nvcsw; 3781 switch_count = &prev->nvcsw;
3620 } 3782 }
3621 3783
@@ -3637,8 +3799,10 @@ need_resched_nonpreemptible:
3637 3799
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3800 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3801 /*
3640 * the context switch might have flipped the stack from under 3802 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3803 * and restored the local variables which were saved when
3804 * this task called schedule() in the past. prev == current
3805 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3806 */
3643 cpu = smp_processor_id(); 3807 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3808 rq = cpu_rq(cpu);
@@ -3647,11 +3811,8 @@ need_resched_nonpreemptible:
3647 3811
3648 post_schedule(rq); 3812 post_schedule(rq);
3649 3813
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3814 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3815 goto need_resched_nonpreemptible;
3654 }
3655 3816
3656 preempt_enable_no_resched(); 3817 preempt_enable_no_resched();
3657 if (need_resched()) 3818 if (need_resched())
@@ -3726,7 +3887,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3726 * off of preempt_enable. Kernel preemptions off return from interrupt 3887 * off of preempt_enable. Kernel preemptions off return from interrupt
3727 * occur there and call schedule directly. 3888 * occur there and call schedule directly.
3728 */ 3889 */
3729asmlinkage void __sched preempt_schedule(void) 3890asmlinkage void __sched notrace preempt_schedule(void)
3730{ 3891{
3731 struct thread_info *ti = current_thread_info(); 3892 struct thread_info *ti = current_thread_info();
3732 3893
@@ -3738,9 +3899,9 @@ asmlinkage void __sched preempt_schedule(void)
3738 return; 3899 return;
3739 3900
3740 do { 3901 do {
3741 add_preempt_count(PREEMPT_ACTIVE); 3902 add_preempt_count_notrace(PREEMPT_ACTIVE);
3742 schedule(); 3903 schedule();
3743 sub_preempt_count(PREEMPT_ACTIVE); 3904 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3744 3905
3745 /* 3906 /*
3746 * Check again in case we missed a preemption opportunity 3907 * Check again in case we missed a preemption opportunity
@@ -4441,12 +4602,8 @@ recheck:
4441 */ 4602 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4603 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4604 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4605 unsigned long rlim_rtprio =
4445 4606 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4607
4451 /* can't set/change the rt policy */ 4608 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4609 if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 5973 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 5974static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 5975 .notifier_call = migration_call,
5819 .priority = 10 5976 .priority = CPU_PRI_MIGRATION,
5820}; 5977};
5821 5978
5979static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5980 unsigned long action, void *hcpu)
5981{
5982 switch (action & ~CPU_TASKS_FROZEN) {
5983 case CPU_ONLINE:
5984 case CPU_DOWN_FAILED:
5985 set_cpu_active((long)hcpu, true);
5986 return NOTIFY_OK;
5987 default:
5988 return NOTIFY_DONE;
5989 }
5990}
5991
5992static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5993 unsigned long action, void *hcpu)
5994{
5995 switch (action & ~CPU_TASKS_FROZEN) {
5996 case CPU_DOWN_PREPARE:
5997 set_cpu_active((long)hcpu, false);
5998 return NOTIFY_OK;
5999 default:
6000 return NOTIFY_DONE;
6001 }
6002}
6003
5822static int __init migration_init(void) 6004static int __init migration_init(void)
5823{ 6005{
5824 void *cpu = (void *)(long)smp_processor_id(); 6006 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6007 int err;
5826 6008
5827 /* Start one for the boot CPU: */ 6009 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6010 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6011 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6012 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6013 register_cpu_notifier(&migration_notifier);
5832 6014
6015 /* Register cpu active notifiers */
6016 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6017 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6018
5833 return 0; 6019 return 0;
5834} 6020}
5835early_initcall(migration_init); 6021early_initcall(migration_init);
@@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6250 free_rootdomain(old_rd);
6065} 6251}
6066 6252
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6253static int init_rootdomain(struct root_domain *rd)
6068{ 6254{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6255 memset(rd, 0, sizeof(*rd));
6072 6256
6073 if (bootmem) 6257 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6258 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6259 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6260 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6261 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6262 goto free_online;
6082 6263
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6264 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6265 goto free_rto_mask;
6085 return 0; 6266 return 0;
6086 6267
@@ -6096,7 +6277,7 @@ out:
6096 6277
6097static void init_defrootdomain(void) 6278static void init_defrootdomain(void)
6098{ 6279{
6099 init_rootdomain(&def_root_domain, true); 6280 init_rootdomain(&def_root_domain);
6100 6281
6101 atomic_set(&def_root_domain.refcount, 1); 6282 atomic_set(&def_root_domain.refcount, 1);
6102} 6283}
@@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6290 if (!rd)
6110 return NULL; 6291 return NULL;
6111 6292
6112 if (init_rootdomain(rd, false) != 0) { 6293 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6294 kfree(rd);
6114 return NULL; 6295 return NULL;
6115 } 6296 }
@@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7469}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7470#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7471
7291#ifndef CONFIG_CPUSETS
7292/* 7472/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7473 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7474 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7475 * around partition_sched_domains().
7295 */ 7476 */
7296static int update_sched_domains(struct notifier_block *nfb, 7477static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7478 void *hcpu)
7298{ 7479{
7299 switch (action) { 7480 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7481 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7482 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7483 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7484 return NOTIFY_OK;
7485 default:
7486 return NOTIFY_DONE;
7487 }
7488}
7308 7489
7490static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7491 void *hcpu)
7492{
7493 switch (action & ~CPU_TASKS_FROZEN) {
7494 case CPU_DOWN_PREPARE:
7495 cpuset_update_active_cpus();
7496 return NOTIFY_OK;
7309 default: 7497 default:
7310 return NOTIFY_DONE; 7498 return NOTIFY_DONE;
7311 } 7499 }
7312} 7500}
7313#endif
7314 7501
7315static int update_runtime(struct notifier_block *nfb, 7502static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7503 unsigned long action, void *hcpu)
@@ -7356,10 +7543,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7543 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7544 put_online_cpus();
7358 7545
7359#ifndef CONFIG_CPUSETS 7546 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7547 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7548
7364 /* RT runtime code needs to handle some hotplug events */ 7549 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7550 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7789,9 @@ void __init sched_init(void)
7604 7789
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7790 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 7791 rq->cpu_load[j] = 0;
7792
7793 rq->last_load_update_tick = jiffies;
7794
7607#ifdef CONFIG_SMP 7795#ifdef CONFIG_SMP
7608 rq->sd = NULL; 7796 rq->sd = NULL;
7609 rq->rd = NULL; 7797 rq->rd = NULL;
@@ -7617,6 +7805,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 7805 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 7806 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 7807 rq_attach_root(rq, &def_root_domain);
7808#ifdef CONFIG_NO_HZ
7809 rq->nohz_balance_kick = 0;
7810 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7811#endif
7620#endif 7812#endif
7621 init_rq_hrtick(rq); 7813 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 7814 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7853,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7853 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 7854#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 7855#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7856 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7857 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7858 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7859 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7860 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 7861#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 7862 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 7863 if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
10 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 *
14 * - gtod 14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
15 * - sched_clock() 43 * - sched_clock()
16 * - explicit idle events 44 * - explicit idle events
17 * 45 *
18 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
19 * making it monotonic and keeping it within an expected window. 47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
20 * 49 *
21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
22 * that is otherwise invisible (TSC gets stopped). 51 * that is otherwise invisible (TSC gets stopped).
23 * 52 *
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 53 *
25 * consistent between cpus (never more than 2 jiffies difference). 54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
26 */ 62 */
27#include <linux/spinlock.h> 63#include <linux/spinlock.h>
28#include <linux/hardirq.h> 64#include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
170 return val; 206 return val;
171} 207}
172 208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
173u64 sched_clock_cpu(int cpu) 214u64 sched_clock_cpu(int cpu)
174{ 215{
175 struct sched_clock_data *scd; 216 struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
237} 278}
238EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
239 280
240unsigned long long cpu_clock(int cpu) 281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
241{ 292{
242 unsigned long long clock; 293 u64 clock;
243 unsigned long flags; 294 unsigned long flags;
244 295
245 local_irq_save(flags); 296 local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
249 return clock; 300 return clock;
250} 301}
251 302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
252#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
253 323
254void sched_clock_init(void) 324void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
264 return sched_clock(); 334 return sched_clock();
265} 335}
266 336
267 337u64 cpu_clock(int cpu)
268unsigned long long cpu_clock(int cpu)
269{ 338{
270 return sched_clock_cpu(cpu); 339 return sched_clock_cpu(cpu);
271} 340}
272 341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
273#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
274 348
275EXPORT_SYMBOL_GPL(cpu_clock); 349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
166 * 166 *
167 * Returns: -ENOMEM if memory fails. 167 * Returns: -ENOMEM if memory fails.
168 */ 168 */
169int cpupri_init(struct cpupri *cp, bool bootmem) 169int cpupri_init(struct cpupri *cp)
170{ 170{
171 gfp_t gfp = GFP_KERNEL;
172 int i; 171 int i;
173 172
174 if (bootmem)
175 gfp = GFP_NOWAIT;
176
177 memset(cp, 0, sizeof(*cp)); 173 memset(cp, 0, sizeof(*cp));
178 174
179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
181 177
182 raw_spin_lock_init(&vec->lock); 178 raw_spin_lock_init(&vec->lock);
183 vec->count = 0; 179 vec->count = 0;
184 if (!zalloc_cpumask_var(&vec->mask, gfp)) 180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
185 goto cleanup; 181 goto cleanup;
186 } 182 }
187 183
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, struct cpumask *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
32#else 32#else
33#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
332 PN(sysctl_sched_latency); 332 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 333 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 334 PN(sysctl_sched_wakeup_granularity);
335 PN(sysctl_sched_child_runs_first); 335 P(sysctl_sched_child_runs_first);
336 P(sysctl_sched_features); 336 P(sysctl_sched_features);
337#undef PN 337#undef PN
338#undef P 338#undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..806d1b227a21 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2287 unsigned long power = SCHED_LOAD_SCALE; 2287 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups; 2288 struct sched_group *sdg = sd->groups;
2289 2289
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2290 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER)) 2291 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu); 2292 power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2303 power >>= SCHED_LOAD_SHIFT; 2296 power >>= SCHED_LOAD_SHIFT;
2304 } 2297 }
2305 2298
2299 sdg->cpu_power_orig = power;
2300
2301 if (sched_feat(ARCH_POWER))
2302 power *= arch_scale_freq_power(sd, cpu);
2303 else
2304 power *= default_scale_freq_power(sd, cpu);
2305
2306 power >>= SCHED_LOAD_SHIFT;
2307
2306 power *= scale_rt_power(cpu); 2308 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT; 2309 power >>= SCHED_LOAD_SHIFT;
2308 2310
@@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2335 sdg->cpu_power = power; 2337 sdg->cpu_power = power;
2336} 2338}
2337 2339
2340/*
2341 * Try and fix up capacity for tiny siblings, this is needed when
2342 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2343 * which on its own isn't powerful enough.
2344 *
2345 * See update_sd_pick_busiest() and check_asym_packing().
2346 */
2347static inline int
2348fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2349{
2350 /*
2351 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2352 */
2353 if (sd->level != SD_LV_SIBLING)
2354 return 0;
2355
2356 /*
2357 * If ~90% of the cpu_power is still there, we're good.
2358 */
2359 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2360 return 1;
2361
2362 return 0;
2363}
2364
2338/** 2365/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2366 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated. 2367 * @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2400 * domains. In the newly idle case, we will allow all the cpu's 2427 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance. 2428 * to do the newly idle load balance.
2402 */ 2429 */
2403 if (idle != CPU_NEWLY_IDLE && local_group && 2430 if (idle != CPU_NEWLY_IDLE && local_group) {
2404 balance_cpu != this_cpu) { 2431 if (balance_cpu != this_cpu) {
2405 *balance = 0; 2432 *balance = 0;
2406 return; 2433 return;
2434 }
2435 update_group_power(sd, this_cpu);
2407 } 2436 }
2408 2437
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */ 2438 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2439 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413 2440
@@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2428 2455
2429 sgs->group_capacity = 2456 sgs->group_capacity =
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2457 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2458 if (!sgs->group_capacity)
2459 sgs->group_capacity = fix_small_capacity(sd, group);
2460}
2461
2462/**
2463 * update_sd_pick_busiest - return 1 on busiest group
2464 * @sd: sched_domain whose statistics are to be checked
2465 * @sds: sched_domain statistics
2466 * @sg: sched_group candidate to be checked for being the busiest
2467 * @sgs: sched_group statistics
2468 * @this_cpu: the current cpu
2469 *
2470 * Determine if @sg is a busier group than the previously selected
2471 * busiest group.
2472 */
2473static bool update_sd_pick_busiest(struct sched_domain *sd,
2474 struct sd_lb_stats *sds,
2475 struct sched_group *sg,
2476 struct sg_lb_stats *sgs,
2477 int this_cpu)
2478{
2479 if (sgs->avg_load <= sds->max_load)
2480 return false;
2481
2482 if (sgs->sum_nr_running > sgs->group_capacity)
2483 return true;
2484
2485 if (sgs->group_imb)
2486 return true;
2487
2488 /*
2489 * ASYM_PACKING needs to move all the work to the lowest
2490 * numbered CPUs in the group, therefore mark all groups
2491 * higher than ourself as busy.
2492 */
2493 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2494 this_cpu < group_first_cpu(sg)) {
2495 if (!sds->busiest)
2496 return true;
2497
2498 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2499 return true;
2500 }
2501
2502 return false;
2431} 2503}
2432 2504
2433/** 2505/**
@@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2435 * @sd: sched_domain whose statistics are to be updated. 2507 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed. 2508 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu 2509 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group. 2510 * @sd_idle: Idle status of the sched_domain containing sg.
2439 * @cpus: Set of cpus considered for load balancing. 2511 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance. 2512 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain. 2513 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2446 struct sd_lb_stats *sds) 2518 struct sd_lb_stats *sds)
2447{ 2519{
2448 struct sched_domain *child = sd->child; 2520 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups; 2521 struct sched_group *sg = sd->groups;
2450 struct sg_lb_stats sgs; 2522 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0; 2523 int load_idx, prefer_sibling = 0;
2452 2524
@@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2459 do { 2531 do {
2460 int local_group; 2532 int local_group;
2461 2533
2462 local_group = cpumask_test_cpu(this_cpu, 2534 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs)); 2535 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2536 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs); 2537 local_group, cpus, balance, &sgs);
2467 2538
2468 if (local_group && !(*balance)) 2539 if (local_group && !(*balance))
2469 return; 2540 return;
2470 2541
2471 sds->total_load += sgs.group_load; 2542 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power; 2543 sds->total_pwr += sg->cpu_power;
2473 2544
2474 /* 2545 /*
2475 * In case the child domain prefers tasks go to siblings 2546 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try 2547 * first, lower the sg capacity to one so that we'll try
2477 * and move all the excess tasks away. 2548 * and move all the excess tasks away.
2478 */ 2549 */
2479 if (prefer_sibling) 2550 if (prefer_sibling)
@@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2481 2552
2482 if (local_group) { 2553 if (local_group) {
2483 sds->this_load = sgs.avg_load; 2554 sds->this_load = sgs.avg_load;
2484 sds->this = group; 2555 sds->this = sg;
2485 sds->this_nr_running = sgs.sum_nr_running; 2556 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load; 2557 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load && 2558 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2488 (sgs.sum_nr_running > sgs.group_capacity ||
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load; 2559 sds->max_load = sgs.avg_load;
2491 sds->busiest = group; 2560 sds->busiest = sg;
2492 sds->busiest_nr_running = sgs.sum_nr_running; 2561 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity; 2562 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load; 2563 sds->busiest_load_per_task = sgs.sum_weighted_load;
2495 sds->group_imb = sgs.group_imb; 2564 sds->group_imb = sgs.group_imb;
2496 } 2565 }
2497 2566
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2567 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2499 group = group->next; 2568 sg = sg->next;
2500 } while (group != sd->groups); 2569 } while (sg != sd->groups);
2570}
2571
2572int __weak arch_sd_sibling_asym_packing(void)
2573{
2574 return 0*SD_ASYM_PACKING;
2575}
2576
2577/**
2578 * check_asym_packing - Check to see if the group is packed into the
2579 * sched doman.
2580 *
2581 * This is primarily intended to used at the sibling level. Some
2582 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2583 * case of POWER7, it can move to lower SMT modes only when higher
2584 * threads are idle. When in lower SMT modes, the threads will
2585 * perform better since they share less core resources. Hence when we
2586 * have idle threads, we want them to be the higher ones.
2587 *
2588 * This packing function is run on idle threads. It checks to see if
2589 * the busiest CPU in this domain (core in the P7 case) has a higher
2590 * CPU number than the packing function is being run on. Here we are
2591 * assuming lower CPU number will be equivalent to lower a SMT thread
2592 * number.
2593 *
2594 * Returns 1 when packing is required and a task should be moved to
2595 * this CPU. The amount of the imbalance is returned in *imbalance.
2596 *
2597 * @sd: The sched_domain whose packing is to be checked.
2598 * @sds: Statistics of the sched_domain which is to be packed
2599 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2600 * @imbalance: returns amount of imbalanced due to packing.
2601 */
2602static int check_asym_packing(struct sched_domain *sd,
2603 struct sd_lb_stats *sds,
2604 int this_cpu, unsigned long *imbalance)
2605{
2606 int busiest_cpu;
2607
2608 if (!(sd->flags & SD_ASYM_PACKING))
2609 return 0;
2610
2611 if (!sds->busiest)
2612 return 0;
2613
2614 busiest_cpu = group_first_cpu(sds->busiest);
2615 if (this_cpu > busiest_cpu)
2616 return 0;
2617
2618 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2619 SCHED_LOAD_SCALE);
2620 return 1;
2501} 2621}
2502 2622
2503/** 2623/**
@@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2692 if (!(*balance)) 2812 if (!(*balance))
2693 goto ret; 2813 goto ret;
2694 2814
2815 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2816 check_asym_packing(sd, &sds, this_cpu, imbalance))
2817 return sds.busiest;
2818
2695 if (!sds.busiest || sds.busiest_nr_running == 0) 2819 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced; 2820 goto out_balanced;
2697 2821
@@ -2726,8 +2850,9 @@ ret:
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2850 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */ 2851 */
2728static struct rq * 2852static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2853find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2730 unsigned long imbalance, const struct cpumask *cpus) 2854 enum cpu_idle_type idle, unsigned long imbalance,
2855 const struct cpumask *cpus)
2731{ 2856{
2732 struct rq *busiest = NULL, *rq; 2857 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0; 2858 unsigned long max_load = 0;
@@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2863 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl; 2864 unsigned long wl;
2740 2865
2866 if (!capacity)
2867 capacity = fix_small_capacity(sd, group);
2868
2741 if (!cpumask_test_cpu(i, cpus)) 2869 if (!cpumask_test_cpu(i, cpus))
2742 continue; 2870 continue;
2743 2871
@@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2777/* Working cpumask for load_balance and load_balance_newidle. */ 2905/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2906static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779 2907
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2908static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2909 int busiest_cpu, int this_cpu)
2781{ 2910{
2782 if (idle == CPU_NEWLY_IDLE) { 2911 if (idle == CPU_NEWLY_IDLE) {
2912
2913 /*
2914 * ASYM_PACKING needs to force migrate tasks from busy but
2915 * higher numbered CPUs in order to pack all tasks in the
2916 * lowest numbered CPUs.
2917 */
2918 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2919 return 1;
2920
2783 /* 2921 /*
2784 * The only task running in a non-idle cpu can be moved to this 2922 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU 2923 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2992,7 @@ redo:
2854 goto out_balanced; 2992 goto out_balanced;
2855 } 2993 }
2856 2994
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2995 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2858 if (!busiest) { 2996 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]); 2997 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced; 2998 goto out_balanced;
@@ -2898,7 +3036,8 @@ redo:
2898 schedstat_inc(sd, lb_failed[idle]); 3036 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++; 3037 sd->nr_balance_failed++;
2900 3038
2901 if (need_active_balance(sd, sd_idle, idle)) { 3039 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3040 this_cpu)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags); 3041 raw_spin_lock_irqsave(&busiest->lock, flags);
2903 3042
2904 /* don't kick the active_load_balance_cpu_stop, 3043 /* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3232,40 @@ out_unlock:
3093} 3232}
3094 3233
3095#ifdef CONFIG_NO_HZ 3234#ifdef CONFIG_NO_HZ
3235
3236static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3237
3238static void trigger_sched_softirq(void *data)
3239{
3240 raise_softirq_irqoff(SCHED_SOFTIRQ);
3241}
3242
3243static inline void init_sched_softirq_csd(struct call_single_data *csd)
3244{
3245 csd->func = trigger_sched_softirq;
3246 csd->info = NULL;
3247 csd->flags = 0;
3248 csd->priv = 0;
3249}
3250
3251/*
3252 * idle load balancing details
3253 * - One of the idle CPUs nominates itself as idle load_balancer, while
3254 * entering idle.
3255 * - This idle load balancer CPU will also go into tickless mode when
3256 * it is idle, just like all other idle CPUs
3257 * - When one of the busy CPUs notice that there may be an idle rebalancing
3258 * needed, they will kick the idle load balancer, which then does idle
3259 * load balancing for all the idle CPUs.
3260 */
3096static struct { 3261static struct {
3097 atomic_t load_balancer; 3262 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask; 3263 atomic_t first_pick_cpu;
3099 cpumask_var_t ilb_grp_nohz_mask; 3264 atomic_t second_pick_cpu;
3100} nohz ____cacheline_aligned = { 3265 cpumask_var_t idle_cpus_mask;
3101 .load_balancer = ATOMIC_INIT(-1), 3266 cpumask_var_t grp_idle_mask;
3102}; 3267 unsigned long next_balance; /* in jiffy units */
3268} nohz ____cacheline_aligned;
3103 3269
3104int get_nohz_load_balancer(void) 3270int get_nohz_load_balancer(void)
3105{ 3271{
@@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3153 */ 3319 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group) 3320static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{ 3321{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3322 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3157 sched_group_cpus(ilb_group)); 3323 sched_group_cpus(ilb_group));
3158 3324
3159 /* 3325 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu 3326 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu. 3327 * and atleast one idle cpu.
3162 */ 3328 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3329 if (cpumask_empty(nohz.grp_idle_mask))
3164 return 0; 3330 return 0;
3165 3331
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3332 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3167 return 0; 3333 return 0;
3168 3334
3169 return 1; 3335 return 1;
@@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu)
3196 * Optimize for the case when we have no idle CPUs or only one 3362 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3363 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */ 3364 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2) 3365 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3200 goto out_done; 3366 goto out_done;
3201 3367
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3368 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu)
3204 3370
3205 do { 3371 do {
3206 if (is_semi_idle_group(ilb_group)) 3372 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask); 3373 return cpumask_first(nohz.grp_idle_mask);
3208 3374
3209 ilb_group = ilb_group->next; 3375 ilb_group = ilb_group->next;
3210 3376
@@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu)
3212 } 3378 }
3213 3379
3214out_done: 3380out_done:
3215 return cpumask_first(nohz.cpu_mask); 3381 return nr_cpu_ids;
3216} 3382}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3383#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu) 3384static inline int find_new_ilb(int call_cpu)
3219{ 3385{
3220 return cpumask_first(nohz.cpu_mask); 3386 return nr_cpu_ids;
3221} 3387}
3222#endif 3388#endif
3223 3389
3224/* 3390/*
3391 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3392 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3393 * CPU (if there is one).
3394 */
3395static void nohz_balancer_kick(int cpu)
3396{
3397 int ilb_cpu;
3398
3399 nohz.next_balance++;
3400
3401 ilb_cpu = get_nohz_load_balancer();
3402
3403 if (ilb_cpu >= nr_cpu_ids) {
3404 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3405 if (ilb_cpu >= nr_cpu_ids)
3406 return;
3407 }
3408
3409 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3410 struct call_single_data *cp;
3411
3412 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3413 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3414 __smp_call_function_single(ilb_cpu, cp, 0);
3415 }
3416 return;
3417}
3418
3419/*
3225 * This routine will try to nominate the ilb (idle load balancing) 3420 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3421 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system 3422 * load balancing on behalf of all those cpus.
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used
3233 * for idle load balancing. ilb owner will still be part of
3234 * nohz.cpu_mask..
3235 * 3423 *
3236 * While stopping the tick, this cpu will become the ilb owner if there 3424 * When the ilb owner becomes busy, we will not have new ilb owner until some
3237 * is no other owner. And will be the owner till that cpu becomes busy 3425 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3238 * or if all cpus in the system stop their ticks at which point 3426 * idle load balancing by kicking one of the idle CPUs.
3239 * there is no need for ilb owner.
3240 * 3427 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the 3428 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3242 * next busy scheduler_tick() 3429 * ilb owner CPU in future (when there is a need for idle load balancing on
3430 * behalf of all idle CPUs).
3243 */ 3431 */
3244int select_nohz_load_balancer(int stop_tick) 3432void select_nohz_load_balancer(int stop_tick)
3245{ 3433{
3246 int cpu = smp_processor_id(); 3434 int cpu = smp_processor_id();
3247 3435
3248 if (stop_tick) { 3436 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) { 3437 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu) 3438 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0; 3439 return;
3254 3440
3255 /* 3441 /*
3256 * If we are going offline and still the leader, 3442 * If we are going offline and still the leader,
3257 * give up! 3443 * give up!
3258 */ 3444 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3445 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3446 nr_cpu_ids) != cpu)
3260 BUG(); 3447 BUG();
3261 3448
3262 return 0; 3449 return;
3263 } 3450 }
3264 3451
3265 cpumask_set_cpu(cpu, nohz.cpu_mask); 3452 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3266 3453
3267 /* time for ilb owner also to sleep */ 3454 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3455 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3269 if (atomic_read(&nohz.load_balancer) == cpu) 3456 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3270 atomic_set(&nohz.load_balancer, -1); 3457 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3271 return 0;
3272 }
3273 3458
3274 if (atomic_read(&nohz.load_balancer) == -1) { 3459 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb; 3460 int new_ilb;
3280 3461
3281 if (!(sched_smt_power_savings || 3462 /* make me the ilb owner */
3282 sched_mc_power_savings)) 3463 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3283 return 1; 3464 cpu) != nr_cpu_ids)
3465 return;
3466
3284 /* 3467 /*
3285 * Check to see if there is a more power-efficient 3468 * Check to see if there is a more power-efficient
3286 * ilb. 3469 * ilb.
3287 */ 3470 */
3288 new_ilb = find_new_ilb(cpu); 3471 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3472 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1); 3473 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3291 resched_cpu(new_ilb); 3474 resched_cpu(new_ilb);
3292 return 0; 3475 return;
3293 } 3476 }
3294 return 1; 3477 return;
3295 } 3478 }
3296 } else { 3479 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3480 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3298 return 0; 3481 return;
3299 3482
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3483 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3301 3484
3302 if (atomic_read(&nohz.load_balancer) == cpu) 3485 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3486 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3487 nr_cpu_ids) != cpu)
3304 BUG(); 3488 BUG();
3305 } 3489 }
3306 return 0; 3490 return;
3307} 3491}
3308#endif 3492#endif
3309 3493
@@ -3385,11 +3569,102 @@ out:
3385 rq->next_balance = next_balance; 3569 rq->next_balance = next_balance;
3386} 3570}
3387 3571
3572#ifdef CONFIG_NO_HZ
3388/* 3573/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick. 3574 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3575 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */ 3576 */
3577static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3578{
3579 struct rq *this_rq = cpu_rq(this_cpu);
3580 struct rq *rq;
3581 int balance_cpu;
3582
3583 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3584 return;
3585
3586 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3587 if (balance_cpu == this_cpu)
3588 continue;
3589
3590 /*
3591 * If this cpu gets work to do, stop the load balancing
3592 * work being done for other cpus. Next load
3593 * balancing owner will pick it up.
3594 */
3595 if (need_resched()) {
3596 this_rq->nohz_balance_kick = 0;
3597 break;
3598 }
3599
3600 raw_spin_lock_irq(&this_rq->lock);
3601 update_rq_clock(this_rq);
3602 update_cpu_load(this_rq);
3603 raw_spin_unlock_irq(&this_rq->lock);
3604
3605 rebalance_domains(balance_cpu, CPU_IDLE);
3606
3607 rq = cpu_rq(balance_cpu);
3608 if (time_after(this_rq->next_balance, rq->next_balance))
3609 this_rq->next_balance = rq->next_balance;
3610 }
3611 nohz.next_balance = this_rq->next_balance;
3612 this_rq->nohz_balance_kick = 0;
3613}
3614
3615/*
3616 * Current heuristic for kicking the idle load balancer
3617 * - first_pick_cpu is the one of the busy CPUs. It will kick
3618 * idle load balancer when it has more than one process active. This
3619 * eliminates the need for idle load balancing altogether when we have
3620 * only one running process in the system (common case).
3621 * - If there are more than one busy CPU, idle load balancer may have
3622 * to run for active_load_balance to happen (i.e., two busy CPUs are
3623 * SMT or core siblings and can run better if they move to different
3624 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3625 * which will kick idle load balancer as soon as it has any load.
3626 */
3627static inline int nohz_kick_needed(struct rq *rq, int cpu)
3628{
3629 unsigned long now = jiffies;
3630 int ret;
3631 int first_pick_cpu, second_pick_cpu;
3632
3633 if (time_before(now, nohz.next_balance))
3634 return 0;
3635
3636 if (!rq->nr_running)
3637 return 0;
3638
3639 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3640 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3641
3642 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3643 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3644 return 0;
3645
3646 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3647 if (ret == nr_cpu_ids || ret == cpu) {
3648 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3649 if (rq->nr_running > 1)
3650 return 1;
3651 } else {
3652 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3653 if (ret == nr_cpu_ids || ret == cpu) {
3654 if (rq->nr_running)
3655 return 1;
3656 }
3657 }
3658 return 0;
3659}
3660#else
3661static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3662#endif
3663
3664/*
3665 * run_rebalance_domains is triggered when needed from the scheduler tick.
3666 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3667 */
3393static void run_rebalance_domains(struct softirq_action *h) 3668static void run_rebalance_domains(struct softirq_action *h)
3394{ 3669{
3395 int this_cpu = smp_processor_id(); 3670 int this_cpu = smp_processor_id();
@@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3399 3674
3400 rebalance_domains(this_cpu, idle); 3675 rebalance_domains(this_cpu, idle);
3401 3676
3402#ifdef CONFIG_NO_HZ
3403 /* 3677 /*
3404 * If this cpu is the owner for idle load balancing, then do the 3678 * If this cpu has a pending nohz_balance_kick, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are 3679 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped. 3680 * stopped.
3407 */ 3681 */
3408 if (this_rq->idle_at_tick && 3682 nohz_idle_balance(this_cpu, idle);
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433} 3683}
3434 3684
3435static inline int on_null_domain(int cpu) 3685static inline int on_null_domain(int cpu)
@@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu)
3439 3689
3440/* 3690/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3691 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */ 3692 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu) 3693static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{ 3694{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */ 3695 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) && 3696 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu))) 3697 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ); 3698 raise_softirq(SCHED_SOFTIRQ);
3699#ifdef CONFIG_NO_HZ
3700 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3701 nohz_balancer_kick(cpu);
3702#endif
3493} 3703}
3494 3704
3495static void rq_online_fair(struct rq *rq) 3705static void rq_online_fair(struct rq *rq)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..d10c80ebb67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1663{ 1663{
1664 unsigned long soft, hard; 1664 unsigned long soft, hard;
1665 1665
1666 if (!p->signal)
1667 return;
1668
1669 /* max may change after cur was read, this will be fixed next tick */ 1666 /* max may change after cur was read, this will be fixed next tick */
1670 soft = task_rlimit(p, RLIMIT_RTTIME); 1667 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1668 hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 296 cputime_t cputime)
297{ 297{
298 struct thread_group_cputimer *cputimer; 298 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 cputimer = &tsk->signal->cputimer;
305 299
306 if (!cputimer->running) 300 if (!cputimer->running)
307 return; 301 return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 319static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 320 cputime_t cputime)
327{ 321{
328 struct thread_group_cputimer *cputimer; 322 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 cputimer = &tsk->signal->cputimer;
335 323
336 if (!cputimer->running) 324 if (!cputimer->running)
337 return; 325 return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
355static inline void account_group_exec_runtime(struct task_struct *tsk, 343static inline void account_group_exec_runtime(struct task_struct *tsk,
356 unsigned long long ns) 344 unsigned long long ns)
357{ 345{
358 struct thread_group_cputimer *cputimer; 346 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
359 struct signal_struct *sig;
360
361 sig = tsk->signal;
362 /* see __exit_signal()->task_rq_unlock_wait() */
363 barrier();
364 if (unlikely(!sig))
365 return;
366
367 cputimer = &sig->cputimer;
368 347
369 if (!cputimer->running) 348 if (!cputimer->running)
370 return; 349 return;
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a1779c..bded65187780 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info)
637 637
638/* 638/*
639 * Bad permissions for sending the signal 639 * Bad permissions for sending the signal
640 * - the caller must hold at least the RCU read lock 640 * - the caller must hold the RCU read lock
641 */ 641 */
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
@@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1127 1127
1128/* 1128/*
1129 * send signal info to all the members of a group 1129 * send signal info to all the members of a group
1130 * - the caller must hold the RCU read lock at least
1131 */ 1130 */
1132int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1131int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1133{ 1132{
1134 int ret = check_kill_permission(sig, info, p); 1133 int ret;
1134
1135 rcu_read_lock();
1136 ret = check_kill_permission(sig, info, p);
1137 rcu_read_unlock();
1135 1138
1136 if (!ret && sig) 1139 if (!ret && sig)
1137 ret = do_send_sig_info(sig, info, p, true); 1140 ret = do_send_sig_info(sig, info, p, true);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index 4b493f67dcb5..000000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9#include <linux/mm.h>
10#include <linux/cpu.h>
11#include <linux/nmi.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/freezer.h>
15#include <linux/kthread.h>
16#include <linux/lockdep.h>
17#include <linux/notifier.h>
18#include <linux/module.h>
19#include <linux/sysctl.h>
20
21#include <asm/irq_regs.h>
22
23static DEFINE_SPINLOCK(print_lock);
24
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
29
30static int __read_mostly did_panic;
31int __read_mostly softlockup_thresh = 60;
32
33/*
34 * Should we panic (and reboot, if panic_timeout= is set) when a
35 * soft-lockup occurs:
36 */
37unsigned int __read_mostly softlockup_panic =
38 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
39
40static int __init softlockup_panic_setup(char *str)
41{
42 softlockup_panic = simple_strtoul(str, NULL, 0);
43
44 return 1;
45}
46__setup("softlockup_panic=", softlockup_panic_setup);
47
48static int
49softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
50{
51 did_panic = 1;
52
53 return NOTIFY_DONE;
54}
55
56static struct notifier_block panic_block = {
57 .notifier_call = softlock_panic,
58};
59
60/*
61 * Returns seconds, approximately. We don't need nanosecond
62 * resolution, and we don't need to waste time with a big divide when
63 * 2^30ns == 1.074s.
64 */
65static unsigned long get_timestamp(int this_cpu)
66{
67 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
68}
69
70static void __touch_softlockup_watchdog(void)
71{
72 int this_cpu = raw_smp_processor_id();
73
74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
75}
76
77void touch_softlockup_watchdog(void)
78{
79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
80}
81EXPORT_SYMBOL(touch_softlockup_watchdog);
82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
89void touch_all_softlockup_watchdogs(void)
90{
91 int cpu;
92
93 /* Cause each CPU to re-update its timestamp rather than complain */
94 for_each_online_cpu(cpu)
95 per_cpu(softlockup_touch_ts, cpu) = 0;
96}
97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
98
99int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
100 void __user *buffer,
101 size_t *lenp, loff_t *ppos)
102{
103 touch_all_softlockup_watchdogs();
104 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
105}
106
107/*
108 * This callback runs from the timer interrupt, and checks
109 * whether the watchdog thread has hung or not:
110 */
111void softlockup_tick(void)
112{
113 int this_cpu = smp_processor_id();
114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
115 unsigned long print_ts;
116 struct pt_regs *regs = get_irq_regs();
117 unsigned long now;
118
119 /* Is detection switched off? */
120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
121 /* Be sure we don't false trigger if switched back on */
122 if (touch_ts)
123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
124 return;
125 }
126
127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
136 __touch_softlockup_watchdog();
137 return;
138 }
139
140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
141
142 /* report at most once a second */
143 if (print_ts == touch_ts || did_panic)
144 return;
145
146 /* do not print during early bootup: */
147 if (unlikely(system_state != SYSTEM_RUNNING)) {
148 __touch_softlockup_watchdog();
149 return;
150 }
151
152 now = get_timestamp(this_cpu);
153
154 /*
155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan.
157 */
158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160
161 /* Warn about unreasonable delays: */
162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return;
164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
166
167 spin_lock(&print_lock);
168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
169 this_cpu, now - touch_ts,
170 current->comm, task_pid_nr(current));
171 print_modules();
172 print_irqtrace_events(current);
173 if (regs)
174 show_regs(regs);
175 else
176 dump_stack();
177 spin_unlock(&print_lock);
178
179 if (softlockup_panic)
180 panic("softlockup: hung tasks");
181}
182
183/*
184 * The watchdog thread - runs every second and touches the timestamp.
185 */
186static int watchdog(void *__bind_cpu)
187{
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 sched_setscheduler(current, SCHED_FIFO, &param);
191
192 /* initialize timestamp */
193 __touch_softlockup_watchdog();
194
195 set_current_state(TASK_INTERRUPTIBLE);
196 /*
197 * Run briefly once per second to reset the softlockup timestamp.
198 * If this gets delayed for more than 60 seconds then the
199 * debug-printout triggers in softlockup_tick().
200 */
201 while (!kthread_should_stop()) {
202 __touch_softlockup_watchdog();
203 schedule();
204
205 if (kthread_should_stop())
206 break;
207
208 set_current_state(TASK_INTERRUPTIBLE);
209 }
210 __set_current_state(TASK_RUNNING);
211
212 return 0;
213}
214
215/*
216 * Create/destroy watchdog threads as CPUs come and go:
217 */
218static int __cpuinit
219cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
220{
221 int hotcpu = (unsigned long)hcpu;
222 struct task_struct *p;
223
224 switch (action) {
225 case CPU_UP_PREPARE:
226 case CPU_UP_PREPARE_FROZEN:
227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
229 if (IS_ERR(p)) {
230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
231 return NOTIFY_BAD;
232 }
233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
234 per_cpu(softlockup_watchdog, hotcpu) = p;
235 kthread_bind(p, hotcpu);
236 break;
237 case CPU_ONLINE:
238 case CPU_ONLINE_FROZEN:
239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
240 break;
241#ifdef CONFIG_HOTPLUG_CPU
242 case CPU_UP_CANCELED:
243 case CPU_UP_CANCELED_FROZEN:
244 if (!per_cpu(softlockup_watchdog, hotcpu))
245 break;
246 /* Unbind so it can run. Fall thru. */
247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
248 cpumask_any(cpu_online_mask));
249 case CPU_DEAD:
250 case CPU_DEAD_FROZEN:
251 p = per_cpu(softlockup_watchdog, hotcpu);
252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
253 kthread_stop(p);
254 break;
255#endif /* CONFIG_HOTPLUG_CPU */
256 }
257 return NOTIFY_OK;
258}
259
260static struct notifier_block __cpuinitdata cpu_nfb = {
261 .notifier_call = cpu_callback
262};
263
264static int __initdata nosoftlockup;
265
266static int __init nosoftlockup_setup(char *str)
267{
268 nosoftlockup = 1;
269 return 1;
270}
271__setup("nosoftlockup", nosoftlockup_setup);
272
273static int __init spawn_softlockup_task(void)
274{
275 void *cpu = (void *)(long)smp_processor_id();
276 int err;
277
278 if (nosoftlockup)
279 return 0;
280
281 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
282 if (err == NOTIFY_BAD) {
283 BUG();
284 return 1;
285 }
286 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
287 register_cpu_notifier(&cpu_nfb);
288
289 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
290
291 return 0;
292}
293early_initcall(spawn_softlockup_task);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..6f79c7f81c96 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,10 @@
76#include <scsi/sg.h> 76#include <scsi/sg.h>
77#endif 77#endif
78 78
79#ifdef CONFIG_LOCKUP_DETECTOR
80#include <linux/nmi.h>
81#endif
82
79 83
80#if defined(CONFIG_SYSCTL) 84#if defined(CONFIG_SYSCTL)
81 85
@@ -106,7 +110,7 @@ extern int blk_iopoll_enabled;
106#endif 110#endif
107 111
108/* Constants used for minimum and maximum */ 112/* Constants used for minimum and maximum */
109#ifdef CONFIG_DETECT_SOFTLOCKUP 113#ifdef CONFIG_LOCKUP_DETECTOR
110static int sixty = 60; 114static int sixty = 60;
111static int neg_one = -1; 115static int neg_one = -1;
112#endif 116#endif
@@ -710,7 +714,34 @@ static struct ctl_table kern_table[] = {
710 .mode = 0444, 714 .mode = 0444,
711 .proc_handler = proc_dointvec, 715 .proc_handler = proc_dointvec,
712 }, 716 },
713#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 717#if defined(CONFIG_LOCKUP_DETECTOR)
718 {
719 .procname = "watchdog",
720 .data = &watchdog_enabled,
721 .maxlen = sizeof (int),
722 .mode = 0644,
723 .proc_handler = proc_dowatchdog_enabled,
724 },
725 {
726 .procname = "watchdog_thresh",
727 .data = &softlockup_thresh,
728 .maxlen = sizeof(int),
729 .mode = 0644,
730 .proc_handler = proc_dowatchdog_thresh,
731 .extra1 = &neg_one,
732 .extra2 = &sixty,
733 },
734 {
735 .procname = "softlockup_panic",
736 .data = &softlockup_panic,
737 .maxlen = sizeof(int),
738 .mode = 0644,
739 .proc_handler = proc_dointvec_minmax,
740 .extra1 = &zero,
741 .extra2 = &one,
742 },
743#endif
744#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
714 { 745 {
715 .procname = "unknown_nmi_panic", 746 .procname = "unknown_nmi_panic",
716 .data = &unknown_nmi_panic, 747 .data = &unknown_nmi_panic,
@@ -813,26 +844,6 @@ static struct ctl_table kern_table[] = {
813 .proc_handler = proc_dointvec, 844 .proc_handler = proc_dointvec,
814 }, 845 },
815#endif 846#endif
816#ifdef CONFIG_DETECT_SOFTLOCKUP
817 {
818 .procname = "softlockup_panic",
819 .data = &softlockup_panic,
820 .maxlen = sizeof(int),
821 .mode = 0644,
822 .proc_handler = proc_dointvec_minmax,
823 .extra1 = &zero,
824 .extra2 = &one,
825 },
826 {
827 .procname = "softlockup_thresh",
828 .data = &softlockup_thresh,
829 .maxlen = sizeof(int),
830 .mode = 0644,
831 .proc_handler = proc_dosoftlockup_thresh,
832 .extra1 = &neg_one,
833 .extra2 = &sixty,
834 },
835#endif
836#ifdef CONFIG_DETECT_HUNG_TASK 847#ifdef CONFIG_DETECT_HUNG_TASK
837 { 848 {
838 .procname = "hung_task_panic", 849 .procname = "hung_task_panic",
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
188 /* 188 /*
189 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
190 * periodic mode. We read dev->next_event first and add to it 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event() 191 * when the event already expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really 192 * sets dev->next_event only when the event is really
193 * programmed to the device. 193 * programmed to the device.
194 */ 194 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..021d2f878f19 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
325 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
326 326
327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { 328 arch_needs_cpu(cpu)) {
329 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
330 delta_jiffies = 1; 330 delta_jiffies = 1;
331 } else { 331 } else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
405 * the scheduler tick in nohz_restart_sched_tick. 405 * the scheduler tick in nohz_restart_sched_tick.
406 */ 406 */
407 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
408 if (select_nohz_load_balancer(1)) { 408 select_nohz_load_balancer(1);
409 /*
410 * sched tick not stopped!
411 */
412 cpumask_clear_cpu(cpu, nohz_cpu_mask);
413 goto out;
414 }
415 409
416 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
417 ts->tick_stopped = 1; 411 ts->tick_stopped = 1;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..d61d16da0b64 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -577,6 +577,19 @@ static void __init_timer(struct timer_list *timer,
577 lockdep_init_map(&timer->lockdep_map, name, key, 0); 577 lockdep_init_map(&timer->lockdep_map, name, key, 0);
578} 578}
579 579
580void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
581 const char *name,
582 struct lock_class_key *key,
583 void (*function)(unsigned long),
584 unsigned long data)
585{
586 timer->function = function;
587 timer->data = data;
588 init_timer_on_stack_key(timer, name, key);
589 timer_set_deferrable(timer);
590}
591EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
592
580/** 593/**
581 * init_timer_key - initialize a timer 594 * init_timer_key - initialize a timer
582 * @timer: the timer to be initialized 595 * @timer: the timer to be initialized
@@ -679,12 +692,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
679 cpu = smp_processor_id(); 692 cpu = smp_processor_id();
680 693
681#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 694#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
682 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 695 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
683 int preferred_cpu = get_nohz_load_balancer(); 696 cpu = get_nohz_timer_target();
684
685 if (preferred_cpu >= 0)
686 cpu = preferred_cpu;
687 }
688#endif 697#endif
689 new_base = per_cpu(tvec_bases, cpu); 698 new_base = per_cpu(tvec_bases, cpu);
690 699
@@ -1289,7 +1298,6 @@ void run_local_timers(void)
1289{ 1298{
1290 hrtimer_run_queues(); 1299 hrtimer_run_queues();
1291 raise_softirq(TIMER_SOFTIRQ); 1300 raise_softirq(TIMER_SOFTIRQ);
1292 softlockup_tick();
1293} 1301}
1294 1302
1295/* 1303/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b1797c4545b..c7683fd8a03a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -194,15 +194,6 @@ config PREEMPT_TRACER
194 enabled. This option and the irqs-off timing option can be 194 enabled. This option and the irqs-off timing option can be
195 used together or separately.) 195 used together or separately.)
196 196
197config SYSPROF_TRACER
198 bool "Sysprof Tracer"
199 depends on X86
200 select GENERIC_TRACER
201 select CONTEXT_SWITCH_TRACER
202 help
203 This tracer provides the trace needed by the 'Sysprof' userspace
204 tool.
205
206config SCHED_TRACER 197config SCHED_TRACER
207 bool "Scheduling Latency Tracer" 198 bool "Scheduling Latency Tracer"
208 select GENERIC_TRACER 199 select GENERIC_TRACER
@@ -229,23 +220,6 @@ config FTRACE_SYSCALLS
229 help 220 help
230 Basic tracer to catch the syscall entry and exit events. 221 Basic tracer to catch the syscall entry and exit events.
231 222
232config BOOT_TRACER
233 bool "Trace boot initcalls"
234 select GENERIC_TRACER
235 select CONTEXT_SWITCH_TRACER
236 help
237 This tracer helps developers to optimize boot times: it records
238 the timings of the initcalls and traces key events and the identity
239 of tasks that can cause boot delays, such as context-switches.
240
241 Its aim is to be parsed by the scripts/bootgraph.pl tool to
242 produce pretty graphics about boot inefficiencies, giving a visual
243 representation of the delays during initcalls - but the raw
244 /debug/tracing/trace text output is readable too.
245
246 You must pass in initcall_debug and ftrace=initcall to the kernel
247 command line to enable this on bootup.
248
249config TRACE_BRANCH_PROFILING 223config TRACE_BRANCH_PROFILING
250 bool 224 bool
251 select GENERIC_TRACER 225 select GENERIC_TRACER
@@ -325,28 +299,6 @@ config BRANCH_TRACER
325 299
326 Say N if unsure. 300 Say N if unsure.
327 301
328config KSYM_TRACER
329 bool "Trace read and write access on kernel memory locations"
330 depends on HAVE_HW_BREAKPOINT
331 select TRACING
332 help
333 This tracer helps find read and write operations on any given kernel
334 symbol i.e. /proc/kallsyms.
335
336config PROFILE_KSYM_TRACER
337 bool "Profile all kernel memory accesses on 'watched' variables"
338 depends on KSYM_TRACER
339 help
340 This tracer profiles kernel accesses on variables watched through the
341 ksym tracer ftrace plugin. Depending upon the hardware, all read
342 and write operations on kernel variables can be monitored for
343 accesses.
344
345 The results will be displayed in:
346 /debugfs/tracing/profile_ksym
347
348 Say N if unsure.
349
350config STACK_TRACER 302config STACK_TRACER
351 bool "Trace max stack" 303 bool "Trace max stack"
352 depends on HAVE_FUNCTION_TRACER 304 depends on HAVE_FUNCTION_TRACER
@@ -371,26 +323,6 @@ config STACK_TRACER
371 323
372 Say N if unsure. 324 Say N if unsure.
373 325
374config KMEMTRACE
375 bool "Trace SLAB allocations"
376 select GENERIC_TRACER
377 help
378 kmemtrace provides tracing for slab allocator functions, such as
379 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
380 data is then fed to the userspace application in order to analyse
381 allocation hotspots, internal fragmentation and so on, making it
382 possible to see how well an allocator performs, as well as debug
383 and profile kernel code.
384
385 This requires an userspace application to use. See
386 Documentation/trace/kmemtrace.txt for more information.
387
388 Saying Y will make the kernel somewhat larger and slower. However,
389 if you disable kmemtrace at run-time or boot-time, the performance
390 impact is minimal (depending on the arch the kernel is built for).
391
392 If unsure, say N.
393
394config WORKQUEUE_TRACER 326config WORKQUEUE_TRACER
395 bool "Trace workqueues" 327 bool "Trace workqueues"
396 select GENERIC_TRACER 328 select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ffb1a5b0550e..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
30obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
31obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 32obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
33obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
34obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 33obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
35obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 34obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
36obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 35obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
38obj-$(CONFIG_NOP_TRACER) += trace_nop.o 37obj-$(CONFIG_NOP_TRACER) += trace_nop.o
39obj-$(CONFIG_STACK_TRACER) += trace_stack.o 38obj-$(CONFIG_STACK_TRACER) += trace_stack.o
40obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 39obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 40obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 41obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 42obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 43obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
47ifeq ($(CONFIG_BLOCK),y) 44ifeq ($(CONFIG_BLOCK),y)
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
55endif 52endif
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
58obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
59obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_EVENT_TRACING) += power-traces.o
56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif
60 59
61libftrace-y := ftrace.o 60libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..0d88ce9b9fb8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1883 struct hlist_head *hhd; 1883 struct hlist_head *hhd;
1884 struct hlist_node *n; 1884 struct hlist_node *n;
1885 unsigned long key; 1885 unsigned long key;
1886 int resched;
1887 1886
1888 key = hash_long(ip, FTRACE_HASH_BITS); 1887 key = hash_long(ip, FTRACE_HASH_BITS);
1889 1888
@@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1897 * period. This syncs the hash iteration and freeing of items 1896 * period. This syncs the hash iteration and freeing of items
1898 * on the hash. rcu_read_lock is too dangerous here. 1897 * on the hash. rcu_read_lock is too dangerous here.
1899 */ 1898 */
1900 resched = ftrace_preempt_disable(); 1899 preempt_disable_notrace();
1901 hlist_for_each_entry_rcu(entry, n, hhd, node) { 1900 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1902 if (entry->ip == ip) 1901 if (entry->ip == ip)
1903 entry->ops->func(ip, parent_ip, &entry->data); 1902 entry->ops->func(ip, parent_ip, &entry->data);
1904 } 1903 }
1905 ftrace_preempt_enable(resched); 1904 preempt_enable_notrace();
1906} 1905}
1907 1906
1908static struct ftrace_ops trace_probe_ops __read_mostly = 1907static struct ftrace_ops trace_probe_ops __read_mostly =
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <linux/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
46 struct trace_array *tr = kmemtrace_array;
47 struct kmemtrace_alloc_entry *entry;
48 struct ring_buffer_event *event;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
51 if (!event)
52 return;
53
54 entry = ring_buffer_event_data(event);
55 tracing_generic_entry_update(&entry->ent, 0, 0);
56
57 entry->ent.type = TRACE_KMEM_ALLOC;
58 entry->type_id = type_id;
59 entry->call_site = call_site;
60 entry->ptr = ptr;
61 entry->bytes_req = bytes_req;
62 entry->bytes_alloc = bytes_alloc;
63 entry->gfp_flags = gfp_flags;
64 entry->node = node;
65
66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
68
69 trace_wake_up();
70}
71
72static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
73 unsigned long call_site,
74 const void *ptr)
75{
76 struct ftrace_event_call *call = &event_kmem_free;
77 struct trace_array *tr = kmemtrace_array;
78 struct kmemtrace_free_entry *entry;
79 struct ring_buffer_event *event;
80
81 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
82 if (!event)
83 return;
84 entry = ring_buffer_event_data(event);
85 tracing_generic_entry_update(&entry->ent, 0, 0);
86
87 entry->ent.type = TRACE_KMEM_FREE;
88 entry->type_id = type_id;
89 entry->call_site = call_site;
90 entry->ptr = ptr;
91
92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
94
95 trace_wake_up();
96}
97
98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
100 const void *ptr,
101 size_t bytes_req,
102 size_t bytes_alloc,
103 gfp_t gfp_flags)
104{
105 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
106 bytes_req, bytes_alloc, gfp_flags, -1);
107}
108
109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
111 const void *ptr,
112 size_t bytes_req,
113 size_t bytes_alloc,
114 gfp_t gfp_flags)
115{
116 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
117 bytes_req, bytes_alloc, gfp_flags, -1);
118}
119
120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
122 const void *ptr,
123 size_t bytes_req,
124 size_t bytes_alloc,
125 gfp_t gfp_flags,
126 int node)
127{
128 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
129 bytes_req, bytes_alloc, gfp_flags, node);
130}
131
132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
134 const void *ptr,
135 size_t bytes_req,
136 size_t bytes_alloc,
137 gfp_t gfp_flags,
138 int node)
139{
140 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
141 bytes_req, bytes_alloc, gfp_flags, node);
142}
143
144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
146{
147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
148}
149
150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
152{
153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
154}
155
156static int kmemtrace_start_probes(void)
157{
158 int err;
159
160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
161 if (err)
162 return err;
163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
164 if (err)
165 return err;
166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
167 if (err)
168 return err;
169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
170 if (err)
171 return err;
172 err = register_trace_kfree(kmemtrace_kfree, NULL);
173 if (err)
174 return err;
175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
176
177 return err;
178}
179
180static void kmemtrace_stop_probes(void)
181{
182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
186 unregister_trace_kfree(kmemtrace_kfree, NULL);
187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
188}
189
190static int kmem_trace_init(struct trace_array *tr)
191{
192 kmemtrace_array = tr;
193
194 tracing_reset_online_cpus(tr);
195
196 kmemtrace_start_probes();
197
198 return 0;
199}
200
201static void kmem_trace_reset(struct trace_array *tr)
202{
203 kmemtrace_stop_probes();
204}
205
206static void kmemtrace_headers(struct seq_file *s)
207{
208 /* Don't need headers for the original kmemtrace output */
209 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
210 return;
211
212 seq_printf(s, "#\n");
213 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
214 " POINTER NODE CALLER\n");
215 seq_printf(s, "# FREE | | | | "
216 " | | | |\n");
217 seq_printf(s, "# |\n\n");
218}
219
220/*
221 * The following functions give the original output from kmemtrace,
222 * plus the origin CPU, since reordering occurs in-kernel now.
223 */
224
225#define KMEMTRACE_USER_ALLOC 0
226#define KMEMTRACE_USER_FREE 1
227
228struct kmemtrace_user_event {
229 u8 event_id;
230 u8 type_id;
231 u16 event_size;
232 u32 cpu;
233 u64 timestamp;
234 unsigned long call_site;
235 unsigned long ptr;
236};
237
238struct kmemtrace_user_event_alloc {
239 size_t bytes_req;
240 size_t bytes_alloc;
241 unsigned gfp_flags;
242 int node;
243};
244
245static enum print_line_t
246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
248{
249 struct trace_seq *s = &iter->seq;
250 struct kmemtrace_alloc_entry *entry;
251 int ret;
252
253 trace_assign_type(entry, iter->ent);
254
255 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
256 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
257 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
258 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
259 (unsigned long)entry->gfp_flags, entry->node);
260
261 if (!ret)
262 return TRACE_TYPE_PARTIAL_LINE;
263 return TRACE_TYPE_HANDLED;
264}
265
266static enum print_line_t
267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
269{
270 struct trace_seq *s = &iter->seq;
271 struct kmemtrace_free_entry *entry;
272 int ret;
273
274 trace_assign_type(entry, iter->ent);
275
276 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
277 entry->type_id, (void *)entry->call_site,
278 (unsigned long)entry->ptr);
279
280 if (!ret)
281 return TRACE_TYPE_PARTIAL_LINE;
282 return TRACE_TYPE_HANDLED;
283}
284
285static enum print_line_t
286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
288{
289 struct trace_seq *s = &iter->seq;
290 struct kmemtrace_alloc_entry *entry;
291 struct kmemtrace_user_event *ev;
292 struct kmemtrace_user_event_alloc *ev_alloc;
293
294 trace_assign_type(entry, iter->ent);
295
296 ev = trace_seq_reserve(s, sizeof(*ev));
297 if (!ev)
298 return TRACE_TYPE_PARTIAL_LINE;
299
300 ev->event_id = KMEMTRACE_USER_ALLOC;
301 ev->type_id = entry->type_id;
302 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
303 ev->cpu = iter->cpu;
304 ev->timestamp = iter->ts;
305 ev->call_site = entry->call_site;
306 ev->ptr = (unsigned long)entry->ptr;
307
308 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
309 if (!ev_alloc)
310 return TRACE_TYPE_PARTIAL_LINE;
311
312 ev_alloc->bytes_req = entry->bytes_req;
313 ev_alloc->bytes_alloc = entry->bytes_alloc;
314 ev_alloc->gfp_flags = entry->gfp_flags;
315 ev_alloc->node = entry->node;
316
317 return TRACE_TYPE_HANDLED;
318}
319
320static enum print_line_t
321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
323{
324 struct trace_seq *s = &iter->seq;
325 struct kmemtrace_free_entry *entry;
326 struct kmemtrace_user_event *ev;
327
328 trace_assign_type(entry, iter->ent);
329
330 ev = trace_seq_reserve(s, sizeof(*ev));
331 if (!ev)
332 return TRACE_TYPE_PARTIAL_LINE;
333
334 ev->event_id = KMEMTRACE_USER_FREE;
335 ev->type_id = entry->type_id;
336 ev->event_size = sizeof(*ev);
337 ev->cpu = iter->cpu;
338 ev->timestamp = iter->ts;
339 ev->call_site = entry->call_site;
340 ev->ptr = (unsigned long)entry->ptr;
341
342 return TRACE_TYPE_HANDLED;
343}
344
345/* The two other following provide a more minimalistic output */
346static enum print_line_t
347kmemtrace_print_alloc_compress(struct trace_iterator *iter)
348{
349 struct kmemtrace_alloc_entry *entry;
350 struct trace_seq *s = &iter->seq;
351 int ret;
352
353 trace_assign_type(entry, iter->ent);
354
355 /* Alloc entry */
356 ret = trace_seq_printf(s, " + ");
357 if (!ret)
358 return TRACE_TYPE_PARTIAL_LINE;
359
360 /* Type */
361 switch (entry->type_id) {
362 case KMEMTRACE_TYPE_KMALLOC:
363 ret = trace_seq_printf(s, "K ");
364 break;
365 case KMEMTRACE_TYPE_CACHE:
366 ret = trace_seq_printf(s, "C ");
367 break;
368 case KMEMTRACE_TYPE_PAGES:
369 ret = trace_seq_printf(s, "P ");
370 break;
371 default:
372 ret = trace_seq_printf(s, "? ");
373 }
374
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 /* Requested */
379 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
380 if (!ret)
381 return TRACE_TYPE_PARTIAL_LINE;
382
383 /* Allocated */
384 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
385 if (!ret)
386 return TRACE_TYPE_PARTIAL_LINE;
387
388 /* Flags
389 * TODO: would be better to see the name of the GFP flag names
390 */
391 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
392 if (!ret)
393 return TRACE_TYPE_PARTIAL_LINE;
394
395 /* Pointer to allocated */
396 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
397 if (!ret)
398 return TRACE_TYPE_PARTIAL_LINE;
399
400 /* Node and call site*/
401 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
402 (void *)entry->call_site);
403 if (!ret)
404 return TRACE_TYPE_PARTIAL_LINE;
405
406 return TRACE_TYPE_HANDLED;
407}
408
409static enum print_line_t
410kmemtrace_print_free_compress(struct trace_iterator *iter)
411{
412 struct kmemtrace_free_entry *entry;
413 struct trace_seq *s = &iter->seq;
414 int ret;
415
416 trace_assign_type(entry, iter->ent);
417
418 /* Free entry */
419 ret = trace_seq_printf(s, " - ");
420 if (!ret)
421 return TRACE_TYPE_PARTIAL_LINE;
422
423 /* Type */
424 switch (entry->type_id) {
425 case KMEMTRACE_TYPE_KMALLOC:
426 ret = trace_seq_printf(s, "K ");
427 break;
428 case KMEMTRACE_TYPE_CACHE:
429 ret = trace_seq_printf(s, "C ");
430 break;
431 case KMEMTRACE_TYPE_PAGES:
432 ret = trace_seq_printf(s, "P ");
433 break;
434 default:
435 ret = trace_seq_printf(s, "? ");
436 }
437
438 if (!ret)
439 return TRACE_TYPE_PARTIAL_LINE;
440
441 /* Skip requested/allocated/flags */
442 ret = trace_seq_printf(s, " ");
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445
446 /* Pointer to allocated */
447 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
448 if (!ret)
449 return TRACE_TYPE_PARTIAL_LINE;
450
451 /* Skip node and print call site*/
452 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
453 if (!ret)
454 return TRACE_TYPE_PARTIAL_LINE;
455
456 return TRACE_TYPE_HANDLED;
457}
458
459static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
460{
461 struct trace_entry *entry = iter->ent;
462
463 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
464 return TRACE_TYPE_UNHANDLED;
465
466 switch (entry->type) {
467 case TRACE_KMEM_ALLOC:
468 return kmemtrace_print_alloc_compress(iter);
469 case TRACE_KMEM_FREE:
470 return kmemtrace_print_free_compress(iter);
471 default:
472 return TRACE_TYPE_UNHANDLED;
473 }
474}
475
476static struct trace_event_functions kmem_trace_alloc_funcs = {
477 .trace = kmemtrace_print_alloc,
478 .binary = kmemtrace_print_alloc_user,
479};
480
481static struct trace_event kmem_trace_alloc = {
482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
487 .trace = kmemtrace_print_free,
488 .binary = kmemtrace_print_free_user,
489};
490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
496static struct tracer kmem_tracer __read_mostly = {
497 .name = "kmemtrace",
498 .init = kmem_trace_init,
499 .reset = kmem_trace_reset,
500 .print_line = kmemtrace_print_line,
501 .print_header = kmemtrace_headers,
502 .flags = &kmem_tracer_flags
503};
504
505void kmemtrace_init(void)
506{
507 /* earliest opportunity to start kmem tracing */
508}
509
510static int __init init_kmem_tracer(void)
511{
512 if (!register_ftrace_event(&kmem_trace_alloc)) {
513 pr_warning("Warning: could not register kmem events\n");
514 return 1;
515 }
516
517 if (!register_ftrace_event(&kmem_trace_free)) {
518 pr_warning("Warning: could not register kmem events\n");
519 return 1;
520 }
521
522 if (register_tracer(&kmem_tracer) != 0) {
523 pr_warning("Warning: could not register the kmem tracer\n");
524 return 1;
525 }
526
527 return 0;
528}
529device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..3632ce87674f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
443 */ 443 */
444struct ring_buffer_per_cpu { 444struct ring_buffer_per_cpu {
445 int cpu; 445 int cpu;
446 atomic_t record_disabled;
446 struct ring_buffer *buffer; 447 struct ring_buffer *buffer;
447 spinlock_t reader_lock; /* serialize readers */ 448 spinlock_t reader_lock; /* serialize readers */
448 arch_spinlock_t lock; 449 arch_spinlock_t lock;
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu {
462 unsigned long read; 463 unsigned long read;
463 u64 write_stamp; 464 u64 write_stamp;
464 u64 read_stamp; 465 u64 read_stamp;
465 atomic_t record_disabled;
466}; 466};
467 467
468struct ring_buffer { 468struct ring_buffer {
@@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void)
2242 2242
2243#endif 2243#endif
2244 2244
2245static DEFINE_PER_CPU(int, rb_need_resched);
2246
2247/** 2245/**
2248 * ring_buffer_lock_reserve - reserve a part of the buffer 2246 * ring_buffer_lock_reserve - reserve a part of the buffer
2249 * @buffer: the ring buffer to reserve from 2247 * @buffer: the ring buffer to reserve from
@@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2264{ 2262{
2265 struct ring_buffer_per_cpu *cpu_buffer; 2263 struct ring_buffer_per_cpu *cpu_buffer;
2266 struct ring_buffer_event *event; 2264 struct ring_buffer_event *event;
2267 int cpu, resched; 2265 int cpu;
2268 2266
2269 if (ring_buffer_flags != RB_BUFFERS_ON) 2267 if (ring_buffer_flags != RB_BUFFERS_ON)
2270 return NULL; 2268 return NULL;
2271 2269
2272 /* If we are tracing schedule, we don't want to recurse */ 2270 /* If we are tracing schedule, we don't want to recurse */
2273 resched = ftrace_preempt_disable(); 2271 preempt_disable_notrace();
2274 2272
2275 if (atomic_read(&buffer->record_disabled)) 2273 if (atomic_read(&buffer->record_disabled))
2276 goto out_nocheck; 2274 goto out_nocheck;
@@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2295 if (!event) 2293 if (!event)
2296 goto out; 2294 goto out;
2297 2295
2298 /*
2299 * Need to store resched state on this cpu.
2300 * Only the first needs to.
2301 */
2302
2303 if (preempt_count() == 1)
2304 per_cpu(rb_need_resched, cpu) = resched;
2305
2306 return event; 2296 return event;
2307 2297
2308 out: 2298 out:
2309 trace_recursive_unlock(); 2299 trace_recursive_unlock();
2310 2300
2311 out_nocheck: 2301 out_nocheck:
2312 ftrace_preempt_enable(resched); 2302 preempt_enable_notrace();
2313 return NULL; 2303 return NULL;
2314} 2304}
2315EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2305EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2355 2345
2356 trace_recursive_unlock(); 2346 trace_recursive_unlock();
2357 2347
2358 /* 2348 preempt_enable_notrace();
2359 * Only the last preempt count needs to restore preemption.
2360 */
2361 if (preempt_count() == 1)
2362 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2363 else
2364 preempt_enable_no_resched_notrace();
2365 2349
2366 return 0; 2350 return 0;
2367} 2351}
@@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2469 2453
2470 trace_recursive_unlock(); 2454 trace_recursive_unlock();
2471 2455
2472 /* 2456 preempt_enable_notrace();
2473 * Only the last preempt count needs to restore preemption.
2474 */
2475 if (preempt_count() == 1)
2476 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2477 else
2478 preempt_enable_no_resched_notrace();
2479 2457
2480} 2458}
2481EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2459EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
2501 struct ring_buffer_event *event; 2479 struct ring_buffer_event *event;
2502 void *body; 2480 void *body;
2503 int ret = -EBUSY; 2481 int ret = -EBUSY;
2504 int cpu, resched; 2482 int cpu;
2505 2483
2506 if (ring_buffer_flags != RB_BUFFERS_ON) 2484 if (ring_buffer_flags != RB_BUFFERS_ON)
2507 return -EBUSY; 2485 return -EBUSY;
2508 2486
2509 resched = ftrace_preempt_disable(); 2487 preempt_disable_notrace();
2510 2488
2511 if (atomic_read(&buffer->record_disabled)) 2489 if (atomic_read(&buffer->record_disabled))
2512 goto out; 2490 goto out;
@@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2536 2514
2537 ret = 0; 2515 ret = 0;
2538 out: 2516 out:
2539 ftrace_preempt_enable(resched); 2517 preempt_enable_notrace();
2540 2518
2541 return ret; 2519 return ret;
2542} 2520}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..ed1032d6f81d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
104static cpumask_var_t __read_mostly tracing_buffer_mask; 104cpumask_var_t __read_mostly tracing_buffer_mask;
105
106#define for_each_tracing_cpu(cpu) \
107 for_each_cpu(cpu, tracing_buffer_mask)
108 105
109/* 106/*
110 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 107 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344/* trace_flags holds trace_options default values */ 341/* trace_flags holds trace_options default values */
345unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 342unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
346 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 343 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
347 TRACE_ITER_GRAPH_TIME; 344 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
348 345
349static int trace_stop_count; 346static int trace_stop_count;
350static DEFINE_SPINLOCK(tracing_start_lock); 347static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +425,7 @@ static const char *trace_options[] = {
428 "latency-format", 425 "latency-format",
429 "sleep-time", 426 "sleep-time",
430 "graph-time", 427 "graph-time",
428 "record-cmd",
431 NULL 429 NULL
432}; 430};
433 431
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
659 return; 657 return;
660 658
661 WARN_ON_ONCE(!irqs_disabled()); 659 WARN_ON_ONCE(!irqs_disabled());
660 if (!current_trace->use_max_tr) {
661 WARN_ON_ONCE(1);
662 return;
663 }
662 arch_spin_lock(&ftrace_max_lock); 664 arch_spin_lock(&ftrace_max_lock);
663 665
664 tr->buffer = max_tr.buffer; 666 tr->buffer = max_tr.buffer;
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
685 return; 687 return;
686 688
687 WARN_ON_ONCE(!irqs_disabled()); 689 WARN_ON_ONCE(!irqs_disabled());
690 if (!current_trace->use_max_tr) {
691 WARN_ON_ONCE(1);
692 return;
693 }
694
688 arch_spin_lock(&ftrace_max_lock); 695 arch_spin_lock(&ftrace_max_lock);
689 696
690 ftrace_disable_cpu(); 697 ftrace_disable_cpu();
@@ -729,7 +736,7 @@ __acquires(kernel_lock)
729 return -1; 736 return -1;
730 } 737 }
731 738
732 if (strlen(type->name) > MAX_TRACER_SIZE) { 739 if (strlen(type->name) >= MAX_TRACER_SIZE) {
733 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); 740 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
734 return -1; 741 return -1;
735 } 742 }
@@ -1331,61 +1338,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1331 1338
1332#endif /* CONFIG_STACKTRACE */ 1339#endif /* CONFIG_STACKTRACE */
1333 1340
1334static void
1335ftrace_trace_special(void *__tr,
1336 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1337 int pc)
1338{
1339 struct ftrace_event_call *call = &event_special;
1340 struct ring_buffer_event *event;
1341 struct trace_array *tr = __tr;
1342 struct ring_buffer *buffer = tr->buffer;
1343 struct special_entry *entry;
1344
1345 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1346 sizeof(*entry), 0, pc);
1347 if (!event)
1348 return;
1349 entry = ring_buffer_event_data(event);
1350 entry->arg1 = arg1;
1351 entry->arg2 = arg2;
1352 entry->arg3 = arg3;
1353
1354 if (!filter_check_discard(call, entry, buffer, event))
1355 trace_buffer_unlock_commit(buffer, event, 0, pc);
1356}
1357
1358void
1359__trace_special(void *__tr, void *__data,
1360 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1361{
1362 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1363}
1364
1365void
1366ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1367{
1368 struct trace_array *tr = &global_trace;
1369 struct trace_array_cpu *data;
1370 unsigned long flags;
1371 int cpu;
1372 int pc;
1373
1374 if (tracing_disabled)
1375 return;
1376
1377 pc = preempt_count();
1378 local_irq_save(flags);
1379 cpu = raw_smp_processor_id();
1380 data = tr->data[cpu];
1381
1382 if (likely(atomic_inc_return(&data->disabled) == 1))
1383 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1384
1385 atomic_dec(&data->disabled);
1386 local_irq_restore(flags);
1387}
1388
1389/** 1341/**
1390 * trace_vbprintk - write binary msg to tracing buffer 1342 * trace_vbprintk - write binary msg to tracing buffer
1391 * 1343 *
@@ -1404,7 +1356,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1404 struct bprint_entry *entry; 1356 struct bprint_entry *entry;
1405 unsigned long flags; 1357 unsigned long flags;
1406 int disable; 1358 int disable;
1407 int resched;
1408 int cpu, len = 0, size, pc; 1359 int cpu, len = 0, size, pc;
1409 1360
1410 if (unlikely(tracing_selftest_running || tracing_disabled)) 1361 if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1365,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1414 pause_graph_tracing(); 1365 pause_graph_tracing();
1415 1366
1416 pc = preempt_count(); 1367 pc = preempt_count();
1417 resched = ftrace_preempt_disable(); 1368 preempt_disable_notrace();
1418 cpu = raw_smp_processor_id(); 1369 cpu = raw_smp_processor_id();
1419 data = tr->data[cpu]; 1370 data = tr->data[cpu];
1420 1371
@@ -1452,7 +1403,7 @@ out_unlock:
1452 1403
1453out: 1404out:
1454 atomic_dec_return(&data->disabled); 1405 atomic_dec_return(&data->disabled);
1455 ftrace_preempt_enable(resched); 1406 preempt_enable_notrace();
1456 unpause_graph_tracing(); 1407 unpause_graph_tracing();
1457 1408
1458 return len; 1409 return len;
@@ -1539,11 +1490,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1539} 1490}
1540EXPORT_SYMBOL_GPL(trace_vprintk); 1491EXPORT_SYMBOL_GPL(trace_vprintk);
1541 1492
1542enum trace_file_type {
1543 TRACE_FILE_LAT_FMT = 1,
1544 TRACE_FILE_ANNOTATE = 2,
1545};
1546
1547static void trace_iterator_increment(struct trace_iterator *iter) 1493static void trace_iterator_increment(struct trace_iterator *iter)
1548{ 1494{
1549 /* Don't allow ftrace to trace into the ring buffers */ 1495 /* Don't allow ftrace to trace into the ring buffers */
@@ -1641,7 +1587,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1641} 1587}
1642 1588
1643/* Find the next real entry, and increment the iterator to the next entry */ 1589/* Find the next real entry, and increment the iterator to the next entry */
1644static void *find_next_entry_inc(struct trace_iterator *iter) 1590void *trace_find_next_entry_inc(struct trace_iterator *iter)
1645{ 1591{
1646 iter->ent = __find_next_entry(iter, &iter->cpu, 1592 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts); 1593 &iter->lost_events, &iter->ts);
@@ -1676,19 +1622,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1676 return NULL; 1622 return NULL;
1677 1623
1678 if (iter->idx < 0) 1624 if (iter->idx < 0)
1679 ent = find_next_entry_inc(iter); 1625 ent = trace_find_next_entry_inc(iter);
1680 else 1626 else
1681 ent = iter; 1627 ent = iter;
1682 1628
1683 while (ent && iter->idx < i) 1629 while (ent && iter->idx < i)
1684 ent = find_next_entry_inc(iter); 1630 ent = trace_find_next_entry_inc(iter);
1685 1631
1686 iter->pos = *pos; 1632 iter->pos = *pos;
1687 1633
1688 return ent; 1634 return ent;
1689} 1635}
1690 1636
1691static void tracing_iter_reset(struct trace_iterator *iter, int cpu) 1637void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1692{ 1638{
1693 struct trace_array *tr = iter->tr; 1639 struct trace_array *tr = iter->tr;
1694 struct ring_buffer_event *event; 1640 struct ring_buffer_event *event;
@@ -2049,7 +1995,7 @@ int trace_empty(struct trace_iterator *iter)
2049} 1995}
2050 1996
2051/* Called with trace_event_read_lock() held. */ 1997/* Called with trace_event_read_lock() held. */
2052static enum print_line_t print_trace_line(struct trace_iterator *iter) 1998enum print_line_t print_trace_line(struct trace_iterator *iter)
2053{ 1999{
2054 enum print_line_t ret; 2000 enum print_line_t ret;
2055 2001
@@ -2394,6 +2340,7 @@ static const struct file_operations show_traces_fops = {
2394 .open = show_traces_open, 2340 .open = show_traces_open,
2395 .read = seq_read, 2341 .read = seq_read,
2396 .release = seq_release, 2342 .release = seq_release,
2343 .llseek = seq_lseek,
2397}; 2344};
2398 2345
2399/* 2346/*
@@ -2487,6 +2434,7 @@ static const struct file_operations tracing_cpumask_fops = {
2487 .open = tracing_open_generic, 2434 .open = tracing_open_generic,
2488 .read = tracing_cpumask_read, 2435 .read = tracing_cpumask_read,
2489 .write = tracing_cpumask_write, 2436 .write = tracing_cpumask_write,
2437 .llseek = generic_file_llseek,
2490}; 2438};
2491 2439
2492static int tracing_trace_options_show(struct seq_file *m, void *v) 2440static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2562,6 +2510,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2562 trace_flags |= mask; 2510 trace_flags |= mask;
2563 else 2511 else
2564 trace_flags &= ~mask; 2512 trace_flags &= ~mask;
2513
2514 if (mask == TRACE_ITER_RECORD_CMD)
2515 trace_event_enable_cmd_record(enabled);
2565} 2516}
2566 2517
2567static ssize_t 2518static ssize_t
@@ -2653,6 +2604,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2653static const struct file_operations tracing_readme_fops = { 2604static const struct file_operations tracing_readme_fops = {
2654 .open = tracing_open_generic, 2605 .open = tracing_open_generic,
2655 .read = tracing_readme_read, 2606 .read = tracing_readme_read,
2607 .llseek = generic_file_llseek,
2656}; 2608};
2657 2609
2658static ssize_t 2610static ssize_t
@@ -2703,6 +2655,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2703static const struct file_operations tracing_saved_cmdlines_fops = { 2655static const struct file_operations tracing_saved_cmdlines_fops = {
2704 .open = tracing_open_generic, 2656 .open = tracing_open_generic,
2705 .read = tracing_saved_cmdlines_read, 2657 .read = tracing_saved_cmdlines_read,
2658 .llseek = generic_file_llseek,
2706}; 2659};
2707 2660
2708static ssize_t 2661static ssize_t
@@ -2798,6 +2751,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
2798 if (ret < 0) 2751 if (ret < 0)
2799 return ret; 2752 return ret;
2800 2753
2754 if (!current_trace->use_max_tr)
2755 goto out;
2756
2801 ret = ring_buffer_resize(max_tr.buffer, size); 2757 ret = ring_buffer_resize(max_tr.buffer, size);
2802 if (ret < 0) { 2758 if (ret < 0) {
2803 int r; 2759 int r;
@@ -2825,11 +2781,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
2825 return ret; 2781 return ret;
2826 } 2782 }
2827 2783
2784 max_tr.entries = size;
2785 out:
2828 global_trace.entries = size; 2786 global_trace.entries = size;
2829 2787
2830 return ret; 2788 return ret;
2831} 2789}
2832 2790
2791
2833/** 2792/**
2834 * tracing_update_buffers - used by tracing facility to expand ring buffers 2793 * tracing_update_buffers - used by tracing facility to expand ring buffers
2835 * 2794 *
@@ -2890,12 +2849,26 @@ static int tracing_set_tracer(const char *buf)
2890 trace_branch_disable(); 2849 trace_branch_disable();
2891 if (current_trace && current_trace->reset) 2850 if (current_trace && current_trace->reset)
2892 current_trace->reset(tr); 2851 current_trace->reset(tr);
2893 2852 if (current_trace && current_trace->use_max_tr) {
2853 /*
2854 * We don't free the ring buffer. instead, resize it because
2855 * The max_tr ring buffer has some state (e.g. ring->clock) and
2856 * we want preserve it.
2857 */
2858 ring_buffer_resize(max_tr.buffer, 1);
2859 max_tr.entries = 1;
2860 }
2894 destroy_trace_option_files(topts); 2861 destroy_trace_option_files(topts);
2895 2862
2896 current_trace = t; 2863 current_trace = t;
2897 2864
2898 topts = create_trace_option_files(current_trace); 2865 topts = create_trace_option_files(current_trace);
2866 if (current_trace->use_max_tr) {
2867 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
2868 if (ret < 0)
2869 goto out;
2870 max_tr.entries = global_trace.entries;
2871 }
2899 2872
2900 if (t->init) { 2873 if (t->init) {
2901 ret = tracer_init(t, tr); 2874 ret = tracer_init(t, tr);
@@ -3032,6 +3005,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3032 if (iter->trace->pipe_open) 3005 if (iter->trace->pipe_open)
3033 iter->trace->pipe_open(iter); 3006 iter->trace->pipe_open(iter);
3034 3007
3008 nonseekable_open(inode, filp);
3035out: 3009out:
3036 mutex_unlock(&trace_types_lock); 3010 mutex_unlock(&trace_types_lock);
3037 return ret; 3011 return ret;
@@ -3211,7 +3185,7 @@ waitagain:
3211 3185
3212 trace_event_read_lock(); 3186 trace_event_read_lock();
3213 trace_access_lock(iter->cpu_file); 3187 trace_access_lock(iter->cpu_file);
3214 while (find_next_entry_inc(iter) != NULL) { 3188 while (trace_find_next_entry_inc(iter) != NULL) {
3215 enum print_line_t ret; 3189 enum print_line_t ret;
3216 int len = iter->seq.len; 3190 int len = iter->seq.len;
3217 3191
@@ -3294,7 +3268,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3294 if (ret != TRACE_TYPE_NO_CONSUME) 3268 if (ret != TRACE_TYPE_NO_CONSUME)
3295 trace_consume(iter); 3269 trace_consume(iter);
3296 rem -= count; 3270 rem -= count;
3297 if (!find_next_entry_inc(iter)) { 3271 if (!trace_find_next_entry_inc(iter)) {
3298 rem = 0; 3272 rem = 0;
3299 iter->ent = NULL; 3273 iter->ent = NULL;
3300 break; 3274 break;
@@ -3350,7 +3324,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3350 if (ret <= 0) 3324 if (ret <= 0)
3351 goto out_err; 3325 goto out_err;
3352 3326
3353 if (!iter->ent && !find_next_entry_inc(iter)) { 3327 if (!iter->ent && !trace_find_next_entry_inc(iter)) {
3354 ret = -EFAULT; 3328 ret = -EFAULT;
3355 goto out_err; 3329 goto out_err;
3356 } 3330 }
@@ -3477,7 +3451,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3477 } 3451 }
3478 3452
3479 tracing_start(); 3453 tracing_start();
3480 max_tr.entries = global_trace.entries;
3481 mutex_unlock(&trace_types_lock); 3454 mutex_unlock(&trace_types_lock);
3482 3455
3483 return cnt; 3456 return cnt;
@@ -3590,18 +3563,21 @@ static const struct file_operations tracing_max_lat_fops = {
3590 .open = tracing_open_generic, 3563 .open = tracing_open_generic,
3591 .read = tracing_max_lat_read, 3564 .read = tracing_max_lat_read,
3592 .write = tracing_max_lat_write, 3565 .write = tracing_max_lat_write,
3566 .llseek = generic_file_llseek,
3593}; 3567};
3594 3568
3595static const struct file_operations tracing_ctrl_fops = { 3569static const struct file_operations tracing_ctrl_fops = {
3596 .open = tracing_open_generic, 3570 .open = tracing_open_generic,
3597 .read = tracing_ctrl_read, 3571 .read = tracing_ctrl_read,
3598 .write = tracing_ctrl_write, 3572 .write = tracing_ctrl_write,
3573 .llseek = generic_file_llseek,
3599}; 3574};
3600 3575
3601static const struct file_operations set_tracer_fops = { 3576static const struct file_operations set_tracer_fops = {
3602 .open = tracing_open_generic, 3577 .open = tracing_open_generic,
3603 .read = tracing_set_trace_read, 3578 .read = tracing_set_trace_read,
3604 .write = tracing_set_trace_write, 3579 .write = tracing_set_trace_write,
3580 .llseek = generic_file_llseek,
3605}; 3581};
3606 3582
3607static const struct file_operations tracing_pipe_fops = { 3583static const struct file_operations tracing_pipe_fops = {
@@ -3610,17 +3586,20 @@ static const struct file_operations tracing_pipe_fops = {
3610 .read = tracing_read_pipe, 3586 .read = tracing_read_pipe,
3611 .splice_read = tracing_splice_read_pipe, 3587 .splice_read = tracing_splice_read_pipe,
3612 .release = tracing_release_pipe, 3588 .release = tracing_release_pipe,
3589 .llseek = no_llseek,
3613}; 3590};
3614 3591
3615static const struct file_operations tracing_entries_fops = { 3592static const struct file_operations tracing_entries_fops = {
3616 .open = tracing_open_generic, 3593 .open = tracing_open_generic,
3617 .read = tracing_entries_read, 3594 .read = tracing_entries_read,
3618 .write = tracing_entries_write, 3595 .write = tracing_entries_write,
3596 .llseek = generic_file_llseek,
3619}; 3597};
3620 3598
3621static const struct file_operations tracing_mark_fops = { 3599static const struct file_operations tracing_mark_fops = {
3622 .open = tracing_open_generic, 3600 .open = tracing_open_generic,
3623 .write = tracing_mark_write, 3601 .write = tracing_mark_write,
3602 .llseek = generic_file_llseek,
3624}; 3603};
3625 3604
3626static const struct file_operations trace_clock_fops = { 3605static const struct file_operations trace_clock_fops = {
@@ -3926,6 +3905,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3926static const struct file_operations tracing_stats_fops = { 3905static const struct file_operations tracing_stats_fops = {
3927 .open = tracing_open_generic, 3906 .open = tracing_open_generic,
3928 .read = tracing_stats_read, 3907 .read = tracing_stats_read,
3908 .llseek = generic_file_llseek,
3929}; 3909};
3930 3910
3931#ifdef CONFIG_DYNAMIC_FTRACE 3911#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3962,6 +3942,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3962static const struct file_operations tracing_dyn_info_fops = { 3942static const struct file_operations tracing_dyn_info_fops = {
3963 .open = tracing_open_generic, 3943 .open = tracing_open_generic,
3964 .read = tracing_read_dyn_info, 3944 .read = tracing_read_dyn_info,
3945 .llseek = generic_file_llseek,
3965}; 3946};
3966#endif 3947#endif
3967 3948
@@ -4115,6 +4096,7 @@ static const struct file_operations trace_options_fops = {
4115 .open = tracing_open_generic, 4096 .open = tracing_open_generic,
4116 .read = trace_options_read, 4097 .read = trace_options_read,
4117 .write = trace_options_write, 4098 .write = trace_options_write,
4099 .llseek = generic_file_llseek,
4118}; 4100};
4119 4101
4120static ssize_t 4102static ssize_t
@@ -4166,6 +4148,7 @@ static const struct file_operations trace_options_core_fops = {
4166 .open = tracing_open_generic, 4148 .open = tracing_open_generic,
4167 .read = trace_options_core_read, 4149 .read = trace_options_core_read,
4168 .write = trace_options_core_write, 4150 .write = trace_options_core_write,
4151 .llseek = generic_file_llseek,
4169}; 4152};
4170 4153
4171struct dentry *trace_create_file(const char *name, 4154struct dentry *trace_create_file(const char *name,
@@ -4355,9 +4338,6 @@ static __init int tracer_init_debugfs(void)
4355 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4338 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4356 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4339 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4357#endif 4340#endif
4358#ifdef CONFIG_SYSPROF_TRACER
4359 init_tracer_sysprof_debugfs(d_tracer);
4360#endif
4361 4341
4362 create_trace_options_dir(); 4342 create_trace_options_dir();
4363 4343
@@ -4414,7 +4394,7 @@ static struct notifier_block trace_die_notifier = {
4414 */ 4394 */
4415#define KERN_TRACE KERN_EMERG 4395#define KERN_TRACE KERN_EMERG
4416 4396
4417static void 4397void
4418trace_printk_seq(struct trace_seq *s) 4398trace_printk_seq(struct trace_seq *s)
4419{ 4399{
4420 /* Probably should print a warning here. */ 4400 /* Probably should print a warning here. */
@@ -4429,6 +4409,13 @@ trace_printk_seq(struct trace_seq *s)
4429 trace_seq_init(s); 4409 trace_seq_init(s);
4430} 4410}
4431 4411
4412void trace_init_global_iter(struct trace_iterator *iter)
4413{
4414 iter->tr = &global_trace;
4415 iter->trace = current_trace;
4416 iter->cpu_file = TRACE_PIPE_ALL_CPU;
4417}
4418
4432static void 4419static void
4433__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) 4420__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4434{ 4421{
@@ -4454,8 +4441,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4454 if (disable_tracing) 4441 if (disable_tracing)
4455 ftrace_kill(); 4442 ftrace_kill();
4456 4443
4444 trace_init_global_iter(&iter);
4445
4457 for_each_tracing_cpu(cpu) { 4446 for_each_tracing_cpu(cpu) {
4458 atomic_inc(&global_trace.data[cpu]->disabled); 4447 atomic_inc(&iter.tr->data[cpu]->disabled);
4459 } 4448 }
4460 4449
4461 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 4450 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4504,7 +4493,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4504 iter.iter_flags |= TRACE_FILE_LAT_FMT; 4493 iter.iter_flags |= TRACE_FILE_LAT_FMT;
4505 iter.pos = -1; 4494 iter.pos = -1;
4506 4495
4507 if (find_next_entry_inc(&iter) != NULL) { 4496 if (trace_find_next_entry_inc(&iter) != NULL) {
4508 int ret; 4497 int ret;
4509 4498
4510 ret = print_trace_line(&iter); 4499 ret = print_trace_line(&iter);
@@ -4526,7 +4515,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4526 trace_flags |= old_userobj; 4515 trace_flags |= old_userobj;
4527 4516
4528 for_each_tracing_cpu(cpu) { 4517 for_each_tracing_cpu(cpu) {
4529 atomic_dec(&global_trace.data[cpu]->disabled); 4518 atomic_dec(&iter.tr->data[cpu]->disabled);
4530 } 4519 }
4531 tracing_on(); 4520 tracing_on();
4532 } 4521 }
@@ -4575,16 +4564,14 @@ __init static int tracer_alloc_buffers(void)
4575 4564
4576 4565
4577#ifdef CONFIG_TRACER_MAX_TRACE 4566#ifdef CONFIG_TRACER_MAX_TRACE
4578 max_tr.buffer = ring_buffer_alloc(ring_buf_size, 4567 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
4579 TRACE_BUFFER_FLAGS);
4580 if (!max_tr.buffer) { 4568 if (!max_tr.buffer) {
4581 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4569 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4582 WARN_ON(1); 4570 WARN_ON(1);
4583 ring_buffer_free(global_trace.buffer); 4571 ring_buffer_free(global_trace.buffer);
4584 goto out_free_cpumask; 4572 goto out_free_cpumask;
4585 } 4573 }
4586 max_tr.entries = ring_buffer_size(max_tr.buffer); 4574 max_tr.entries = 1;
4587 WARN_ON(max_tr.entries != global_trace.entries);
4588#endif 4575#endif
4589 4576
4590 /* Allocate the first page for all buffers */ 4577 /* Allocate the first page for all buffers */
@@ -4597,9 +4584,6 @@ __init static int tracer_alloc_buffers(void)
4597 4584
4598 register_tracer(&nop_trace); 4585 register_tracer(&nop_trace);
4599 current_trace = &nop_trace; 4586 current_trace = &nop_trace;
4600#ifdef CONFIG_BOOT_TRACER
4601 register_tracer(&boot_tracer);
4602#endif
4603 /* All seems OK, enable tracing */ 4587 /* All seems OK, enable tracing */
4604 tracing_disabled = 0; 4588 tracing_disabled = 0;
4605 4589
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..d39b3c5454a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h> 10#include <linux/tracepoint.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h>
13#include <linux/kmemtrace.h>
14#include <linux/hw_breakpoint.h> 12#include <linux/hw_breakpoint.h>
15
16#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
17#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
18 15
@@ -25,30 +22,17 @@ enum trace_type {
25 TRACE_STACK, 22 TRACE_STACK,
26 TRACE_PRINT, 23 TRACE_PRINT,
27 TRACE_BPRINT, 24 TRACE_BPRINT,
28 TRACE_SPECIAL,
29 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
30 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
31 TRACE_BRANCH, 27 TRACE_BRANCH,
32 TRACE_BOOT_CALL,
33 TRACE_BOOT_RET,
34 TRACE_GRAPH_RET, 28 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 29 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 30 TRACE_USER_STACK,
37 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE,
39 TRACE_BLK, 31 TRACE_BLK,
40 TRACE_KSYM,
41 32
42 __TRACE_LAST_TYPE, 33 __TRACE_LAST_TYPE,
43}; 34};
44 35
45enum kmemtrace_type_id {
46 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49};
50
51extern struct tracer boot_tracer;
52 36
53#undef __field 37#undef __field
54#define __field(type, item) type item; 38#define __field(type, item) type item;
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void);
204 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 188 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
205 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 189 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
206 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 190 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
207 IF_ASSIGN(var, ent, struct special_entry, 0); \
208 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 191 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
209 TRACE_MMIO_RW); \ 192 TRACE_MMIO_RW); \
210 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 193 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
211 TRACE_MMIO_MAP); \ 194 TRACE_MMIO_MAP); \
212 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
213 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
214 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ 195 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
215 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ 196 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
216 TRACE_GRAPH_ENT); \ 197 TRACE_GRAPH_ENT); \
217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 198 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
218 TRACE_GRAPH_RET); \ 199 TRACE_GRAPH_RET); \
219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
220 TRACE_KMEM_ALLOC); \
221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
222 TRACE_KMEM_FREE); \
223 IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
224 __ftrace_bad_type(); \ 200 __ftrace_bad_type(); \
225 } while (0) 201 } while (0)
226 202
@@ -298,6 +274,7 @@ struct tracer {
298 struct tracer *next; 274 struct tracer *next;
299 int print_max; 275 int print_max;
300 struct tracer_flags *flags; 276 struct tracer_flags *flags;
277 int use_max_tr;
301}; 278};
302 279
303 280
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
318 const struct file_operations *fops); 295 const struct file_operations *fops);
319 296
320struct dentry *tracing_init_dentry(void); 297struct dentry *tracing_init_dentry(void);
321void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
322 298
323struct ring_buffer_event; 299struct ring_buffer_event;
324 300
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
338struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 314struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
339 int *ent_cpu, u64 *ent_ts); 315 int *ent_cpu, u64 *ent_ts);
340 316
317int trace_empty(struct trace_iterator *iter);
318
319void *trace_find_next_entry_inc(struct trace_iterator *iter);
320
321void trace_init_global_iter(struct trace_iterator *iter);
322
323void tracing_iter_reset(struct trace_iterator *iter, int cpu);
324
341void default_wait_pipe(struct trace_iterator *iter); 325void default_wait_pipe(struct trace_iterator *iter);
342void poll_wait_pipe(struct trace_iterator *iter); 326void poll_wait_pipe(struct trace_iterator *iter);
343 327
@@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
355 struct task_struct *wakee, 339 struct task_struct *wakee,
356 struct task_struct *cur, 340 struct task_struct *cur,
357 unsigned long flags, int pc); 341 unsigned long flags, int pc);
358void trace_special(struct trace_array *tr,
359 struct trace_array_cpu *data,
360 unsigned long arg1,
361 unsigned long arg2,
362 unsigned long arg3, int pc);
363void trace_function(struct trace_array *tr, 342void trace_function(struct trace_array *tr,
364 unsigned long ip, 343 unsigned long ip,
365 unsigned long parent_ip, 344 unsigned long parent_ip,
@@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void);
380int register_tracer(struct tracer *type); 359int register_tracer(struct tracer *type);
381void unregister_tracer(struct tracer *type); 360void unregister_tracer(struct tracer *type);
382int is_tracing_stopped(void); 361int is_tracing_stopped(void);
362enum trace_file_type {
363 TRACE_FILE_LAT_FMT = 1,
364 TRACE_FILE_ANNOTATE = 2,
365};
366
367extern cpumask_var_t __read_mostly tracing_buffer_mask;
383 368
384extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); 369#define for_each_tracing_cpu(cpu) \
370 for_each_cpu(cpu, tracing_buffer_mask)
385 371
386extern unsigned long nsecs_to_usecs(unsigned long nsecs); 372extern unsigned long nsecs_to_usecs(unsigned long nsecs);
387 373
@@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
452 struct trace_array *tr); 438 struct trace_array *tr);
453extern int trace_selftest_startup_sched_switch(struct tracer *trace, 439extern int trace_selftest_startup_sched_switch(struct tracer *trace,
454 struct trace_array *tr); 440 struct trace_array *tr);
455extern int trace_selftest_startup_sysprof(struct tracer *trace,
456 struct trace_array *tr);
457extern int trace_selftest_startup_branch(struct tracer *trace, 441extern int trace_selftest_startup_branch(struct tracer *trace,
458 struct trace_array *tr); 442 struct trace_array *tr);
459extern int trace_selftest_startup_ksym(struct tracer *trace,
460 struct trace_array *tr);
461#endif /* CONFIG_FTRACE_STARTUP_TEST */ 443#endif /* CONFIG_FTRACE_STARTUP_TEST */
462 444
463extern void *head_page(struct trace_array_cpu *data); 445extern void *head_page(struct trace_array_cpu *data);
@@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
471 unsigned long ip, const char *fmt, va_list args); 453 unsigned long ip, const char *fmt, va_list args);
472int trace_array_printk(struct trace_array *tr, 454int trace_array_printk(struct trace_array *tr,
473 unsigned long ip, const char *fmt, ...); 455 unsigned long ip, const char *fmt, ...);
456void trace_printk_seq(struct trace_seq *s);
457enum print_line_t print_trace_line(struct trace_iterator *iter);
474 458
475extern unsigned long trace_flags; 459extern unsigned long trace_flags;
476 460
@@ -617,6 +601,7 @@ enum trace_iterator_flags {
617 TRACE_ITER_LATENCY_FMT = 0x20000, 601 TRACE_ITER_LATENCY_FMT = 0x20000,
618 TRACE_ITER_SLEEP_TIME = 0x40000, 602 TRACE_ITER_SLEEP_TIME = 0x40000,
619 TRACE_ITER_GRAPH_TIME = 0x80000, 603 TRACE_ITER_GRAPH_TIME = 0x80000,
604 TRACE_ITER_RECORD_CMD = 0x100000,
620}; 605};
621 606
622/* 607/*
@@ -628,54 +613,6 @@ enum trace_iterator_flags {
628 613
629extern struct tracer nop_trace; 614extern struct tracer nop_trace;
630 615
631/**
632 * ftrace_preempt_disable - disable preemption scheduler safe
633 *
634 * When tracing can happen inside the scheduler, there exists
635 * cases that the tracing might happen before the need_resched
636 * flag is checked. If this happens and the tracer calls
637 * preempt_enable (after a disable), a schedule might take place
638 * causing an infinite recursion.
639 *
640 * To prevent this, we read the need_resched flag before
641 * disabling preemption. When we want to enable preemption we
642 * check the flag, if it is set, then we call preempt_enable_no_resched.
643 * Otherwise, we call preempt_enable.
644 *
645 * The rational for doing the above is that if need_resched is set
646 * and we have yet to reschedule, we are either in an atomic location
647 * (where we do not need to check for scheduling) or we are inside
648 * the scheduler and do not want to resched.
649 */
650static inline int ftrace_preempt_disable(void)
651{
652 int resched;
653
654 resched = need_resched();
655 preempt_disable_notrace();
656
657 return resched;
658}
659
660/**
661 * ftrace_preempt_enable - enable preemption scheduler safe
662 * @resched: the return value from ftrace_preempt_disable
663 *
664 * This is a scheduler safe way to enable preemption and not miss
665 * any preemption checks. The disabled saved the state of preemption.
666 * If resched is set, then we are either inside an atomic or
667 * are inside the scheduler (we would have already scheduled
668 * otherwise). In this case, we do not want to call normal
669 * preempt_enable, but preempt_enable_no_resched instead.
670 */
671static inline void ftrace_preempt_enable(int resched)
672{
673 if (resched)
674 preempt_enable_no_resched_notrace();
675 else
676 preempt_enable_notrace();
677}
678
679#ifdef CONFIG_BRANCH_TRACER 616#ifdef CONFIG_BRANCH_TRACER
680extern int enable_branch_tracing(struct trace_array *tr); 617extern int enable_branch_tracing(struct trace_array *tr);
681extern void disable_branch_tracing(void); 618extern void disable_branch_tracing(void);
@@ -766,6 +703,8 @@ struct filter_pred {
766 int pop_n; 703 int pop_n;
767}; 704};
768 705
706extern struct list_head ftrace_common_fields;
707
769extern enum regex_type 708extern enum regex_type
770filter_parse_regex(char *buff, int len, char **search, int *not); 709filter_parse_regex(char *buff, int len, char **search, int *not);
771extern void print_event_filter(struct ftrace_event_call *call, 710extern void print_event_filter(struct ftrace_event_call *call,
@@ -795,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
795 return 0; 734 return 0;
796} 735}
797 736
737extern void trace_event_enable_cmd_record(bool enable);
738
798extern struct mutex event_mutex; 739extern struct mutex event_mutex;
799extern struct list_head ftrace_events; 740extern struct list_head ftrace_events;
800 741
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956ad..000000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12#include <linux/time.h>
13
14#include "trace.h"
15#include "trace_output.h"
16
17static struct trace_array *boot_trace;
18static bool pre_initcalls_finished;
19
20/* Tells the boot tracer that the pre_smp_initcalls are finished.
21 * So we are ready .
22 * It doesn't enable sched events tracing however.
23 * You have to call enable_boot_trace to do so.
24 */
25void start_boot_trace(void)
26{
27 pre_initcalls_finished = true;
28}
29
30void enable_boot_trace(void)
31{
32 if (boot_trace && pre_initcalls_finished)
33 tracing_start_sched_switch_record();
34}
35
36void disable_boot_trace(void)
37{
38 if (boot_trace && pre_initcalls_finished)
39 tracing_stop_sched_switch_record();
40}
41
42static int boot_trace_init(struct trace_array *tr)
43{
44 boot_trace = tr;
45
46 if (!tr)
47 return 0;
48
49 tracing_reset_online_cpus(tr);
50
51 tracing_sched_switch_assign_trace(tr);
52 return 0;
53}
54
55static enum print_line_t
56initcall_call_print_line(struct trace_iterator *iter)
57{
58 struct trace_entry *entry = iter->ent;
59 struct trace_seq *s = &iter->seq;
60 struct trace_boot_call *field;
61 struct boot_trace_call *call;
62 u64 ts;
63 unsigned long nsec_rem;
64 int ret;
65
66 trace_assign_type(field, entry);
67 call = &field->boot_call;
68 ts = iter->ts;
69 nsec_rem = do_div(ts, NSEC_PER_SEC);
70
71 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
72 (unsigned long)ts, nsec_rem, call->func, call->caller);
73
74 if (!ret)
75 return TRACE_TYPE_PARTIAL_LINE;
76 else
77 return TRACE_TYPE_HANDLED;
78}
79
80static enum print_line_t
81initcall_ret_print_line(struct trace_iterator *iter)
82{
83 struct trace_entry *entry = iter->ent;
84 struct trace_seq *s = &iter->seq;
85 struct trace_boot_ret *field;
86 struct boot_trace_ret *init_ret;
87 u64 ts;
88 unsigned long nsec_rem;
89 int ret;
90
91 trace_assign_type(field, entry);
92 init_ret = &field->boot_ret;
93 ts = iter->ts;
94 nsec_rem = do_div(ts, NSEC_PER_SEC);
95
96 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
97 "returned %d after %llu msecs\n",
98 (unsigned long) ts,
99 nsec_rem,
100 init_ret->func, init_ret->result, init_ret->duration);
101
102 if (!ret)
103 return TRACE_TYPE_PARTIAL_LINE;
104 else
105 return TRACE_TYPE_HANDLED;
106}
107
108static enum print_line_t initcall_print_line(struct trace_iterator *iter)
109{
110 struct trace_entry *entry = iter->ent;
111
112 switch (entry->type) {
113 case TRACE_BOOT_CALL:
114 return initcall_call_print_line(iter);
115 case TRACE_BOOT_RET:
116 return initcall_ret_print_line(iter);
117 default:
118 return TRACE_TYPE_UNHANDLED;
119 }
120}
121
122struct tracer boot_tracer __read_mostly =
123{
124 .name = "initcall",
125 .init = boot_trace_init,
126 .reset = tracing_reset_online_cpus,
127 .print_line = initcall_print_line,
128};
129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{
132 struct ftrace_event_call *call = &event_boot_call;
133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace;
137
138 if (!tr || !pre_initcalls_finished)
139 return;
140
141 /* Get its name now since this function could
142 * disappear because it is in the .init section.
143 */
144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable();
146
147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
149 sizeof(*entry), 0, 0);
150 if (!event)
151 goto out;
152 entry = ring_buffer_event_data(event);
153 entry->boot_call = *bt;
154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
156 out:
157 preempt_enable();
158}
159
160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
161{
162 struct ftrace_event_call *call = &event_boot_ret;
163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
165 struct trace_boot_ret *entry;
166 struct trace_array *tr = boot_trace;
167
168 if (!tr || !pre_initcalls_finished)
169 return;
170
171 sprint_symbol(bt->func, (unsigned long)fn);
172 preempt_disable();
173
174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
176 sizeof(*entry), 0, 0);
177 if (!event)
178 goto out;
179 entry = ring_buffer_event_data(event);
180 entry->boot_ret = *bt;
181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
183 out:
184 preempt_enable();
185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
32u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
33{ 33{
34 u64 clock; 34 u64 clock;
35 int resched;
36 35
37 /* 36 /*
38 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
39 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
40 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
41 */ 40 */
42 resched = ftrace_preempt_disable(); 41 preempt_disable_notrace();
43 clock = sched_clock(); 42 clock = sched_clock();
44 ftrace_preempt_enable(resched); 43 preempt_enable_notrace();
45 44
46 return clock; 45 return clock;
47} 46}
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void)
56 */ 55 */
57u64 notrace trace_clock(void) 56u64 notrace trace_clock(void)
58{ 57{
59 return cpu_clock(raw_smp_processor_id()); 58 return local_clock();
60} 59}
61 60
62 61
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index dc008c1240da..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
151); 151);
152 152
153/* 153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry: 154 * Stack-trace entry:
172 */ 155 */
173 156
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
271 __entry->map_id, __entry->opcode) 254 __entry->map_id, __entry->opcode)
272); 255);
273 256
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301 257
302#define TRACE_FUNC_SIZE 30 258#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20 259#define TRACE_FILE_SIZE 20
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 274 __entry->func, __entry->file, __entry->correct)
319); 275);
320 276
321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
322
323 TRACE_KMEM_ALLOC,
324
325 F_STRUCT(
326 __field( enum kmemtrace_type_id, type_id )
327 __field( unsigned long, call_site )
328 __field( const void *, ptr )
329 __field( size_t, bytes_req )
330 __field( size_t, bytes_alloc )
331 __field( gfp_t, gfp_flags )
332 __field( int, node )
333 ),
334
335 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
336 " flags:%x node:%d",
337 __entry->type_id, __entry->call_site, __entry->ptr,
338 __entry->bytes_req, __entry->bytes_alloc,
339 __entry->gfp_flags, __entry->node)
340);
341
342FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
343
344 TRACE_KMEM_FREE,
345
346 F_STRUCT(
347 __field( enum kmemtrace_type_id, type_id )
348 __field( unsigned long, call_site )
349 __field( const void *, ptr )
350 ),
351
352 F_printk("type:%u call_site:%lx ptr:%p",
353 __entry->type_id, __entry->call_site, __entry->ptr)
354);
355
356FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
357
358 TRACE_KSYM,
359
360 F_STRUCT(
361 __field( unsigned long, ip )
362 __field( unsigned char, type )
363 __array( char , cmd, TASK_COMM_LEN )
364 __field( unsigned long, addr )
365 ),
366
367 F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
368 (void *)__entry->ip, (unsigned int)__entry->type,
369 (void *)__entry->addr, __entry->cmd)
370);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 8a2b73f7c068..000e6e85b445 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
13
14static char *perf_trace_buf[4]; 12static char *perf_trace_buf[4];
15 13
16/* 14/*
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
56 } 54 }
57 } 55 }
58 56
59 if (tp_event->class->reg) 57 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret) 58 if (ret)
67 goto fail; 59 goto fail;
68 60
@@ -96,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event)
96 mutex_lock(&event_mutex); 88 mutex_lock(&event_mutex);
97 list_for_each_entry(tp_event, &ftrace_events, list) { 89 list_for_each_entry(tp_event, &ftrace_events, list) {
98 if (tp_event->event.type == event_id && 90 if (tp_event->event.type == event_id &&
99 tp_event->class && 91 tp_event->class && tp_event->class->reg &&
100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
102 try_module_get(tp_event->mod)) { 92 try_module_get(tp_event->mod)) {
103 ret = perf_trace_event_init(tp_event, p_event); 93 ret = perf_trace_event_init(tp_event, p_event);
104 break; 94 break;
@@ -138,18 +128,13 @@ void perf_trace_destroy(struct perf_event *p_event)
138 if (--tp_event->perf_refcount > 0) 128 if (--tp_event->perf_refcount > 0)
139 goto out; 129 goto out;
140 130
141 if (tp_event->class->reg) 131 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
143 else
144 tracepoint_probe_unregister(tp_event->name,
145 tp_event->class->perf_probe,
146 tp_event);
147 132
148 /* 133 /*
149 * Ensure our callback won't be called anymore. See 134 * Ensure our callback won't be called anymore. The buffers
150 * tracepoint_probe_unregister() and __DO_TRACE(). 135 * will be freed after that.
151 */ 136 */
152 synchronize_sched(); 137 tracepoint_synchronize_unregister();
153 138
154 free_percpu(tp_event->perf_events); 139 free_percpu(tp_event->perf_events);
155 tp_event->perf_events = NULL; 140 tp_event->perf_events = NULL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..09b4fa6e4d3b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -28,6 +28,7 @@
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields);
31 32
32struct list_head * 33struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call) 34trace_get_fields(struct ftrace_event_call *event_call)
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call)
37 return event_call->class->get_fields(event_call); 38 return event_call->class->get_fields(event_call);
38} 39}
39 40
40int trace_define_field(struct ftrace_event_call *call, const char *type, 41static int __trace_define_field(struct list_head *head, const char *type,
41 const char *name, int offset, int size, int is_signed, 42 const char *name, int offset, int size,
42 int filter_type) 43 int is_signed, int filter_type)
43{ 44{
44 struct ftrace_event_field *field; 45 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
49 46
50 field = kzalloc(sizeof(*field), GFP_KERNEL); 47 field = kzalloc(sizeof(*field), GFP_KERNEL);
51 if (!field) 48 if (!field)
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
68 field->size = size; 65 field->size = size;
69 field->is_signed = is_signed; 66 field->is_signed = is_signed;
70 67
71 head = trace_get_fields(call);
72 list_add(&field->link, head); 68 list_add(&field->link, head);
73 69
74 return 0; 70 return 0;
@@ -80,17 +76,32 @@ err:
80 76
81 return -ENOMEM; 77 return -ENOMEM;
82} 78}
79
80int trace_define_field(struct ftrace_event_call *call, const char *type,
81 const char *name, int offset, int size, int is_signed,
82 int filter_type)
83{
84 struct list_head *head;
85
86 if (WARN_ON(!call->class))
87 return 0;
88
89 head = trace_get_fields(call);
90 return __trace_define_field(head, type, name, offset, size,
91 is_signed, filter_type);
92}
83EXPORT_SYMBOL_GPL(trace_define_field); 93EXPORT_SYMBOL_GPL(trace_define_field);
84 94
85#define __common_field(type, item) \ 95#define __common_field(type, item) \
86 ret = trace_define_field(call, #type, "common_" #item, \ 96 ret = __trace_define_field(&ftrace_common_fields, #type, \
87 offsetof(typeof(ent), item), \ 97 "common_" #item, \
88 sizeof(ent.item), \ 98 offsetof(typeof(ent), item), \
89 is_signed_type(type), FILTER_OTHER); \ 99 sizeof(ent.item), \
100 is_signed_type(type), FILTER_OTHER); \
90 if (ret) \ 101 if (ret) \
91 return ret; 102 return ret;
92 103
93static int trace_define_common_fields(struct ftrace_event_call *call) 104static int trace_define_common_fields(void)
94{ 105{
95 int ret; 106 int ret;
96 struct trace_entry ent; 107 struct trace_entry ent;
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call)
130} 141}
131EXPORT_SYMBOL_GPL(trace_event_raw_init); 142EXPORT_SYMBOL_GPL(trace_event_raw_init);
132 143
144int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
145{
146 switch (type) {
147 case TRACE_REG_REGISTER:
148 return tracepoint_probe_register(call->name,
149 call->class->probe,
150 call);
151 case TRACE_REG_UNREGISTER:
152 tracepoint_probe_unregister(call->name,
153 call->class->probe,
154 call);
155 return 0;
156
157#ifdef CONFIG_PERF_EVENTS
158 case TRACE_REG_PERF_REGISTER:
159 return tracepoint_probe_register(call->name,
160 call->class->perf_probe,
161 call);
162 case TRACE_REG_PERF_UNREGISTER:
163 tracepoint_probe_unregister(call->name,
164 call->class->perf_probe,
165 call);
166 return 0;
167#endif
168 }
169 return 0;
170}
171EXPORT_SYMBOL_GPL(ftrace_event_reg);
172
173void trace_event_enable_cmd_record(bool enable)
174{
175 struct ftrace_event_call *call;
176
177 mutex_lock(&event_mutex);
178 list_for_each_entry(call, &ftrace_events, list) {
179 if (!(call->flags & TRACE_EVENT_FL_ENABLED))
180 continue;
181
182 if (enable) {
183 tracing_start_cmdline_record();
184 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
185 } else {
186 tracing_stop_cmdline_record();
187 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
188 }
189 }
190 mutex_unlock(&event_mutex);
191}
192
133static int ftrace_event_enable_disable(struct ftrace_event_call *call, 193static int ftrace_event_enable_disable(struct ftrace_event_call *call,
134 int enable) 194 int enable)
135{ 195{
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
139 case 0: 199 case 0:
140 if (call->flags & TRACE_EVENT_FL_ENABLED) { 200 if (call->flags & TRACE_EVENT_FL_ENABLED) {
141 call->flags &= ~TRACE_EVENT_FL_ENABLED; 201 call->flags &= ~TRACE_EVENT_FL_ENABLED;
142 tracing_stop_cmdline_record(); 202 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
143 if (call->class->reg) 203 tracing_stop_cmdline_record();
144 call->class->reg(call, TRACE_REG_UNREGISTER); 204 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
145 else 205 }
146 tracepoint_probe_unregister(call->name, 206 call->class->reg(call, TRACE_REG_UNREGISTER);
147 call->class->probe,
148 call);
149 } 207 }
150 break; 208 break;
151 case 1: 209 case 1:
152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 210 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
153 tracing_start_cmdline_record(); 211 if (trace_flags & TRACE_ITER_RECORD_CMD) {
154 if (call->class->reg) 212 tracing_start_cmdline_record();
155 ret = call->class->reg(call, TRACE_REG_REGISTER); 213 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
156 else 214 }
157 ret = tracepoint_probe_register(call->name, 215 ret = call->class->reg(call, TRACE_REG_REGISTER);
158 call->class->probe,
159 call);
160 if (ret) { 216 if (ret) {
161 tracing_stop_cmdline_record(); 217 tracing_stop_cmdline_record();
162 pr_info("event trace: Could not enable event " 218 pr_info("event trace: Could not enable event "
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
194 mutex_lock(&event_mutex); 250 mutex_lock(&event_mutex);
195 list_for_each_entry(call, &ftrace_events, list) { 251 list_for_each_entry(call, &ftrace_events, list) {
196 252
197 if (!call->name || !call->class || 253 if (!call->name || !call->class || !call->class->reg)
198 (!call->class->probe && !call->class->reg))
199 continue; 254 continue;
200 255
201 if (match && 256 if (match &&
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
321 * The ftrace subsystem is for showing formats only. 376 * The ftrace subsystem is for showing formats only.
322 * They can not be enabled or disabled via the event files. 377 * They can not be enabled or disabled via the event files.
323 */ 378 */
324 if (call->class && (call->class->probe || call->class->reg)) 379 if (call->class && call->class->reg)
325 return call; 380 return call;
326 } 381 }
327 382
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
474 529
475 mutex_lock(&event_mutex); 530 mutex_lock(&event_mutex);
476 list_for_each_entry(call, &ftrace_events, list) { 531 list_for_each_entry(call, &ftrace_events, list) {
477 if (!call->name || !call->class || 532 if (!call->name || !call->class || !call->class->reg)
478 (!call->class->probe && !call->class->reg))
479 continue; 533 continue;
480 534
481 if (system && strcmp(call->class->system, system) != 0) 535 if (system && strcmp(call->class->system, system) != 0)
@@ -544,32 +598,10 @@ out:
544 return ret; 598 return ret;
545} 599}
546 600
547static ssize_t 601static void print_event_fields(struct trace_seq *s, struct list_head *head)
548event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
549 loff_t *ppos)
550{ 602{
551 struct ftrace_event_call *call = filp->private_data;
552 struct ftrace_event_field *field; 603 struct ftrace_event_field *field;
553 struct list_head *head;
554 struct trace_seq *s;
555 int common_field_count = 5;
556 char *buf;
557 int r = 0;
558
559 if (*ppos)
560 return 0;
561
562 s = kmalloc(sizeof(*s), GFP_KERNEL);
563 if (!s)
564 return -ENOMEM;
565
566 trace_seq_init(s);
567
568 trace_seq_printf(s, "name: %s\n", call->name);
569 trace_seq_printf(s, "ID: %d\n", call->event.type);
570 trace_seq_printf(s, "format:\n");
571 604
572 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) { 605 list_for_each_entry_reverse(field, head, link) {
574 /* 606 /*
575 * Smartly shows the array type(except dynamic array). 607 * Smartly shows the array type(except dynamic array).
@@ -584,29 +616,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
584 array_descriptor = NULL; 616 array_descriptor = NULL;
585 617
586 if (!array_descriptor) { 618 if (!array_descriptor) {
587 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" 619 trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
588 "\tsize:%u;\tsigned:%d;\n", 620 "\tsize:%u;\tsigned:%d;\n",
589 field->type, field->name, field->offset, 621 field->type, field->name, field->offset,
590 field->size, !!field->is_signed); 622 field->size, !!field->is_signed);
591 } else { 623 } else {
592 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" 624 trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
593 "\tsize:%u;\tsigned:%d;\n", 625 "\tsize:%u;\tsigned:%d;\n",
594 (int)(array_descriptor - field->type), 626 (int)(array_descriptor - field->type),
595 field->type, field->name, 627 field->type, field->name,
596 array_descriptor, field->offset, 628 array_descriptor, field->offset,
597 field->size, !!field->is_signed); 629 field->size, !!field->is_signed);
598 } 630 }
631 }
632}
599 633
600 if (--common_field_count == 0) 634static ssize_t
601 r = trace_seq_printf(s, "\n"); 635event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
636 loff_t *ppos)
637{
638 struct ftrace_event_call *call = filp->private_data;
639 struct list_head *head;
640 struct trace_seq *s;
641 char *buf;
642 int r;
602 643
603 if (!r) 644 if (*ppos)
604 break; 645 return 0;
605 } 646
647 s = kmalloc(sizeof(*s), GFP_KERNEL);
648 if (!s)
649 return -ENOMEM;
650
651 trace_seq_init(s);
652
653 trace_seq_printf(s, "name: %s\n", call->name);
654 trace_seq_printf(s, "ID: %d\n", call->event.type);
655 trace_seq_printf(s, "format:\n");
656
657 /* print common fields */
658 print_event_fields(s, &ftrace_common_fields);
606 659
607 if (r) 660 trace_seq_putc(s, '\n');
608 r = trace_seq_printf(s, "\nprint fmt: %s\n", 661
609 call->print_fmt); 662 /* print event specific fields */
663 head = trace_get_fields(call);
664 print_event_fields(s, head);
665
666 r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt);
610 667
611 if (!r) { 668 if (!r) {
612 /* 669 /*
@@ -963,35 +1020,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
963 return -1; 1020 return -1;
964 } 1021 }
965 1022
966 if (call->class->probe || call->class->reg) 1023 if (call->class->reg)
967 trace_create_file("enable", 0644, call->dir, call, 1024 trace_create_file("enable", 0644, call->dir, call,
968 enable); 1025 enable);
969 1026
970#ifdef CONFIG_PERF_EVENTS 1027#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg)) 1028 if (call->event.type && call->class->reg)
972 trace_create_file("id", 0444, call->dir, call, 1029 trace_create_file("id", 0444, call->dir, call,
973 id); 1030 id);
974#endif 1031#endif
975 1032
976 if (call->class->define_fields) { 1033 /*
977 /* 1034 * Other events may have the same class. Only update
978 * Other events may have the same class. Only update 1035 * the fields if they are not already defined.
979 * the fields if they are not already defined. 1036 */
980 */ 1037 head = trace_get_fields(call);
981 head = trace_get_fields(call); 1038 if (list_empty(head)) {
982 if (list_empty(head)) { 1039 ret = call->class->define_fields(call);
983 ret = trace_define_common_fields(call); 1040 if (ret < 0) {
984 if (!ret) 1041 pr_warning("Could not initialize trace point"
985 ret = call->class->define_fields(call); 1042 " events/%s\n", call->name);
986 if (ret < 0) { 1043 return ret;
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
991 } 1044 }
992 trace_create_file("filter", 0644, call->dir, call,
993 filter);
994 } 1045 }
1046 trace_create_file("filter", 0644, call->dir, call,
1047 filter);
995 1048
996 trace_create_file("format", 0444, call->dir, call, 1049 trace_create_file("format", 0444, call->dir, call,
997 format); 1050 format);
@@ -999,11 +1052,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
999 return 0; 1052 return 0;
1000} 1053}
1001 1054
1002static int __trace_add_event_call(struct ftrace_event_call *call) 1055static int
1056__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
1057 const struct file_operations *id,
1058 const struct file_operations *enable,
1059 const struct file_operations *filter,
1060 const struct file_operations *format)
1003{ 1061{
1004 struct dentry *d_events; 1062 struct dentry *d_events;
1005 int ret; 1063 int ret;
1006 1064
1065 /* The linker may leave blanks */
1007 if (!call->name) 1066 if (!call->name)
1008 return -EINVAL; 1067 return -EINVAL;
1009 1068
@@ -1011,8 +1070,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1011 ret = call->class->raw_init(call); 1070 ret = call->class->raw_init(call);
1012 if (ret < 0) { 1071 if (ret < 0) {
1013 if (ret != -ENOSYS) 1072 if (ret != -ENOSYS)
1014 pr_warning("Could not initialize trace " 1073 pr_warning("Could not initialize trace events/%s\n",
1015 "events/%s\n", call->name); 1074 call->name);
1016 return ret; 1075 return ret;
1017 } 1076 }
1018 } 1077 }
@@ -1021,11 +1080,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
1021 if (!d_events) 1080 if (!d_events)
1022 return -ENOENT; 1081 return -ENOENT;
1023 1082
1024 ret = event_create_dir(call, d_events, &ftrace_event_id_fops, 1083 ret = event_create_dir(call, d_events, id, enable, filter, format);
1025 &ftrace_enable_fops, &ftrace_event_filter_fops,
1026 &ftrace_event_format_fops);
1027 if (!ret) 1084 if (!ret)
1028 list_add(&call->list, &ftrace_events); 1085 list_add(&call->list, &ftrace_events);
1086 call->mod = mod;
1029 1087
1030 return ret; 1088 return ret;
1031} 1089}
@@ -1035,7 +1093,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
1035{ 1093{
1036 int ret; 1094 int ret;
1037 mutex_lock(&event_mutex); 1095 mutex_lock(&event_mutex);
1038 ret = __trace_add_event_call(call); 1096 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1097 &ftrace_enable_fops,
1098 &ftrace_event_filter_fops,
1099 &ftrace_event_format_fops);
1039 mutex_unlock(&event_mutex); 1100 mutex_unlock(&event_mutex);
1040 return ret; 1101 return ret;
1041} 1102}
@@ -1152,8 +1213,6 @@ static void trace_module_add_events(struct module *mod)
1152{ 1213{
1153 struct ftrace_module_file_ops *file_ops = NULL; 1214 struct ftrace_module_file_ops *file_ops = NULL;
1154 struct ftrace_event_call *call, *start, *end; 1215 struct ftrace_event_call *call, *start, *end;
1155 struct dentry *d_events;
1156 int ret;
1157 1216
1158 start = mod->trace_events; 1217 start = mod->trace_events;
1159 end = mod->trace_events + mod->num_trace_events; 1218 end = mod->trace_events + mod->num_trace_events;
@@ -1161,38 +1220,14 @@ static void trace_module_add_events(struct module *mod)
1161 if (start == end) 1220 if (start == end)
1162 return; 1221 return;
1163 1222
1164 d_events = event_trace_events_dir(); 1223 file_ops = trace_create_file_ops(mod);
1165 if (!d_events) 1224 if (!file_ops)
1166 return; 1225 return;
1167 1226
1168 for_each_event(call, start, end) { 1227 for_each_event(call, start, end) {
1169 /* The linker may leave blanks */ 1228 __trace_add_event_call(call, mod,
1170 if (!call->name)
1171 continue;
1172 if (call->class->raw_init) {
1173 ret = call->class->raw_init(call);
1174 if (ret < 0) {
1175 if (ret != -ENOSYS)
1176 pr_warning("Could not initialize trace "
1177 "point events/%s\n", call->name);
1178 continue;
1179 }
1180 }
1181 /*
1182 * This module has events, create file ops for this module
1183 * if not already done.
1184 */
1185 if (!file_ops) {
1186 file_ops = trace_create_file_ops(mod);
1187 if (!file_ops)
1188 return;
1189 }
1190 call->mod = mod;
1191 ret = event_create_dir(call, d_events,
1192 &file_ops->id, &file_ops->enable, 1229 &file_ops->id, &file_ops->enable,
1193 &file_ops->filter, &file_ops->format); 1230 &file_ops->filter, &file_ops->format);
1194 if (!ret)
1195 list_add(&call->list, &ftrace_events);
1196 } 1231 }
1197} 1232}
1198 1233
@@ -1319,25 +1354,14 @@ static __init int event_trace_init(void)
1319 trace_create_file("enable", 0644, d_events, 1354 trace_create_file("enable", 0644, d_events,
1320 NULL, &ftrace_system_enable_fops); 1355 NULL, &ftrace_system_enable_fops);
1321 1356
1357 if (trace_define_common_fields())
1358 pr_warning("tracing: Failed to allocate common fields");
1359
1322 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1360 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1323 /* The linker may leave blanks */ 1361 __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1324 if (!call->name)
1325 continue;
1326 if (call->class->raw_init) {
1327 ret = call->class->raw_init(call);
1328 if (ret < 0) {
1329 if (ret != -ENOSYS)
1330 pr_warning("Could not initialize trace "
1331 "point events/%s\n", call->name);
1332 continue;
1333 }
1334 }
1335 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1336 &ftrace_enable_fops, 1362 &ftrace_enable_fops,
1337 &ftrace_event_filter_fops, 1363 &ftrace_event_filter_fops,
1338 &ftrace_event_format_fops); 1364 &ftrace_event_format_fops);
1339 if (!ret)
1340 list_add(&call->list, &ftrace_events);
1341 } 1365 }
1342 1366
1343 while (true) { 1367 while (true) {
@@ -1524,12 +1548,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1524 struct ftrace_entry *entry; 1548 struct ftrace_entry *entry;
1525 unsigned long flags; 1549 unsigned long flags;
1526 long disabled; 1550 long disabled;
1527 int resched;
1528 int cpu; 1551 int cpu;
1529 int pc; 1552 int pc;
1530 1553
1531 pc = preempt_count(); 1554 pc = preempt_count();
1532 resched = ftrace_preempt_disable(); 1555 preempt_disable_notrace();
1533 cpu = raw_smp_processor_id(); 1556 cpu = raw_smp_processor_id();
1534 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); 1557 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1535 1558
@@ -1551,7 +1574,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1551 1574
1552 out: 1575 out:
1553 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1576 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1554 ftrace_preempt_enable(resched); 1577 preempt_enable_notrace();
1555} 1578}
1556 1579
1557static struct ftrace_ops trace_ops __initdata = 1580static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..36d40104b17f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system,
497} 497}
498 498
499static struct ftrace_event_field * 499static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500__find_event_field(struct list_head *head, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
504 503
505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) { 504 list_for_each_entry(field, head, link) {
507 if (!strcmp(field->name, name)) 505 if (!strcmp(field->name, name))
508 return field; 506 return field;
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
511 return NULL; 509 return NULL;
512} 510}
513 511
512static struct ftrace_event_field *
513find_event_field(struct ftrace_event_call *call, char *name)
514{
515 struct ftrace_event_field *field;
516 struct list_head *head;
517
518 field = __find_event_field(&ftrace_common_fields, name);
519 if (field)
520 return field;
521
522 head = trace_get_fields(call);
523 return __find_event_field(head, name);
524}
525
514static void filter_free_pred(struct filter_pred *pred) 526static void filter_free_pred(struct filter_pred *pred)
515{ 527{
516 if (!pred) 528 if (!pred)
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system)
627 int err; 639 int err;
628 640
629 list_for_each_entry(call, &ftrace_events, list) { 641 list_for_each_entry(call, &ftrace_events, list) {
630 if (!call->class || !call->class->define_fields)
631 continue;
632
633 if (strcmp(call->class->system, system->name) != 0) 642 if (strcmp(call->class->system, system->name) != 0)
634 continue; 643 continue;
635 644
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
646 struct ftrace_event_call *call; 655 struct ftrace_event_call *call;
647 656
648 list_for_each_entry(call, &ftrace_events, list) { 657 list_for_each_entry(call, &ftrace_events, list) {
649 if (!call->class || !call->class->define_fields)
650 continue;
651
652 if (strcmp(call->class->system, system->name) != 0) 658 if (strcmp(call->class->system, system->name) != 0)
653 continue; 659 continue;
654 660
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system,
1251 list_for_each_entry(call, &ftrace_events, list) { 1257 list_for_each_entry(call, &ftrace_events, list) {
1252 struct event_filter *filter = call->filter; 1258 struct event_filter *filter = call->filter;
1253 1259
1254 if (!call->class || !call->class->define_fields)
1255 continue;
1256
1257 if (strcmp(call->class->system, system->name) != 0) 1260 if (strcmp(call->class->system, system->name) != 0)
1258 continue; 1261 continue;
1259 1262
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 8536e2a65969..4ba44deaac25 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
125 125
126#include "trace_entries.h" 126#include "trace_entries.h"
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{
130 INIT_LIST_HEAD(&call->class->fields);
131 return 0;
132}
133
134#undef __entry 128#undef __entry
135#define __entry REC 129#define __entry REC
136 130
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
158struct ftrace_event_class event_class_ftrace_##call = { \ 152struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 153 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 154 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \ 155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
162}; \ 156}; \
163 \ 157 \
164struct ftrace_event_call __used \ 158struct ftrace_event_call __used \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
56 long disabled; 56 long disabled;
57 int cpu, resched; 57 int cpu;
58 int pc; 58 int pc;
59 59
60 if (unlikely(!ftrace_function_enabled)) 60 if (unlikely(!ftrace_function_enabled))
61 return; 61 return;
62 62
63 pc = preempt_count(); 63 pc = preempt_count();
64 resched = ftrace_preempt_disable(); 64 preempt_disable_notrace();
65 local_save_flags(flags); 65 local_save_flags(flags);
66 cpu = raw_smp_processor_id(); 66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu]; 67 data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
71 trace_function(tr, ip, parent_ip, flags, pc); 71 trace_function(tr, ip, parent_ip, flags, pc);
72 72
73 atomic_dec(&data->disabled); 73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched); 74 preempt_enable_notrace();
75} 75}
76 76
77static void 77static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..6bff23625781 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
641 641
642 /* Print nsecs (we don't want to exceed 7 numbers) */ 642 /* Print nsecs (we don't want to exceed 7 numbers) */
643 if (len < 7) { 643 if (len < 7) {
644 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); 644 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
645 nsecs_rem);
645 ret = trace_seq_printf(s, ".%s", nsecs_str); 646 ret = trace_seq_printf(s, ".%s", nsecs_str);
646 if (!ret) 647 if (!ret)
647 return TRACE_TYPE_PARTIAL_LINE; 648 return TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6fd486e0cef4..73a6b0601f2e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly =
649#endif 649#endif
650 .open = irqsoff_trace_open, 650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close, 651 .close = irqsoff_trace_close,
652 .use_max_tr = 1,
652}; 653};
653# define register_irqsoff(trace) register_tracer(&trace) 654# define register_irqsoff(trace) register_tracer(&trace)
654#else 655#else
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly =
681#endif 682#endif
682 .open = irqsoff_trace_open, 683 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close, 684 .close = irqsoff_trace_close,
685 .use_max_tr = 1,
684}; 686};
685# define register_preemptoff(trace) register_tracer(&trace) 687# define register_preemptoff(trace) register_tracer(&trace)
686#else 688#else
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
715#endif 717#endif
716 .open = irqsoff_trace_open, 718 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close, 719 .close = irqsoff_trace_close,
720 .use_max_tr = 1,
718}; 721};
719 722
720# define register_preemptirqsoff(trace) register_tracer(&trace) 723# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..7b8ecd751d93
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
1/*
2 * kdb helper for dumping the ftrace buffer
3 *
4 * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
5 *
6 * ftrace_dump_buf based on ftrace_dump:
7 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
8 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
9 *
10 */
11#include <linux/init.h>
12#include <linux/kgdb.h>
13#include <linux/kdb.h>
14#include <linux/ftrace.h>
15
16#include "../debug/kdb/kdb_private.h"
17#include "trace.h"
18#include "trace_output.h"
19
20static void ftrace_dump_buf(int skip_lines, long cpu_file)
21{
22 /* use static because iter can be a bit big for the stack */
23 static struct trace_iterator iter;
24 unsigned int old_userobj;
25 int cnt = 0, cpu;
26
27 trace_init_global_iter(&iter);
28
29 for_each_tracing_cpu(cpu) {
30 atomic_inc(&iter.tr->data[cpu]->disabled);
31 }
32
33 old_userobj = trace_flags;
34
35 /* don't look at user memory in panic mode */
36 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
37
38 kdb_printf("Dumping ftrace buffer:\n");
39
40 /* reset all but tr, trace, and overruns */
41 memset(&iter.seq, 0,
42 sizeof(struct trace_iterator) -
43 offsetof(struct trace_iterator, seq));
44 iter.iter_flags |= TRACE_FILE_LAT_FMT;
45 iter.pos = -1;
46
47 if (cpu_file == TRACE_PIPE_ALL_CPU) {
48 for_each_tracing_cpu(cpu) {
49 iter.buffer_iter[cpu] =
50 ring_buffer_read_prepare(iter.tr->buffer, cpu);
51 ring_buffer_read_start(iter.buffer_iter[cpu]);
52 tracing_iter_reset(&iter, cpu);
53 }
54 } else {
55 iter.cpu_file = cpu_file;
56 iter.buffer_iter[cpu_file] =
57 ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
58 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
59 tracing_iter_reset(&iter, cpu_file);
60 }
61 if (!trace_empty(&iter))
62 trace_find_next_entry_inc(&iter);
63 while (!trace_empty(&iter)) {
64 if (!cnt)
65 kdb_printf("---------------------------------\n");
66 cnt++;
67
68 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
69 print_trace_line(&iter);
70 if (!skip_lines)
71 trace_printk_seq(&iter.seq);
72 else
73 skip_lines--;
74 if (KDB_FLAG(CMD_INTERRUPT))
75 goto out;
76 }
77
78 if (!cnt)
79 kdb_printf(" (ftrace buffer empty)\n");
80 else
81 kdb_printf("---------------------------------\n");
82
83out:
84 trace_flags = old_userobj;
85
86 for_each_tracing_cpu(cpu) {
87 atomic_dec(&iter.tr->data[cpu]->disabled);
88 }
89
90 for_each_tracing_cpu(cpu)
91 if (iter.buffer_iter[cpu])
92 ring_buffer_read_finish(iter.buffer_iter[cpu]);
93}
94
95/*
96 * kdb_ftdump - Dump the ftrace log buffer
97 */
98static int kdb_ftdump(int argc, const char **argv)
99{
100 int skip_lines = 0;
101 long cpu_file;
102 char *cp;
103
104 if (argc > 2)
105 return KDB_ARGCOUNT;
106
107 if (argc) {
108 skip_lines = simple_strtol(argv[1], &cp, 0);
109 if (*cp)
110 skip_lines = 0;
111 }
112
113 if (argc == 2) {
114 cpu_file = simple_strtol(argv[2], &cp, 0);
115 if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
116 !cpu_online(cpu_file))
117 return KDB_BADINT;
118 } else {
119 cpu_file = TRACE_PIPE_ALL_CPU;
120 }
121
122 kdb_trap_printk++;
123 ftrace_dump_buf(skip_lines, cpu_file);
124 kdb_trap_printk--;
125
126 return 0;
127}
128
129static __init int kdb_ftrace_register(void)
130{
131 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
132 "Dump ftrace log", 0, KDB_REPEAT_NONE);
133 return 0;
134}
135
136late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f52b5f50299d..8b27c9849b42 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,6 +30,8 @@
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h> 32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <linux/uaccess.h>
33#include <asm/bitsperlong.h> 35#include <asm/bitsperlong.h>
34 36
35#include "trace.h" 37#include "trace.h"
@@ -38,6 +40,7 @@
38#define MAX_TRACE_ARGS 128 40#define MAX_TRACE_ARGS 128
39#define MAX_ARGSTR_LEN 63 41#define MAX_ARGSTR_LEN 63
40#define MAX_EVENT_NAME_LEN 64 42#define MAX_EVENT_NAME_LEN 64
43#define MAX_STRING_SIZE PATH_MAX
41#define KPROBE_EVENT_SYSTEM "kprobes" 44#define KPROBE_EVENT_SYSTEM "kprobes"
42 45
43/* Reserved field names */ 46/* Reserved field names */
@@ -58,14 +61,16 @@ const char *reserved_field_names[] = {
58}; 61};
59 62
60/* Printing function type */ 63/* Printing function type */
61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); 64typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
65 void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type 66#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type 67#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64 68
65/* Printing in basic type function template */ 69/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ 70#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 71static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\ 72 const char *name, \
73 void *data, void *ent)\
69{ \ 74{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ 75 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \ 76} \
@@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) 85DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) 86DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82 87
88/* data_rloc: data relative location, compatible with u32 */
89#define make_data_rloc(len, roffs) \
90 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
91#define get_rloc_len(dl) ((u32)(dl) >> 16)
92#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
93
94static inline void *get_rloc_data(u32 *dl)
95{
96 return (u8 *)dl + get_rloc_offs(*dl);
97}
98
99/* For data_loc conversion */
100static inline void *get_loc_data(u32 *dl, void *ent)
101{
102 return (u8 *)ent + get_rloc_offs(*dl);
103}
104
105/*
106 * Convert data_rloc to data_loc:
107 * data_rloc stores the offset from data_rloc itself, but data_loc
108 * stores the offset from event entry.
109 */
110#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
111
112/* For defining macros, define string/string_size types */
113typedef u32 string;
114typedef u32 string_size;
115
116/* Print type function for string type */
117static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
118 const char *name,
119 void *data, void *ent)
120{
121 int len = *(u32 *)data >> 16;
122
123 if (!len)
124 return trace_seq_printf(s, " %s=(fault)", name);
125 else
126 return trace_seq_printf(s, " %s=\"%s\"", name,
127 (const char *)get_loc_data(data, ent));
128}
129static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
130
83/* Data fetch function type */ 131/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); 132typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85 133
@@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm,
94 return fprm->fn(regs, fprm->data, dest); 142 return fprm->fn(regs, fprm->data, dest);
95} 143}
96 144
97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type 145#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
98/* 146/*
99 * Define macro for basic types - we don't need to define s* types, because 147 * Define macro for basic types - we don't need to define s* types, because
100 * we have to care only about bitwidth at recording time. 148 * we have to care only about bitwidth at recording time.
101 */ 149 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \ 150#define DEFINE_BASIC_FETCH_FUNCS(method) \
103DEFINE_FETCH_##kind(u8) \ 151DEFINE_FETCH_##method(u8) \
104DEFINE_FETCH_##kind(u16) \ 152DEFINE_FETCH_##method(u16) \
105DEFINE_FETCH_##kind(u32) \ 153DEFINE_FETCH_##method(u32) \
106DEFINE_FETCH_##kind(u64) 154DEFINE_FETCH_##method(u64)
107 155
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ 156#define CHECK_FETCH_FUNCS(method, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \ 157 (((FETCH_FUNC_NAME(method, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \ 158 (FETCH_FUNC_NAME(method, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \ 159 (FETCH_FUNC_NAME(method, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn)) 160 (FETCH_FUNC_NAME(method, u64) == fn) || \
161 (FETCH_FUNC_NAME(method, string) == fn) || \
162 (FETCH_FUNC_NAME(method, string_size) == fn)) \
163 && (fn != NULL))
113 164
114/* Data fetch function templates */ 165/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \ 166#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 167static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \ 168 void *offset, void *dest) \
118{ \ 169{ \
119 *(type *)dest = (type)regs_get_register(regs, \ 170 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \ 171 (unsigned int)((unsigned long)offset)); \
121} 172}
122DEFINE_BASIC_FETCH_FUNCS(reg) 173DEFINE_BASIC_FETCH_FUNCS(reg)
174/* No string on the register */
175#define fetch_reg_string NULL
176#define fetch_reg_string_size NULL
123 177
124#define DEFINE_FETCH_stack(type) \ 178#define DEFINE_FETCH_stack(type) \
125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 179static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
@@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
129 (unsigned int)((unsigned long)offset)); \ 183 (unsigned int)((unsigned long)offset)); \
130} 184}
131DEFINE_BASIC_FETCH_FUNCS(stack) 185DEFINE_BASIC_FETCH_FUNCS(stack)
186/* No string on the stack entry */
187#define fetch_stack_string NULL
188#define fetch_stack_string_size NULL
132 189
133#define DEFINE_FETCH_retval(type) \ 190#define DEFINE_FETCH_retval(type) \
134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ 191static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
@@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
137 *(type *)dest = (type)regs_return_value(regs); \ 194 *(type *)dest = (type)regs_return_value(regs); \
138} 195}
139DEFINE_BASIC_FETCH_FUNCS(retval) 196DEFINE_BASIC_FETCH_FUNCS(retval)
197/* No string on the retval */
198#define fetch_retval_string NULL
199#define fetch_retval_string_size NULL
140 200
141#define DEFINE_FETCH_memory(type) \ 201#define DEFINE_FETCH_memory(type) \
142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 202static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
@@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
149 *(type *)dest = retval; \ 209 *(type *)dest = retval; \
150} 210}
151DEFINE_BASIC_FETCH_FUNCS(memory) 211DEFINE_BASIC_FETCH_FUNCS(memory)
212/*
213 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
214 * length and relative data location.
215 */
216static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
217 void *addr, void *dest)
218{
219 long ret;
220 int maxlen = get_rloc_len(*(u32 *)dest);
221 u8 *dst = get_rloc_data(dest);
222 u8 *src = addr;
223 mm_segment_t old_fs = get_fs();
224 if (!maxlen)
225 return;
226 /*
227 * Try to get string again, since the string can be changed while
228 * probing.
229 */
230 set_fs(KERNEL_DS);
231 pagefault_disable();
232 do
233 ret = __copy_from_user_inatomic(dst++, src++, 1);
234 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
235 dst[-1] = '\0';
236 pagefault_enable();
237 set_fs(old_fs);
238
239 if (ret < 0) { /* Failed to fetch string */
240 ((u8 *)get_rloc_data(dest))[0] = '\0';
241 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
242 } else
243 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
244 get_rloc_offs(*(u32 *)dest));
245}
246/* Return the length of string -- including null terminal byte */
247static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
248 void *addr, void *dest)
249{
250 int ret, len = 0;
251 u8 c;
252 mm_segment_t old_fs = get_fs();
253
254 set_fs(KERNEL_DS);
255 pagefault_disable();
256 do {
257 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
258 len++;
259 } while (c && ret == 0 && len < MAX_STRING_SIZE);
260 pagefault_enable();
261 set_fs(old_fs);
262
263 if (ret < 0) /* Failed to check the length */
264 *(u32 *)dest = 0;
265 else
266 *(u32 *)dest = len;
267}
152 268
153/* Memory fetching by symbol */ 269/* Memory fetching by symbol */
154struct symbol_cache { 270struct symbol_cache {
@@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
203 *(type *)dest = 0; \ 319 *(type *)dest = 0; \
204} 320}
205DEFINE_BASIC_FETCH_FUNCS(symbol) 321DEFINE_BASIC_FETCH_FUNCS(symbol)
322DEFINE_FETCH_symbol(string)
323DEFINE_FETCH_symbol(string_size)
206 324
207/* Dereference memory access function */ 325/* Dereference memory access function */
208struct deref_fetch_param { 326struct deref_fetch_param {
@@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
224 *(type *)dest = 0; \ 342 *(type *)dest = 0; \
225} 343}
226DEFINE_BASIC_FETCH_FUNCS(deref) 344DEFINE_BASIC_FETCH_FUNCS(deref)
345DEFINE_FETCH_deref(string)
346DEFINE_FETCH_deref(string_size)
227 347
228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 348static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
229{ 349{
230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) 350 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
231 free_deref_fetch_param(data->orig.data); 351 free_deref_fetch_param(data->orig.data);
232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) 352 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
233 free_symbol_cache(data->orig.data); 353 free_symbol_cache(data->orig.data);
234 kfree(data); 354 kfree(data);
235} 355}
@@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) 360#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) 361#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242 362
243#define ASSIGN_FETCH_FUNC(kind, type) \ 363/* Fetch types */
244 .kind = FETCH_FUNC_NAME(kind, type) 364enum {
245 365 FETCH_MTD_reg = 0,
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ 366 FETCH_MTD_stack,
247 {.name = #ptype, \ 367 FETCH_MTD_retval,
248 .size = sizeof(ftype), \ 368 FETCH_MTD_memory,
249 .is_signed = sign, \ 369 FETCH_MTD_symbol,
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \ 370 FETCH_MTD_deref,
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \ 371 FETCH_MTD_END,
252ASSIGN_FETCH_FUNC(reg, ftype), \ 372};
253ASSIGN_FETCH_FUNC(stack, ftype), \ 373
254ASSIGN_FETCH_FUNC(retval, ftype), \ 374#define ASSIGN_FETCH_FUNC(method, type) \
255ASSIGN_FETCH_FUNC(memory, ftype), \ 375 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
256ASSIGN_FETCH_FUNC(symbol, ftype), \ 376
257ASSIGN_FETCH_FUNC(deref, ftype), \ 377#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
378 {.name = _name, \
379 .size = _size, \
380 .is_signed = sign, \
381 .print = PRINT_TYPE_FUNC_NAME(ptype), \
382 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
383 .fmttype = _fmttype, \
384 .fetch = { \
385ASSIGN_FETCH_FUNC(reg, ftype), \
386ASSIGN_FETCH_FUNC(stack, ftype), \
387ASSIGN_FETCH_FUNC(retval, ftype), \
388ASSIGN_FETCH_FUNC(memory, ftype), \
389ASSIGN_FETCH_FUNC(symbol, ftype), \
390ASSIGN_FETCH_FUNC(deref, ftype), \
391 } \
258 } 392 }
259 393
394#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
395 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
396
397#define FETCH_TYPE_STRING 0
398#define FETCH_TYPE_STRSIZE 1
399
260/* Fetch type information table */ 400/* Fetch type information table */
261static const struct fetch_type { 401static const struct fetch_type {
262 const char *name; /* Name of type */ 402 const char *name; /* Name of type */
@@ -264,14 +404,16 @@ static const struct fetch_type {
264 int is_signed; /* Signed flag */ 404 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */ 405 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */ 406 const char *fmt; /* Fromat string */
407 const char *fmttype; /* Name in format file */
267 /* Fetch functions */ 408 /* Fetch functions */
268 fetch_func_t reg; 409 fetch_func_t fetch[FETCH_MTD_END];
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = { 410} fetch_type_table[] = {
411 /* Special types */
412 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
413 sizeof(u32), 1, "__data_loc char[]"),
414 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
415 string_size, sizeof(u32), 0, "u32"),
416 /* Basic types */
275 ASSIGN_FETCH_TYPE(u8, u8, 0), 417 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0), 418 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0), 419 ASSIGN_FETCH_TYPE(u32, u32, 0),
@@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs,
302 *(unsigned long *)dest = kernel_stack_pointer(regs); 444 *(unsigned long *)dest = kernel_stack_pointer(regs);
303} 445}
304 446
447static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
448 fetch_func_t orig_fn)
449{
450 int i;
451
452 if (type != &fetch_type_table[FETCH_TYPE_STRING])
453 return NULL; /* Only string type needs size function */
454 for (i = 0; i < FETCH_MTD_END; i++)
455 if (type->fetch[i] == orig_fn)
456 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
457
458 WARN_ON(1); /* This should not happen */
459 return NULL;
460}
461
305/** 462/**
306 * Kprobe event core functions 463 * Kprobe event core functions
307 */ 464 */
308 465
309struct probe_arg { 466struct probe_arg {
310 struct fetch_param fetch; 467 struct fetch_param fetch;
468 struct fetch_param fetch_size;
311 unsigned int offset; /* Offset from argument entry */ 469 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */ 470 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */ 471 const char *comm; /* Command of this argument */
@@ -429,9 +587,9 @@ error:
429 587
430static void free_probe_arg(struct probe_arg *arg) 588static void free_probe_arg(struct probe_arg *arg)
431{ 589{
432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) 590 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data); 591 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) 592 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
435 free_symbol_cache(arg->fetch.data); 593 free_symbol_cache(arg->fetch.data);
436 kfree(arg->name); 594 kfree(arg->name);
437 kfree(arg->comm); 595 kfree(arg->comm);
@@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
548 706
549 if (strcmp(arg, "retval") == 0) { 707 if (strcmp(arg, "retval") == 0) {
550 if (is_return) 708 if (is_return)
551 f->fn = t->retval; 709 f->fn = t->fetch[FETCH_MTD_retval];
552 else 710 else
553 ret = -EINVAL; 711 ret = -EINVAL;
554 } else if (strncmp(arg, "stack", 5) == 0) { 712 } else if (strncmp(arg, "stack", 5) == 0) {
@@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
562 if (ret || param > PARAM_MAX_STACK) 720 if (ret || param > PARAM_MAX_STACK)
563 ret = -EINVAL; 721 ret = -EINVAL;
564 else { 722 else {
565 f->fn = t->stack; 723 f->fn = t->fetch[FETCH_MTD_stack];
566 f->data = (void *)param; 724 f->data = (void *)param;
567 } 725 }
568 } else 726 } else
@@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
588 case '%': /* named register */ 746 case '%': /* named register */
589 ret = regs_query_register_offset(arg + 1); 747 ret = regs_query_register_offset(arg + 1);
590 if (ret >= 0) { 748 if (ret >= 0) {
591 f->fn = t->reg; 749 f->fn = t->fetch[FETCH_MTD_reg];
592 f->data = (void *)(unsigned long)ret; 750 f->data = (void *)(unsigned long)ret;
593 ret = 0; 751 ret = 0;
594 } 752 }
@@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
598 ret = strict_strtoul(arg + 1, 0, &param); 756 ret = strict_strtoul(arg + 1, 0, &param);
599 if (ret) 757 if (ret)
600 break; 758 break;
601 f->fn = t->memory; 759 f->fn = t->fetch[FETCH_MTD_memory];
602 f->data = (void *)param; 760 f->data = (void *)param;
603 } else { 761 } else {
604 ret = split_symbol_offset(arg + 1, &offset); 762 ret = split_symbol_offset(arg + 1, &offset);
@@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
606 break; 764 break;
607 f->data = alloc_symbol_cache(arg + 1, offset); 765 f->data = alloc_symbol_cache(arg + 1, offset);
608 if (f->data) 766 if (f->data)
609 f->fn = t->symbol; 767 f->fn = t->fetch[FETCH_MTD_symbol];
610 } 768 }
611 break; 769 break;
612 case '+': /* deref memory */ 770 case '+': /* deref memory */
@@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
636 if (ret) 794 if (ret)
637 kfree(dprm); 795 kfree(dprm);
638 else { 796 else {
639 f->fn = t->deref; 797 f->fn = t->fetch[FETCH_MTD_deref];
640 f->data = (void *)dprm; 798 f->data = (void *)dprm;
641 } 799 }
642 } 800 }
643 break; 801 break;
644 } 802 }
645 if (!ret && !f->fn) 803 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
804 pr_info("%s type has no corresponding fetch method.\n",
805 t->name);
646 ret = -EINVAL; 806 ret = -EINVAL;
807 }
647 return ret; 808 return ret;
648} 809}
649 810
@@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return) 813 struct probe_arg *parg, int is_return)
653{ 814{
654 const char *t; 815 const char *t;
816 int ret;
655 817
656 if (strlen(arg) > MAX_ARGSTR_LEN) { 818 if (strlen(arg) > MAX_ARGSTR_LEN) {
657 pr_info("Argument is too long.: %s\n", arg); 819 pr_info("Argument is too long.: %s\n", arg);
@@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
674 } 836 }
675 parg->offset = tp->size; 837 parg->offset = tp->size;
676 tp->size += parg->type->size; 838 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 839 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
840 if (ret >= 0) {
841 parg->fetch_size.fn = get_fetch_size_function(parg->type,
842 parg->fetch.fn);
843 parg->fetch_size.data = parg->fetch.data;
844 }
845 return ret;
678} 846}
679 847
680/* Return 1 if name is reserved or already used by another argument */ 848/* Return 1 if name is reserved or already used by another argument */
@@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
757 pr_info("Delete command needs an event name.\n"); 925 pr_info("Delete command needs an event name.\n");
758 return -EINVAL; 926 return -EINVAL;
759 } 927 }
928 mutex_lock(&probe_lock);
760 tp = find_probe_event(event, group); 929 tp = find_probe_event(event, group);
761 if (!tp) { 930 if (!tp) {
931 mutex_unlock(&probe_lock);
762 pr_info("Event %s/%s doesn't exist.\n", group, event); 932 pr_info("Event %s/%s doesn't exist.\n", group, event);
763 return -ENOENT; 933 return -ENOENT;
764 } 934 }
765 /* delete an event */ 935 /* delete an event */
766 unregister_trace_probe(tp); 936 unregister_trace_probe(tp);
767 free_trace_probe(tp); 937 free_trace_probe(tp);
938 mutex_unlock(&probe_lock);
768 return 0; 939 return 0;
769 } 940 }
770 941
@@ -1043,6 +1214,54 @@ static const struct file_operations kprobe_profile_ops = {
1043 .release = seq_release, 1214 .release = seq_release,
1044}; 1215};
1045 1216
1217/* Sum up total data length for dynamic arraies (strings) */
1218static __kprobes int __get_data_size(struct trace_probe *tp,
1219 struct pt_regs *regs)
1220{
1221 int i, ret = 0;
1222 u32 len;
1223
1224 for (i = 0; i < tp->nr_args; i++)
1225 if (unlikely(tp->args[i].fetch_size.fn)) {
1226 call_fetch(&tp->args[i].fetch_size, regs, &len);
1227 ret += len;
1228 }
1229
1230 return ret;
1231}
1232
1233/* Store the value of each argument */
1234static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
1235 struct pt_regs *regs,
1236 u8 *data, int maxlen)
1237{
1238 int i;
1239 u32 end = tp->size;
1240 u32 *dl; /* Data (relative) location */
1241
1242 for (i = 0; i < tp->nr_args; i++) {
1243 if (unlikely(tp->args[i].fetch_size.fn)) {
1244 /*
1245 * First, we set the relative location and
1246 * maximum data length to *dl
1247 */
1248 dl = (u32 *)(data + tp->args[i].offset);
1249 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
1250 /* Then try to fetch string or dynamic array data */
1251 call_fetch(&tp->args[i].fetch, regs, dl);
1252 /* Reduce maximum length */
1253 end += get_rloc_len(*dl);
1254 maxlen -= get_rloc_len(*dl);
1255 /* Trick here, convert data_rloc to data_loc */
1256 *dl = convert_rloc_to_loc(*dl,
1257 ent_size + tp->args[i].offset);
1258 } else
1259 /* Just fetching data normally */
1260 call_fetch(&tp->args[i].fetch, regs,
1261 data + tp->args[i].offset);
1262 }
1263}
1264
1046/* Kprobe handler */ 1265/* Kprobe handler */
1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1266static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1048{ 1267{
@@ -1050,8 +1269,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1050 struct kprobe_trace_entry_head *entry; 1269 struct kprobe_trace_entry_head *entry;
1051 struct ring_buffer_event *event; 1270 struct ring_buffer_event *event;
1052 struct ring_buffer *buffer; 1271 struct ring_buffer *buffer;
1053 u8 *data; 1272 int size, dsize, pc;
1054 int size, i, pc;
1055 unsigned long irq_flags; 1273 unsigned long irq_flags;
1056 struct ftrace_event_call *call = &tp->call; 1274 struct ftrace_event_call *call = &tp->call;
1057 1275
@@ -1060,7 +1278,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1060 local_save_flags(irq_flags); 1278 local_save_flags(irq_flags);
1061 pc = preempt_count(); 1279 pc = preempt_count();
1062 1280
1063 size = sizeof(*entry) + tp->size; 1281 dsize = __get_data_size(tp, regs);
1282 size = sizeof(*entry) + tp->size + dsize;
1064 1283
1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1284 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1066 size, irq_flags, pc); 1285 size, irq_flags, pc);
@@ -1069,9 +1288,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1069 1288
1070 entry = ring_buffer_event_data(event); 1289 entry = ring_buffer_event_data(event);
1071 entry->ip = (unsigned long)kp->addr; 1290 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1]; 1291 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1073 for (i = 0; i < tp->nr_args; i++)
1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1075 1292
1076 if (!filter_current_check_discard(buffer, call, entry, event)) 1293 if (!filter_current_check_discard(buffer, call, entry, event))
1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1294 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1085,15 +1302,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1085 struct kretprobe_trace_entry_head *entry; 1302 struct kretprobe_trace_entry_head *entry;
1086 struct ring_buffer_event *event; 1303 struct ring_buffer_event *event;
1087 struct ring_buffer *buffer; 1304 struct ring_buffer *buffer;
1088 u8 *data; 1305 int size, pc, dsize;
1089 int size, i, pc;
1090 unsigned long irq_flags; 1306 unsigned long irq_flags;
1091 struct ftrace_event_call *call = &tp->call; 1307 struct ftrace_event_call *call = &tp->call;
1092 1308
1093 local_save_flags(irq_flags); 1309 local_save_flags(irq_flags);
1094 pc = preempt_count(); 1310 pc = preempt_count();
1095 1311
1096 size = sizeof(*entry) + tp->size; 1312 dsize = __get_data_size(tp, regs);
1313 size = sizeof(*entry) + tp->size + dsize;
1097 1314
1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 1315 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1099 size, irq_flags, pc); 1316 size, irq_flags, pc);
@@ -1103,9 +1320,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1103 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1104 entry->func = (unsigned long)tp->rp.kp.addr; 1321 entry->func = (unsigned long)tp->rp.kp.addr;
1105 entry->ret_ip = (unsigned long)ri->ret_addr; 1322 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1]; 1323 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1107 for (i = 0; i < tp->nr_args; i++)
1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1109 1324
1110 if (!filter_current_check_discard(buffer, call, entry, event)) 1325 if (!filter_current_check_discard(buffer, call, entry, event))
1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1326 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1137,7 +1352,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1137 data = (u8 *)&field[1]; 1352 data = (u8 *)&field[1];
1138 for (i = 0; i < tp->nr_args; i++) 1353 for (i = 0; i < tp->nr_args; i++)
1139 if (!tp->args[i].type->print(s, tp->args[i].name, 1354 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset)) 1355 data + tp->args[i].offset, field))
1141 goto partial; 1356 goto partial;
1142 1357
1143 if (!trace_seq_puts(s, "\n")) 1358 if (!trace_seq_puts(s, "\n"))
@@ -1179,7 +1394,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1179 data = (u8 *)&field[1]; 1394 data = (u8 *)&field[1];
1180 for (i = 0; i < tp->nr_args; i++) 1395 for (i = 0; i < tp->nr_args; i++)
1181 if (!tp->args[i].type->print(s, tp->args[i].name, 1396 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset)) 1397 data + tp->args[i].offset, field))
1183 goto partial; 1398 goto partial;
1184 1399
1185 if (!trace_seq_puts(s, "\n")) 1400 if (!trace_seq_puts(s, "\n"))
@@ -1214,11 +1429,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1214 } 1429 }
1215} 1430}
1216 1431
1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1218{
1219 return 0;
1220}
1221
1222#undef DEFINE_FIELD 1432#undef DEFINE_FIELD
1223#define DEFINE_FIELD(type, item, name, is_signed) \ 1433#define DEFINE_FIELD(type, item, name, is_signed) \
1224 do { \ 1434 do { \
@@ -1239,7 +1449,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1449 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1240 /* Set argument names as fields */ 1450 /* Set argument names as fields */
1241 for (i = 0; i < tp->nr_args; i++) { 1451 for (i = 0; i < tp->nr_args; i++) {
1242 ret = trace_define_field(event_call, tp->args[i].type->name, 1452 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1243 tp->args[i].name, 1453 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset, 1454 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size, 1455 tp->args[i].type->size,
@@ -1261,7 +1471,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1471 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1262 /* Set argument names as fields */ 1472 /* Set argument names as fields */
1263 for (i = 0; i < tp->nr_args; i++) { 1473 for (i = 0; i < tp->nr_args; i++) {
1264 ret = trace_define_field(event_call, tp->args[i].type->name, 1474 ret = trace_define_field(event_call, tp->args[i].type->fmttype,
1265 tp->args[i].name, 1475 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset, 1476 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size, 1477 tp->args[i].type->size,
@@ -1301,8 +1511,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1511 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1302 1512
1303 for (i = 0; i < tp->nr_args; i++) { 1513 for (i = 0; i < tp->nr_args; i++) {
1304 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", 1514 if (strcmp(tp->args[i].type->name, "string") == 0)
1305 tp->args[i].name); 1515 pos += snprintf(buf + pos, LEN_OR_ZERO,
1516 ", __get_str(%s)",
1517 tp->args[i].name);
1518 else
1519 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1520 tp->args[i].name);
1306 } 1521 }
1307 1522
1308#undef LEN_OR_ZERO 1523#undef LEN_OR_ZERO
@@ -1339,11 +1554,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1339 struct ftrace_event_call *call = &tp->call; 1554 struct ftrace_event_call *call = &tp->call;
1340 struct kprobe_trace_entry_head *entry; 1555 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head; 1556 struct hlist_head *head;
1342 u8 *data; 1557 int size, __size, dsize;
1343 int size, __size, i;
1344 int rctx; 1558 int rctx;
1345 1559
1346 __size = sizeof(*entry) + tp->size; 1560 dsize = __get_data_size(tp, regs);
1561 __size = sizeof(*entry) + tp->size + dsize;
1347 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1562 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1348 size -= sizeof(u32); 1563 size -= sizeof(u32);
1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1564 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1355,9 +1570,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1355 return; 1570 return;
1356 1571
1357 entry->ip = (unsigned long)kp->addr; 1572 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1]; 1573 memset(&entry[1], 0, dsize);
1359 for (i = 0; i < tp->nr_args; i++) 1574 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1361 1575
1362 head = this_cpu_ptr(call->perf_events); 1576 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1577 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
@@ -1371,11 +1585,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1371 struct ftrace_event_call *call = &tp->call; 1585 struct ftrace_event_call *call = &tp->call;
1372 struct kretprobe_trace_entry_head *entry; 1586 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head; 1587 struct hlist_head *head;
1374 u8 *data; 1588 int size, __size, dsize;
1375 int size, __size, i;
1376 int rctx; 1589 int rctx;
1377 1590
1378 __size = sizeof(*entry) + tp->size; 1591 dsize = __get_data_size(tp, regs);
1592 __size = sizeof(*entry) + tp->size + dsize;
1379 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1593 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1380 size -= sizeof(u32); 1594 size -= sizeof(u32);
1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1595 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1388,9 +1602,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1388 1602
1389 entry->func = (unsigned long)tp->rp.kp.addr; 1603 entry->func = (unsigned long)tp->rp.kp.addr;
1390 entry->ret_ip = (unsigned long)ri->ret_addr; 1604 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1]; 1605 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1392 for (i = 0; i < tp->nr_args; i++)
1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1394 1606
1395 head = this_cpu_ptr(call->perf_events); 1607 head = this_cpu_ptr(call->perf_events);
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1608 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
@@ -1486,15 +1698,12 @@ static int register_probe_event(struct trace_probe *tp)
1486 int ret; 1698 int ret;
1487 1699
1488 /* Initialize ftrace_event_call */ 1700 /* Initialize ftrace_event_call */
1701 INIT_LIST_HEAD(&call->class->fields);
1489 if (probe_is_return(tp)) { 1702 if (probe_is_return(tp)) {
1490 INIT_LIST_HEAD(&call->class->fields);
1491 call->event.funcs = &kretprobe_funcs; 1703 call->event.funcs = &kretprobe_funcs;
1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields; 1704 call->class->define_fields = kretprobe_event_define_fields;
1494 } else { 1705 } else {
1495 INIT_LIST_HEAD(&call->class->fields);
1496 call->event.funcs = &kprobe_funcs; 1706 call->event.funcs = &kprobe_funcs;
1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields; 1707 call->class->define_fields = kprobe_event_define_fields;
1499 } 1708 }
1500 if (set_print_fmt(tp) < 0) 1709 if (set_print_fmt(tp) < 0)
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 8eaf00749b65..000000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,508 +0,0 @@
1/*
2 * trace_ksym.c - Kernel Symbol Tracer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2009
19 */
20
21#include <linux/kallsyms.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/ftrace.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/fs.h>
28
29#include "trace_output.h"
30#include "trace.h"
31
32#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h>
34
35#include <asm/atomic.h>
36
37#define KSYM_TRACER_OP_LEN 3 /* rw- */
38
39struct trace_ksym {
40 struct perf_event **ksym_hbp;
41 struct perf_event_attr attr;
42#ifdef CONFIG_PROFILE_KSYM_TRACER
43 atomic64_t counter;
44#endif
45 struct hlist_node ksym_hlist;
46};
47
48static struct trace_array *ksym_trace_array;
49
50static unsigned int ksym_tracing_enabled;
51
52static HLIST_HEAD(ksym_filter_head);
53
54static DEFINE_MUTEX(ksym_tracer_mutex);
55
56#ifdef CONFIG_PROFILE_KSYM_TRACER
57
58#define MAX_UL_INT 0xffffffff
59
60void ksym_collect_stats(unsigned long hbp_hit_addr)
61{
62 struct hlist_node *node;
63 struct trace_ksym *entry;
64
65 rcu_read_lock();
66 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
67 if (entry->attr.bp_addr == hbp_hit_addr) {
68 atomic64_inc(&entry->counter);
69 break;
70 }
71 }
72 rcu_read_unlock();
73}
74#endif /* CONFIG_PROFILE_KSYM_TRACER */
75
76void ksym_hbp_handler(struct perf_event *hbp, int nmi,
77 struct perf_sample_data *data,
78 struct pt_regs *regs)
79{
80 struct ring_buffer_event *event;
81 struct ksym_trace_entry *entry;
82 struct ring_buffer *buffer;
83 int pc;
84
85 if (!ksym_tracing_enabled)
86 return;
87
88 buffer = ksym_trace_array->buffer;
89
90 pc = preempt_count();
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
93 sizeof(*entry), 0, pc);
94 if (!event)
95 return;
96
97 entry = ring_buffer_event_data(event);
98 entry->ip = instruction_pointer(regs);
99 entry->type = hw_breakpoint_type(hbp);
100 entry->addr = hw_breakpoint_addr(hbp);
101 strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
102
103#ifdef CONFIG_PROFILE_KSYM_TRACER
104 ksym_collect_stats(hw_breakpoint_addr(hbp));
105#endif /* CONFIG_PROFILE_KSYM_TRACER */
106
107 trace_buffer_unlock_commit(buffer, event, 0, pc);
108}
109
110/* Valid access types are represented as
111 *
112 * rw- : Set Read/Write Access Breakpoint
113 * -w- : Set Write Access Breakpoint
114 * --- : Clear Breakpoints
115 * --x : Set Execution Break points (Not available yet)
116 *
117 */
118static int ksym_trace_get_access_type(char *str)
119{
120 int access = 0;
121
122 if (str[0] == 'r')
123 access |= HW_BREAKPOINT_R;
124
125 if (str[1] == 'w')
126 access |= HW_BREAKPOINT_W;
127
128 if (str[2] == 'x')
129 access |= HW_BREAKPOINT_X;
130
131 switch (access) {
132 case HW_BREAKPOINT_R:
133 case HW_BREAKPOINT_W:
134 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
135 return access;
136 default:
137 return -EINVAL;
138 }
139}
140
141/*
142 * There can be several possible malformed requests and we attempt to capture
143 * all of them. We enumerate some of the rules
144 * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
145 * i.e. multiple ':' symbols disallowed. Possible uses are of the form
146 * <module>:<ksym_name>:<op>.
147 * 2. No delimiter symbol ':' in the input string
148 * 3. Spurious operator symbols or symbols not in their respective positions
149 * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
150 * 5. Kernel symbol not a part of /proc/kallsyms
151 * 6. Duplicate requests
152 */
153static int parse_ksym_trace_str(char *input_string, char **ksymname,
154 unsigned long *addr)
155{
156 int ret;
157
158 *ksymname = strsep(&input_string, ":");
159 *addr = kallsyms_lookup_name(*ksymname);
160
161 /* Check for malformed request: (2), (1) and (5) */
162 if ((!input_string) ||
163 (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
164 (*addr == 0))
165 return -EINVAL;;
166
167 ret = ksym_trace_get_access_type(input_string);
168
169 return ret;
170}
171
172int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
173{
174 struct trace_ksym *entry;
175 int ret = -ENOMEM;
176
177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
178 if (!entry)
179 return -ENOMEM;
180
181 hw_breakpoint_init(&entry->attr);
182
183 entry->attr.bp_type = op;
184 entry->attr.bp_addr = addr;
185 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
186
187 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
188 ksym_hbp_handler);
189
190 if (IS_ERR(entry->ksym_hbp)) {
191 ret = PTR_ERR(entry->ksym_hbp);
192 if (ret == -ENOSPC) {
193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
199 goto err;
200 }
201
202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
203
204 return 0;
205
206err:
207 kfree(entry);
208
209 return ret;
210}
211
212static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
213 size_t count, loff_t *ppos)
214{
215 struct trace_ksym *entry;
216 struct hlist_node *node;
217 struct trace_seq *s;
218 ssize_t cnt = 0;
219 int ret;
220
221 s = kmalloc(sizeof(*s), GFP_KERNEL);
222 if (!s)
223 return -ENOMEM;
224 trace_seq_init(s);
225
226 mutex_lock(&ksym_tracer_mutex);
227
228 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
229 ret = trace_seq_printf(s, "%pS:",
230 (void *)(unsigned long)entry->attr.bp_addr);
231 if (entry->attr.bp_type == HW_BREAKPOINT_R)
232 ret = trace_seq_puts(s, "r--\n");
233 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
234 ret = trace_seq_puts(s, "-w-\n");
235 else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
236 ret = trace_seq_puts(s, "rw-\n");
237 WARN_ON_ONCE(!ret);
238 }
239
240 cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
241
242 mutex_unlock(&ksym_tracer_mutex);
243
244 kfree(s);
245
246 return cnt;
247}
248
249static void __ksym_trace_reset(void)
250{
251 struct trace_ksym *entry;
252 struct hlist_node *node, *node1;
253
254 mutex_lock(&ksym_tracer_mutex);
255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
256 ksym_hlist) {
257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
258 hlist_del_rcu(&(entry->ksym_hlist));
259 synchronize_rcu();
260 kfree(entry);
261 }
262 mutex_unlock(&ksym_tracer_mutex);
263}
264
265static ssize_t ksym_trace_filter_write(struct file *file,
266 const char __user *buffer,
267 size_t count, loff_t *ppos)
268{
269 struct trace_ksym *entry;
270 struct hlist_node *node;
271 char *buf, *input_string, *ksymname = NULL;
272 unsigned long ksym_addr = 0;
273 int ret, op, changed = 0;
274
275 buf = kzalloc(count + 1, GFP_KERNEL);
276 if (!buf)
277 return -ENOMEM;
278
279 ret = -EFAULT;
280 if (copy_from_user(buf, buffer, count))
281 goto out;
282
283 buf[count] = '\0';
284 input_string = strstrip(buf);
285
286 /*
287 * Clear all breakpoints if:
288 * 1: echo > ksym_trace_filter
289 * 2: echo 0 > ksym_trace_filter
290 * 3: echo "*:---" > ksym_trace_filter
291 */
292 if (!input_string[0] || !strcmp(input_string, "0") ||
293 !strcmp(input_string, "*:---")) {
294 __ksym_trace_reset();
295 ret = 0;
296 goto out;
297 }
298
299 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
300 if (ret < 0)
301 goto out;
302
303 mutex_lock(&ksym_tracer_mutex);
304
305 ret = -EINVAL;
306 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
307 if (entry->attr.bp_addr == ksym_addr) {
308 /* Check for malformed request: (6) */
309 if (entry->attr.bp_type != op)
310 changed = 1;
311 else
312 goto out_unlock;
313 break;
314 }
315 }
316 if (changed) {
317 unregister_wide_hw_breakpoint(entry->ksym_hbp);
318 entry->attr.bp_type = op;
319 ret = 0;
320 if (op > 0) {
321 entry->ksym_hbp =
322 register_wide_hw_breakpoint(&entry->attr,
323 ksym_hbp_handler);
324 if (IS_ERR(entry->ksym_hbp))
325 ret = PTR_ERR(entry->ksym_hbp);
326 else
327 goto out_unlock;
328 }
329 /* Error or "symbol:---" case: drop it */
330 hlist_del_rcu(&(entry->ksym_hlist));
331 synchronize_rcu();
332 kfree(entry);
333 goto out_unlock;
334 } else {
335 /* Check for malformed request: (4) */
336 if (op)
337 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
338 }
339out_unlock:
340 mutex_unlock(&ksym_tracer_mutex);
341out:
342 kfree(buf);
343 return !ret ? count : ret;
344}
345
346static const struct file_operations ksym_tracing_fops = {
347 .open = tracing_open_generic,
348 .read = ksym_trace_filter_read,
349 .write = ksym_trace_filter_write,
350};
351
352static void ksym_trace_reset(struct trace_array *tr)
353{
354 ksym_tracing_enabled = 0;
355 __ksym_trace_reset();
356}
357
358static int ksym_trace_init(struct trace_array *tr)
359{
360 int cpu, ret = 0;
361
362 for_each_online_cpu(cpu)
363 tracing_reset(tr, cpu);
364 ksym_tracing_enabled = 1;
365 ksym_trace_array = tr;
366
367 return ret;
368}
369
370static void ksym_trace_print_header(struct seq_file *m)
371{
372 seq_puts(m,
373 "# TASK-PID CPU# Symbol "
374 "Type Function\n");
375 seq_puts(m,
376 "# | | | "
377 " | |\n");
378}
379
380static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
381{
382 struct trace_entry *entry = iter->ent;
383 struct trace_seq *s = &iter->seq;
384 struct ksym_trace_entry *field;
385 char str[KSYM_SYMBOL_LEN];
386 int ret;
387
388 if (entry->type != TRACE_KSYM)
389 return TRACE_TYPE_UNHANDLED;
390
391 trace_assign_type(field, entry);
392
393 ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
394 entry->pid, iter->cpu, (char *)field->addr);
395 if (!ret)
396 return TRACE_TYPE_PARTIAL_LINE;
397
398 switch (field->type) {
399 case HW_BREAKPOINT_R:
400 ret = trace_seq_printf(s, " R ");
401 break;
402 case HW_BREAKPOINT_W:
403 ret = trace_seq_printf(s, " W ");
404 break;
405 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
406 ret = trace_seq_printf(s, " RW ");
407 break;
408 default:
409 return TRACE_TYPE_PARTIAL_LINE;
410 }
411
412 if (!ret)
413 return TRACE_TYPE_PARTIAL_LINE;
414
415 sprint_symbol(str, field->ip);
416 ret = trace_seq_printf(s, "%s\n", str);
417 if (!ret)
418 return TRACE_TYPE_PARTIAL_LINE;
419
420 return TRACE_TYPE_HANDLED;
421}
422
423struct tracer ksym_tracer __read_mostly =
424{
425 .name = "ksym_tracer",
426 .init = ksym_trace_init,
427 .reset = ksym_trace_reset,
428#ifdef CONFIG_FTRACE_SELFTEST
429 .selftest = trace_selftest_startup_ksym,
430#endif
431 .print_header = ksym_trace_print_header,
432 .print_line = ksym_trace_output
433};
434
435#ifdef CONFIG_PROFILE_KSYM_TRACER
436static int ksym_profile_show(struct seq_file *m, void *v)
437{
438 struct hlist_node *node;
439 struct trace_ksym *entry;
440 int access_type = 0;
441 char fn_name[KSYM_NAME_LEN];
442
443 seq_puts(m, " Access Type ");
444 seq_puts(m, " Symbol Counter\n");
445 seq_puts(m, " ----------- ");
446 seq_puts(m, " ------ -------\n");
447
448 rcu_read_lock();
449 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
450
451 access_type = entry->attr.bp_type;
452
453 switch (access_type) {
454 case HW_BREAKPOINT_R:
455 seq_puts(m, " R ");
456 break;
457 case HW_BREAKPOINT_W:
458 seq_puts(m, " W ");
459 break;
460 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
461 seq_puts(m, " RW ");
462 break;
463 default:
464 seq_puts(m, " NA ");
465 }
466
467 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
468 seq_printf(m, " %-36s", fn_name);
469 else
470 seq_printf(m, " %-36s", "<NA>");
471 seq_printf(m, " %15llu\n",
472 (unsigned long long)atomic64_read(&entry->counter));
473 }
474 rcu_read_unlock();
475
476 return 0;
477}
478
479static int ksym_profile_open(struct inode *node, struct file *file)
480{
481 return single_open(file, ksym_profile_show, NULL);
482}
483
484static const struct file_operations ksym_profile_fops = {
485 .open = ksym_profile_open,
486 .read = seq_read,
487 .llseek = seq_lseek,
488 .release = single_release,
489};
490#endif /* CONFIG_PROFILE_KSYM_TRACER */
491
492__init static int init_ksym_trace(void)
493{
494 struct dentry *d_tracer;
495
496 d_tracer = tracing_init_dentry();
497
498 trace_create_file("ksym_trace_filter", 0644, d_tracer,
499 NULL, &ksym_tracing_fops);
500
501#ifdef CONFIG_PROFILE_KSYM_TRACER
502 trace_create_file("ksym_profile", 0444, d_tracer,
503 NULL, &ksym_profile_fops);
504#endif
505
506 return register_tracer(&ksym_tracer);
507}
508device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..02272baa2206 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18 18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 20
24static int next_event_type = __TRACE_LAST_TYPE + 1; 21static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = {
1069 .funcs = &trace_wake_funcs, 1066 .funcs = &trace_wake_funcs,
1070}; 1067};
1071 1068
1072/* TRACE_SPECIAL */
1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1074 int flags, struct trace_event *event)
1075{
1076 struct special_entry *field;
1077
1078 trace_assign_type(field, iter->ent);
1079
1080 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
1081 field->arg1,
1082 field->arg2,
1083 field->arg3))
1084 return TRACE_TYPE_PARTIAL_LINE;
1085
1086 return TRACE_TYPE_HANDLED;
1087}
1088
1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1090 int flags, struct trace_event *event)
1091{
1092 struct special_entry *field;
1093 struct trace_seq *s = &iter->seq;
1094
1095 trace_assign_type(field, iter->ent);
1096
1097 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1098 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1099 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1100
1101 return TRACE_TYPE_HANDLED;
1102}
1103
1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1105 int flags, struct trace_event *event)
1106{
1107 struct special_entry *field;
1108 struct trace_seq *s = &iter->seq;
1109
1110 trace_assign_type(field, iter->ent);
1111
1112 SEQ_PUT_FIELD_RET(s, field->arg1);
1113 SEQ_PUT_FIELD_RET(s, field->arg2);
1114 SEQ_PUT_FIELD_RET(s, field->arg3);
1115
1116 return TRACE_TYPE_HANDLED;
1117}
1118
1119static struct trace_event_functions trace_special_funcs = {
1120 .trace = trace_special_print,
1121 .raw = trace_special_print,
1122 .hex = trace_special_hex,
1123 .binary = trace_special_bin,
1124};
1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1131/* TRACE_STACK */ 1069/* TRACE_STACK */
1132 1070
1133static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1071static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1161 1099
1162static struct trace_event_functions trace_stack_funcs = { 1100static struct trace_event_functions trace_stack_funcs = {
1163 .trace = trace_stack_print, 1101 .trace = trace_stack_print,
1164 .raw = trace_special_print,
1165 .hex = trace_special_hex,
1166 .binary = trace_special_bin,
1167}; 1102};
1168 1103
1169static struct trace_event trace_stack_event = { 1104static struct trace_event trace_stack_event = {
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1194 1129
1195static struct trace_event_functions trace_user_stack_funcs = { 1130static struct trace_event_functions trace_user_stack_funcs = {
1196 .trace = trace_user_stack_print, 1131 .trace = trace_user_stack_print,
1197 .raw = trace_special_print,
1198 .hex = trace_special_hex,
1199 .binary = trace_special_bin,
1200}; 1132};
1201 1133
1202static struct trace_event trace_user_stack_event = { 1134static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = {
1314 &trace_fn_event, 1246 &trace_fn_event,
1315 &trace_ctx_event, 1247 &trace_ctx_event,
1316 &trace_wake_event, 1248 &trace_wake_event,
1317 &trace_special_event,
1318 &trace_stack_event, 1249 &trace_stack_event,
1319 &trace_user_stack_event, 1250 &trace_user_stack_event,
1320 &trace_bprint_event, 1251 &trace_bprint_event,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..4086eae6e81b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
46 struct trace_array_cpu *data; 46 struct trace_array_cpu *data;
47 unsigned long flags; 47 unsigned long flags;
48 long disabled; 48 long disabled;
49 int resched;
50 int cpu; 49 int cpu;
51 int pc; 50 int pc;
52 51
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
54 return; 53 return;
55 54
56 pc = preempt_count(); 55 pc = preempt_count();
57 resched = ftrace_preempt_disable(); 56 preempt_disable_notrace();
58 57
59 cpu = raw_smp_processor_id(); 58 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu) 59 if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
74 out: 73 out:
75 atomic_dec(&data->disabled); 74 atomic_dec(&data->disabled);
76 out_enable: 75 out_enable:
77 ftrace_preempt_enable(resched); 76 preempt_enable_notrace();
78} 77}
79 78
80static struct ftrace_ops trace_ops __read_mostly = 79static struct ftrace_ops trace_ops __read_mostly =
@@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
383#ifdef CONFIG_FTRACE_SELFTEST 382#ifdef CONFIG_FTRACE_SELFTEST
384 .selftest = trace_selftest_startup_wakeup, 383 .selftest = trace_selftest_startup_wakeup,
385#endif 384#endif
385 .use_max_tr = 1,
386}; 386};
387 387
388static struct tracer wakeup_rt_tracer __read_mostly = 388static struct tracer wakeup_rt_tracer __read_mostly =
@@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
397#ifdef CONFIG_FTRACE_SELFTEST 397#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 398 .selftest = trace_selftest_startup_wakeup,
399#endif 399#endif
400 .use_max_tr = 1,
400}; 401};
401 402
402__init static int init_wakeup_tracer(void) 403__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 250e7f9bd2f0..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_WAKE: 13 case TRACE_WAKE:
14 case TRACE_STACK: 14 case TRACE_STACK:
15 case TRACE_PRINT: 15 case TRACE_PRINT:
16 case TRACE_SPECIAL:
17 case TRACE_BRANCH: 16 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
20 case TRACE_KSYM:
21 return 1; 19 return 1;
22 } 20 }
23 return 0; 21 return 0;
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
691} 689}
692#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 690#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
693 691
694#ifdef CONFIG_SYSPROF_TRACER
695int
696trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
697{
698 unsigned long count;
699 int ret;
700
701 /* start the tracing */
702 ret = tracer_init(trace, tr);
703 if (ret) {
704 warn_failed_init_tracer(trace, ret);
705 return ret;
706 }
707
708 /* Sleep for a 1/10 of a second */
709 msleep(100);
710 /* stop the tracing. */
711 tracing_stop();
712 /* check the trace buffer */
713 ret = trace_test_buffer(tr, &count);
714 trace->reset(tr);
715 tracing_start();
716
717 if (!ret && !count) {
718 printk(KERN_CONT ".. no entries found ..");
719 ret = -1;
720 }
721
722 return ret;
723}
724#endif /* CONFIG_SYSPROF_TRACER */
725
726#ifdef CONFIG_BRANCH_TRACER 692#ifdef CONFIG_BRANCH_TRACER
727int 693int
728trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) 694trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 721}
756#endif /* CONFIG_BRANCH_TRACER */ 722#endif /* CONFIG_BRANCH_TRACER */
757 723
758#ifdef CONFIG_KSYM_TRACER
759static int ksym_selftest_dummy;
760
761int
762trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
763{
764 unsigned long count;
765 int ret;
766
767 /* start the tracing */
768 ret = tracer_init(trace, tr);
769 if (ret) {
770 warn_failed_init_tracer(trace, ret);
771 return ret;
772 }
773
774 ksym_selftest_dummy = 0;
775 /* Register the read-write tracing request */
776
777 ret = process_new_ksym_entry("ksym_selftest_dummy",
778 HW_BREAKPOINT_R | HW_BREAKPOINT_W,
779 (unsigned long)(&ksym_selftest_dummy));
780
781 if (ret < 0) {
782 printk(KERN_CONT "ksym_trace read-write startup test failed\n");
783 goto ret_path;
784 }
785 /* Perform a read and a write operation over the dummy variable to
786 * trigger the tracer
787 */
788 if (ksym_selftest_dummy == 0)
789 ksym_selftest_dummy++;
790
791 /* stop the tracing. */
792 tracing_stop();
793 /* check the trace buffer */
794 ret = trace_test_buffer(tr, &count);
795 trace->reset(tr);
796 tracing_start();
797
798 /* read & write operations - one each is performed on the dummy variable
799 * triggering two entries in the trace buffer
800 */
801 if (!ret && count != 2) {
802 printk(KERN_CONT "Ksym tracer startup test failed");
803 ret = -1;
804 }
805
806ret_path:
807 return ret;
808}
809#endif /* CONFIG_KSYM_TRACER */
810
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..056468eae7cf 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
110static void 110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip) 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{ 112{
113 int cpu, resched; 113 int cpu;
114 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return; 116 return;
117 117
118 resched = ftrace_preempt_disable(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
127 out: 127 out:
128 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched); 130 preempt_enable_notrace();
131} 131}
132 132
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
26static struct list_head * 29static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call) 30syscall_get_enter_fields(struct ftrace_event_call *call)
28{ 31{
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34static struct list_head * 37static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call) 38syscall_get_exit_fields(struct ftrace_event_call *call)
36{ 39{
37 struct syscall_metadata *entry = call->data; 40 return &syscall_exit_fields;
38
39 return &entry->exit_fields;
40} 41}
41 42
42struct trace_event_functions enter_syscall_print_funcs = { 43struct trace_event_functions enter_syscall_print_funcs = {
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca9..000000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
97};
98
99static int
100trace_kernel(struct pt_regs *regs, struct trace_array *tr,
101 struct trace_array_cpu *data)
102{
103 struct backtrace_info info;
104 unsigned long bp;
105 char *stack;
106
107 info.tr = tr;
108 info.data = data;
109 info.pos = 1;
110
111 __trace_special(info.tr, info.data, 1, regs->ip, 0);
112
113 stack = ((char *)regs + sizeof(struct pt_regs));
114#ifdef CONFIG_FRAME_POINTER
115 bp = regs->bp;
116#else
117 bp = 0;
118#endif
119
120 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
121
122 return info.pos;
123}
124
125static void timer_notify(struct pt_regs *regs, int cpu)
126{
127 struct trace_array_cpu *data;
128 struct stack_frame frame;
129 struct trace_array *tr;
130 const void __user *fp;
131 int is_user;
132 int i;
133
134 if (!regs)
135 return;
136
137 tr = sysprof_trace;
138 data = tr->data[cpu];
139 is_user = user_mode(regs);
140
141 if (!current || current->pid == 0)
142 return;
143
144 if (is_user && current->state != TASK_RUNNING)
145 return;
146
147 __trace_special(tr, data, 0, 0, current->pid);
148
149 if (!is_user)
150 i = trace_kernel(regs, tr, data);
151 else
152 i = 0;
153
154 /*
155 * Trace user stack if we are not a kernel thread
156 */
157 if (current->mm && i < sample_max_depth) {
158 regs = (struct pt_regs *)current->thread.sp0 - 1;
159
160 fp = (void __user *)regs->bp;
161
162 __trace_special(tr, data, 2, regs->ip, 0);
163
164 while (i < sample_max_depth) {
165 frame.next_fp = NULL;
166 frame.return_address = 0;
167 if (!copy_stack_frame(fp, &frame))
168 break;
169 if ((unsigned long)fp < regs->sp)
170 break;
171
172 __trace_special(tr, data, 2, frame.return_address,
173 (unsigned long)fp);
174 fp = frame.next_fp;
175
176 i++;
177 }
178
179 }
180
181 /*
182 * Special trace entry if we overflow the max depth:
183 */
184 if (i == sample_max_depth)
185 __trace_special(tr, data, -1, -1, -1);
186
187 __trace_special(tr, data, 3, current->pid, i);
188}
189
190static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
191{
192 /* trace here */
193 timer_notify(get_irq_regs(), smp_processor_id());
194
195 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
196
197 return HRTIMER_RESTART;
198}
199
200static void start_stack_timer(void *unused)
201{
202 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
203
204 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 hrtimer->function = stack_trace_timer_fn;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
208 HRTIMER_MODE_REL_PINNED);
209}
210
211static void start_stack_timers(void)
212{
213 on_each_cpu(start_stack_timer, NULL, 1);
214}
215
216static void stop_stack_timer(int cpu)
217{
218 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
219
220 hrtimer_cancel(hrtimer);
221}
222
223static void stop_stack_timers(void)
224{
225 int cpu;
226
227 for_each_online_cpu(cpu)
228 stop_stack_timer(cpu);
229}
230
231static void stop_stack_trace(struct trace_array *tr)
232{
233 mutex_lock(&sample_timer_lock);
234 stop_stack_timers();
235 tracer_enabled = 0;
236 mutex_unlock(&sample_timer_lock);
237}
238
239static int stack_trace_init(struct trace_array *tr)
240{
241 sysprof_trace = tr;
242
243 tracing_start_cmdline_record();
244
245 mutex_lock(&sample_timer_lock);
246 start_stack_timers();
247 tracer_enabled = 1;
248 mutex_unlock(&sample_timer_lock);
249 return 0;
250}
251
252static void stack_trace_reset(struct trace_array *tr)
253{
254 tracing_stop_cmdline_record();
255 stop_stack_trace(tr);
256}
257
258static struct tracer stack_trace __read_mostly =
259{
260 .name = "sysprof",
261 .init = stack_trace_init,
262 .reset = stack_trace_reset,
263#ifdef CONFIG_FTRACE_SELFTEST
264 .selftest = trace_selftest_startup_sysprof,
265#endif
266};
267
268__init static int init_stack_trace(void)
269{
270 return register_tracer(&stack_trace);
271}
272device_initcall(init_stack_trace);
273
274#define MAX_LONG_DIGITS 22
275
276static ssize_t
277sysprof_sample_read(struct file *filp, char __user *ubuf,
278 size_t cnt, loff_t *ppos)
279{
280 char buf[MAX_LONG_DIGITS];
281 int r;
282
283 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
284
285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
286}
287
288static ssize_t
289sysprof_sample_write(struct file *filp, const char __user *ubuf,
290 size_t cnt, loff_t *ppos)
291{
292 char buf[MAX_LONG_DIGITS];
293 unsigned long val;
294
295 if (cnt > MAX_LONG_DIGITS-1)
296 cnt = MAX_LONG_DIGITS-1;
297
298 if (copy_from_user(&buf, ubuf, cnt))
299 return -EFAULT;
300
301 buf[cnt] = 0;
302
303 val = simple_strtoul(buf, NULL, 10);
304 /*
305 * Enforce a minimum sample period of 100 usecs:
306 */
307 if (val < 100)
308 val = 100;
309
310 mutex_lock(&sample_timer_lock);
311 stop_stack_timers();
312 sample_period = val * 1000;
313 start_stack_timers();
314 mutex_unlock(&sample_timer_lock);
315
316 return cnt;
317}
318
319static const struct file_operations sysprof_sample_fops = {
320 .read = sysprof_sample_read,
321 .write = sysprof_sample_write,
322};
323
324void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
325{
326
327 trace_create_file("sysprof_sample_period", 0644,
328 d_tracer, NULL, &sysprof_sample_fops);
329}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h>
12#include <linux/cred.h> 13#include <linux/cred.h>
13 14
14/* 15/*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
82 schedule_work(&ns->destroyer); 83 schedule_work(&ns->destroyer);
83} 84}
84EXPORT_SYMBOL(free_user_ns); 85EXPORT_SYMBOL(free_user_ns);
86
87uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
88{
89 struct user_namespace *tmp;
90
91 if (likely(to == cred->user->user_ns))
92 return uid;
93
94
95 /* Is cred->user the creator of the target user_ns
96 * or the creator of one of it's parents?
97 */
98 for ( tmp = to; tmp != &init_user_ns;
99 tmp = tmp->creator->user_ns ) {
100 if (cred->user == tmp->creator) {
101 return (uid_t)0;
102 }
103 }
104
105 /* No useful relationship so no mapping */
106 return overflowuid;
107}
108
109gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
110{
111 struct user_namespace *tmp;
112
113 if (likely(to == cred->user->user_ns))
114 return gid;
115
116 /* Is cred->user the creator of the target user_ns
117 * or the creator of one of it's parents?
118 */
119 for ( tmp = to; tmp != &init_user_ns;
120 tmp = tmp->creator->user_ns ) {
121 if (cred->user == tmp->creator) {
122 return (gid_t)0;
123 }
124 }
125
126 /* No useful relationship so no mapping */
127 return overflowgid;
128}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..613bc1f04610
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,567 @@
1/*
2 * Detect hard and soft lockups on a system
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * this code detects hard lockups: incidents in where on a CPU
7 * the kernel does not respond to anything except NMI.
8 *
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well.
13 */
14
15#include <linux/mm.h>
16#include <linux/cpu.h>
17#include <linux/nmi.h>
18#include <linux/init.h>
19#include <linux/delay.h>
20#include <linux/freezer.h>
21#include <linux/kthread.h>
22#include <linux/lockdep.h>
23#include <linux/notifier.h>
24#include <linux/module.h>
25#include <linux/sysctl.h>
26
27#include <asm/irq_regs.h>
28#include <linux/perf_event.h>
29
30int watchdog_enabled;
31int __read_mostly softlockup_thresh = 60;
32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif
45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog;
48
49
50/* boot commands */
51/*
52 * Should we panic when a soft-lockup or hard-lockup occurs:
53 */
54#ifdef CONFIG_HARDLOCKUP_DETECTOR
55static int hardlockup_panic;
56
57static int __init hardlockup_panic_setup(char *str)
58{
59 if (!strncmp(str, "panic", 5))
60 hardlockup_panic = 1;
61 return 1;
62}
63__setup("nmi_watchdog=", hardlockup_panic_setup);
64#endif
65
66unsigned int __read_mostly softlockup_panic =
67 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
68
69static int __init softlockup_panic_setup(char *str)
70{
71 softlockup_panic = simple_strtoul(str, NULL, 0);
72
73 return 1;
74}
75__setup("softlockup_panic=", softlockup_panic_setup);
76
77static int __init nowatchdog_setup(char *str)
78{
79 no_watchdog = 1;
80 return 1;
81}
82__setup("nowatchdog", nowatchdog_setup);
83
84/* deprecated */
85static int __init nosoftlockup_setup(char *str)
86{
87 no_watchdog = 1;
88 return 1;
89}
90__setup("nosoftlockup", nosoftlockup_setup);
91/* */
92
93
94/*
95 * Returns seconds, approximately. We don't need nanosecond
96 * resolution, and we don't need to waste time with a big divide when
97 * 2^30ns == 1.074s.
98 */
99static unsigned long get_timestamp(int this_cpu)
100{
101 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
102}
103
104static unsigned long get_sample_period(void)
105{
106 /*
107 * convert softlockup_thresh from seconds to ns
108 * the divide by 5 is to give hrtimer 5 chances to
109 * increment before the hardlockup detector generates
110 * a warning
111 */
112 return softlockup_thresh / 5 * NSEC_PER_SEC;
113}
114
115/* Commands for resetting the watchdog */
116static void __touch_watchdog(void)
117{
118 int this_cpu = smp_processor_id();
119
120 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
121}
122
123void touch_softlockup_watchdog(void)
124{
125 __get_cpu_var(watchdog_touch_ts) = 0;
126}
127EXPORT_SYMBOL(touch_softlockup_watchdog);
128
129void touch_all_softlockup_watchdogs(void)
130{
131 int cpu;
132
133 /*
134 * this is done lockless
135 * do we care if a 0 races with a timestamp?
136 * all it means is the softlock check starts one cycle later
137 */
138 for_each_online_cpu(cpu)
139 per_cpu(watchdog_touch_ts, cpu) = 0;
140}
141
142#ifdef CONFIG_HARDLOCKUP_DETECTOR
143void touch_nmi_watchdog(void)
144{
145 __get_cpu_var(watchdog_nmi_touch) = true;
146 touch_softlockup_watchdog();
147}
148EXPORT_SYMBOL(touch_nmi_watchdog);
149
150#endif
151
152void touch_softlockup_watchdog_sync(void)
153{
154 __raw_get_cpu_var(softlockup_touch_sync) = true;
155 __raw_get_cpu_var(watchdog_touch_ts) = 0;
156}
157
158#ifdef CONFIG_HARDLOCKUP_DETECTOR
159/* watchdog detector functions */
160static int is_hardlockup(void)
161{
162 unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
163
164 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
165 return 1;
166
167 __get_cpu_var(hrtimer_interrupts_saved) = hrint;
168 return 0;
169}
170#endif
171
172static int is_softlockup(unsigned long touch_ts)
173{
174 unsigned long now = get_timestamp(smp_processor_id());
175
176 /* Warn about unreasonable delays: */
177 if (time_after(now, touch_ts + softlockup_thresh))
178 return now - touch_ts;
179
180 return 0;
181}
182
183static int
184watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
185{
186 did_panic = 1;
187
188 return NOTIFY_DONE;
189}
190
191static struct notifier_block panic_block = {
192 .notifier_call = watchdog_panic,
193};
194
195#ifdef CONFIG_HARDLOCKUP_DETECTOR
196static struct perf_event_attr wd_hw_attr = {
197 .type = PERF_TYPE_HARDWARE,
198 .config = PERF_COUNT_HW_CPU_CYCLES,
199 .size = sizeof(struct perf_event_attr),
200 .pinned = 1,
201 .disabled = 1,
202};
203
204/* Callback function for perf event subsystem */
205void watchdog_overflow_callback(struct perf_event *event, int nmi,
206 struct perf_sample_data *data,
207 struct pt_regs *regs)
208{
209 if (__get_cpu_var(watchdog_nmi_touch) == true) {
210 __get_cpu_var(watchdog_nmi_touch) = false;
211 return;
212 }
213
214 /* check for a hardlockup
215 * This is done by making sure our timer interrupt
216 * is incrementing. The timer interrupt should have
217 * fired multiple times before we overflow'd. If it hasn't
218 * then this is a good indication the cpu is stuck
219 */
220 if (is_hardlockup()) {
221 int this_cpu = smp_processor_id();
222
223 /* only print hardlockups once */
224 if (__get_cpu_var(hard_watchdog_warn) == true)
225 return;
226
227 if (hardlockup_panic)
228 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
229 else
230 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
231
232 __get_cpu_var(hard_watchdog_warn) = true;
233 return;
234 }
235
236 __get_cpu_var(hard_watchdog_warn) = false;
237 return;
238}
239static void watchdog_interrupt_count(void)
240{
241 __get_cpu_var(hrtimer_interrupts)++;
242}
243#else
244static inline void watchdog_interrupt_count(void) { return; }
245#endif /* CONFIG_HARDLOCKUP_DETECTOR */
246
247/* watchdog kicker functions */
248static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
249{
250 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
251 struct pt_regs *regs = get_irq_regs();
252 int duration;
253
254 /* kick the hardlockup detector */
255 watchdog_interrupt_count();
256
257 /* kick the softlockup detector */
258 wake_up_process(__get_cpu_var(softlockup_watchdog));
259
260 /* .. and repeat */
261 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
262
263 if (touch_ts == 0) {
264 if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
265 /*
266 * If the time stamp was touched atomically
267 * make sure the scheduler tick is up to date.
268 */
269 __get_cpu_var(softlockup_touch_sync) = false;
270 sched_clock_tick();
271 }
272 __touch_watchdog();
273 return HRTIMER_RESTART;
274 }
275
276 /* check for a softlockup
277 * This is done by making sure a high priority task is
278 * being scheduled. The task touches the watchdog to
279 * indicate it is getting cpu time. If it hasn't then
280 * this is a good indication some task is hogging the cpu
281 */
282 duration = is_softlockup(touch_ts);
283 if (unlikely(duration)) {
284 /* only warn once */
285 if (__get_cpu_var(soft_watchdog_warn) == true)
286 return HRTIMER_RESTART;
287
288 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
289 smp_processor_id(), duration,
290 current->comm, task_pid_nr(current));
291 print_modules();
292 print_irqtrace_events(current);
293 if (regs)
294 show_regs(regs);
295 else
296 dump_stack();
297
298 if (softlockup_panic)
299 panic("softlockup: hung tasks");
300 __get_cpu_var(soft_watchdog_warn) = true;
301 } else
302 __get_cpu_var(soft_watchdog_warn) = false;
303
304 return HRTIMER_RESTART;
305}
306
307
308/*
309 * The watchdog thread - touches the timestamp.
310 */
311static int watchdog(void *unused)
312{
313 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
314 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
315
316 sched_setscheduler(current, SCHED_FIFO, &param);
317
318 /* initialize timestamp */
319 __touch_watchdog();
320
321 /* kick off the timer for the hardlockup detector */
322 /* done here because hrtimer_start can only pin to smp_processor_id() */
323 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
324 HRTIMER_MODE_REL_PINNED);
325
326 set_current_state(TASK_INTERRUPTIBLE);
327 /*
328 * Run briefly once per second to reset the softlockup timestamp.
329 * If this gets delayed for more than 60 seconds then the
330 * debug-printout triggers in watchdog_timer_fn().
331 */
332 while (!kthread_should_stop()) {
333 __touch_watchdog();
334 schedule();
335
336 if (kthread_should_stop())
337 break;
338
339 set_current_state(TASK_INTERRUPTIBLE);
340 }
341 __set_current_state(TASK_RUNNING);
342
343 return 0;
344}
345
346
347#ifdef CONFIG_HARDLOCKUP_DETECTOR
348static int watchdog_nmi_enable(int cpu)
349{
350 struct perf_event_attr *wd_attr;
351 struct perf_event *event = per_cpu(watchdog_ev, cpu);
352
353 /* is it already setup and enabled? */
354 if (event && event->state > PERF_EVENT_STATE_OFF)
355 goto out;
356
357 /* it is setup but not enabled */
358 if (event != NULL)
359 goto out_enable;
360
361 /* Try to register using hardware perf events */
362 wd_attr = &wd_hw_attr;
363 wd_attr->sample_period = hw_nmi_get_sample_period();
364 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
365 if (!IS_ERR(event)) {
366 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
367 goto out_save;
368 }
369
370 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
371 return -1;
372
373 /* success path */
374out_save:
375 per_cpu(watchdog_ev, cpu) = event;
376out_enable:
377 perf_event_enable(per_cpu(watchdog_ev, cpu));
378out:
379 return 0;
380}
381
382static void watchdog_nmi_disable(int cpu)
383{
384 struct perf_event *event = per_cpu(watchdog_ev, cpu);
385
386 if (event) {
387 perf_event_disable(event);
388 per_cpu(watchdog_ev, cpu) = NULL;
389
390 /* should be in cleanup, but blocks oprofile */
391 perf_event_release_kernel(event);
392 }
393 return;
394}
395#else
396static int watchdog_nmi_enable(int cpu) { return 0; }
397static void watchdog_nmi_disable(int cpu) { return; }
398#endif /* CONFIG_HARDLOCKUP_DETECTOR */
399
400/* prepare/enable/disable routines */
401static int watchdog_prepare_cpu(int cpu)
402{
403 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
404
405 WARN_ON(per_cpu(softlockup_watchdog, cpu));
406 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
407 hrtimer->function = watchdog_timer_fn;
408
409 return 0;
410}
411
412static int watchdog_enable(int cpu)
413{
414 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
415
416 /* enable the perf event */
417 if (watchdog_nmi_enable(cpu) != 0)
418 return -1;
419
420 /* create the watchdog thread */
421 if (!p) {
422 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
423 if (IS_ERR(p)) {
424 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
425 return -1;
426 }
427 kthread_bind(p, cpu);
428 per_cpu(watchdog_touch_ts, cpu) = 0;
429 per_cpu(softlockup_watchdog, cpu) = p;
430 wake_up_process(p);
431 }
432
433 return 0;
434}
435
436static void watchdog_disable(int cpu)
437{
438 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
439 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
440
441 /*
442 * cancel the timer first to stop incrementing the stats
443 * and waking up the kthread
444 */
445 hrtimer_cancel(hrtimer);
446
447 /* disable the perf event */
448 watchdog_nmi_disable(cpu);
449
450 /* stop the watchdog thread */
451 if (p) {
452 per_cpu(softlockup_watchdog, cpu) = NULL;
453 kthread_stop(p);
454 }
455
456 /* if any cpu succeeds, watchdog is considered enabled for the system */
457 watchdog_enabled = 1;
458}
459
460static void watchdog_enable_all_cpus(void)
461{
462 int cpu;
463 int result = 0;
464
465 for_each_online_cpu(cpu)
466 result += watchdog_enable(cpu);
467
468 if (result)
469 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
470
471}
472
473static void watchdog_disable_all_cpus(void)
474{
475 int cpu;
476
477 for_each_online_cpu(cpu)
478 watchdog_disable(cpu);
479
480 /* if all watchdogs are disabled, then they are disabled for the system */
481 watchdog_enabled = 0;
482}
483
484
485/* sysctl functions */
486#ifdef CONFIG_SYSCTL
487/*
488 * proc handler for /proc/sys/kernel/nmi_watchdog
489 */
490
491int proc_dowatchdog_enabled(struct ctl_table *table, int write,
492 void __user *buffer, size_t *length, loff_t *ppos)
493{
494 proc_dointvec(table, write, buffer, length, ppos);
495
496 if (watchdog_enabled)
497 watchdog_enable_all_cpus();
498 else
499 watchdog_disable_all_cpus();
500 return 0;
501}
502
503int proc_dowatchdog_thresh(struct ctl_table *table, int write,
504 void __user *buffer,
505 size_t *lenp, loff_t *ppos)
506{
507 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
508}
509#endif /* CONFIG_SYSCTL */
510
511
512/*
513 * Create/destroy watchdog threads as CPUs come and go:
514 */
515static int __cpuinit
516cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
517{
518 int hotcpu = (unsigned long)hcpu;
519
520 switch (action) {
521 case CPU_UP_PREPARE:
522 case CPU_UP_PREPARE_FROZEN:
523 if (watchdog_prepare_cpu(hotcpu))
524 return NOTIFY_BAD;
525 break;
526 case CPU_ONLINE:
527 case CPU_ONLINE_FROZEN:
528 if (watchdog_enable(hotcpu))
529 return NOTIFY_BAD;
530 break;
531#ifdef CONFIG_HOTPLUG_CPU
532 case CPU_UP_CANCELED:
533 case CPU_UP_CANCELED_FROZEN:
534 watchdog_disable(hotcpu);
535 break;
536 case CPU_DEAD:
537 case CPU_DEAD_FROZEN:
538 watchdog_disable(hotcpu);
539 break;
540#endif /* CONFIG_HOTPLUG_CPU */
541 }
542 return NOTIFY_OK;
543}
544
545static struct notifier_block __cpuinitdata cpu_nfb = {
546 .notifier_call = cpu_callback
547};
548
549static int __init spawn_watchdog_task(void)
550{
551 void *cpu = (void *)(long)smp_processor_id();
552 int err;
553
554 if (no_watchdog)
555 return 0;
556
557 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
558 WARN_ON(err == NOTIFY_BAD);
559
560 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
561 register_cpu_notifier(&cpu_nfb);
562
563 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
564
565 return 0;
566}
567early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..59fef1531dd2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,21 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_LOCKDEP
72/**
73 * in_workqueue_context() - in context of specified workqueue?
74 * @wq: the workqueue of interest
75 *
76 * Checks lockdep state to see if the current task is executing from
77 * within a workqueue item. This function exists only if lockdep is
78 * enabled.
79 */
80int in_workqueue_context(struct workqueue_struct *wq)
81{
82 return lock_is_held(&wq->lockdep_map);
83}
84#endif
85
71#ifdef CONFIG_DEBUG_OBJECTS_WORK 86#ifdef CONFIG_DEBUG_OBJECTS_WORK
72 87
73static struct debug_obj_descr work_debug_descr; 88static struct debug_obj_descr work_debug_descr;
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..af040babb742
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,16 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7static inline void wq_worker_waking_up(struct task_struct *task,
8 unsigned int cpu)
9{
10}
11
12static inline struct task_struct *wq_worker_sleeping(struct task_struct *task,
13 unsigned int cpu)
14{
15 return NULL;
16}