aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-06-16 12:08:13 -0400
committerJiri Kosina <jkosina@suse.cz>2010-06-16 12:08:13 -0400
commitf1bbbb6912662b9f6070c5bfc4ca9eb1f06a9d5b (patch)
treec2c130a74be25b0b2dff992e1a195e2728bdaadd /kernel
parentfd0961ff67727482bb20ca7e8ea97b83e9de2ddb (diff)
parent7e27d6e778cd87b6f2415515d7127eba53fe5d02 (diff)
Merge branch 'master' into for-next
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/cpu.c133
-rw-r--r--kernel/cpuset.c78
-rw-r--r--kernel/cred.c60
-rw-r--r--kernel/debug/kdb/kdb_main.c12
-rw-r--r--kernel/exec_domain.c18
-rw-r--r--kernel/exit.c42
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/kmod.c193
-rw-r--r--kernel/module.c329
-rw-r--r--kernel/mutex.c7
-rw-r--r--kernel/padata.c189
-rw-r--r--kernel/panic.c27
-rw-r--r--kernel/perf_event.c753
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c26
-rw-r--r--kernel/relay.c17
-rw-r--r--kernel/resource.c16
-rw-r--r--kernel/sched.c49
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--kernel/sched_debug.c10
-rw-r--r--kernel/sched_fair.c22
-rw-r--r--kernel/signal.c23
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c76
-rw-r--r--kernel/sysctl_binary.c9
-rw-r--r--kernel/time.c8
-rw-r--r--kernel/timer.c22
-rw-r--r--kernel/trace/blktrace.c140
-rw-r--r--kernel/trace/ftrace.c7
-rw-r--r--kernel/trace/kmemtrace.c70
-rw-r--r--kernel/trace/ring_buffer.c19
-rw-r--r--kernel/trace/trace.c75
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_perf.c190
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c28
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions_graph.c13
-rw-r--r--kernel/trace/trace_kprobe.c113
-rw-r--r--kernel/trace/trace_output.c137
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c20
-rw-r--r--kernel/trace/trace_sched_wakeup.c28
-rw-r--r--kernel/trace/trace_syscalls.c146
-rw-r--r--kernel/trace/trace_workqueue.c26
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/workqueue.c9
56 files changed, 2085 insertions, 1429 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 291775021b2e..3ac6f5b0a64b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2994,7 +2994,6 @@ static void cgroup_event_remove(struct work_struct *work)
2994 remove); 2994 remove);
2995 struct cgroup *cgrp = event->cgrp; 2995 struct cgroup *cgrp = event->cgrp;
2996 2996
2997 /* TODO: check return code */
2998 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 2997 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2999 2998
3000 eventfd_ctx_put(event->eventfd); 2999 eventfd_ctx_put(event->eventfd);
@@ -4599,7 +4598,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4599 parent_css = parent->subsys[subsys_id]; 4598 parent_css = parent->subsys[subsys_id];
4600 child_css = child->subsys[subsys_id]; 4599 child_css = child->subsys[subsys_id];
4601 parent_id = parent_css->id; 4600 parent_id = parent_css->id;
4602 depth = parent_id->depth; 4601 depth = parent_id->depth + 1;
4603 4602
4604 child_id = get_new_cssid(ss, depth); 4603 child_id = get_new_cssid(ss, depth);
4605 if (IS_ERR(child_id)) 4604 if (IS_ERR(child_id))
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 545777574779..97d1b426a4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,13 +20,29 @@
20/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
21static DEFINE_MUTEX(cpu_add_remove_lock); 21static DEFINE_MUTEX(cpu_add_remove_lock);
22 22
23static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 23/*
24 * The following two API's must be used when attempting
25 * to serialize the updates to cpu_online_mask, cpu_present_mask.
26 */
27void cpu_maps_update_begin(void)
28{
29 mutex_lock(&cpu_add_remove_lock);
30}
31
32void cpu_maps_update_done(void)
33{
34 mutex_unlock(&cpu_add_remove_lock);
35}
36
37static RAW_NOTIFIER_HEAD(cpu_chain);
24 38
25/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 39/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
26 * Should always be manipulated under cpu_add_remove_lock 40 * Should always be manipulated under cpu_add_remove_lock
27 */ 41 */
28static int cpu_hotplug_disabled; 42static int cpu_hotplug_disabled;
29 43
44#ifdef CONFIG_HOTPLUG_CPU
45
30static struct { 46static struct {
31 struct task_struct *active_writer; 47 struct task_struct *active_writer;
32 struct mutex lock; /* Synchronizes accesses to refcount, */ 48 struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -41,8 +57,6 @@ static struct {
41 .refcount = 0, 57 .refcount = 0,
42}; 58};
43 59
44#ifdef CONFIG_HOTPLUG_CPU
45
46void get_online_cpus(void) 60void get_online_cpus(void)
47{ 61{
48 might_sleep(); 62 might_sleep();
@@ -67,22 +81,6 @@ void put_online_cpus(void)
67} 81}
68EXPORT_SYMBOL_GPL(put_online_cpus); 82EXPORT_SYMBOL_GPL(put_online_cpus);
69 83
70#endif /* CONFIG_HOTPLUG_CPU */
71
72/*
73 * The following two API's must be used when attempting
74 * to serialize the updates to cpu_online_mask, cpu_present_mask.
75 */
76void cpu_maps_update_begin(void)
77{
78 mutex_lock(&cpu_add_remove_lock);
79}
80
81void cpu_maps_update_done(void)
82{
83 mutex_unlock(&cpu_add_remove_lock);
84}
85
86/* 84/*
87 * This ensures that the hotplug operation can begin only when the 85 * This ensures that the hotplug operation can begin only when the
88 * refcount goes to zero. 86 * refcount goes to zero.
@@ -124,6 +122,12 @@ static void cpu_hotplug_done(void)
124 cpu_hotplug.active_writer = NULL; 122 cpu_hotplug.active_writer = NULL;
125 mutex_unlock(&cpu_hotplug.lock); 123 mutex_unlock(&cpu_hotplug.lock);
126} 124}
125
126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */
130
127/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
128int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
129{ 133{
@@ -134,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
134 return ret; 138 return ret;
135} 139}
136 140
141static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
142 int *nr_calls)
143{
144 int ret;
145
146 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
147 nr_calls);
148
149 return notifier_to_errno(ret);
150}
151
152static int cpu_notify(unsigned long val, void *v)
153{
154 return __cpu_notify(val, v, -1, NULL);
155}
156
137#ifdef CONFIG_HOTPLUG_CPU 157#ifdef CONFIG_HOTPLUG_CPU
138 158
159static void cpu_notify_nofail(unsigned long val, void *v)
160{
161 BUG_ON(cpu_notify(val, v));
162}
163
139EXPORT_SYMBOL(register_cpu_notifier); 164EXPORT_SYMBOL(register_cpu_notifier);
140 165
141void __ref unregister_cpu_notifier(struct notifier_block *nb) 166void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -181,8 +206,7 @@ static int __ref take_cpu_down(void *_param)
181 if (err < 0) 206 if (err < 0)
182 return err; 207 return err;
183 208
184 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 209 cpu_notify(CPU_DYING | param->mod, param->hcpu);
185 param->hcpu);
186 210
187 if (task_cpu(param->caller) == cpu) 211 if (task_cpu(param->caller) == cpu)
188 move_task_off_dead_cpu(cpu, param->caller); 212 move_task_off_dead_cpu(cpu, param->caller);
@@ -212,17 +236,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
212 236
213 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
214 set_cpu_active(cpu, false); 238 set_cpu_active(cpu, false);
215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
216 hcpu, -1, &nr_calls); 240 if (err) {
217 if (err == NOTIFY_BAD) {
218 set_cpu_active(cpu, true); 241 set_cpu_active(cpu, true);
219 242
220 nr_calls--; 243 nr_calls--;
221 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
222 hcpu, nr_calls, NULL);
223 printk("%s: attempt to take down CPU %u failed\n", 245 printk("%s: attempt to take down CPU %u failed\n",
224 __func__, cpu); 246 __func__, cpu);
225 err = -EINVAL;
226 goto out_release; 247 goto out_release;
227 } 248 }
228 249
@@ -230,9 +251,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
230 if (err) { 251 if (err) {
231 set_cpu_active(cpu, true); 252 set_cpu_active(cpu, true);
232 /* CPU didn't die: tell everyone. Can't complain. */ 253 /* CPU didn't die: tell everyone. Can't complain. */
233 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
234 hcpu) == NOTIFY_BAD)
235 BUG();
236 255
237 goto out_release; 256 goto out_release;
238 } 257 }
@@ -246,19 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
246 __cpu_die(cpu); 265 __cpu_die(cpu);
247 266
248 /* CPU is completely dead: tell everyone. Too late to complain. */ 267 /* CPU is completely dead: tell everyone. Too late to complain. */
249 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, 268 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
250 hcpu) == NOTIFY_BAD)
251 BUG();
252 269
253 check_for_tasks(cpu); 270 check_for_tasks(cpu);
254 271
255out_release: 272out_release:
256 cpu_hotplug_done(); 273 cpu_hotplug_done();
257 if (!err) { 274 if (!err)
258 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, 275 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
259 hcpu) == NOTIFY_BAD)
260 BUG();
261 }
262 return err; 276 return err;
263} 277}
264 278
@@ -293,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
293 return -EINVAL; 307 return -EINVAL;
294 308
295 cpu_hotplug_begin(); 309 cpu_hotplug_begin();
296 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 310 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
297 -1, &nr_calls); 311 if (ret) {
298 if (ret == NOTIFY_BAD) {
299 nr_calls--; 312 nr_calls--;
300 printk("%s: attempt to bring up CPU %u failed\n", 313 printk("%s: attempt to bring up CPU %u failed\n",
301 __func__, cpu); 314 __func__, cpu);
302 ret = -EINVAL;
303 goto out_notify; 315 goto out_notify;
304 } 316 }
305 317
@@ -312,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
312 set_cpu_active(cpu, true); 324 set_cpu_active(cpu, true);
313 325
314 /* Now call notifier in preparation. */ 326 /* Now call notifier in preparation. */
315 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 327 cpu_notify(CPU_ONLINE | mod, hcpu);
316 328
317out_notify: 329out_notify:
318 if (ret != 0) 330 if (ret != 0)
319 __raw_notifier_call_chain(&cpu_chain, 331 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
320 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
321 cpu_hotplug_done(); 332 cpu_hotplug_done();
322 333
323 return ret; 334 return ret;
@@ -326,6 +337,12 @@ out_notify:
326int __cpuinit cpu_up(unsigned int cpu) 337int __cpuinit cpu_up(unsigned int cpu)
327{ 338{
328 int err = 0; 339 int err = 0;
340
341#ifdef CONFIG_MEMORY_HOTPLUG
342 int nid;
343 pg_data_t *pgdat;
344#endif
345
329 if (!cpu_possible(cpu)) { 346 if (!cpu_possible(cpu)) {
330 printk(KERN_ERR "can't online cpu %d because it is not " 347 printk(KERN_ERR "can't online cpu %d because it is not "
331 "configured as may-hotadd at boot time\n", cpu); 348 "configured as may-hotadd at boot time\n", cpu);
@@ -336,6 +353,28 @@ int __cpuinit cpu_up(unsigned int cpu)
336 return -EINVAL; 353 return -EINVAL;
337 } 354 }
338 355
356#ifdef CONFIG_MEMORY_HOTPLUG
357 nid = cpu_to_node(cpu);
358 if (!node_online(nid)) {
359 err = mem_online_node(nid);
360 if (err)
361 return err;
362 }
363
364 pgdat = NODE_DATA(nid);
365 if (!pgdat) {
366 printk(KERN_ERR
367 "Can't online cpu %d due to NULL pgdat\n", cpu);
368 return -ENOMEM;
369 }
370
371 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
372 mutex_lock(&zonelists_mutex);
373 build_all_zonelists(NULL);
374 mutex_unlock(&zonelists_mutex);
375 }
376#endif
377
339 cpu_maps_update_begin(); 378 cpu_maps_update_begin();
340 379
341 if (cpu_hotplug_disabled) { 380 if (cpu_hotplug_disabled) {
@@ -355,7 +394,7 @@ static cpumask_var_t frozen_cpus;
355 394
356int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
357{ 396{
358 int cpu, first_cpu, error; 397 int cpu, first_cpu, error = 0;
359 398
360 cpu_maps_update_begin(); 399 cpu_maps_update_begin();
361 first_cpu = cpumask_first(cpu_online_mask); 400 first_cpu = cpumask_first(cpu_online_mask);
@@ -453,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
453 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) 492 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
454 val = CPU_STARTING_FROZEN; 493 val = CPU_STARTING_FROZEN;
455#endif /* CONFIG_PM_SLEEP_SMP */ 494#endif /* CONFIG_PM_SLEEP_SMP */
456 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 495 cpu_notify(val, (void *)(long)cpu);
457} 496}
458 497
459#endif /* CONFIG_SMP */ 498#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1a109788592f..7cb37d86a005 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
946 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
947 * we structure updates as setting all new allowed nodes, then clearing newly 947 * we structure updates as setting all new allowed nodes, then clearing newly
948 * disallowed ones. 948 * disallowed ones.
949 *
950 * Called with task's alloc_lock held
951 */ 949 */
952static void cpuset_change_task_nodemask(struct task_struct *tsk, 950static void cpuset_change_task_nodemask(struct task_struct *tsk,
953 nodemask_t *newmems) 951 nodemask_t *newmems)
954{ 952{
953repeat:
954 /*
955 * Allow tasks that have access to memory reserves because they have
956 * been OOM killed to get memory anywhere.
957 */
958 if (unlikely(test_thread_flag(TIF_MEMDIE)))
959 return;
960 if (current->flags & PF_EXITING) /* Let dying task have memory */
961 return;
962
963 task_lock(tsk);
955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
956 mpol_rebind_task(tsk, &tsk->mems_allowed); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
957 mpol_rebind_task(tsk, newmems); 966
967
968 /*
969 * ensure checking ->mems_allowed_change_disable after setting all new
970 * allowed nodes.
971 *
972 * the read-side task can see an nodemask with new allowed nodes and
973 * old allowed nodes. and if it allocates page when cpuset clears newly
974 * disallowed ones continuous, it can see the new allowed bits.
975 *
976 * And if setting all new allowed nodes is after the checking, setting
977 * all new allowed nodes and clearing newly disallowed ones will be done
978 * continuous, and the read-side task may find no node to alloc page.
979 */
980 smp_mb();
981
982 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side.
985 */
986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
987 task_unlock(tsk);
988 if (!task_curr(tsk))
989 yield();
990 goto repeat;
991 }
992
993 /*
994 * ensure checking ->mems_allowed_change_disable before clearing all new
995 * disallowed nodes.
996 *
997 * if clearing newly disallowed bits before the checking, the read-side
998 * task may find no node to alloc page.
999 */
1000 smp_mb();
1001
1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
958 tsk->mems_allowed = *newmems; 1003 tsk->mems_allowed = *newmems;
1004 task_unlock(tsk);
959} 1005}
960 1006
961/* 1007/*
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
978 cs = cgroup_cs(scan->cg); 1024 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, newmems); 1025 guarantee_online_mems(cs, newmems);
980 1026
981 task_lock(p);
982 cpuset_change_task_nodemask(p, newmems); 1027 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p);
984 1028
985 NODEMASK_FREE(newmems); 1029 NODEMASK_FREE(newmems);
986 1030
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1383 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1384 WARN_ON_ONCE(err); 1428 WARN_ON_ONCE(err);
1385 1429
1386 task_lock(tsk);
1387 cpuset_change_task_nodemask(tsk, to); 1430 cpuset_change_task_nodemask(tsk, to);
1388 task_unlock(tsk);
1389 cpuset_update_task_spread_flag(cs, tsk); 1431 cpuset_update_task_spread_flag(cs, tsk);
1390 1432
1391} 1433}
@@ -2427,7 +2469,8 @@ void cpuset_unlock(void)
2427} 2469}
2428 2470
2429/** 2471/**
2430 * cpuset_mem_spread_node() - On which node to begin search for a page 2472 * cpuset_mem_spread_node() - On which node to begin search for a file page
2473 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2431 * 2474 *
2432 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2475 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2433 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2476 * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2452,16 +2495,27 @@ void cpuset_unlock(void)
2452 * See kmem_cache_alloc_node(). 2495 * See kmem_cache_alloc_node().
2453 */ 2496 */
2454 2497
2455int cpuset_mem_spread_node(void) 2498static int cpuset_spread_node(int *rotor)
2456{ 2499{
2457 int node; 2500 int node;
2458 2501
2459 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); 2502 node = next_node(*rotor, current->mems_allowed);
2460 if (node == MAX_NUMNODES) 2503 if (node == MAX_NUMNODES)
2461 node = first_node(current->mems_allowed); 2504 node = first_node(current->mems_allowed);
2462 current->cpuset_mem_spread_rotor = node; 2505 *rotor = node;
2463 return node; 2506 return node;
2464} 2507}
2508
2509int cpuset_mem_spread_node(void)
2510{
2511 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2512}
2513
2514int cpuset_slab_spread_node(void)
2515{
2516 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2517}
2518
2465EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2519EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2466 2520
2467/** 2521/**
diff --git a/kernel/cred.c b/kernel/cred.c
index 2c24870c55d1..a2d5504fbcc2 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -347,66 +347,6 @@ struct cred *prepare_exec_creds(void)
347} 347}
348 348
349/* 349/*
350 * prepare new credentials for the usermode helper dispatcher
351 */
352struct cred *prepare_usermodehelper_creds(void)
353{
354#ifdef CONFIG_KEYS
355 struct thread_group_cred *tgcred = NULL;
356#endif
357 struct cred *new;
358
359#ifdef CONFIG_KEYS
360 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
361 if (!tgcred)
362 return NULL;
363#endif
364
365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
366 if (!new)
367 goto free_tgcred;
368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
371 memcpy(new, &init_cred, sizeof(struct cred));
372
373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
375 get_group_info(new->group_info);
376 get_uid(new->user);
377
378#ifdef CONFIG_KEYS
379 new->thread_keyring = NULL;
380 new->request_key_auth = NULL;
381 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
382
383 atomic_set(&tgcred->usage, 1);
384 spin_lock_init(&tgcred->lock);
385 new->tgcred = tgcred;
386#endif
387
388#ifdef CONFIG_SECURITY
389 new->security = NULL;
390#endif
391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
392 goto error;
393 validate_creds(new);
394
395 BUG_ON(atomic_read(&new->usage) != 1);
396 return new;
397
398error:
399 put_cred(new);
400 return NULL;
401
402free_tgcred:
403#ifdef CONFIG_KEYS
404 kfree(tgcred);
405#endif
406 return NULL;
407}
408
409/*
410 * Copy credentials for the new process created by fork() 350 * Copy credentials for the new process created by fork()
411 * 351 *
412 * We share if we can, but under some circumstances we have to generate a new 352 * We share if we can, but under some circumstances we have to generate a new
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index b724c791b6d4..184cd8209c36 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1857,12 +1857,6 @@ static int kdb_ef(int argc, const char **argv)
1857} 1857}
1858 1858
1859#if defined(CONFIG_MODULES) 1859#if defined(CONFIG_MODULES)
1860/* modules using other modules */
1861struct module_use {
1862 struct list_head list;
1863 struct module *module_which_uses;
1864};
1865
1866/* 1860/*
1867 * kdb_lsmod - This function implements the 'lsmod' command. Lists 1861 * kdb_lsmod - This function implements the 'lsmod' command. Lists
1868 * currently loaded kernel modules. 1862 * currently loaded kernel modules.
@@ -1894,9 +1888,9 @@ static int kdb_lsmod(int argc, const char **argv)
1894 { 1888 {
1895 struct module_use *use; 1889 struct module_use *use;
1896 kdb_printf(" [ "); 1890 kdb_printf(" [ ");
1897 list_for_each_entry(use, &mod->modules_which_use_me, 1891 list_for_each_entry(use, &mod->source_list,
1898 list) 1892 source_list)
1899 kdb_printf("%s ", use->module_which_uses->name); 1893 kdb_printf("%s ", use->target->name);
1900 kdb_printf("]\n"); 1894 kdb_printf("]\n");
1901 } 1895 }
1902#endif 1896#endif
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..dd62f8e714ca 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
27static DEFINE_RWLOCK(exec_domains_lock); 27static DEFINE_RWLOCK(exec_domains_lock);
28 28
29 29
30static u_long ident_map[32] = { 30static unsigned long ident_map[32] = {
31 0, 1, 2, 3, 4, 5, 6, 7, 31 0, 1, 2, 3, 4, 5, 6, 7,
32 8, 9, 10, 11, 12, 13, 14, 15, 32 8, 9, 10, 11, 12, 13, 14, 15,
33 16, 17, 18, 19, 20, 21, 22, 23, 33 16, 17, 18, 19, 20, 21, 22, 23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
56} 56}
57 57
58static struct exec_domain * 58static struct exec_domain *
59lookup_exec_domain(u_long personality) 59lookup_exec_domain(unsigned int personality)
60{ 60{
61 struct exec_domain * ep; 61 unsigned int pers = personality(personality);
62 u_long pers = personality(personality); 62 struct exec_domain *ep;
63 63
64 read_lock(&exec_domains_lock); 64 read_lock(&exec_domains_lock);
65 for (ep = exec_domains; ep; ep = ep->next) { 65 for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
70 70
71#ifdef CONFIG_MODULES 71#ifdef CONFIG_MODULES
72 read_unlock(&exec_domains_lock); 72 read_unlock(&exec_domains_lock);
73 request_module("personality-%ld", pers); 73 request_module("personality-%d", pers);
74 read_lock(&exec_domains_lock); 74 read_lock(&exec_domains_lock);
75 75
76 for (ep = exec_domains; ep; ep = ep->next) { 76 for (ep = exec_domains; ep; ep = ep->next) {
@@ -135,7 +135,7 @@ unregister:
135} 135}
136 136
137int 137int
138__set_personality(u_long personality) 138__set_personality(unsigned int personality)
139{ 139{
140 struct exec_domain *ep, *oep; 140 struct exec_domain *ep, *oep;
141 141
@@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void)
188module_init(proc_execdomains_init); 188module_init(proc_execdomains_init);
189#endif 189#endif
190 190
191SYSCALL_DEFINE1(personality, u_long, personality) 191SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 192{
193 u_long old = current->personality; 193 unsigned int old = current->personality;
194 194
195 if (personality != 0xffffffff) { 195 if (personality != 0xffffffff) {
196 set_personality(personality); 196 set_personality(personality);
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality)
198 return -EINVAL; 198 return -EINVAL;
199 } 199 }
200 200
201 return (long)old; 201 return old;
202} 202}
203 203
204 204
diff --git a/kernel/exit.c b/kernel/exit.c
index eabca5a73a85..ceffc67b564a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -58,11 +58,11 @@
58 58
59static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
60 60
61static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p, bool group_dead)
62{ 62{
63 nr_threads--; 63 nr_threads--;
64 detach_pid(p, PIDTYPE_PID); 64 detach_pid(p, PIDTYPE_PID);
65 if (thread_group_leader(p)) { 65 if (group_dead) {
66 detach_pid(p, PIDTYPE_PGID); 66 detach_pid(p, PIDTYPE_PGID);
67 detach_pid(p, PIDTYPE_SID); 67 detach_pid(p, PIDTYPE_SID);
68 68
@@ -79,10 +79,9 @@ static void __unhash_process(struct task_struct *p)
79static void __exit_signal(struct task_struct *tsk) 79static void __exit_signal(struct task_struct *tsk)
80{ 80{
81 struct signal_struct *sig = tsk->signal; 81 struct signal_struct *sig = tsk->signal;
82 bool group_dead = thread_group_leader(tsk);
82 struct sighand_struct *sighand; 83 struct sighand_struct *sighand;
83 84 struct tty_struct *uninitialized_var(tty);
84 BUG_ON(!sig);
85 BUG_ON(!atomic_read(&sig->count));
86 85
87 sighand = rcu_dereference_check(tsk->sighand, 86 sighand = rcu_dereference_check(tsk->sighand,
88 rcu_read_lock_held() || 87 rcu_read_lock_held() ||
@@ -90,14 +89,16 @@ static void __exit_signal(struct task_struct *tsk)
90 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
91 90
92 posix_cpu_timers_exit(tsk); 91 posix_cpu_timers_exit(tsk);
93 if (atomic_dec_and_test(&sig->count)) 92 if (group_dead) {
94 posix_cpu_timers_exit_group(tsk); 93 posix_cpu_timers_exit_group(tsk);
95 else { 94 tty = sig->tty;
95 sig->tty = NULL;
96 } else {
96 /* 97 /*
97 * If there is any task waiting for the group exit 98 * If there is any task waiting for the group exit
98 * then notify it: 99 * then notify it:
99 */ 100 */
100 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) 101 if (sig->notify_count > 0 && !--sig->notify_count)
101 wake_up_process(sig->group_exit_task); 102 wake_up_process(sig->group_exit_task);
102 103
103 if (tsk == sig->curr_target) 104 if (tsk == sig->curr_target)
@@ -123,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
123 sig->oublock += task_io_get_oublock(tsk); 124 sig->oublock += task_io_get_oublock(tsk);
124 task_io_accounting_add(&sig->ioac, &tsk->ioac); 125 task_io_accounting_add(&sig->ioac, &tsk->ioac);
125 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 126 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
126 sig = NULL; /* Marker for below. */
127 } 127 }
128 128
129 __unhash_process(tsk); 129 sig->nr_threads--;
130 __unhash_process(tsk, group_dead);
130 131
131 /* 132 /*
132 * Do this under ->siglock, we can race with another thread 133 * Do this under ->siglock, we can race with another thread
133 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 134 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
134 */ 135 */
135 flush_sigqueue(&tsk->pending); 136 flush_sigqueue(&tsk->pending);
136
137 tsk->signal = NULL;
138 tsk->sighand = NULL; 137 tsk->sighand = NULL;
139 spin_unlock(&sighand->siglock); 138 spin_unlock(&sighand->siglock);
140 139
141 __cleanup_sighand(sighand); 140 __cleanup_sighand(sighand);
142 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 141 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
143 if (sig) { 142 if (group_dead) {
144 flush_sigqueue(&sig->shared_pending); 143 flush_sigqueue(&sig->shared_pending);
145 taskstats_tgid_free(sig); 144 tty_kref_put(tty);
146 /*
147 * Make sure ->signal can't go away under rq->lock,
148 * see account_group_exec_runtime().
149 */
150 task_rq_unlock_wait(tsk);
151 __cleanup_signal(sig);
152 } 145 }
153} 146}
154 147
@@ -856,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
856 849
857 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 850 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
858 851
859 /* mt-exec, de_thread() is waiting for us */ 852 /* mt-exec, de_thread() is waiting for group leader */
860 if (thread_group_leader(tsk) && 853 if (unlikely(tsk->signal->notify_count < 0))
861 tsk->signal->group_exit_task &&
862 tsk->signal->notify_count < 0)
863 wake_up_process(tsk->signal->group_exit_task); 854 wake_up_process(tsk->signal->group_exit_task);
864
865 write_unlock_irq(&tasklist_lock); 855 write_unlock_irq(&tasklist_lock);
866 856
867 tracehook_report_death(tsk, signal, cookie, group_dead); 857 tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -1002,8 +992,10 @@ NORET_TYPE void do_exit(long code)
1002 992
1003 exit_notify(tsk, group_dead); 993 exit_notify(tsk, group_dead);
1004#ifdef CONFIG_NUMA 994#ifdef CONFIG_NUMA
995 task_lock(tsk);
1005 mpol_put(tsk->mempolicy); 996 mpol_put(tsk->mempolicy);
1006 tsk->mempolicy = NULL; 997 tsk->mempolicy = NULL;
998 task_unlock(tsk);
1007#endif 999#endif
1008#ifdef CONFIG_FUTEX 1000#ifdef CONFIG_FUTEX
1009 if (unlikely(current->pi_state_cache)) 1001 if (unlikely(current->pi_state_cache))
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d57d9e3a6e9..b6cce14ba047 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk)
165} 165}
166EXPORT_SYMBOL(free_task); 166EXPORT_SYMBOL(free_task);
167 167
168static inline void free_signal_struct(struct signal_struct *sig)
169{
170 taskstats_tgid_free(sig);
171 kmem_cache_free(signal_cachep, sig);
172}
173
174static inline void put_signal_struct(struct signal_struct *sig)
175{
176 if (atomic_dec_and_test(&sig->sigcnt))
177 free_signal_struct(sig);
178}
179
168void __put_task_struct(struct task_struct *tsk) 180void __put_task_struct(struct task_struct *tsk)
169{ 181{
170 WARN_ON(!tsk->exit_state); 182 WARN_ON(!tsk->exit_state);
@@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
173 185
174 exit_creds(tsk); 186 exit_creds(tsk);
175 delayacct_tsk_free(tsk); 187 delayacct_tsk_free(tsk);
188 put_signal_struct(tsk->signal);
176 189
177 if (!profile_handoff_task(tsk)) 190 if (!profile_handoff_task(tsk))
178 free_task(tsk); 191 free_task(tsk);
@@ -864,8 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
864 if (!sig) 877 if (!sig)
865 return -ENOMEM; 878 return -ENOMEM;
866 879
867 atomic_set(&sig->count, 1); 880 sig->nr_threads = 1;
868 atomic_set(&sig->live, 1); 881 atomic_set(&sig->live, 1);
882 atomic_set(&sig->sigcnt, 1);
869 init_waitqueue_head(&sig->wait_chldexit); 883 init_waitqueue_head(&sig->wait_chldexit);
870 if (clone_flags & CLONE_NEWPID) 884 if (clone_flags & CLONE_NEWPID)
871 sig->flags |= SIGNAL_UNKILLABLE; 885 sig->flags |= SIGNAL_UNKILLABLE;
@@ -889,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
889 return 0; 903 return 0;
890} 904}
891 905
892void __cleanup_signal(struct signal_struct *sig)
893{
894 thread_group_cputime_free(sig);
895 tty_kref_put(sig->tty);
896 kmem_cache_free(signal_cachep, sig);
897}
898
899static void copy_flags(unsigned long clone_flags, struct task_struct *p) 906static void copy_flags(unsigned long clone_flags, struct task_struct *p)
900{ 907{
901 unsigned long new_flags = p->flags; 908 unsigned long new_flags = p->flags;
@@ -1245,8 +1252,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1245 } 1252 }
1246 1253
1247 if (clone_flags & CLONE_THREAD) { 1254 if (clone_flags & CLONE_THREAD) {
1248 atomic_inc(&current->signal->count); 1255 current->signal->nr_threads++;
1249 atomic_inc(&current->signal->live); 1256 atomic_inc(&current->signal->live);
1257 atomic_inc(&current->signal->sigcnt);
1250 p->group_leader = current->group_leader; 1258 p->group_leader = current->group_leader;
1251 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1259 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1252 } 1260 }
@@ -1259,7 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1259 p->nsproxy->pid_ns->child_reaper = p; 1267 p->nsproxy->pid_ns->child_reaper = p;
1260 1268
1261 p->signal->leader_pid = pid; 1269 p->signal->leader_pid = pid;
1262 tty_kref_put(p->signal->tty);
1263 p->signal->tty = tty_kref_get(current->signal->tty); 1270 p->signal->tty = tty_kref_get(current->signal->tty);
1264 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1271 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1265 attach_pid(p, PIDTYPE_SID, task_session(current)); 1272 attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1292,7 +1299,7 @@ bad_fork_cleanup_mm:
1292 mmput(p->mm); 1299 mmput(p->mm);
1293bad_fork_cleanup_signal: 1300bad_fork_cleanup_signal:
1294 if (!(clone_flags & CLONE_THREAD)) 1301 if (!(clone_flags & CLONE_THREAD))
1295 __cleanup_signal(p->signal); 1302 free_signal_struct(p->signal);
1296bad_fork_cleanup_sighand: 1303bad_fork_cleanup_sighand:
1297 __cleanup_sighand(p->sighand); 1304 __cleanup_sighand(p->sighand);
1298bad_fork_cleanup_fs: 1305bad_fork_cleanup_fs:
@@ -1327,6 +1334,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
1327 return regs; 1334 return regs;
1328} 1335}
1329 1336
1337static inline void init_idle_pids(struct pid_link *links)
1338{
1339 enum pid_type type;
1340
1341 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
1342 INIT_HLIST_NODE(&links[type].node); /* not really needed */
1343 links[type].pid = &init_struct_pid;
1344 }
1345}
1346
1330struct task_struct * __cpuinit fork_idle(int cpu) 1347struct task_struct * __cpuinit fork_idle(int cpu)
1331{ 1348{
1332 struct task_struct *task; 1349 struct task_struct *task;
@@ -1334,8 +1351,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1334 1351
1335 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1352 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1336 &init_struct_pid, 0); 1353 &init_struct_pid, 0);
1337 if (!IS_ERR(task)) 1354 if (!IS_ERR(task)) {
1355 init_idle_pids(task->pids);
1338 init_idle(task, cpu); 1356 init_idle(task, cpu);
1357 }
1339 1358
1340 return task; 1359 return task;
1341} 1360}
@@ -1507,14 +1526,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
1507 *flags_ptr |= CLONE_SIGHAND; 1526 *flags_ptr |= CLONE_SIGHAND;
1508 1527
1509 /* 1528 /*
1510 * If unsharing signal handlers and the task was created
1511 * using CLONE_THREAD, then must unshare the thread
1512 */
1513 if ((*flags_ptr & CLONE_SIGHAND) &&
1514 (atomic_read(&current->signal->count) > 1))
1515 *flags_ptr |= CLONE_THREAD;
1516
1517 /*
1518 * If unsharing namespace, must also unshare filesystem information. 1529 * If unsharing namespace, must also unshare filesystem information.
1519 */ 1530 */
1520 if (*flags_ptr & CLONE_NEWNS) 1531 if (*flags_ptr & CLONE_NEWNS)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b9b134b35088..5c69e996bd0f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
89 89
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = wall_to_monotonic;
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..6e9b19667a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
116 116
117 trace_module_request(module_name, wait, _RET_IP_); 117 trace_module_request(module_name, wait, _RET_IP_);
118 118
119 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper_fns(modprobe_path, argv, envp,
120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
121 NULL, NULL, NULL);
122
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return ret; 124 return ret;
123} 125}
124EXPORT_SYMBOL(__request_module); 126EXPORT_SYMBOL(__request_module);
125#endif /* CONFIG_MODULES */ 127#endif /* CONFIG_MODULES */
126 128
127struct subprocess_info {
128 struct work_struct work;
129 struct completion *complete;
130 struct cred *cred;
131 char *path;
132 char **argv;
133 char **envp;
134 enum umh_wait wait;
135 int retval;
136 struct file *stdin;
137 void (*cleanup)(char **argv, char **envp);
138};
139
140/* 129/*
141 * This is the task which runs the usermode application 130 * This is the task which runs the usermode application
142 */ 131 */
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
145 struct subprocess_info *sub_info = data; 134 struct subprocess_info *sub_info = data;
146 int retval; 135 int retval;
147 136
148 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
149
150 /* Unblock all signals */
151 spin_lock_irq(&current->sighand->siglock); 137 spin_lock_irq(&current->sighand->siglock);
152 flush_signal_handlers(current, 1); 138 flush_signal_handlers(current, 1);
153 sigemptyset(&current->blocked);
154 recalc_sigpending();
155 spin_unlock_irq(&current->sighand->siglock); 139 spin_unlock_irq(&current->sighand->siglock);
156 140
157 /* Install the credentials */
158 commit_creds(sub_info->cred);
159 sub_info->cred = NULL;
160
161 /* Install input pipe when needed */
162 if (sub_info->stdin) {
163 struct files_struct *f = current->files;
164 struct fdtable *fdt;
165 /* no races because files should be private here */
166 sys_close(0);
167 fd_install(0, sub_info->stdin);
168 spin_lock(&f->file_lock);
169 fdt = files_fdtable(f);
170 FD_SET(0, fdt->open_fds);
171 FD_CLR(0, fdt->close_on_exec);
172 spin_unlock(&f->file_lock);
173
174 /* and disallow core files too */
175 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
176 }
177
178 /* We can run anywhere, unlike our parent keventd(). */ 141 /* We can run anywhere, unlike our parent keventd(). */
179 set_cpus_allowed_ptr(current, cpu_all_mask); 142 set_cpus_allowed_ptr(current, cpu_all_mask);
180 143
@@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data)
184 */ 147 */
185 set_user_nice(current, 0); 148 set_user_nice(current, 0);
186 149
150 if (sub_info->init) {
151 retval = sub_info->init(sub_info);
152 if (retval)
153 goto fail;
154 }
155
187 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
188 157
189 /* Exec failed? */ 158 /* Exec failed? */
159fail:
190 sub_info->retval = retval; 160 sub_info->retval = retval;
191 do_exit(0); 161 do_exit(0);
192} 162}
@@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data)
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 164void call_usermodehelper_freeinfo(struct subprocess_info *info)
195{ 165{
196 if (info->cleanup) 166 if (info->cleanup)
197 (*info->cleanup)(info->argv, info->envp); 167 (*info->cleanup)(info);
198 if (info->cred)
199 put_cred(info->cred);
200 kfree(info); 168 kfree(info);
201} 169}
202EXPORT_SYMBOL(call_usermodehelper_freeinfo); 170EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +175,16 @@ static int wait_for_helper(void *data)
207 struct subprocess_info *sub_info = data; 175 struct subprocess_info *sub_info = data;
208 pid_t pid; 176 pid_t pid;
209 177
210 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 178 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
211 * populate the status, but will return -ECHILD. */ 179 spin_lock_irq(&current->sighand->siglock);
212 allow_signal(SIGCHLD); 180 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
181 spin_unlock_irq(&current->sighand->siglock);
213 182
214 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 183 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
215 if (pid < 0) { 184 if (pid < 0) {
216 sub_info->retval = pid; 185 sub_info->retval = pid;
217 } else { 186 } else {
218 int ret; 187 int ret = -ECHILD;
219
220 /* 188 /*
221 * Normally it is bogus to call wait4() from in-kernel because 189 * Normally it is bogus to call wait4() from in-kernel because
222 * wait4() wants to write the exit code to a userspace address. 190 * wait4() wants to write the exit code to a userspace address.
@@ -237,10 +205,7 @@ static int wait_for_helper(void *data)
237 sub_info->retval = ret; 205 sub_info->retval = ret;
238 } 206 }
239 207
240 if (sub_info->wait == UMH_NO_WAIT) 208 complete(sub_info->complete);
241 call_usermodehelper_freeinfo(sub_info);
242 else
243 complete(sub_info->complete);
244 return 0; 209 return 0;
245} 210}
246 211
@@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work)
249{ 214{
250 struct subprocess_info *sub_info = 215 struct subprocess_info *sub_info =
251 container_of(work, struct subprocess_info, work); 216 container_of(work, struct subprocess_info, work);
252 pid_t pid;
253 enum umh_wait wait = sub_info->wait; 217 enum umh_wait wait = sub_info->wait;
254 218 pid_t pid;
255 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
256 219
257 /* CLONE_VFORK: wait until the usermode helper has execve'd 220 /* CLONE_VFORK: wait until the usermode helper has execve'd
258 * successfully We need the data structures to stay around 221 * successfully We need the data structures to stay around
259 * until that is done. */ 222 * until that is done. */
260 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) 223 if (wait == UMH_WAIT_PROC)
261 pid = kernel_thread(wait_for_helper, sub_info, 224 pid = kernel_thread(wait_for_helper, sub_info,
262 CLONE_FS | CLONE_FILES | SIGCHLD); 225 CLONE_FS | CLONE_FILES | SIGCHLD);
263 else 226 else
@@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work)
266 229
267 switch (wait) { 230 switch (wait) {
268 case UMH_NO_WAIT: 231 case UMH_NO_WAIT:
232 call_usermodehelper_freeinfo(sub_info);
269 break; 233 break;
270 234
271 case UMH_WAIT_PROC: 235 case UMH_WAIT_PROC:
272 if (pid > 0) 236 if (pid > 0)
273 break; 237 break;
274 sub_info->retval = pid;
275 /* FALLTHROUGH */ 238 /* FALLTHROUGH */
276
277 case UMH_WAIT_EXEC: 239 case UMH_WAIT_EXEC:
240 if (pid < 0)
241 sub_info->retval = pid;
278 complete(sub_info->complete); 242 complete(sub_info->complete);
279 } 243 }
280} 244}
@@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
376 sub_info->path = path; 340 sub_info->path = path;
377 sub_info->argv = argv; 341 sub_info->argv = argv;
378 sub_info->envp = envp; 342 sub_info->envp = envp;
379 sub_info->cred = prepare_usermodehelper_creds();
380 if (!sub_info->cred) {
381 kfree(sub_info);
382 return NULL;
383 }
384
385 out: 343 out:
386 return sub_info; 344 return sub_info;
387} 345}
388EXPORT_SYMBOL(call_usermodehelper_setup); 346EXPORT_SYMBOL(call_usermodehelper_setup);
389 347
390/** 348/**
391 * call_usermodehelper_setkeys - set the session keys for usermode helper 349 * call_usermodehelper_setfns - set a cleanup/init function
392 * @info: a subprocess_info returned by call_usermodehelper_setup
393 * @session_keyring: the session keyring for the process
394 */
395void call_usermodehelper_setkeys(struct subprocess_info *info,
396 struct key *session_keyring)
397{
398#ifdef CONFIG_KEYS
399 struct thread_group_cred *tgcred = info->cred->tgcred;
400 key_put(tgcred->session_keyring);
401 tgcred->session_keyring = key_get(session_keyring);
402#else
403 BUG();
404#endif
405}
406EXPORT_SYMBOL(call_usermodehelper_setkeys);
407
408/**
409 * call_usermodehelper_setcleanup - set a cleanup function
410 * @info: a subprocess_info returned by call_usermodehelper_setup 350 * @info: a subprocess_info returned by call_usermodehelper_setup
411 * @cleanup: a cleanup function 351 * @cleanup: a cleanup function
352 * @init: an init function
353 * @data: arbitrary context sensitive data
412 * 354 *
413 * The cleanup function is just befor ethe subprocess_info is about to 355 * The init function is used to customize the helper process prior to
356 * exec. A non-zero return code causes the process to error out, exit,
357 * and return the failure to the calling process
358 *
359 * The cleanup function is just before ethe subprocess_info is about to
414 * be freed. This can be used for freeing the argv and envp. The 360 * be freed. This can be used for freeing the argv and envp. The
415 * Function must be runnable in either a process context or the 361 * Function must be runnable in either a process context or the
416 * context in which call_usermodehelper_exec is called. 362 * context in which call_usermodehelper_exec is called.
417 */ 363 */
418void call_usermodehelper_setcleanup(struct subprocess_info *info, 364void call_usermodehelper_setfns(struct subprocess_info *info,
419 void (*cleanup)(char **argv, char **envp)) 365 int (*init)(struct subprocess_info *info),
366 void (*cleanup)(struct subprocess_info *info),
367 void *data)
420{ 368{
421 info->cleanup = cleanup; 369 info->cleanup = cleanup;
370 info->init = init;
371 info->data = data;
422} 372}
423EXPORT_SYMBOL(call_usermodehelper_setcleanup); 373EXPORT_SYMBOL(call_usermodehelper_setfns);
424
425/**
426 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
427 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
428 * @filp: set to the write-end of a pipe
429 *
430 * This constructs a pipe, and sets the read end to be the stdin of the
431 * subprocess, and returns the write-end in *@filp.
432 */
433int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
434 struct file **filp)
435{
436 struct file *f;
437
438 f = create_write_pipe(0);
439 if (IS_ERR(f))
440 return PTR_ERR(f);
441 *filp = f;
442
443 f = create_read_pipe(f, 0);
444 if (IS_ERR(f)) {
445 free_write_pipe(*filp);
446 return PTR_ERR(f);
447 }
448 sub_info->stdin = f;
449
450 return 0;
451}
452EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
453 374
454/** 375/**
455 * call_usermodehelper_exec - start a usermode application 376 * call_usermodehelper_exec - start a usermode application
@@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
469 DECLARE_COMPLETION_ONSTACK(done); 390 DECLARE_COMPLETION_ONSTACK(done);
470 int retval = 0; 391 int retval = 0;
471 392
472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
474
475 helper_lock(); 393 helper_lock();
476 if (sub_info->path[0] == '\0') 394 if (sub_info->path[0] == '\0')
477 goto out; 395 goto out;
@@ -498,41 +416,6 @@ unlock:
498} 416}
499EXPORT_SYMBOL(call_usermodehelper_exec); 417EXPORT_SYMBOL(call_usermodehelper_exec);
500 418
501/**
502 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
503 * @path: path to usermode executable
504 * @argv: arg vector for process
505 * @envp: environment for process
506 * @filp: set to the write-end of a pipe
507 *
508 * This is a simple wrapper which executes a usermode-helper function
509 * with a pipe as stdin. It is implemented entirely in terms of
510 * lower-level call_usermodehelper_* functions.
511 */
512int call_usermodehelper_pipe(char *path, char **argv, char **envp,
513 struct file **filp)
514{
515 struct subprocess_info *sub_info;
516 int ret;
517
518 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
519 if (sub_info == NULL)
520 return -ENOMEM;
521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) {
524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
527
528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
531
532 return ret;
533}
534EXPORT_SYMBOL(call_usermodehelper_pipe);
535
536void __init usermodehelper_init(void) 419void __init usermodehelper_init(void)
537{ 420{
538 khelper_wq = create_singlethread_workqueue("khelper"); 421 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/module.c b/kernel/module.c
index 3c4fc4bb4b82..8c6b42840dd1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -72,7 +72,11 @@
72/* If this is set, the section belongs in the init part of the module */ 72/* If this is set, the section belongs in the init part of the module */
73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
74 74
75/* List of modules, protected by module_mutex or preempt_disable 75/*
76 * Mutex protects:
77 * 1) List of modules (also safely readable with preempt_disable),
78 * 2) module_use links,
79 * 3) module_addr_min/module_addr_max.
76 * (delete uses stop_machine/add uses RCU list operations). */ 80 * (delete uses stop_machine/add uses RCU list operations). */
77DEFINE_MUTEX(module_mutex); 81DEFINE_MUTEX(module_mutex);
78EXPORT_SYMBOL_GPL(module_mutex); 82EXPORT_SYMBOL_GPL(module_mutex);
@@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
90 94
91static BLOCKING_NOTIFIER_HEAD(module_notify_list); 95static BLOCKING_NOTIFIER_HEAD(module_notify_list);
92 96
93/* Bounds of module allocation, for speeding __module_address */ 97/* Bounds of module allocation, for speeding __module_address.
98 * Protected by module_mutex. */
94static unsigned long module_addr_min = -1UL, module_addr_max = 0; 99static unsigned long module_addr_min = -1UL, module_addr_max = 0;
95 100
96int register_module_notifier(struct notifier_block * nb) 101int register_module_notifier(struct notifier_block * nb)
@@ -180,8 +185,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
180extern const struct kernel_symbol __stop___ksymtab_gpl[]; 185extern const struct kernel_symbol __stop___ksymtab_gpl[];
181extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 186extern const struct kernel_symbol __start___ksymtab_gpl_future[];
182extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 187extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
183extern const struct kernel_symbol __start___ksymtab_gpl_future[];
184extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
185extern const unsigned long __start___kcrctab[]; 188extern const unsigned long __start___kcrctab[];
186extern const unsigned long __start___kcrctab_gpl[]; 189extern const unsigned long __start___kcrctab_gpl[];
187extern const unsigned long __start___kcrctab_gpl_future[]; 190extern const unsigned long __start___kcrctab_gpl_future[];
@@ -331,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
331} 334}
332 335
333/* Find a symbol and return it, along with, (optional) crc and 336/* Find a symbol and return it, along with, (optional) crc and
334 * (optional) module which owns it */ 337 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
335const struct kernel_symbol *find_symbol(const char *name, 338const struct kernel_symbol *find_symbol(const char *name,
336 struct module **owner, 339 struct module **owner,
337 const unsigned long **crc, 340 const unsigned long **crc,
@@ -405,7 +408,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
405 Elf_Shdr *sechdrs, 408 Elf_Shdr *sechdrs,
406 const char *secstrings) 409 const char *secstrings)
407{ 410{
408 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
409} 412}
410 413
411static void percpu_modcopy(struct module *mod, 414static void percpu_modcopy(struct module *mod,
@@ -525,7 +528,8 @@ static void module_unload_init(struct module *mod)
525{ 528{
526 int cpu; 529 int cpu;
527 530
528 INIT_LIST_HEAD(&mod->modules_which_use_me); 531 INIT_LIST_HEAD(&mod->source_list);
532 INIT_LIST_HEAD(&mod->target_list);
529 for_each_possible_cpu(cpu) { 533 for_each_possible_cpu(cpu) {
530 per_cpu_ptr(mod->refptr, cpu)->incs = 0; 534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
531 per_cpu_ptr(mod->refptr, cpu)->decs = 0; 535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
@@ -537,20 +541,13 @@ static void module_unload_init(struct module *mod)
537 mod->waiter = current; 541 mod->waiter = current;
538} 542}
539 543
540/* modules using other modules */
541struct module_use
542{
543 struct list_head list;
544 struct module *module_which_uses;
545};
546
547/* Does a already use b? */ 544/* Does a already use b? */
548static int already_uses(struct module *a, struct module *b) 545static int already_uses(struct module *a, struct module *b)
549{ 546{
550 struct module_use *use; 547 struct module_use *use;
551 548
552 list_for_each_entry(use, &b->modules_which_use_me, list) { 549 list_for_each_entry(use, &b->source_list, source_list) {
553 if (use->module_which_uses == a) { 550 if (use->source == a) {
554 DEBUGP("%s uses %s!\n", a->name, b->name); 551 DEBUGP("%s uses %s!\n", a->name, b->name);
555 return 1; 552 return 1;
556 } 553 }
@@ -559,62 +556,68 @@ static int already_uses(struct module *a, struct module *b)
559 return 0; 556 return 0;
560} 557}
561 558
562/* Module a uses b */ 559/*
563int use_module(struct module *a, struct module *b) 560 * Module a uses b
561 * - we add 'a' as a "source", 'b' as a "target" of module use
562 * - the module_use is added to the list of 'b' sources (so
563 * 'b' can walk the list to see who sourced them), and of 'a'
564 * targets (so 'a' can see what modules it targets).
565 */
566static int add_module_usage(struct module *a, struct module *b)
564{ 567{
565 struct module_use *use; 568 struct module_use *use;
566 int no_warn, err;
567 569
568 if (b == NULL || already_uses(a, b)) return 1; 570 DEBUGP("Allocating new usage for %s.\n", a->name);
571 use = kmalloc(sizeof(*use), GFP_ATOMIC);
572 if (!use) {
573 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
574 return -ENOMEM;
575 }
576
577 use->source = a;
578 use->target = b;
579 list_add(&use->source_list, &b->source_list);
580 list_add(&use->target_list, &a->target_list);
581 return 0;
582}
569 583
570 /* If we're interrupted or time out, we fail. */ 584/* Module a uses b: caller needs module_mutex() */
571 if (wait_event_interruptible_timeout( 585int ref_module(struct module *a, struct module *b)
572 module_wq, (err = strong_try_module_get(b)) != -EBUSY, 586{
573 30 * HZ) <= 0) { 587 int err;
574 printk("%s: gave up waiting for init of module %s.\n", 588
575 a->name, b->name); 589 if (b == NULL || already_uses(a, b))
576 return 0; 590 return 0;
577 }
578 591
579 /* If strong_try_module_get() returned a different error, we fail. */ 592 /* If module isn't available, we fail. */
593 err = strong_try_module_get(b);
580 if (err) 594 if (err)
581 return 0; 595 return err;
582 596
583 DEBUGP("Allocating new usage for %s.\n", a->name); 597 err = add_module_usage(a, b);
584 use = kmalloc(sizeof(*use), GFP_ATOMIC); 598 if (err) {
585 if (!use) {
586 printk("%s: out of memory loading\n", a->name);
587 module_put(b); 599 module_put(b);
588 return 0; 600 return err;
589 } 601 }
590 602 return 0;
591 use->module_which_uses = a;
592 list_add(&use->list, &b->modules_which_use_me);
593 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
594 return 1;
595} 603}
596EXPORT_SYMBOL_GPL(use_module); 604EXPORT_SYMBOL_GPL(ref_module);
597 605
598/* Clear the unload stuff of the module. */ 606/* Clear the unload stuff of the module. */
599static void module_unload_free(struct module *mod) 607static void module_unload_free(struct module *mod)
600{ 608{
601 struct module *i; 609 struct module_use *use, *tmp;
602
603 list_for_each_entry(i, &modules, list) {
604 struct module_use *use;
605 610
606 list_for_each_entry(use, &i->modules_which_use_me, list) { 611 mutex_lock(&module_mutex);
607 if (use->module_which_uses == mod) { 612 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
608 DEBUGP("%s unusing %s\n", mod->name, i->name); 613 struct module *i = use->target;
609 module_put(i); 614 DEBUGP("%s unusing %s\n", mod->name, i->name);
610 list_del(&use->list); 615 module_put(i);
611 kfree(use); 616 list_del(&use->source_list);
612 sysfs_remove_link(i->holders_dir, mod->name); 617 list_del(&use->target_list);
613 /* There can be at most one match. */ 618 kfree(use);
614 break;
615 }
616 }
617 } 619 }
620 mutex_unlock(&module_mutex);
618} 621}
619 622
620#ifdef CONFIG_MODULE_FORCE_UNLOAD 623#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -737,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
737 goto out; 740 goto out;
738 } 741 }
739 742
740 if (!list_empty(&mod->modules_which_use_me)) { 743 if (!list_empty(&mod->source_list)) {
741 /* Other modules depend on us: get rid of them first. */ 744 /* Other modules depend on us: get rid of them first. */
742 ret = -EWOULDBLOCK; 745 ret = -EWOULDBLOCK;
743 goto out; 746 goto out;
@@ -781,13 +784,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
781 blocking_notifier_call_chain(&module_notify_list, 784 blocking_notifier_call_chain(&module_notify_list,
782 MODULE_STATE_GOING, mod); 785 MODULE_STATE_GOING, mod);
783 async_synchronize_full(); 786 async_synchronize_full();
784 mutex_lock(&module_mutex); 787
785 /* Store the name of the last unloaded module for diagnostic purposes */ 788 /* Store the name of the last unloaded module for diagnostic purposes */
786 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 789 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
787 ddebug_remove_module(mod->name); 790 ddebug_remove_module(mod->name);
788 free_module(mod);
789 791
790 out: 792 free_module(mod);
793 return 0;
794out:
791 mutex_unlock(&module_mutex); 795 mutex_unlock(&module_mutex);
792 return ret; 796 return ret;
793} 797}
@@ -801,9 +805,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
801 805
802 /* Always include a trailing , so userspace can differentiate 806 /* Always include a trailing , so userspace can differentiate
803 between this and the old multi-field proc format. */ 807 between this and the old multi-field proc format. */
804 list_for_each_entry(use, &mod->modules_which_use_me, list) { 808 list_for_each_entry(use, &mod->source_list, source_list) {
805 printed_something = 1; 809 printed_something = 1;
806 seq_printf(m, "%s,", use->module_which_uses->name); 810 seq_printf(m, "%s,", use->source->name);
807 } 811 }
808 812
809 if (mod->init != NULL && mod->exit == NULL) { 813 if (mod->init != NULL && mod->exit == NULL) {
@@ -882,11 +886,11 @@ static inline void module_unload_free(struct module *mod)
882{ 886{
883} 887}
884 888
885int use_module(struct module *a, struct module *b) 889int ref_module(struct module *a, struct module *b)
886{ 890{
887 return strong_try_module_get(b) == 0; 891 return strong_try_module_get(b);
888} 892}
889EXPORT_SYMBOL_GPL(use_module); 893EXPORT_SYMBOL_GPL(ref_module);
890 894
891static inline void module_unload_init(struct module *mod) 895static inline void module_unload_init(struct module *mod)
892{ 896{
@@ -1003,6 +1007,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1003{ 1007{
1004 const unsigned long *crc; 1008 const unsigned long *crc;
1005 1009
1010 /* Since this should be found in kernel (which can't be removed),
1011 * no locking is necessary. */
1006 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1012 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1007 &crc, true, false)) 1013 &crc, true, false))
1008 BUG(); 1014 BUG();
@@ -1045,29 +1051,62 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1045} 1051}
1046#endif /* CONFIG_MODVERSIONS */ 1052#endif /* CONFIG_MODVERSIONS */
1047 1053
1048/* Resolve a symbol for this module. I.e. if we find one, record usage. 1054/* Resolve a symbol for this module. I.e. if we find one, record usage. */
1049 Must be holding module_mutex. */
1050static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1055static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1051 unsigned int versindex, 1056 unsigned int versindex,
1052 const char *name, 1057 const char *name,
1053 struct module *mod) 1058 struct module *mod,
1059 char ownername[])
1054{ 1060{
1055 struct module *owner; 1061 struct module *owner;
1056 const struct kernel_symbol *sym; 1062 const struct kernel_symbol *sym;
1057 const unsigned long *crc; 1063 const unsigned long *crc;
1064 int err;
1058 1065
1066 mutex_lock(&module_mutex);
1059 sym = find_symbol(name, &owner, &crc, 1067 sym = find_symbol(name, &owner, &crc,
1060 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1068 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1061 /* use_module can fail due to OOM, 1069 if (!sym)
1062 or module initialization or unloading */ 1070 goto unlock;
1063 if (sym) { 1071
1064 if (!check_version(sechdrs, versindex, name, mod, crc, owner) 1072 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
1065 || !use_module(mod, owner)) 1073 sym = ERR_PTR(-EINVAL);
1066 sym = NULL; 1074 goto getname;
1067 } 1075 }
1076
1077 err = ref_module(mod, owner);
1078 if (err) {
1079 sym = ERR_PTR(err);
1080 goto getname;
1081 }
1082
1083getname:
1084 /* We must make copy under the lock if we failed to get ref. */
1085 strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
1086unlock:
1087 mutex_unlock(&module_mutex);
1068 return sym; 1088 return sym;
1069} 1089}
1070 1090
1091static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1092 unsigned int versindex,
1093 const char *name,
1094 struct module *mod)
1095{
1096 const struct kernel_symbol *ksym;
1097 char ownername[MODULE_NAME_LEN];
1098
1099 if (wait_event_interruptible_timeout(module_wq,
1100 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
1101 mod, ownername)) ||
1102 PTR_ERR(ksym) != -EBUSY,
1103 30 * HZ) <= 0) {
1104 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1105 mod->name, ownername);
1106 }
1107 return ksym;
1108}
1109
1071/* 1110/*
1072 * /sys/module/foo/sections stuff 1111 * /sys/module/foo/sections stuff
1073 * J. Corbet <corbet@lwn.net> 1112 * J. Corbet <corbet@lwn.net>
@@ -1297,7 +1336,34 @@ static inline void remove_notes_attrs(struct module *mod)
1297#endif 1336#endif
1298 1337
1299#ifdef CONFIG_SYSFS 1338#ifdef CONFIG_SYSFS
1300int module_add_modinfo_attrs(struct module *mod) 1339static void add_usage_links(struct module *mod)
1340{
1341#ifdef CONFIG_MODULE_UNLOAD
1342 struct module_use *use;
1343 int nowarn;
1344
1345 mutex_lock(&module_mutex);
1346 list_for_each_entry(use, &mod->target_list, target_list) {
1347 nowarn = sysfs_create_link(use->target->holders_dir,
1348 &mod->mkobj.kobj, mod->name);
1349 }
1350 mutex_unlock(&module_mutex);
1351#endif
1352}
1353
1354static void del_usage_links(struct module *mod)
1355{
1356#ifdef CONFIG_MODULE_UNLOAD
1357 struct module_use *use;
1358
1359 mutex_lock(&module_mutex);
1360 list_for_each_entry(use, &mod->target_list, target_list)
1361 sysfs_remove_link(use->target->holders_dir, mod->name);
1362 mutex_unlock(&module_mutex);
1363#endif
1364}
1365
1366static int module_add_modinfo_attrs(struct module *mod)
1301{ 1367{
1302 struct module_attribute *attr; 1368 struct module_attribute *attr;
1303 struct module_attribute *temp_attr; 1369 struct module_attribute *temp_attr;
@@ -1323,7 +1389,7 @@ int module_add_modinfo_attrs(struct module *mod)
1323 return error; 1389 return error;
1324} 1390}
1325 1391
1326void module_remove_modinfo_attrs(struct module *mod) 1392static void module_remove_modinfo_attrs(struct module *mod)
1327{ 1393{
1328 struct module_attribute *attr; 1394 struct module_attribute *attr;
1329 int i; 1395 int i;
@@ -1339,7 +1405,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1339 kfree(mod->modinfo_attrs); 1405 kfree(mod->modinfo_attrs);
1340} 1406}
1341 1407
1342int mod_sysfs_init(struct module *mod) 1408static int mod_sysfs_init(struct module *mod)
1343{ 1409{
1344 int err; 1410 int err;
1345 struct kobject *kobj; 1411 struct kobject *kobj;
@@ -1373,12 +1439,16 @@ out:
1373 return err; 1439 return err;
1374} 1440}
1375 1441
1376int mod_sysfs_setup(struct module *mod, 1442static int mod_sysfs_setup(struct module *mod,
1377 struct kernel_param *kparam, 1443 struct kernel_param *kparam,
1378 unsigned int num_params) 1444 unsigned int num_params)
1379{ 1445{
1380 int err; 1446 int err;
1381 1447
1448 err = mod_sysfs_init(mod);
1449 if (err)
1450 goto out;
1451
1382 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); 1452 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1383 if (!mod->holders_dir) { 1453 if (!mod->holders_dir) {
1384 err = -ENOMEM; 1454 err = -ENOMEM;
@@ -1393,6 +1463,8 @@ int mod_sysfs_setup(struct module *mod,
1393 if (err) 1463 if (err)
1394 goto out_unreg_param; 1464 goto out_unreg_param;
1395 1465
1466 add_usage_links(mod);
1467
1396 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1468 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1397 return 0; 1469 return 0;
1398 1470
@@ -1402,6 +1474,7 @@ out_unreg_holders:
1402 kobject_put(mod->holders_dir); 1474 kobject_put(mod->holders_dir);
1403out_unreg: 1475out_unreg:
1404 kobject_put(&mod->mkobj.kobj); 1476 kobject_put(&mod->mkobj.kobj);
1477out:
1405 return err; 1478 return err;
1406} 1479}
1407 1480
@@ -1412,14 +1485,40 @@ static void mod_sysfs_fini(struct module *mod)
1412 1485
1413#else /* CONFIG_SYSFS */ 1486#else /* CONFIG_SYSFS */
1414 1487
1488static inline int mod_sysfs_init(struct module *mod)
1489{
1490 return 0;
1491}
1492
1493static inline int mod_sysfs_setup(struct module *mod,
1494 struct kernel_param *kparam,
1495 unsigned int num_params)
1496{
1497 return 0;
1498}
1499
1500static inline int module_add_modinfo_attrs(struct module *mod)
1501{
1502 return 0;
1503}
1504
1505static inline void module_remove_modinfo_attrs(struct module *mod)
1506{
1507}
1508
1415static void mod_sysfs_fini(struct module *mod) 1509static void mod_sysfs_fini(struct module *mod)
1416{ 1510{
1417} 1511}
1418 1512
1513static void del_usage_links(struct module *mod)
1514{
1515}
1516
1419#endif /* CONFIG_SYSFS */ 1517#endif /* CONFIG_SYSFS */
1420 1518
1421static void mod_kobject_remove(struct module *mod) 1519static void mod_kobject_remove(struct module *mod)
1422{ 1520{
1521 del_usage_links(mod);
1423 module_remove_modinfo_attrs(mod); 1522 module_remove_modinfo_attrs(mod);
1424 module_param_sysfs_remove(mod); 1523 module_param_sysfs_remove(mod);
1425 kobject_put(mod->mkobj.drivers_dir); 1524 kobject_put(mod->mkobj.drivers_dir);
@@ -1438,13 +1537,15 @@ static int __unlink_module(void *_mod)
1438 return 0; 1537 return 0;
1439} 1538}
1440 1539
1441/* Free a module, remove from lists, etc (must hold module_mutex). */ 1540/* Free a module, remove from lists, etc. */
1442static void free_module(struct module *mod) 1541static void free_module(struct module *mod)
1443{ 1542{
1444 trace_module_free(mod); 1543 trace_module_free(mod);
1445 1544
1446 /* Delete from various lists */ 1545 /* Delete from various lists */
1546 mutex_lock(&module_mutex);
1447 stop_machine(__unlink_module, mod, NULL); 1547 stop_machine(__unlink_module, mod, NULL);
1548 mutex_unlock(&module_mutex);
1448 remove_notes_attrs(mod); 1549 remove_notes_attrs(mod);
1449 remove_sect_attrs(mod); 1550 remove_sect_attrs(mod);
1450 mod_kobject_remove(mod); 1551 mod_kobject_remove(mod);
@@ -1495,6 +1596,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1495/* 1596/*
1496 * Ensure that an exported symbol [global namespace] does not already exist 1597 * Ensure that an exported symbol [global namespace] does not already exist
1497 * in the kernel or in some other module's exported symbol table. 1598 * in the kernel or in some other module's exported symbol table.
1599 *
1600 * You must hold the module_mutex.
1498 */ 1601 */
1499static int verify_export_symbols(struct module *mod) 1602static int verify_export_symbols(struct module *mod)
1500{ 1603{
@@ -1560,21 +1663,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1560 break; 1663 break;
1561 1664
1562 case SHN_UNDEF: 1665 case SHN_UNDEF:
1563 ksym = resolve_symbol(sechdrs, versindex, 1666 ksym = resolve_symbol_wait(sechdrs, versindex,
1564 strtab + sym[i].st_name, mod); 1667 strtab + sym[i].st_name,
1668 mod);
1565 /* Ok if resolved. */ 1669 /* Ok if resolved. */
1566 if (ksym) { 1670 if (ksym && !IS_ERR(ksym)) {
1567 sym[i].st_value = ksym->value; 1671 sym[i].st_value = ksym->value;
1568 break; 1672 break;
1569 } 1673 }
1570 1674
1571 /* Ok if weak. */ 1675 /* Ok if weak. */
1572 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1676 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1573 break; 1677 break;
1574 1678
1575 printk(KERN_WARNING "%s: Unknown symbol %s\n", 1679 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1576 mod->name, strtab + sym[i].st_name); 1680 mod->name, strtab + sym[i].st_name,
1577 ret = -ENOENT; 1681 PTR_ERR(ksym));
1682 ret = PTR_ERR(ksym) ?: -ENOENT;
1578 break; 1683 break;
1579 1684
1580 default: 1685 default:
@@ -1962,11 +2067,13 @@ static void *module_alloc_update_bounds(unsigned long size)
1962 void *ret = module_alloc(size); 2067 void *ret = module_alloc(size);
1963 2068
1964 if (ret) { 2069 if (ret) {
2070 mutex_lock(&module_mutex);
1965 /* Update module bounds. */ 2071 /* Update module bounds. */
1966 if ((unsigned long)ret < module_addr_min) 2072 if ((unsigned long)ret < module_addr_min)
1967 module_addr_min = (unsigned long)ret; 2073 module_addr_min = (unsigned long)ret;
1968 if ((unsigned long)ret + size > module_addr_max) 2074 if ((unsigned long)ret + size > module_addr_max)
1969 module_addr_max = (unsigned long)ret + size; 2075 module_addr_max = (unsigned long)ret + size;
2076 mutex_unlock(&module_mutex);
1970 } 2077 }
1971 return ret; 2078 return ret;
1972} 2079}
@@ -2016,6 +2123,7 @@ static noinline struct module *load_module(void __user *umod,
2016 long err = 0; 2123 long err = 0;
2017 void *ptr = NULL; /* Stops spurious gcc warning */ 2124 void *ptr = NULL; /* Stops spurious gcc warning */
2018 unsigned long symoffs, stroffs, *strmap; 2125 unsigned long symoffs, stroffs, *strmap;
2126 void __percpu *percpu;
2019 2127
2020 mm_segment_t old_fs; 2128 mm_segment_t old_fs;
2021 2129
@@ -2140,11 +2248,6 @@ static noinline struct module *load_module(void __user *umod,
2140 goto free_mod; 2248 goto free_mod;
2141 } 2249 }
2142 2250
2143 if (find_module(mod->name)) {
2144 err = -EEXIST;
2145 goto free_mod;
2146 }
2147
2148 mod->state = MODULE_STATE_COMING; 2251 mod->state = MODULE_STATE_COMING;
2149 2252
2150 /* Allow arches to frob section contents and sizes. */ 2253 /* Allow arches to frob section contents and sizes. */
@@ -2160,6 +2263,8 @@ static noinline struct module *load_module(void __user *umod,
2160 goto free_mod; 2263 goto free_mod;
2161 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2264 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2162 } 2265 }
2266 /* Keep this around for failure path. */
2267 percpu = mod_percpu(mod);
2163 2268
2164 /* Determine total sizes, and put offsets in sh_entsize. For now 2269 /* Determine total sizes, and put offsets in sh_entsize. For now
2165 this is done generically; there doesn't appear to be any 2270 this is done generically; there doesn't appear to be any
@@ -2233,11 +2338,6 @@ static noinline struct module *load_module(void __user *umod,
2233 /* Now we've moved module, initialize linked lists, etc. */ 2338 /* Now we've moved module, initialize linked lists, etc. */
2234 module_unload_init(mod); 2339 module_unload_init(mod);
2235 2340
2236 /* add kobject, so we can reference it. */
2237 err = mod_sysfs_init(mod);
2238 if (err)
2239 goto free_unload;
2240
2241 /* Set up license info based on the info section */ 2341 /* Set up license info based on the info section */
2242 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2342 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
2243 2343
@@ -2362,11 +2462,6 @@ static noinline struct module *load_module(void __user *umod,
2362 goto cleanup; 2462 goto cleanup;
2363 } 2463 }
2364 2464
2365 /* Find duplicate symbols */
2366 err = verify_export_symbols(mod);
2367 if (err < 0)
2368 goto cleanup;
2369
2370 /* Set up and sort exception table */ 2465 /* Set up and sort exception table */
2371 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", 2466 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2372 sizeof(*mod->extable), &mod->num_exentries); 2467 sizeof(*mod->extable), &mod->num_exentries);
@@ -2425,7 +2520,19 @@ static noinline struct module *load_module(void __user *umod,
2425 * function to insert in a way safe to concurrent readers. 2520 * function to insert in a way safe to concurrent readers.
2426 * The mutex protects against concurrent writers. 2521 * The mutex protects against concurrent writers.
2427 */ 2522 */
2523 mutex_lock(&module_mutex);
2524 if (find_module(mod->name)) {
2525 err = -EEXIST;
2526 goto unlock;
2527 }
2528
2529 /* Find duplicate symbols */
2530 err = verify_export_symbols(mod);
2531 if (err < 0)
2532 goto unlock;
2533
2428 list_add_rcu(&mod->list, &modules); 2534 list_add_rcu(&mod->list, &modules);
2535 mutex_unlock(&module_mutex);
2429 2536
2430 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2537 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2431 if (err < 0) 2538 if (err < 0)
@@ -2434,6 +2541,7 @@ static noinline struct module *load_module(void __user *umod,
2434 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2541 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2435 if (err < 0) 2542 if (err < 0)
2436 goto unlink; 2543 goto unlink;
2544
2437 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2545 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2438 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2546 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2439 2547
@@ -2446,15 +2554,15 @@ static noinline struct module *load_module(void __user *umod,
2446 return mod; 2554 return mod;
2447 2555
2448 unlink: 2556 unlink:
2557 mutex_lock(&module_mutex);
2449 /* Unlink carefully: kallsyms could be walking list. */ 2558 /* Unlink carefully: kallsyms could be walking list. */
2450 list_del_rcu(&mod->list); 2559 list_del_rcu(&mod->list);
2560 unlock:
2561 mutex_unlock(&module_mutex);
2451 synchronize_sched(); 2562 synchronize_sched();
2452 module_arch_cleanup(mod); 2563 module_arch_cleanup(mod);
2453 cleanup: 2564 cleanup:
2454 free_modinfo(mod); 2565 free_modinfo(mod);
2455 kobject_del(&mod->mkobj.kobj);
2456 kobject_put(&mod->mkobj.kobj);
2457 free_unload:
2458 module_unload_free(mod); 2566 module_unload_free(mod);
2459#if defined(CONFIG_MODULE_UNLOAD) 2567#if defined(CONFIG_MODULE_UNLOAD)
2460 free_percpu(mod->refptr); 2568 free_percpu(mod->refptr);
@@ -2465,7 +2573,7 @@ static noinline struct module *load_module(void __user *umod,
2465 module_free(mod, mod->module_core); 2573 module_free(mod, mod->module_core);
2466 /* mod will be freed with core. Don't access it beyond this line! */ 2574 /* mod will be freed with core. Don't access it beyond this line! */
2467 free_percpu: 2575 free_percpu:
2468 percpu_modfree(mod); 2576 free_percpu(percpu);
2469 free_mod: 2577 free_mod:
2470 kfree(args); 2578 kfree(args);
2471 kfree(strmap); 2579 kfree(strmap);
@@ -2501,19 +2609,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2501 if (!capable(CAP_SYS_MODULE) || modules_disabled) 2609 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2502 return -EPERM; 2610 return -EPERM;
2503 2611
2504 /* Only one module load at a time, please */
2505 if (mutex_lock_interruptible(&module_mutex) != 0)
2506 return -EINTR;
2507
2508 /* Do all the hard work */ 2612 /* Do all the hard work */
2509 mod = load_module(umod, len, uargs); 2613 mod = load_module(umod, len, uargs);
2510 if (IS_ERR(mod)) { 2614 if (IS_ERR(mod))
2511 mutex_unlock(&module_mutex);
2512 return PTR_ERR(mod); 2615 return PTR_ERR(mod);
2513 }
2514
2515 /* Drop lock so they can recurse */
2516 mutex_unlock(&module_mutex);
2517 2616
2518 blocking_notifier_call_chain(&module_notify_list, 2617 blocking_notifier_call_chain(&module_notify_list,
2519 MODULE_STATE_COMING, mod); 2618 MODULE_STATE_COMING, mod);
@@ -2530,9 +2629,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2530 module_put(mod); 2629 module_put(mod);
2531 blocking_notifier_call_chain(&module_notify_list, 2630 blocking_notifier_call_chain(&module_notify_list,
2532 MODULE_STATE_GOING, mod); 2631 MODULE_STATE_GOING, mod);
2533 mutex_lock(&module_mutex);
2534 free_module(mod); 2632 free_module(mod);
2535 mutex_unlock(&module_mutex);
2536 wake_up(&module_wq); 2633 wake_up(&module_wq);
2537 return ret; 2634 return ret;
2538 } 2635 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d82..4c0b7b3e6d2e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
172 struct thread_info *owner; 172 struct thread_info *owner;
173 173
174 /* 174 /*
175 * If we own the BKL, then don't spin. The owner of
176 * the mutex might be waiting on us to release the BKL.
177 */
178 if (unlikely(current->lock_depth >= 0))
179 break;
180
181 /*
175 * If there's an owner, wait for it to either 182 * If there's an owner, wait for it to either
176 * release the lock or go to sleep. 183 * release the lock or go to sleep.
177 */ 184 */
diff --git a/kernel/padata.c b/kernel/padata.c
index fd03513c7327..fdd8ae609ce3 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30 30
31#define MAX_SEQ_NR INT_MAX - NR_CPUS 31#define MAX_SEQ_NR INT_MAX - NR_CPUS
32#define MAX_OBJ_NUM 10000 * NR_CPUS 32#define MAX_OBJ_NUM 1000
33 33
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{ 35{
@@ -88,7 +88,7 @@ static void padata_parallel_worker(struct work_struct *work)
88 local_bh_enable(); 88 local_bh_enable();
89} 89}
90 90
91/* 91/**
92 * padata_do_parallel - padata parallelization function 92 * padata_do_parallel - padata parallelization function
93 * 93 *
94 * @pinst: padata instance 94 * @pinst: padata instance
@@ -152,6 +152,23 @@ out:
152} 152}
153EXPORT_SYMBOL(padata_do_parallel); 153EXPORT_SYMBOL(padata_do_parallel);
154 154
155/*
156 * padata_get_next - Get the next object that needs serialization.
157 *
158 * Return values are:
159 *
160 * A pointer to the control struct of the next object that needs
161 * serialization, if present in one of the percpu reorder queues.
162 *
163 * NULL, if all percpu reorder queues are empty.
164 *
165 * -EINPROGRESS, if the next object that needs serialization will
166 * be parallel processed by another cpu and is not yet present in
167 * the cpu's reorder queue.
168 *
169 * -ENODATA, if this cpu has to do the parallel processing for
170 * the next object.
171 */
155static struct padata_priv *padata_get_next(struct parallel_data *pd) 172static struct padata_priv *padata_get_next(struct parallel_data *pd)
156{ 173{
157 int cpu, num_cpus, empty, calc_seq_nr; 174 int cpu, num_cpus, empty, calc_seq_nr;
@@ -173,7 +190,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
173 190
174 /* 191 /*
175 * Calculate the seq_nr of the object that should be 192 * Calculate the seq_nr of the object that should be
176 * next in this queue. 193 * next in this reorder queue.
177 */ 194 */
178 overrun = 0; 195 overrun = 0;
179 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) 196 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
@@ -231,7 +248,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
231 goto out; 248 goto out;
232 } 249 }
233 250
234 if (next_nr % num_cpus == next_queue->cpu_index) { 251 queue = per_cpu_ptr(pd->queue, smp_processor_id());
252 if (queue->cpu_index == next_queue->cpu_index) {
235 padata = ERR_PTR(-ENODATA); 253 padata = ERR_PTR(-ENODATA);
236 goto out; 254 goto out;
237 } 255 }
@@ -247,19 +265,40 @@ static void padata_reorder(struct parallel_data *pd)
247 struct padata_queue *queue; 265 struct padata_queue *queue;
248 struct padata_instance *pinst = pd->pinst; 266 struct padata_instance *pinst = pd->pinst;
249 267
250try_again: 268 /*
269 * We need to ensure that only one cpu can work on dequeueing of
270 * the reorder queue the time. Calculating in which percpu reorder
271 * queue the next object will arrive takes some time. A spinlock
272 * would be highly contended. Also it is not clear in which order
273 * the objects arrive to the reorder queues. So a cpu could wait to
274 * get the lock just to notice that there is nothing to do at the
275 * moment. Therefore we use a trylock and let the holder of the lock
276 * care for all the objects enqueued during the holdtime of the lock.
277 */
251 if (!spin_trylock_bh(&pd->lock)) 278 if (!spin_trylock_bh(&pd->lock))
252 goto out; 279 return;
253 280
254 while (1) { 281 while (1) {
255 padata = padata_get_next(pd); 282 padata = padata_get_next(pd);
256 283
284 /*
285 * All reorder queues are empty, or the next object that needs
286 * serialization is parallel processed by another cpu and is
287 * still on it's way to the cpu's reorder queue, nothing to
288 * do for now.
289 */
257 if (!padata || PTR_ERR(padata) == -EINPROGRESS) 290 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
258 break; 291 break;
259 292
293 /*
294 * This cpu has to do the parallel processing of the next
295 * object. It's waiting in the cpu's parallelization queue,
296 * so exit imediately.
297 */
260 if (PTR_ERR(padata) == -ENODATA) { 298 if (PTR_ERR(padata) == -ENODATA) {
299 del_timer(&pd->timer);
261 spin_unlock_bh(&pd->lock); 300 spin_unlock_bh(&pd->lock);
262 goto out; 301 return;
263 } 302 }
264 303
265 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 304 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
@@ -273,13 +312,27 @@ try_again:
273 312
274 spin_unlock_bh(&pd->lock); 313 spin_unlock_bh(&pd->lock);
275 314
276 if (atomic_read(&pd->reorder_objects)) 315 /*
277 goto try_again; 316 * The next object that needs serialization might have arrived to
317 * the reorder queues in the meantime, we will be called again
318 * from the timer function if noone else cares for it.
319 */
320 if (atomic_read(&pd->reorder_objects)
321 && !(pinst->flags & PADATA_RESET))
322 mod_timer(&pd->timer, jiffies + HZ);
323 else
324 del_timer(&pd->timer);
278 325
279out:
280 return; 326 return;
281} 327}
282 328
329static void padata_reorder_timer(unsigned long arg)
330{
331 struct parallel_data *pd = (struct parallel_data *)arg;
332
333 padata_reorder(pd);
334}
335
283static void padata_serial_worker(struct work_struct *work) 336static void padata_serial_worker(struct work_struct *work)
284{ 337{
285 struct padata_queue *queue; 338 struct padata_queue *queue;
@@ -308,7 +361,7 @@ static void padata_serial_worker(struct work_struct *work)
308 local_bh_enable(); 361 local_bh_enable();
309} 362}
310 363
311/* 364/**
312 * padata_do_serial - padata serialization function 365 * padata_do_serial - padata serialization function
313 * 366 *
314 * @padata: object to be serialized. 367 * @padata: object to be serialized.
@@ -338,6 +391,7 @@ void padata_do_serial(struct padata_priv *padata)
338} 391}
339EXPORT_SYMBOL(padata_do_serial); 392EXPORT_SYMBOL(padata_do_serial);
340 393
394/* Allocate and initialize the internal cpumask dependend resources. */
341static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 395static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
342 const struct cpumask *cpumask) 396 const struct cpumask *cpumask)
343{ 397{
@@ -358,17 +412,15 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
358 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 412 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
359 goto err_free_queue; 413 goto err_free_queue;
360 414
361 for_each_possible_cpu(cpu) { 415 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
416
417 for_each_cpu(cpu, pd->cpumask) {
362 queue = per_cpu_ptr(pd->queue, cpu); 418 queue = per_cpu_ptr(pd->queue, cpu);
363 419
364 queue->pd = pd; 420 queue->pd = pd;
365 421
366 if (cpumask_test_cpu(cpu, cpumask) 422 queue->cpu_index = cpu_index;
367 && cpumask_test_cpu(cpu, cpu_active_mask)) { 423 cpu_index++;
368 queue->cpu_index = cpu_index;
369 cpu_index++;
370 } else
371 queue->cpu_index = -1;
372 424
373 INIT_LIST_HEAD(&queue->reorder.list); 425 INIT_LIST_HEAD(&queue->reorder.list);
374 INIT_LIST_HEAD(&queue->parallel.list); 426 INIT_LIST_HEAD(&queue->parallel.list);
@@ -382,11 +434,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
382 atomic_set(&queue->num_obj, 0); 434 atomic_set(&queue->num_obj, 0);
383 } 435 }
384 436
385 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
386
387 num_cpus = cpumask_weight(pd->cpumask); 437 num_cpus = cpumask_weight(pd->cpumask);
388 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 438 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
389 439
440 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
390 atomic_set(&pd->seq_nr, -1); 441 atomic_set(&pd->seq_nr, -1);
391 atomic_set(&pd->reorder_objects, 0); 442 atomic_set(&pd->reorder_objects, 0);
392 atomic_set(&pd->refcnt, 0); 443 atomic_set(&pd->refcnt, 0);
@@ -410,6 +461,31 @@ static void padata_free_pd(struct parallel_data *pd)
410 kfree(pd); 461 kfree(pd);
411} 462}
412 463
464/* Flush all objects out of the padata queues. */
465static void padata_flush_queues(struct parallel_data *pd)
466{
467 int cpu;
468 struct padata_queue *queue;
469
470 for_each_cpu(cpu, pd->cpumask) {
471 queue = per_cpu_ptr(pd->queue, cpu);
472 flush_work(&queue->pwork);
473 }
474
475 del_timer_sync(&pd->timer);
476
477 if (atomic_read(&pd->reorder_objects))
478 padata_reorder(pd);
479
480 for_each_cpu(cpu, pd->cpumask) {
481 queue = per_cpu_ptr(pd->queue, cpu);
482 flush_work(&queue->swork);
483 }
484
485 BUG_ON(atomic_read(&pd->refcnt) != 0);
486}
487
488/* Replace the internal control stucture with a new one. */
413static void padata_replace(struct padata_instance *pinst, 489static void padata_replace(struct padata_instance *pinst,
414 struct parallel_data *pd_new) 490 struct parallel_data *pd_new)
415{ 491{
@@ -421,17 +497,13 @@ static void padata_replace(struct padata_instance *pinst,
421 497
422 synchronize_rcu(); 498 synchronize_rcu();
423 499
424 while (atomic_read(&pd_old->refcnt) != 0) 500 padata_flush_queues(pd_old);
425 yield();
426
427 flush_workqueue(pinst->wq);
428
429 padata_free_pd(pd_old); 501 padata_free_pd(pd_old);
430 502
431 pinst->flags &= ~PADATA_RESET; 503 pinst->flags &= ~PADATA_RESET;
432} 504}
433 505
434/* 506/**
435 * padata_set_cpumask - set the cpumask that padata should use 507 * padata_set_cpumask - set the cpumask that padata should use
436 * 508 *
437 * @pinst: padata instance 509 * @pinst: padata instance
@@ -443,10 +515,10 @@ int padata_set_cpumask(struct padata_instance *pinst,
443 struct parallel_data *pd; 515 struct parallel_data *pd;
444 int err = 0; 516 int err = 0;
445 517
446 might_sleep();
447
448 mutex_lock(&pinst->lock); 518 mutex_lock(&pinst->lock);
449 519
520 get_online_cpus();
521
450 pd = padata_alloc_pd(pinst, cpumask); 522 pd = padata_alloc_pd(pinst, cpumask);
451 if (!pd) { 523 if (!pd) {
452 err = -ENOMEM; 524 err = -ENOMEM;
@@ -458,6 +530,8 @@ int padata_set_cpumask(struct padata_instance *pinst,
458 padata_replace(pinst, pd); 530 padata_replace(pinst, pd);
459 531
460out: 532out:
533 put_online_cpus();
534
461 mutex_unlock(&pinst->lock); 535 mutex_unlock(&pinst->lock);
462 536
463 return err; 537 return err;
@@ -479,7 +553,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
479 return 0; 553 return 0;
480} 554}
481 555
482/* 556/**
483 * padata_add_cpu - add a cpu to the padata cpumask 557 * padata_add_cpu - add a cpu to the padata cpumask
484 * 558 *
485 * @pinst: padata instance 559 * @pinst: padata instance
@@ -489,12 +563,12 @@ int padata_add_cpu(struct padata_instance *pinst, int cpu)
489{ 563{
490 int err; 564 int err;
491 565
492 might_sleep();
493
494 mutex_lock(&pinst->lock); 566 mutex_lock(&pinst->lock);
495 567
568 get_online_cpus();
496 cpumask_set_cpu(cpu, pinst->cpumask); 569 cpumask_set_cpu(cpu, pinst->cpumask);
497 err = __padata_add_cpu(pinst, cpu); 570 err = __padata_add_cpu(pinst, cpu);
571 put_online_cpus();
498 572
499 mutex_unlock(&pinst->lock); 573 mutex_unlock(&pinst->lock);
500 574
@@ -517,7 +591,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
517 return 0; 591 return 0;
518} 592}
519 593
520/* 594/**
521 * padata_remove_cpu - remove a cpu from the padata cpumask 595 * padata_remove_cpu - remove a cpu from the padata cpumask
522 * 596 *
523 * @pinst: padata instance 597 * @pinst: padata instance
@@ -527,12 +601,12 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
527{ 601{
528 int err; 602 int err;
529 603
530 might_sleep();
531
532 mutex_lock(&pinst->lock); 604 mutex_lock(&pinst->lock);
533 605
606 get_online_cpus();
534 cpumask_clear_cpu(cpu, pinst->cpumask); 607 cpumask_clear_cpu(cpu, pinst->cpumask);
535 err = __padata_remove_cpu(pinst, cpu); 608 err = __padata_remove_cpu(pinst, cpu);
609 put_online_cpus();
536 610
537 mutex_unlock(&pinst->lock); 611 mutex_unlock(&pinst->lock);
538 612
@@ -540,38 +614,35 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
540} 614}
541EXPORT_SYMBOL(padata_remove_cpu); 615EXPORT_SYMBOL(padata_remove_cpu);
542 616
543/* 617/**
544 * padata_start - start the parallel processing 618 * padata_start - start the parallel processing
545 * 619 *
546 * @pinst: padata instance to start 620 * @pinst: padata instance to start
547 */ 621 */
548void padata_start(struct padata_instance *pinst) 622void padata_start(struct padata_instance *pinst)
549{ 623{
550 might_sleep();
551
552 mutex_lock(&pinst->lock); 624 mutex_lock(&pinst->lock);
553 pinst->flags |= PADATA_INIT; 625 pinst->flags |= PADATA_INIT;
554 mutex_unlock(&pinst->lock); 626 mutex_unlock(&pinst->lock);
555} 627}
556EXPORT_SYMBOL(padata_start); 628EXPORT_SYMBOL(padata_start);
557 629
558/* 630/**
559 * padata_stop - stop the parallel processing 631 * padata_stop - stop the parallel processing
560 * 632 *
561 * @pinst: padata instance to stop 633 * @pinst: padata instance to stop
562 */ 634 */
563void padata_stop(struct padata_instance *pinst) 635void padata_stop(struct padata_instance *pinst)
564{ 636{
565 might_sleep();
566
567 mutex_lock(&pinst->lock); 637 mutex_lock(&pinst->lock);
568 pinst->flags &= ~PADATA_INIT; 638 pinst->flags &= ~PADATA_INIT;
569 mutex_unlock(&pinst->lock); 639 mutex_unlock(&pinst->lock);
570} 640}
571EXPORT_SYMBOL(padata_stop); 641EXPORT_SYMBOL(padata_stop);
572 642
573static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, 643#ifdef CONFIG_HOTPLUG_CPU
574 unsigned long action, void *hcpu) 644static int padata_cpu_callback(struct notifier_block *nfb,
645 unsigned long action, void *hcpu)
575{ 646{
576 int err; 647 int err;
577 struct padata_instance *pinst; 648 struct padata_instance *pinst;
@@ -588,7 +659,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
588 err = __padata_add_cpu(pinst, cpu); 659 err = __padata_add_cpu(pinst, cpu);
589 mutex_unlock(&pinst->lock); 660 mutex_unlock(&pinst->lock);
590 if (err) 661 if (err)
591 return NOTIFY_BAD; 662 return notifier_from_errno(err);
592 break; 663 break;
593 664
594 case CPU_DOWN_PREPARE: 665 case CPU_DOWN_PREPARE:
@@ -599,7 +670,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
599 err = __padata_remove_cpu(pinst, cpu); 670 err = __padata_remove_cpu(pinst, cpu);
600 mutex_unlock(&pinst->lock); 671 mutex_unlock(&pinst->lock);
601 if (err) 672 if (err)
602 return NOTIFY_BAD; 673 return notifier_from_errno(err);
603 break; 674 break;
604 675
605 case CPU_UP_CANCELED: 676 case CPU_UP_CANCELED:
@@ -621,8 +692,9 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
621 692
622 return NOTIFY_OK; 693 return NOTIFY_OK;
623} 694}
695#endif
624 696
625/* 697/**
626 * padata_alloc - allocate and initialize a padata instance 698 * padata_alloc - allocate and initialize a padata instance
627 * 699 *
628 * @cpumask: cpumask that padata uses for parallelization 700 * @cpumask: cpumask that padata uses for parallelization
@@ -631,7 +703,6 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
631struct padata_instance *padata_alloc(const struct cpumask *cpumask, 703struct padata_instance *padata_alloc(const struct cpumask *cpumask,
632 struct workqueue_struct *wq) 704 struct workqueue_struct *wq)
633{ 705{
634 int err;
635 struct padata_instance *pinst; 706 struct padata_instance *pinst;
636 struct parallel_data *pd; 707 struct parallel_data *pd;
637 708
@@ -639,6 +710,8 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
639 if (!pinst) 710 if (!pinst)
640 goto err; 711 goto err;
641 712
713 get_online_cpus();
714
642 pd = padata_alloc_pd(pinst, cpumask); 715 pd = padata_alloc_pd(pinst, cpumask);
643 if (!pd) 716 if (!pd)
644 goto err_free_inst; 717 goto err_free_inst;
@@ -654,31 +727,32 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
654 727
655 pinst->flags = 0; 728 pinst->flags = 0;
656 729
730#ifdef CONFIG_HOTPLUG_CPU
657 pinst->cpu_notifier.notifier_call = padata_cpu_callback; 731 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
658 pinst->cpu_notifier.priority = 0; 732 pinst->cpu_notifier.priority = 0;
659 err = register_hotcpu_notifier(&pinst->cpu_notifier); 733 register_hotcpu_notifier(&pinst->cpu_notifier);
660 if (err) 734#endif
661 goto err_free_cpumask; 735
736 put_online_cpus();
662 737
663 mutex_init(&pinst->lock); 738 mutex_init(&pinst->lock);
664 739
665 return pinst; 740 return pinst;
666 741
667err_free_cpumask:
668 free_cpumask_var(pinst->cpumask);
669err_free_pd: 742err_free_pd:
670 padata_free_pd(pd); 743 padata_free_pd(pd);
671err_free_inst: 744err_free_inst:
672 kfree(pinst); 745 kfree(pinst);
746 put_online_cpus();
673err: 747err:
674 return NULL; 748 return NULL;
675} 749}
676EXPORT_SYMBOL(padata_alloc); 750EXPORT_SYMBOL(padata_alloc);
677 751
678/* 752/**
679 * padata_free - free a padata instance 753 * padata_free - free a padata instance
680 * 754 *
681 * @ padata_inst: padata instance to free 755 * @padata_inst: padata instance to free
682 */ 756 */
683void padata_free(struct padata_instance *pinst) 757void padata_free(struct padata_instance *pinst)
684{ 758{
@@ -686,10 +760,13 @@ void padata_free(struct padata_instance *pinst)
686 760
687 synchronize_rcu(); 761 synchronize_rcu();
688 762
689 while (atomic_read(&pinst->pd->refcnt) != 0) 763#ifdef CONFIG_HOTPLUG_CPU
690 yield();
691
692 unregister_hotcpu_notifier(&pinst->cpu_notifier); 764 unregister_hotcpu_notifier(&pinst->cpu_notifier);
765#endif
766 get_online_cpus();
767 padata_flush_queues(pinst->pd);
768 put_online_cpus();
769
693 padata_free_pd(pinst->pd); 770 padata_free_pd(pinst->pd);
694 free_cpumask_var(pinst->cpumask); 771 free_cpumask_var(pinst->cpumask);
695 kfree(pinst); 772 kfree(pinst);
diff --git a/kernel/panic.c b/kernel/panic.c
index 13d966b4c14a..3b16cd93fa7d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -87,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...)
87 */ 87 */
88 preempt_disable(); 88 preempt_disable();
89 89
90 console_verbose();
90 bust_spinlocks(1); 91 bust_spinlocks(1);
91 va_start(args, fmt); 92 va_start(args, fmt);
92 vsnprintf(buf, sizeof(buf), fmt, args); 93 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -178,6 +179,7 @@ static const struct tnt tnts[] = {
178 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 179 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
179 { TAINT_WARN, 'W', ' ' }, 180 { TAINT_WARN, 'W', ' ' },
180 { TAINT_CRAP, 'C', ' ' }, 181 { TAINT_CRAP, 'C', ' ' },
182 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
181}; 183};
182 184
183/** 185/**
@@ -194,6 +196,7 @@ static const struct tnt tnts[] = {
194 * 'A' - ACPI table overridden. 196 * 'A' - ACPI table overridden.
195 * 'W' - Taint on warning. 197 * 'W' - Taint on warning.
196 * 'C' - modules from drivers/staging are loaded. 198 * 'C' - modules from drivers/staging are loaded.
199 * 'I' - Working around severe firmware bug.
197 * 200 *
198 * The string is overwritten by the next call to print_tainted(). 201 * The string is overwritten by the next call to print_tainted().
199 */ 202 */
@@ -365,7 +368,8 @@ struct slowpath_args {
365 va_list args; 368 va_list args;
366}; 369};
367 370
368static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) 371static void warn_slowpath_common(const char *file, int line, void *caller,
372 unsigned taint, struct slowpath_args *args)
369{ 373{
370 const char *board; 374 const char *board;
371 375
@@ -381,7 +385,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
381 print_modules(); 385 print_modules();
382 dump_stack(); 386 dump_stack();
383 print_oops_end_marker(); 387 print_oops_end_marker();
384 add_taint(TAINT_WARN); 388 add_taint(taint);
385} 389}
386 390
387void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 391void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -390,14 +394,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
390 394
391 args.fmt = fmt; 395 args.fmt = fmt;
392 va_start(args.args, fmt); 396 va_start(args.args, fmt);
393 warn_slowpath_common(file, line, __builtin_return_address(0), &args); 397 warn_slowpath_common(file, line, __builtin_return_address(0),
398 TAINT_WARN, &args);
394 va_end(args.args); 399 va_end(args.args);
395} 400}
396EXPORT_SYMBOL(warn_slowpath_fmt); 401EXPORT_SYMBOL(warn_slowpath_fmt);
397 402
403void warn_slowpath_fmt_taint(const char *file, int line,
404 unsigned taint, const char *fmt, ...)
405{
406 struct slowpath_args args;
407
408 args.fmt = fmt;
409 va_start(args.args, fmt);
410 warn_slowpath_common(file, line, __builtin_return_address(0),
411 taint, &args);
412 va_end(args.args);
413}
414EXPORT_SYMBOL(warn_slowpath_fmt_taint);
415
398void warn_slowpath_null(const char *file, int line) 416void warn_slowpath_null(const char *file, int line)
399{ 417{
400 warn_slowpath_common(file, line, __builtin_return_address(0), NULL); 418 warn_slowpath_common(file, line, __builtin_return_address(0),
419 TAINT_WARN, NULL);
401} 420}
402EXPORT_SYMBOL(warn_slowpath_null); 421EXPORT_SYMBOL(warn_slowpath_null);
403#endif 422#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a4fa381db3c2..ff86c558af4c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
283static void 283static void
284list_add_event(struct perf_event *event, struct perf_event_context *ctx) 284list_add_event(struct perf_event *event, struct perf_event_context *ctx)
285{ 285{
286 struct perf_event *group_leader = event->group_leader; 286 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
287 event->attach_state |= PERF_ATTACH_CONTEXT;
287 288
288 /* 289 /*
289 * Depending on whether it is a standalone or sibling event, 290 * If we're a stand alone event or group leader, we go to the context
290 * add it straight to the context's event list, or to the group 291 * list, group events are kept attached to the group so that
291 * leader's sibling list: 292 * perf_group_detach can, at all times, locate all siblings.
292 */ 293 */
293 if (group_leader == event) { 294 if (event->group_leader == event) {
294 struct list_head *list; 295 struct list_head *list;
295 296
296 if (is_software_event(event)) 297 if (is_software_event(event))
@@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
298 299
299 list = ctx_group_list(event, ctx); 300 list = ctx_group_list(event, ctx);
300 list_add_tail(&event->group_entry, list); 301 list_add_tail(&event->group_entry, list);
301 } else {
302 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
303 !is_software_event(event))
304 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
305
306 list_add_tail(&event->group_entry, &group_leader->sibling_list);
307 group_leader->nr_siblings++;
308 } 302 }
309 303
310 list_add_rcu(&event->event_entry, &ctx->event_list); 304 list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
313 ctx->nr_stat++; 307 ctx->nr_stat++;
314} 308}
315 309
310static void perf_group_attach(struct perf_event *event)
311{
312 struct perf_event *group_leader = event->group_leader;
313
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
315 event->attach_state |= PERF_ATTACH_GROUP;
316
317 if (group_leader == event)
318 return;
319
320 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
321 !is_software_event(event))
322 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
323
324 list_add_tail(&event->group_entry, &group_leader->sibling_list);
325 group_leader->nr_siblings++;
326}
327
316/* 328/*
317 * Remove a event from the lists for its context. 329 * Remove a event from the lists for its context.
318 * Must be called with ctx->mutex and ctx->lock held. 330 * Must be called with ctx->mutex and ctx->lock held.
@@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
320static void 332static void
321list_del_event(struct perf_event *event, struct perf_event_context *ctx) 333list_del_event(struct perf_event *event, struct perf_event_context *ctx)
322{ 334{
323 if (list_empty(&event->group_entry)) 335 /*
336 * We can have double detach due to exit/hot-unplug + close.
337 */
338 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
324 return; 339 return;
340
341 event->attach_state &= ~PERF_ATTACH_CONTEXT;
342
325 ctx->nr_events--; 343 ctx->nr_events--;
326 if (event->attr.inherit_stat) 344 if (event->attr.inherit_stat)
327 ctx->nr_stat--; 345 ctx->nr_stat--;
328 346
329 list_del_init(&event->group_entry);
330 list_del_rcu(&event->event_entry); 347 list_del_rcu(&event->event_entry);
331 348
332 if (event->group_leader != event) 349 if (event->group_leader == event)
333 event->group_leader->nr_siblings--; 350 list_del_init(&event->group_entry);
334 351
335 update_group_times(event); 352 update_group_times(event);
336 353
@@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
345 event->state = PERF_EVENT_STATE_OFF; 362 event->state = PERF_EVENT_STATE_OFF;
346} 363}
347 364
348static void 365static void perf_group_detach(struct perf_event *event)
349perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
350{ 366{
351 struct perf_event *sibling, *tmp; 367 struct perf_event *sibling, *tmp;
368 struct list_head *list = NULL;
369
370 /*
371 * We can have double detach due to exit/hot-unplug + close.
372 */
373 if (!(event->attach_state & PERF_ATTACH_GROUP))
374 return;
375
376 event->attach_state &= ~PERF_ATTACH_GROUP;
377
378 /*
379 * If this is a sibling, remove it from its group.
380 */
381 if (event->group_leader != event) {
382 list_del_init(&event->group_entry);
383 event->group_leader->nr_siblings--;
384 return;
385 }
386
387 if (!list_empty(&event->group_entry))
388 list = &event->group_entry;
352 389
353 /* 390 /*
354 * If this was a group event with sibling events then 391 * If this was a group event with sibling events then
355 * upgrade the siblings to singleton events by adding them 392 * upgrade the siblings to singleton events by adding them
356 * to the context list directly: 393 * to whatever list we are on.
357 */ 394 */
358 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 395 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
359 struct list_head *list; 396 if (list)
360 397 list_move_tail(&sibling->group_entry, list);
361 list = ctx_group_list(event, ctx);
362 list_move_tail(&sibling->group_entry, list);
363 sibling->group_leader = sibling; 398 sibling->group_leader = sibling;
364 399
365 /* Inherit group flags from the previous leader */ 400 /* Inherit group flags from the previous leader */
@@ -652,8 +687,11 @@ group_sched_in(struct perf_event *group_event,
652 if (txn) 687 if (txn)
653 pmu->start_txn(pmu); 688 pmu->start_txn(pmu);
654 689
655 if (event_sched_in(group_event, cpuctx, ctx)) 690 if (event_sched_in(group_event, cpuctx, ctx)) {
691 if (txn)
692 pmu->cancel_txn(pmu);
656 return -EAGAIN; 693 return -EAGAIN;
694 }
657 695
658 /* 696 /*
659 * Schedule in siblings as one group (if any): 697 * Schedule in siblings as one group (if any):
@@ -675,9 +713,6 @@ group_sched_in(struct perf_event *group_event,
675 } 713 }
676 714
677group_error: 715group_error:
678 if (txn)
679 pmu->cancel_txn(pmu);
680
681 /* 716 /*
682 * Groups can be scheduled in as one unit only, so undo any 717 * Groups can be scheduled in as one unit only, so undo any
683 * partial group before returning: 718 * partial group before returning:
@@ -689,6 +724,9 @@ group_error:
689 } 724 }
690 event_sched_out(group_event, cpuctx, ctx); 725 event_sched_out(group_event, cpuctx, ctx);
691 726
727 if (txn)
728 pmu->cancel_txn(pmu);
729
692 return -EAGAIN; 730 return -EAGAIN;
693} 731}
694 732
@@ -727,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
727 struct perf_event_context *ctx) 765 struct perf_event_context *ctx)
728{ 766{
729 list_add_event(event, ctx); 767 list_add_event(event, ctx);
768 perf_group_attach(event);
730 event->tstamp_enabled = ctx->time; 769 event->tstamp_enabled = ctx->time;
731 event->tstamp_running = ctx->time; 770 event->tstamp_running = ctx->time;
732 event->tstamp_stopped = ctx->time; 771 event->tstamp_stopped = ctx->time;
@@ -1468,6 +1507,9 @@ do { \
1468 divisor = nsec * frequency; 1507 divisor = nsec * frequency;
1469 } 1508 }
1470 1509
1510 if (!divisor)
1511 return dividend;
1512
1471 return div64_u64(dividend, divisor); 1513 return div64_u64(dividend, divisor);
1472} 1514}
1473 1515
@@ -1490,7 +1532,7 @@ static int perf_event_start(struct perf_event *event)
1490static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1532static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1491{ 1533{
1492 struct hw_perf_event *hwc = &event->hw; 1534 struct hw_perf_event *hwc = &event->hw;
1493 u64 period, sample_period; 1535 s64 period, sample_period;
1494 s64 delta; 1536 s64 delta;
1495 1537
1496 period = perf_calculate_period(event, nsec, count); 1538 period = perf_calculate_period(event, nsec, count);
@@ -1841,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head)
1841} 1883}
1842 1884
1843static void perf_pending_sync(struct perf_event *event); 1885static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data);
1844 1887
1845static void free_event(struct perf_event *event) 1888static void free_event(struct perf_event *event)
1846{ 1889{
@@ -1856,9 +1899,9 @@ static void free_event(struct perf_event *event)
1856 atomic_dec(&nr_task_events); 1899 atomic_dec(&nr_task_events);
1857 } 1900 }
1858 1901
1859 if (event->output) { 1902 if (event->data) {
1860 fput(event->output->filp); 1903 perf_mmap_data_put(event->data);
1861 event->output = NULL; 1904 event->data = NULL;
1862 } 1905 }
1863 1906
1864 if (event->destroy) 1907 if (event->destroy)
@@ -1893,8 +1936,8 @@ int perf_event_release_kernel(struct perf_event *event)
1893 */ 1936 */
1894 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 1937 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1895 raw_spin_lock_irq(&ctx->lock); 1938 raw_spin_lock_irq(&ctx->lock);
1939 perf_group_detach(event);
1896 list_del_event(event, ctx); 1940 list_del_event(event, ctx);
1897 perf_destroy_group(event, ctx);
1898 raw_spin_unlock_irq(&ctx->lock); 1941 raw_spin_unlock_irq(&ctx->lock);
1899 mutex_unlock(&ctx->mutex); 1942 mutex_unlock(&ctx->mutex);
1900 1943
@@ -2175,7 +2218,27 @@ unlock:
2175 return ret; 2218 return ret;
2176} 2219}
2177 2220
2178static int perf_event_set_output(struct perf_event *event, int output_fd); 2221static const struct file_operations perf_fops;
2222
2223static struct perf_event *perf_fget_light(int fd, int *fput_needed)
2224{
2225 struct file *file;
2226
2227 file = fget_light(fd, fput_needed);
2228 if (!file)
2229 return ERR_PTR(-EBADF);
2230
2231 if (file->f_op != &perf_fops) {
2232 fput_light(file, *fput_needed);
2233 *fput_needed = 0;
2234 return ERR_PTR(-EBADF);
2235 }
2236
2237 return file->private_data;
2238}
2239
2240static int perf_event_set_output(struct perf_event *event,
2241 struct perf_event *output_event);
2179static int perf_event_set_filter(struct perf_event *event, void __user *arg); 2242static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2180 2243
2181static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2244static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2202,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2202 return perf_event_period(event, (u64 __user *)arg); 2265 return perf_event_period(event, (u64 __user *)arg);
2203 2266
2204 case PERF_EVENT_IOC_SET_OUTPUT: 2267 case PERF_EVENT_IOC_SET_OUTPUT:
2205 return perf_event_set_output(event, arg); 2268 {
2269 struct perf_event *output_event = NULL;
2270 int fput_needed = 0;
2271 int ret;
2272
2273 if (arg != -1) {
2274 output_event = perf_fget_light(arg, &fput_needed);
2275 if (IS_ERR(output_event))
2276 return PTR_ERR(output_event);
2277 }
2278
2279 ret = perf_event_set_output(event, output_event);
2280 if (output_event)
2281 fput_light(output_event->filp, fput_needed);
2282
2283 return ret;
2284 }
2206 2285
2207 case PERF_EVENT_IOC_SET_FILTER: 2286 case PERF_EVENT_IOC_SET_FILTER:
2208 return perf_event_set_filter(event, (void __user *)arg); 2287 return perf_event_set_filter(event, (void __user *)arg);
@@ -2297,11 +2376,6 @@ unlock:
2297 rcu_read_unlock(); 2376 rcu_read_unlock();
2298} 2377}
2299 2378
2300static unsigned long perf_data_size(struct perf_mmap_data *data)
2301{
2302 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2303}
2304
2305#ifndef CONFIG_PERF_USE_VMALLOC 2379#ifndef CONFIG_PERF_USE_VMALLOC
2306 2380
2307/* 2381/*
@@ -2320,6 +2394,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2320 return virt_to_page(data->data_pages[pgoff - 1]); 2394 return virt_to_page(data->data_pages[pgoff - 1]);
2321} 2395}
2322 2396
2397static void *perf_mmap_alloc_page(int cpu)
2398{
2399 struct page *page;
2400 int node;
2401
2402 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2403 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2404 if (!page)
2405 return NULL;
2406
2407 return page_address(page);
2408}
2409
2323static struct perf_mmap_data * 2410static struct perf_mmap_data *
2324perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2325{ 2412{
@@ -2327,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2327 unsigned long size; 2414 unsigned long size;
2328 int i; 2415 int i;
2329 2416
2330 WARN_ON(atomic_read(&event->mmap_count));
2331
2332 size = sizeof(struct perf_mmap_data); 2417 size = sizeof(struct perf_mmap_data);
2333 size += nr_pages * sizeof(void *); 2418 size += nr_pages * sizeof(void *);
2334 2419
@@ -2336,17 +2421,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2336 if (!data) 2421 if (!data)
2337 goto fail; 2422 goto fail;
2338 2423
2339 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2424 data->user_page = perf_mmap_alloc_page(event->cpu);
2340 if (!data->user_page) 2425 if (!data->user_page)
2341 goto fail_user_page; 2426 goto fail_user_page;
2342 2427
2343 for (i = 0; i < nr_pages; i++) { 2428 for (i = 0; i < nr_pages; i++) {
2344 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2345 if (!data->data_pages[i]) 2430 if (!data->data_pages[i])
2346 goto fail_data_pages; 2431 goto fail_data_pages;
2347 } 2432 }
2348 2433
2349 data->data_order = 0;
2350 data->nr_pages = nr_pages; 2434 data->nr_pages = nr_pages;
2351 2435
2352 return data; 2436 return data;
@@ -2382,6 +2466,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2382 kfree(data); 2466 kfree(data);
2383} 2467}
2384 2468
2469static inline int page_order(struct perf_mmap_data *data)
2470{
2471 return 0;
2472}
2473
2385#else 2474#else
2386 2475
2387/* 2476/*
@@ -2390,10 +2479,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2390 * Required for architectures that have d-cache aliasing issues. 2479 * Required for architectures that have d-cache aliasing issues.
2391 */ 2480 */
2392 2481
2482static inline int page_order(struct perf_mmap_data *data)
2483{
2484 return data->page_order;
2485}
2486
2393static struct page * 2487static struct page *
2394perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2395{ 2489{
2396 if (pgoff > (1UL << data->data_order)) 2490 if (pgoff > (1UL << page_order(data)))
2397 return NULL; 2491 return NULL;
2398 2492
2399 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2413,7 +2507,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2413 int i, nr; 2507 int i, nr;
2414 2508
2415 data = container_of(work, struct perf_mmap_data, work); 2509 data = container_of(work, struct perf_mmap_data, work);
2416 nr = 1 << data->data_order; 2510 nr = 1 << page_order(data);
2417 2511
2418 base = data->user_page; 2512 base = data->user_page;
2419 for (i = 0; i < nr + 1; i++) 2513 for (i = 0; i < nr + 1; i++)
@@ -2435,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2435 unsigned long size; 2529 unsigned long size;
2436 void *all_buf; 2530 void *all_buf;
2437 2531
2438 WARN_ON(atomic_read(&event->mmap_count));
2439
2440 size = sizeof(struct perf_mmap_data); 2532 size = sizeof(struct perf_mmap_data);
2441 size += sizeof(void *); 2533 size += sizeof(void *);
2442 2534
@@ -2452,7 +2544,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2452 2544
2453 data->user_page = all_buf; 2545 data->user_page = all_buf;
2454 data->data_pages[0] = all_buf + PAGE_SIZE; 2546 data->data_pages[0] = all_buf + PAGE_SIZE;
2455 data->data_order = ilog2(nr_pages); 2547 data->page_order = ilog2(nr_pages);
2456 data->nr_pages = 1; 2548 data->nr_pages = 1;
2457 2549
2458 return data; 2550 return data;
@@ -2466,6 +2558,11 @@ fail:
2466 2558
2467#endif 2559#endif
2468 2560
2561static unsigned long perf_data_size(struct perf_mmap_data *data)
2562{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data));
2564}
2565
2469static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2470{ 2567{
2471 struct perf_event *event = vma->vm_file->private_data; 2568 struct perf_event *event = vma->vm_file->private_data;
@@ -2506,8 +2603,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2506{ 2603{
2507 long max_size = perf_data_size(data); 2604 long max_size = perf_data_size(data);
2508 2605
2509 atomic_set(&data->lock, -1);
2510
2511 if (event->attr.watermark) { 2606 if (event->attr.watermark) {
2512 data->watermark = min_t(long, max_size, 2607 data->watermark = min_t(long, max_size,
2513 event->attr.wakeup_watermark); 2608 event->attr.wakeup_watermark);
@@ -2516,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2516 if (!data->watermark) 2611 if (!data->watermark)
2517 data->watermark = max_size / 2; 2612 data->watermark = max_size / 2;
2518 2613
2519 2614 atomic_set(&data->refcount, 1);
2520 rcu_assign_pointer(event->data, data); 2615 rcu_assign_pointer(event->data, data);
2521} 2616}
2522 2617
@@ -2528,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2528 perf_mmap_data_free(data); 2623 perf_mmap_data_free(data);
2529} 2624}
2530 2625
2531static void perf_mmap_data_release(struct perf_event *event) 2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
2532{ 2627{
2533 struct perf_mmap_data *data = event->data; 2628 struct perf_mmap_data *data;
2534 2629
2535 WARN_ON(atomic_read(&event->mmap_count)); 2630 rcu_read_lock();
2631 data = rcu_dereference(event->data);
2632 if (data) {
2633 if (!atomic_inc_not_zero(&data->refcount))
2634 data = NULL;
2635 }
2636 rcu_read_unlock();
2637
2638 return data;
2639}
2640
2641static void perf_mmap_data_put(struct perf_mmap_data *data)
2642{
2643 if (!atomic_dec_and_test(&data->refcount))
2644 return;
2536 2645
2537 rcu_assign_pointer(event->data, NULL);
2538 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2539} 2647}
2540 2648
@@ -2549,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2549{ 2657{
2550 struct perf_event *event = vma->vm_file->private_data; 2658 struct perf_event *event = vma->vm_file->private_data;
2551 2659
2552 WARN_ON_ONCE(event->ctx->parent_ctx);
2553 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2554 unsigned long size = perf_data_size(event->data); 2661 unsigned long size = perf_data_size(event->data);
2555 struct user_struct *user = current_user(); 2662 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data;
2556 2664
2557 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2558 vma->vm_mm->locked_vm -= event->data->nr_locked; 2666 vma->vm_mm->locked_vm -= event->mmap_locked;
2559 perf_mmap_data_release(event); 2667 rcu_assign_pointer(event->data, NULL);
2560 mutex_unlock(&event->mmap_mutex); 2668 mutex_unlock(&event->mmap_mutex);
2669
2670 perf_mmap_data_put(data);
2671 free_uid(user);
2561 } 2672 }
2562} 2673}
2563 2674
@@ -2580,6 +2691,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2580 long user_extra, extra; 2691 long user_extra, extra;
2581 int ret = 0; 2692 int ret = 0;
2582 2693
2694 /*
2695 * Don't allow mmap() of inherited per-task counters. This would
2696 * create a performance issue due to all children writing to the
2697 * same buffer.
2698 */
2699 if (event->cpu == -1 && event->attr.inherit)
2700 return -EINVAL;
2701
2583 if (!(vma->vm_flags & VM_SHARED)) 2702 if (!(vma->vm_flags & VM_SHARED))
2584 return -EINVAL; 2703 return -EINVAL;
2585 2704
@@ -2601,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2601 2720
2602 WARN_ON_ONCE(event->ctx->parent_ctx); 2721 WARN_ON_ONCE(event->ctx->parent_ctx);
2603 mutex_lock(&event->mmap_mutex); 2722 mutex_lock(&event->mmap_mutex);
2604 if (event->output) { 2723 if (event->data) {
2605 ret = -EINVAL; 2724 if (event->data->nr_pages == nr_pages)
2606 goto unlock; 2725 atomic_inc(&event->data->refcount);
2607 } 2726 else
2608
2609 if (atomic_inc_not_zero(&event->mmap_count)) {
2610 if (nr_pages != event->data->nr_pages)
2611 ret = -EINVAL; 2727 ret = -EINVAL;
2612 goto unlock; 2728 goto unlock;
2613 } 2729 }
@@ -2639,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2639 WARN_ON(event->data); 2755 WARN_ON(event->data);
2640 2756
2641 data = perf_mmap_data_alloc(event, nr_pages); 2757 data = perf_mmap_data_alloc(event, nr_pages);
2642 ret = -ENOMEM; 2758 if (!data) {
2643 if (!data) 2759 ret = -ENOMEM;
2644 goto unlock; 2760 goto unlock;
2761 }
2645 2762
2646 ret = 0;
2647 perf_mmap_data_init(event, data); 2763 perf_mmap_data_init(event, data);
2648
2649 atomic_set(&event->mmap_count, 1);
2650 atomic_long_add(user_extra, &user->locked_vm);
2651 vma->vm_mm->locked_vm += extra;
2652 event->data->nr_locked = extra;
2653 if (vma->vm_flags & VM_WRITE) 2764 if (vma->vm_flags & VM_WRITE)
2654 event->data->writable = 1; 2765 event->data->writable = 1;
2655 2766
2767 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra;
2769 event->mmap_user = get_current_user();
2770 vma->vm_mm->locked_vm += event->mmap_locked;
2771
2656unlock: 2772unlock:
2773 if (!ret)
2774 atomic_inc(&event->mmap_count);
2657 mutex_unlock(&event->mmap_mutex); 2775 mutex_unlock(&event->mmap_mutex);
2658 2776
2659 vma->vm_flags |= VM_RESERVED; 2777 vma->vm_flags |= VM_RESERVED;
@@ -2885,127 +3003,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2885} 3003}
2886 3004
2887/* 3005/*
2888 * Curious locking construct.
2889 *
2890 * We need to ensure a later event_id doesn't publish a head when a former 3006 * We need to ensure a later event_id doesn't publish a head when a former
2891 * event_id isn't done writing. However since we need to deal with NMIs we 3007 * event isn't done writing. However since we need to deal with NMIs we
2892 * cannot fully serialize things. 3008 * cannot fully serialize things.
2893 * 3009 *
2894 * What we do is serialize between CPUs so we only have to deal with NMI
2895 * nesting on a single CPU.
2896 *
2897 * We only publish the head (and generate a wakeup) when the outer-most 3010 * We only publish the head (and generate a wakeup) when the outer-most
2898 * event_id completes. 3011 * event completes.
2899 */ 3012 */
2900static void perf_output_lock(struct perf_output_handle *handle) 3013static void perf_output_get_handle(struct perf_output_handle *handle)
2901{ 3014{
2902 struct perf_mmap_data *data = handle->data; 3015 struct perf_mmap_data *data = handle->data;
2903 int cur, cpu = get_cpu();
2904
2905 handle->locked = 0;
2906 3016
2907 for (;;) { 3017 preempt_disable();
2908 cur = atomic_cmpxchg(&data->lock, -1, cpu); 3018 local_inc(&data->nest);
2909 if (cur == -1) { 3019 handle->wakeup = local_read(&data->wakeup);
2910 handle->locked = 1;
2911 break;
2912 }
2913 if (cur == cpu)
2914 break;
2915
2916 cpu_relax();
2917 }
2918} 3020}
2919 3021
2920static void perf_output_unlock(struct perf_output_handle *handle) 3022static void perf_output_put_handle(struct perf_output_handle *handle)
2921{ 3023{
2922 struct perf_mmap_data *data = handle->data; 3024 struct perf_mmap_data *data = handle->data;
2923 unsigned long head; 3025 unsigned long head;
2924 int cpu;
2925
2926 data->done_head = data->head;
2927
2928 if (!handle->locked)
2929 goto out;
2930 3026
2931again: 3027again:
2932 /* 3028 head = local_read(&data->head);
2933 * The xchg implies a full barrier that ensures all writes are done
2934 * before we publish the new head, matched by a rmb() in userspace when
2935 * reading this position.
2936 */
2937 while ((head = atomic_long_xchg(&data->done_head, 0)))
2938 data->user_page->data_head = head;
2939 3029
2940 /* 3030 /*
2941 * NMI can happen here, which means we can miss a done_head update. 3031 * IRQ/NMI can happen here, which means we can miss a head update.
2942 */ 3032 */
2943 3033
2944 cpu = atomic_xchg(&data->lock, -1); 3034 if (!local_dec_and_test(&data->nest))
2945 WARN_ON_ONCE(cpu != smp_processor_id()); 3035 goto out;
2946 3036
2947 /* 3037 /*
2948 * Therefore we have to validate we did not indeed do so. 3038 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this
3040 * write.
2949 */ 3041 */
2950 if (unlikely(atomic_long_read(&data->done_head))) { 3042 data->user_page->data_head = head;
2951 /*
2952 * Since we had it locked, we can lock it again.
2953 */
2954 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2955 cpu_relax();
2956 3043
3044 /*
3045 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head.
3047 */
3048 if (unlikely(head != local_read(&data->head))) {
3049 local_inc(&data->nest);
2957 goto again; 3050 goto again;
2958 } 3051 }
2959 3052
2960 if (atomic_xchg(&data->wakeup, 0)) 3053 if (handle->wakeup != local_read(&data->wakeup))
2961 perf_output_wakeup(handle); 3054 perf_output_wakeup(handle);
2962out: 3055
2963 put_cpu(); 3056 out:
3057 preempt_enable();
2964} 3058}
2965 3059
2966void perf_output_copy(struct perf_output_handle *handle, 3060__always_inline void perf_output_copy(struct perf_output_handle *handle,
2967 const void *buf, unsigned int len) 3061 const void *buf, unsigned int len)
2968{ 3062{
2969 unsigned int pages_mask;
2970 unsigned long offset;
2971 unsigned int size;
2972 void **pages;
2973
2974 offset = handle->offset;
2975 pages_mask = handle->data->nr_pages - 1;
2976 pages = handle->data->data_pages;
2977
2978 do { 3063 do {
2979 unsigned long page_offset; 3064 unsigned long size = min_t(unsigned long, handle->size, len);
2980 unsigned long page_size;
2981 int nr;
2982 3065
2983 nr = (offset >> PAGE_SHIFT) & pages_mask; 3066 memcpy(handle->addr, buf, size);
2984 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2985 page_offset = offset & (page_size - 1);
2986 size = min_t(unsigned int, page_size - page_offset, len);
2987 3067
2988 memcpy(pages[nr] + page_offset, buf, size); 3068 len -= size;
3069 handle->addr += size;
3070 buf += size;
3071 handle->size -= size;
3072 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data;
2989 3074
2990 len -= size; 3075 handle->page++;
2991 buf += size; 3076 handle->page &= data->nr_pages - 1;
2992 offset += size; 3077 handle->addr = data->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data);
3079 }
2993 } while (len); 3080 } while (len);
2994
2995 handle->offset = offset;
2996
2997 /*
2998 * Check we didn't copy past our reservation window, taking the
2999 * possible unsigned int wrap into account.
3000 */
3001 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
3002} 3081}
3003 3082
3004int perf_output_begin(struct perf_output_handle *handle, 3083int perf_output_begin(struct perf_output_handle *handle,
3005 struct perf_event *event, unsigned int size, 3084 struct perf_event *event, unsigned int size,
3006 int nmi, int sample) 3085 int nmi, int sample)
3007{ 3086{
3008 struct perf_event *output_event;
3009 struct perf_mmap_data *data; 3087 struct perf_mmap_data *data;
3010 unsigned long tail, offset, head; 3088 unsigned long tail, offset, head;
3011 int have_lost; 3089 int have_lost;
@@ -3022,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle,
3022 if (event->parent) 3100 if (event->parent)
3023 event = event->parent; 3101 event = event->parent;
3024 3102
3025 output_event = rcu_dereference(event->output);
3026 if (output_event)
3027 event = output_event;
3028
3029 data = rcu_dereference(event->data); 3103 data = rcu_dereference(event->data);
3030 if (!data) 3104 if (!data)
3031 goto out; 3105 goto out;
@@ -3036,13 +3110,13 @@ int perf_output_begin(struct perf_output_handle *handle,
3036 handle->sample = sample; 3110 handle->sample = sample;
3037 3111
3038 if (!data->nr_pages) 3112 if (!data->nr_pages)
3039 goto fail; 3113 goto out;
3040 3114
3041 have_lost = atomic_read(&data->lost); 3115 have_lost = local_read(&data->lost);
3042 if (have_lost) 3116 if (have_lost)
3043 size += sizeof(lost_event); 3117 size += sizeof(lost_event);
3044 3118
3045 perf_output_lock(handle); 3119 perf_output_get_handle(handle);
3046 3120
3047 do { 3121 do {
3048 /* 3122 /*
@@ -3052,24 +3126,28 @@ int perf_output_begin(struct perf_output_handle *handle,
3052 */ 3126 */
3053 tail = ACCESS_ONCE(data->user_page->data_tail); 3127 tail = ACCESS_ONCE(data->user_page->data_tail);
3054 smp_rmb(); 3128 smp_rmb();
3055 offset = head = atomic_long_read(&data->head); 3129 offset = head = local_read(&data->head);
3056 head += size; 3130 head += size;
3057 if (unlikely(!perf_output_space(data, tail, offset, head))) 3131 if (unlikely(!perf_output_space(data, tail, offset, head)))
3058 goto fail; 3132 goto fail;
3059 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3133 } while (local_cmpxchg(&data->head, offset, head) != offset);
3060 3134
3061 handle->offset = offset; 3135 if (head - local_read(&data->wakeup) > data->watermark)
3062 handle->head = head; 3136 local_add(data->watermark, &data->wakeup);
3063 3137
3064 if (head - tail > data->watermark) 3138 handle->page = offset >> (PAGE_SHIFT + page_order(data));
3065 atomic_set(&data->wakeup, 1); 3139 handle->page &= data->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
3141 handle->addr = data->data_pages[handle->page];
3142 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
3066 3144
3067 if (have_lost) { 3145 if (have_lost) {
3068 lost_event.header.type = PERF_RECORD_LOST; 3146 lost_event.header.type = PERF_RECORD_LOST;
3069 lost_event.header.misc = 0; 3147 lost_event.header.misc = 0;
3070 lost_event.header.size = sizeof(lost_event); 3148 lost_event.header.size = sizeof(lost_event);
3071 lost_event.id = event->id; 3149 lost_event.id = event->id;
3072 lost_event.lost = atomic_xchg(&data->lost, 0); 3150 lost_event.lost = local_xchg(&data->lost, 0);
3073 3151
3074 perf_output_put(handle, lost_event); 3152 perf_output_put(handle, lost_event);
3075 } 3153 }
@@ -3077,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle,
3077 return 0; 3155 return 0;
3078 3156
3079fail: 3157fail:
3080 atomic_inc(&data->lost); 3158 local_inc(&data->lost);
3081 perf_output_unlock(handle); 3159 perf_output_put_handle(handle);
3082out: 3160out:
3083 rcu_read_unlock(); 3161 rcu_read_unlock();
3084 3162
@@ -3093,14 +3171,14 @@ void perf_output_end(struct perf_output_handle *handle)
3093 int wakeup_events = event->attr.wakeup_events; 3171 int wakeup_events = event->attr.wakeup_events;
3094 3172
3095 if (handle->sample && wakeup_events) { 3173 if (handle->sample && wakeup_events) {
3096 int events = atomic_inc_return(&data->events); 3174 int events = local_inc_return(&data->events);
3097 if (events >= wakeup_events) { 3175 if (events >= wakeup_events) {
3098 atomic_sub(wakeup_events, &data->events); 3176 local_sub(wakeup_events, &data->events);
3099 atomic_set(&data->wakeup, 1); 3177 local_inc(&data->wakeup);
3100 } 3178 }
3101 } 3179 }
3102 3180
3103 perf_output_unlock(handle); 3181 perf_output_put_handle(handle);
3104 rcu_read_unlock(); 3182 rcu_read_unlock();
3105} 3183}
3106 3184
@@ -3436,22 +3514,13 @@ static void perf_event_task_output(struct perf_event *event,
3436{ 3514{
3437 struct perf_output_handle handle; 3515 struct perf_output_handle handle;
3438 struct task_struct *task = task_event->task; 3516 struct task_struct *task = task_event->task;
3439 unsigned long flags;
3440 int size, ret; 3517 int size, ret;
3441 3518
3442 /*
3443 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3444 * in perf_output_lock() from interrupt context, it's game over.
3445 */
3446 local_irq_save(flags);
3447
3448 size = task_event->event_id.header.size; 3519 size = task_event->event_id.header.size;
3449 ret = perf_output_begin(&handle, event, size, 0, 0); 3520 ret = perf_output_begin(&handle, event, size, 0, 0);
3450 3521
3451 if (ret) { 3522 if (ret)
3452 local_irq_restore(flags);
3453 return; 3523 return;
3454 }
3455 3524
3456 task_event->event_id.pid = perf_event_pid(event, task); 3525 task_event->event_id.pid = perf_event_pid(event, task);
3457 task_event->event_id.ppid = perf_event_pid(event, current); 3526 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3531,6 @@ static void perf_event_task_output(struct perf_event *event,
3462 perf_output_put(&handle, task_event->event_id); 3531 perf_output_put(&handle, task_event->event_id);
3463 3532
3464 perf_output_end(&handle); 3533 perf_output_end(&handle);
3465 local_irq_restore(flags);
3466} 3534}
3467 3535
3468static int perf_event_task_match(struct perf_event *event) 3536static int perf_event_task_match(struct perf_event *event)
@@ -3990,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3990 } 4058 }
3991} 4059}
3992 4060
3993static void perf_swevent_unthrottle(struct perf_event *event)
3994{
3995 /*
3996 * Nothing to do, we already reset hwc->interrupts.
3997 */
3998}
3999
4000static void perf_swevent_add(struct perf_event *event, u64 nr, 4061static void perf_swevent_add(struct perf_event *event, u64 nr,
4001 int nmi, struct perf_sample_data *data, 4062 int nmi, struct perf_sample_data *data,
4002 struct pt_regs *regs) 4063 struct pt_regs *regs)
@@ -4020,9 +4081,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4020 perf_swevent_overflow(event, 0, nmi, data, regs); 4081 perf_swevent_overflow(event, 0, nmi, data, regs);
4021} 4082}
4022 4083
4023static int perf_tp_event_match(struct perf_event *event,
4024 struct perf_sample_data *data);
4025
4026static int perf_exclude_event(struct perf_event *event, 4084static int perf_exclude_event(struct perf_event *event,
4027 struct pt_regs *regs) 4085 struct pt_regs *regs)
4028{ 4086{
@@ -4052,10 +4110,6 @@ static int perf_swevent_match(struct perf_event *event,
4052 if (perf_exclude_event(event, regs)) 4110 if (perf_exclude_event(event, regs))
4053 return 0; 4111 return 0;
4054 4112
4055 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4056 !perf_tp_event_match(event, data))
4057 return 0;
4058
4059 return 1; 4113 return 1;
4060} 4114}
4061 4115
@@ -4066,19 +4120,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id)
4066 return hash_64(val, SWEVENT_HLIST_BITS); 4120 return hash_64(val, SWEVENT_HLIST_BITS);
4067} 4121}
4068 4122
4069static struct hlist_head * 4123static inline struct hlist_head *
4070find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4124__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4071{ 4125{
4072 u64 hash; 4126 u64 hash = swevent_hash(type, event_id);
4073 struct swevent_hlist *hlist;
4074 4127
4075 hash = swevent_hash(type, event_id); 4128 return &hlist->heads[hash];
4129}
4130
4131/* For the read side: events when they trigger */
4132static inline struct hlist_head *
4133find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4134{
4135 struct swevent_hlist *hlist;
4076 4136
4077 hlist = rcu_dereference(ctx->swevent_hlist); 4137 hlist = rcu_dereference(ctx->swevent_hlist);
4078 if (!hlist) 4138 if (!hlist)
4079 return NULL; 4139 return NULL;
4080 4140
4081 return &hlist->heads[hash]; 4141 return __find_swevent_head(hlist, type, event_id);
4142}
4143
4144/* For the event head insertion and removal in the hlist */
4145static inline struct hlist_head *
4146find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4147{
4148 struct swevent_hlist *hlist;
4149 u32 event_id = event->attr.config;
4150 u64 type = event->attr.type;
4151
4152 /*
4153 * Event scheduling is always serialized against hlist allocation
4154 * and release. Which makes the protected version suitable here.
4155 * The context lock guarantees that.
4156 */
4157 hlist = rcu_dereference_protected(ctx->swevent_hlist,
4158 lockdep_is_held(&event->ctx->lock));
4159 if (!hlist)
4160 return NULL;
4161
4162 return __find_swevent_head(hlist, type, event_id);
4082} 4163}
4083 4164
4084static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4165static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@ -4095,7 +4176,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4095 4176
4096 rcu_read_lock(); 4177 rcu_read_lock();
4097 4178
4098 head = find_swevent_head(cpuctx, type, event_id); 4179 head = find_swevent_head_rcu(cpuctx, type, event_id);
4099 4180
4100 if (!head) 4181 if (!head)
4101 goto end; 4182 goto end;
@@ -4110,7 +4191,7 @@ end:
4110 4191
4111int perf_swevent_get_recursion_context(void) 4192int perf_swevent_get_recursion_context(void)
4112{ 4193{
4113 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4194 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4114 int rctx; 4195 int rctx;
4115 4196
4116 if (in_nmi()) 4197 if (in_nmi())
@@ -4122,10 +4203,8 @@ int perf_swevent_get_recursion_context(void)
4122 else 4203 else
4123 rctx = 0; 4204 rctx = 0;
4124 4205
4125 if (cpuctx->recursion[rctx]) { 4206 if (cpuctx->recursion[rctx])
4126 put_cpu_var(perf_cpu_context);
4127 return -1; 4207 return -1;
4128 }
4129 4208
4130 cpuctx->recursion[rctx]++; 4209 cpuctx->recursion[rctx]++;
4131 barrier(); 4210 barrier();
@@ -4139,7 +4218,6 @@ void perf_swevent_put_recursion_context(int rctx)
4139 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4140 barrier(); 4219 barrier();
4141 cpuctx->recursion[rctx]--; 4220 cpuctx->recursion[rctx]--;
4142 put_cpu_var(perf_cpu_context);
4143} 4221}
4144EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4145 4223
@@ -4150,6 +4228,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4150 struct perf_sample_data data; 4228 struct perf_sample_data data;
4151 int rctx; 4229 int rctx;
4152 4230
4231 preempt_disable_notrace();
4153 rctx = perf_swevent_get_recursion_context(); 4232 rctx = perf_swevent_get_recursion_context();
4154 if (rctx < 0) 4233 if (rctx < 0)
4155 return; 4234 return;
@@ -4159,6 +4238,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4159 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4238 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4160 4239
4161 perf_swevent_put_recursion_context(rctx); 4240 perf_swevent_put_recursion_context(rctx);
4241 preempt_enable_notrace();
4162} 4242}
4163 4243
4164static void perf_swevent_read(struct perf_event *event) 4244static void perf_swevent_read(struct perf_event *event)
@@ -4178,7 +4258,7 @@ static int perf_swevent_enable(struct perf_event *event)
4178 perf_swevent_set_period(event); 4258 perf_swevent_set_period(event);
4179 } 4259 }
4180 4260
4181 head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); 4261 head = find_swevent_head(cpuctx, event);
4182 if (WARN_ON_ONCE(!head)) 4262 if (WARN_ON_ONCE(!head))
4183 return -EINVAL; 4263 return -EINVAL;
4184 4264
@@ -4192,11 +4272,22 @@ static void perf_swevent_disable(struct perf_event *event)
4192 hlist_del_rcu(&event->hlist_entry); 4272 hlist_del_rcu(&event->hlist_entry);
4193} 4273}
4194 4274
4275static void perf_swevent_void(struct perf_event *event)
4276{
4277}
4278
4279static int perf_swevent_int(struct perf_event *event)
4280{
4281 return 0;
4282}
4283
4195static const struct pmu perf_ops_generic = { 4284static const struct pmu perf_ops_generic = {
4196 .enable = perf_swevent_enable, 4285 .enable = perf_swevent_enable,
4197 .disable = perf_swevent_disable, 4286 .disable = perf_swevent_disable,
4287 .start = perf_swevent_int,
4288 .stop = perf_swevent_void,
4198 .read = perf_swevent_read, 4289 .read = perf_swevent_read,
4199 .unthrottle = perf_swevent_unthrottle, 4290 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4200}; 4291};
4201 4292
4202/* 4293/*
@@ -4366,6 +4457,14 @@ static const struct pmu perf_ops_task_clock = {
4366 .read = task_clock_perf_event_read, 4457 .read = task_clock_perf_event_read,
4367}; 4458};
4368 4459
4460/* Deref the hlist from the update side */
4461static inline struct swevent_hlist *
4462swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4463{
4464 return rcu_dereference_protected(cpuctx->swevent_hlist,
4465 lockdep_is_held(&cpuctx->hlist_mutex));
4466}
4467
4369static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4468static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4370{ 4469{
4371 struct swevent_hlist *hlist; 4470 struct swevent_hlist *hlist;
@@ -4376,12 +4475,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4376 4475
4377static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4476static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4378{ 4477{
4379 struct swevent_hlist *hlist; 4478 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4380 4479
4381 if (!cpuctx->swevent_hlist) 4480 if (!hlist)
4382 return; 4481 return;
4383 4482
4384 hlist = cpuctx->swevent_hlist;
4385 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4483 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4386 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4484 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4387} 4485}
@@ -4418,7 +4516,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4418 4516
4419 mutex_lock(&cpuctx->hlist_mutex); 4517 mutex_lock(&cpuctx->hlist_mutex);
4420 4518
4421 if (!cpuctx->swevent_hlist && cpu_online(cpu)) { 4519 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4422 struct swevent_hlist *hlist; 4520 struct swevent_hlist *hlist;
4423 4521
4424 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4522 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4467,10 +4565,48 @@ static int swevent_hlist_get(struct perf_event *event)
4467 4565
4468#ifdef CONFIG_EVENT_TRACING 4566#ifdef CONFIG_EVENT_TRACING
4469 4567
4470void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4568static const struct pmu perf_ops_tracepoint = {
4471 int entry_size, struct pt_regs *regs) 4569 .enable = perf_trace_enable,
4570 .disable = perf_trace_disable,
4571 .start = perf_swevent_int,
4572 .stop = perf_swevent_void,
4573 .read = perf_swevent_read,
4574 .unthrottle = perf_swevent_void,
4575};
4576
4577static int perf_tp_filter_match(struct perf_event *event,
4578 struct perf_sample_data *data)
4579{
4580 void *record = data->raw->data;
4581
4582 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4583 return 1;
4584 return 0;
4585}
4586
4587static int perf_tp_event_match(struct perf_event *event,
4588 struct perf_sample_data *data,
4589 struct pt_regs *regs)
4590{
4591 /*
4592 * All tracepoints are from kernel-space.
4593 */
4594 if (event->attr.exclude_kernel)
4595 return 0;
4596
4597 if (!perf_tp_filter_match(event, data))
4598 return 0;
4599
4600 return 1;
4601}
4602
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head)
4472{ 4605{
4473 struct perf_sample_data data; 4606 struct perf_sample_data data;
4607 struct perf_event *event;
4608 struct hlist_node *node;
4609
4474 struct perf_raw_record raw = { 4610 struct perf_raw_record raw = {
4475 .size = entry_size, 4611 .size = entry_size,
4476 .data = record, 4612 .data = record,
@@ -4479,26 +4615,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4479 perf_sample_data_init(&data, addr); 4615 perf_sample_data_init(&data, addr);
4480 data.raw = &raw; 4616 data.raw = &raw;
4481 4617
4482 /* Trace events already protected against recursion */ 4618 rcu_read_lock();
4483 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4484 &data, regs); 4620 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs);
4622 }
4623 rcu_read_unlock();
4485} 4624}
4486EXPORT_SYMBOL_GPL(perf_tp_event); 4625EXPORT_SYMBOL_GPL(perf_tp_event);
4487 4626
4488static int perf_tp_event_match(struct perf_event *event,
4489 struct perf_sample_data *data)
4490{
4491 void *record = data->raw->data;
4492
4493 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4494 return 1;
4495 return 0;
4496}
4497
4498static void tp_perf_event_destroy(struct perf_event *event) 4627static void tp_perf_event_destroy(struct perf_event *event)
4499{ 4628{
4500 perf_trace_disable(event->attr.config); 4629 perf_trace_destroy(event);
4501 swevent_hlist_put(event);
4502} 4630}
4503 4631
4504static const struct pmu *tp_perf_event_init(struct perf_event *event) 4632static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4514,17 +4642,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4514 !capable(CAP_SYS_ADMIN)) 4642 !capable(CAP_SYS_ADMIN))
4515 return ERR_PTR(-EPERM); 4643 return ERR_PTR(-EPERM);
4516 4644
4517 if (perf_trace_enable(event->attr.config)) 4645 err = perf_trace_init(event);
4646 if (err)
4518 return NULL; 4647 return NULL;
4519 4648
4520 event->destroy = tp_perf_event_destroy; 4649 event->destroy = tp_perf_event_destroy;
4521 err = swevent_hlist_get(event);
4522 if (err) {
4523 perf_trace_disable(event->attr.config);
4524 return ERR_PTR(err);
4525 }
4526 4650
4527 return &perf_ops_generic; 4651 return &perf_ops_tracepoint;
4528} 4652}
4529 4653
4530static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4654static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4552,12 +4676,6 @@ static void perf_event_free_filter(struct perf_event *event)
4552 4676
4553#else 4677#else
4554 4678
4555static int perf_tp_event_match(struct perf_event *event,
4556 struct perf_sample_data *data)
4557{
4558 return 1;
4559}
4560
4561static const struct pmu *tp_perf_event_init(struct perf_event *event) 4679static const struct pmu *tp_perf_event_init(struct perf_event *event)
4562{ 4680{
4563 return NULL; 4681 return NULL;
@@ -4886,54 +5004,53 @@ err_size:
4886 goto out; 5004 goto out;
4887} 5005}
4888 5006
4889static int perf_event_set_output(struct perf_event *event, int output_fd) 5007static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
4890{ 5009{
4891 struct perf_event *output_event = NULL; 5010 struct perf_mmap_data *data = NULL, *old_data = NULL;
4892 struct file *output_file = NULL;
4893 struct perf_event *old_output;
4894 int fput_needed = 0;
4895 int ret = -EINVAL; 5011 int ret = -EINVAL;
4896 5012
4897 if (!output_fd) 5013 if (!output_event)
4898 goto set; 5014 goto set;
4899 5015
4900 output_file = fget_light(output_fd, &fput_needed); 5016 /* don't allow circular references */
4901 if (!output_file) 5017 if (event == output_event)
4902 return -EBADF;
4903
4904 if (output_file->f_op != &perf_fops)
4905 goto out; 5018 goto out;
4906 5019
4907 output_event = output_file->private_data; 5020 /*
4908 5021 * Don't allow cross-cpu buffers
4909 /* Don't chain output fds */ 5022 */
4910 if (output_event->output) 5023 if (output_event->cpu != event->cpu)
4911 goto out; 5024 goto out;
4912 5025
4913 /* Don't set an output fd when we already have an output channel */ 5026 /*
4914 if (event->data) 5027 * If its not a per-cpu buffer, it must be the same task.
5028 */
5029 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4915 goto out; 5030 goto out;
4916 5031
4917 atomic_long_inc(&output_file->f_count);
4918
4919set: 5032set:
4920 mutex_lock(&event->mmap_mutex); 5033 mutex_lock(&event->mmap_mutex);
4921 old_output = event->output; 5034 /* Can't redirect output if we've got an active mmap() */
4922 rcu_assign_pointer(event->output, output_event); 5035 if (atomic_read(&event->mmap_count))
4923 mutex_unlock(&event->mmap_mutex); 5036 goto unlock;
4924 5037
4925 if (old_output) { 5038 if (output_event) {
4926 /* 5039 /* get the buffer we want to redirect to */
4927 * we need to make sure no existing perf_output_*() 5040 data = perf_mmap_data_get(output_event);
4928 * is still referencing this event. 5041 if (!data)
4929 */ 5042 goto unlock;
4930 synchronize_rcu();
4931 fput(old_output->filp);
4932 } 5043 }
4933 5044
5045 old_data = event->data;
5046 rcu_assign_pointer(event->data, data);
4934 ret = 0; 5047 ret = 0;
5048unlock:
5049 mutex_unlock(&event->mmap_mutex);
5050
5051 if (old_data)
5052 perf_mmap_data_put(old_data);
4935out: 5053out:
4936 fput_light(output_file, fput_needed);
4937 return ret; 5054 return ret;
4938} 5055}
4939 5056
@@ -4949,13 +5066,13 @@ SYSCALL_DEFINE5(perf_event_open,
4949 struct perf_event_attr __user *, attr_uptr, 5066 struct perf_event_attr __user *, attr_uptr,
4950 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5067 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4951{ 5068{
4952 struct perf_event *event, *group_leader; 5069 struct perf_event *event, *group_leader = NULL, *output_event = NULL;
4953 struct perf_event_attr attr; 5070 struct perf_event_attr attr;
4954 struct perf_event_context *ctx; 5071 struct perf_event_context *ctx;
4955 struct file *event_file = NULL; 5072 struct file *event_file = NULL;
4956 struct file *group_file = NULL; 5073 struct file *group_file = NULL;
5074 int event_fd;
4957 int fput_needed = 0; 5075 int fput_needed = 0;
4958 int fput_needed2 = 0;
4959 int err; 5076 int err;
4960 5077
4961 /* for future expandability... */ 5078 /* for future expandability... */
@@ -4976,26 +5093,38 @@ SYSCALL_DEFINE5(perf_event_open,
4976 return -EINVAL; 5093 return -EINVAL;
4977 } 5094 }
4978 5095
5096 event_fd = get_unused_fd_flags(O_RDWR);
5097 if (event_fd < 0)
5098 return event_fd;
5099
4979 /* 5100 /*
4980 * Get the target context (task or percpu): 5101 * Get the target context (task or percpu):
4981 */ 5102 */
4982 ctx = find_get_context(pid, cpu); 5103 ctx = find_get_context(pid, cpu);
4983 if (IS_ERR(ctx)) 5104 if (IS_ERR(ctx)) {
4984 return PTR_ERR(ctx); 5105 err = PTR_ERR(ctx);
5106 goto err_fd;
5107 }
5108
5109 if (group_fd != -1) {
5110 group_leader = perf_fget_light(group_fd, &fput_needed);
5111 if (IS_ERR(group_leader)) {
5112 err = PTR_ERR(group_leader);
5113 goto err_put_context;
5114 }
5115 group_file = group_leader->filp;
5116 if (flags & PERF_FLAG_FD_OUTPUT)
5117 output_event = group_leader;
5118 if (flags & PERF_FLAG_FD_NO_GROUP)
5119 group_leader = NULL;
5120 }
4985 5121
4986 /* 5122 /*
4987 * Look up the group leader (we will attach this event to it): 5123 * Look up the group leader (we will attach this event to it):
4988 */ 5124 */
4989 group_leader = NULL; 5125 if (group_leader) {
4990 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4991 err = -EINVAL; 5126 err = -EINVAL;
4992 group_file = fget_light(group_fd, &fput_needed);
4993 if (!group_file)
4994 goto err_put_context;
4995 if (group_file->f_op != &perf_fops)
4996 goto err_put_context;
4997 5127
4998 group_leader = group_file->private_data;
4999 /* 5128 /*
5000 * Do not allow a recursive hierarchy (this new sibling 5129 * Do not allow a recursive hierarchy (this new sibling
5001 * becoming part of another group-sibling): 5130 * becoming part of another group-sibling):
@@ -5017,22 +5146,21 @@ SYSCALL_DEFINE5(perf_event_open,
5017 5146
5018 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 5147 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5019 NULL, NULL, GFP_KERNEL); 5148 NULL, NULL, GFP_KERNEL);
5020 err = PTR_ERR(event); 5149 if (IS_ERR(event)) {
5021 if (IS_ERR(event)) 5150 err = PTR_ERR(event);
5022 goto err_put_context; 5151 goto err_put_context;
5152 }
5023 5153
5024 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); 5154 if (output_event) {
5025 if (err < 0) 5155 err = perf_event_set_output(event, output_event);
5026 goto err_free_put_context; 5156 if (err)
5157 goto err_free_put_context;
5158 }
5027 5159
5028 event_file = fget_light(err, &fput_needed2); 5160 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5029 if (!event_file) 5161 if (IS_ERR(event_file)) {
5162 err = PTR_ERR(event_file);
5030 goto err_free_put_context; 5163 goto err_free_put_context;
5031
5032 if (flags & PERF_FLAG_FD_OUTPUT) {
5033 err = perf_event_set_output(event, group_fd);
5034 if (err)
5035 goto err_fput_free_put_context;
5036 } 5164 }
5037 5165
5038 event->filp = event_file; 5166 event->filp = event_file;
@@ -5048,19 +5176,23 @@ SYSCALL_DEFINE5(perf_event_open,
5048 list_add_tail(&event->owner_entry, &current->perf_event_list); 5176 list_add_tail(&event->owner_entry, &current->perf_event_list);
5049 mutex_unlock(&current->perf_event_mutex); 5177 mutex_unlock(&current->perf_event_mutex);
5050 5178
5051err_fput_free_put_context: 5179 /*
5052 fput_light(event_file, fput_needed2); 5180 * Drop the reference on the group_event after placing the
5181 * new event on the sibling_list. This ensures destruction
5182 * of the group leader will find the pointer to itself in
5183 * perf_group_detach().
5184 */
5185 fput_light(group_file, fput_needed);
5186 fd_install(event_fd, event_file);
5187 return event_fd;
5053 5188
5054err_free_put_context: 5189err_free_put_context:
5055 if (err < 0) 5190 free_event(event);
5056 free_event(event);
5057
5058err_put_context: 5191err_put_context:
5059 if (err < 0)
5060 put_ctx(ctx);
5061
5062 fput_light(group_file, fput_needed); 5192 fput_light(group_file, fput_needed);
5063 5193 put_ctx(ctx);
5194err_fd:
5195 put_unused_fd(event_fd);
5064 return err; 5196 return err;
5065} 5197}
5066 5198
@@ -5371,6 +5503,7 @@ static void perf_free_event(struct perf_event *event,
5371 5503
5372 fput(parent->filp); 5504 fput(parent->filp);
5373 5505
5506 perf_group_detach(event);
5374 list_del_event(event, ctx); 5507 list_del_event(event, ctx);
5375 free_event(event); 5508 free_event(event);
5376} 5509}
diff --git a/kernel/pid.c b/kernel/pid.c
index aebb30d9c233..e9fd8c132d26 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -513,6 +513,13 @@ void __init pidhash_init(void)
513 513
514void __init pidmap_init(void) 514void __init pidmap_init(void)
515{ 515{
516 /* bump default and minimum pid_max based on number of cpus */
517 pid_max = min(pid_max_max, max_t(int, pid_max,
518 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
519 pid_max_min = max_t(int, pid_max_min,
520 PIDS_PER_CPU_MIN * num_possible_cpus());
521 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
522
516 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 523 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
517 /* Reserve PID 0. We never call free_pidmap(0) */ 524 /* Reserve PID 0. We never call free_pidmap(0) */
518 set_bit(0, init_pid_ns.pidmap[0].page); 525 set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 00bb252f29a2..9829646d399c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -363,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
363 } 363 }
364 } else { 364 } else {
365 read_lock(&tasklist_lock); 365 read_lock(&tasklist_lock);
366 if (thread_group_leader(p) && p->signal) { 366 if (thread_group_leader(p) && p->sighand) {
367 error = 367 error =
368 cpu_clock_sample_group(which_clock, 368 cpu_clock_sample_group(which_clock,
369 p, &rtn); 369 p, &rtn);
@@ -439,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
439 439
440 if (likely(p != NULL)) { 440 if (likely(p != NULL)) {
441 read_lock(&tasklist_lock); 441 read_lock(&tasklist_lock);
442 if (unlikely(p->signal == NULL)) { 442 if (unlikely(p->sighand == NULL)) {
443 /* 443 /*
444 * We raced with the reaping of the task. 444 * We raced with the reaping of the task.
445 * The deletion should have cleared us off the list. 445 * The deletion should have cleared us off the list.
@@ -691,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
691 read_lock(&tasklist_lock); 691 read_lock(&tasklist_lock);
692 /* 692 /*
693 * We need the tasklist_lock to protect against reaping that 693 * We need the tasklist_lock to protect against reaping that
694 * clears p->signal. If p has just been reaped, we can no 694 * clears p->sighand. If p has just been reaped, we can no
695 * longer get any information about it at all. 695 * longer get any information about it at all.
696 */ 696 */
697 if (unlikely(p->signal == NULL)) { 697 if (unlikely(p->sighand == NULL)) {
698 read_unlock(&tasklist_lock); 698 read_unlock(&tasklist_lock);
699 put_task_struct(p); 699 put_task_struct(p);
700 timer->it.cpu.task = NULL; 700 timer->it.cpu.task = NULL;
@@ -863,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
863 clear_dead = p->exit_state; 863 clear_dead = p->exit_state;
864 } else { 864 } else {
865 read_lock(&tasklist_lock); 865 read_lock(&tasklist_lock);
866 if (unlikely(p->signal == NULL)) { 866 if (unlikely(p->sighand == NULL)) {
867 /* 867 /*
868 * The process has been reaped. 868 * The process has been reaped.
869 * We can't even collect a sample any more. 869 * We can't even collect a sample any more.
@@ -1199,7 +1199,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1199 spin_lock(&p->sighand->siglock); 1199 spin_lock(&p->sighand->siglock);
1200 } else { 1200 } else {
1201 read_lock(&tasklist_lock); 1201 read_lock(&tasklist_lock);
1202 if (unlikely(p->signal == NULL)) { 1202 if (unlikely(p->sighand == NULL)) {
1203 /* 1203 /*
1204 * The process has been reaped. 1204 * The process has been reaped.
1205 * We can't even collect a sample any more. 1205 * We can't even collect a sample any more.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 00d1fda58ab6..ad723420acc3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
559 new_timer->it_id = (timer_t) new_timer_id; 559 new_timer->it_id = (timer_t) new_timer_id;
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
563 if (error)
564 goto out;
565 562
566 /*
567 * return the timer_id now. The next step is hard to
568 * back out if there is an error.
569 */
570 if (copy_to_user(created_timer_id, 563 if (copy_to_user(created_timer_id,
571 &new_timer_id, sizeof (new_timer_id))) { 564 &new_timer_id, sizeof (new_timer_id))) {
572 error = -EFAULT; 565 error = -EFAULT;
@@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 new_timer->sigq->info.si_tid = new_timer->it_id; 590 new_timer->sigq->info.si_tid = new_timer->it_id;
598 new_timer->sigq->info.si_code = SI_TIMER; 591 new_timer->sigq->info.si_code = SI_TIMER;
599 592
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error)
595 goto out;
596
600 spin_lock_irq(&current->sighand->siglock); 597 spin_lock_irq(&current->sighand->siglock);
601 new_timer->it_signal = current->signal; 598 new_timer->it_signal = current->signal;
602 list_add(&new_timer->list, &current->signal->posix_timers); 599 list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/profile.c b/kernel/profile.c
index dfadc5b729f1..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
365 switch (action) { 365 switch (action) {
366 case CPU_UP_PREPARE: 366 case CPU_UP_PREPARE:
367 case CPU_UP_PREPARE_FROZEN: 367 case CPU_UP_PREPARE_FROZEN:
368 node = cpu_to_node(cpu); 368 node = cpu_to_mem(cpu);
369 per_cpu(cpu_profile_flip, cpu) = 0; 369 per_cpu(cpu_profile_flip, cpu) = 0;
370 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 370 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
371 page = alloc_pages_exact_node(node, 371 page = alloc_pages_exact_node(node,
372 GFP_KERNEL | __GFP_ZERO, 372 GFP_KERNEL | __GFP_ZERO,
373 0); 373 0);
374 if (!page) 374 if (!page)
375 return NOTIFY_BAD; 375 return notifier_from_errno(-ENOMEM);
376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
377 } 377 }
378 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 378 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -388,7 +388,7 @@ out_free:
388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
389 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 389 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
390 __free_page(page); 390 __free_page(page);
391 return NOTIFY_BAD; 391 return notifier_from_errno(-ENOMEM);
392 case CPU_ONLINE: 392 case CPU_ONLINE:
393 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
394 if (prof_cpu_mask != NULL) 394 if (prof_cpu_mask != NULL)
@@ -567,7 +567,7 @@ static int create_hash_tables(void)
567 int cpu; 567 int cpu;
568 568
569 for_each_online_cpu(cpu) { 569 for_each_online_cpu(cpu) {
570 int node = cpu_to_node(cpu); 570 int node = cpu_to_mem(cpu);
571 struct page *page; 571 struct page *page;
572 572
573 page = alloc_pages_exact_node(node, 573 page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6af9cdd558b7..74a3d693c196 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -594,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request,
594 ret = ptrace_detach(child, data); 594 ret = ptrace_detach(child, data);
595 break; 595 break;
596 596
597#ifdef CONFIG_BINFMT_ELF_FDPIC
598 case PTRACE_GETFDPIC: {
599 struct mm_struct *mm = get_task_mm(child);
600 unsigned long tmp = 0;
601
602 ret = -ESRCH;
603 if (!mm)
604 break;
605
606 switch (addr) {
607 case PTRACE_GETFDPIC_EXEC:
608 tmp = mm->context.exec_fdpic_loadmap;
609 break;
610 case PTRACE_GETFDPIC_INTERP:
611 tmp = mm->context.interp_fdpic_loadmap;
612 break;
613 default:
614 break;
615 }
616 mmput(mm);
617
618 ret = put_user(tmp, (unsigned long __user *) data);
619 break;
620 }
621#endif
622
597#ifdef PTRACE_SINGLESTEP 623#ifdef PTRACE_SINGLESTEP
598 case PTRACE_SINGLESTEP: 624 case PTRACE_SINGLESTEP:
599#endif 625#endif
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
539 "relay_hotcpu_callback: cpu %d buffer " 539 "relay_hotcpu_callback: cpu %d buffer "
540 "creation failed\n", hotcpu); 540 "creation failed\n", hotcpu);
541 mutex_unlock(&relay_channels_mutex); 541 mutex_unlock(&relay_channels_mutex);
542 return NOTIFY_BAD; 542 return notifier_from_errno(-ENOMEM);
543 } 543 }
544 } 544 }
545 mutex_unlock(&relay_channels_mutex); 545 mutex_unlock(&relay_channels_mutex);
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1231 size_t read_subbuf = read_start / subbuf_size; 1231 size_t read_subbuf = read_start / subbuf_size;
1232 size_t padding = rbuf->padding[read_subbuf]; 1232 size_t padding = rbuf->padding[read_subbuf];
1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; 1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1234 struct page *pages[PIPE_BUFFERS]; 1234 struct page *pages[PIPE_DEF_BUFFERS];
1235 struct partial_page partial[PIPE_BUFFERS]; 1235 struct partial_page partial[PIPE_DEF_BUFFERS];
1236 struct splice_pipe_desc spd = { 1236 struct splice_pipe_desc spd = {
1237 .pages = pages, 1237 .pages = pages,
1238 .nr_pages = 0, 1238 .nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1245 1245
1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1247 return 0; 1247 return 0;
1248 if (splice_grow_spd(pipe, &spd))
1249 return -ENOMEM;
1248 1250
1249 /* 1251 /*
1250 * Adjust read len, if longer than what is available 1252 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
1255 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1257 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1256 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1258 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1257 poff = read_start & ~PAGE_MASK; 1259 poff = read_start & ~PAGE_MASK;
1258 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); 1260 nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
1259 1261
1260 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { 1262 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1261 unsigned int this_len, this_end, private; 1263 unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
1289 } 1291 }
1290 } 1292 }
1291 1293
1294 ret = 0;
1292 if (!spd.nr_pages) 1295 if (!spd.nr_pages)
1293 return 0; 1296 goto out;
1294 1297
1295 ret = *nonpad_ret = splice_to_pipe(pipe, &spd); 1298 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1296 if (ret < 0 || ret < total_len) 1299 if (ret < 0 || ret < total_len)
1297 return ret; 1300 goto out;
1298 1301
1299 if (read_start + ret == nonpad_end) 1302 if (read_start + ret == nonpad_end)
1300 ret += padding; 1303 ret += padding;
1301 1304
1305out:
1306 splice_shrink_spd(pipe, &spd);
1302 return ret; 1307 return ret;
1303} 1308}
1304 1309
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c358e263534..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/sched.h>
18#include <linux/seq_file.h> 19#include <linux/seq_file.h>
19#include <linux/device.h> 20#include <linux/device.h>
20#include <linux/pfn.h> 21#include <linux/pfn.h>
@@ -681,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
681 * release_region releases a matching busy region. 682 * release_region releases a matching busy region.
682 */ 683 */
683 684
685static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
686
684/** 687/**
685 * __request_region - create a new busy resource region 688 * __request_region - create a new busy resource region
686 * @parent: parent resource descriptor 689 * @parent: parent resource descriptor
@@ -693,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
693 resource_size_t start, resource_size_t n, 696 resource_size_t start, resource_size_t n,
694 const char *name, int flags) 697 const char *name, int flags)
695{ 698{
699 DECLARE_WAITQUEUE(wait, current);
696 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 700 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
697 701
698 if (!res) 702 if (!res)
@@ -717,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
717 if (!(conflict->flags & IORESOURCE_BUSY)) 721 if (!(conflict->flags & IORESOURCE_BUSY))
718 continue; 722 continue;
719 } 723 }
720 724 if (conflict->flags & flags & IORESOURCE_MUXED) {
725 add_wait_queue(&muxed_resource_wait, &wait);
726 write_unlock(&resource_lock);
727 set_current_state(TASK_UNINTERRUPTIBLE);
728 schedule();
729 remove_wait_queue(&muxed_resource_wait, &wait);
730 write_lock(&resource_lock);
731 continue;
732 }
721 /* Uhhuh, that didn't work out.. */ 733 /* Uhhuh, that didn't work out.. */
722 kfree(res); 734 kfree(res);
723 res = NULL; 735 res = NULL;
@@ -791,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
791 break; 803 break;
792 *p = res->sibling; 804 *p = res->sibling;
793 write_unlock(&resource_lock); 805 write_unlock(&resource_lock);
806 if (res->flags & IORESOURCE_MUXED)
807 wake_up(&muxed_resource_wait);
794 kfree(res); 808 kfree(res);
795 return; 809 return;
796 } 810 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 054a6012de99..f8b8996228dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -544,6 +544,8 @@ struct rq {
544 struct root_domain *rd; 544 struct root_domain *rd;
545 struct sched_domain *sd; 545 struct sched_domain *sd;
546 546
547 unsigned long cpu_power;
548
547 unsigned char idle_at_tick; 549 unsigned char idle_at_tick;
548 /* For active balancing */ 550 /* For active balancing */
549 int post_schedule; 551 int post_schedule;
@@ -969,14 +971,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
969 } 971 }
970} 972}
971 973
972void task_rq_unlock_wait(struct task_struct *p)
973{
974 struct rq *rq = task_rq(p);
975
976 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
977 raw_spin_unlock_wait(&rq->lock);
978}
979
980static void __task_rq_unlock(struct rq *rq) 974static void __task_rq_unlock(struct rq *rq)
981 __releases(rq->lock) 975 __releases(rq->lock)
982{ 976{
@@ -1507,24 +1501,9 @@ static unsigned long target_load(int cpu, int type)
1507 return max(rq->cpu_load[type-1], total); 1501 return max(rq->cpu_load[type-1], total);
1508} 1502}
1509 1503
1510static struct sched_group *group_of(int cpu)
1511{
1512 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1513
1514 if (!sd)
1515 return NULL;
1516
1517 return sd->groups;
1518}
1519
1520static unsigned long power_of(int cpu) 1504static unsigned long power_of(int cpu)
1521{ 1505{
1522 struct sched_group *group = group_of(cpu); 1506 return cpu_rq(cpu)->cpu_power;
1523
1524 if (!group)
1525 return SCHED_LOAD_SCALE;
1526
1527 return group->cpu_power;
1528} 1507}
1529 1508
1530static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1509static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1862,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
1862static void set_load_weight(struct task_struct *p) 1841static void set_load_weight(struct task_struct *p)
1863{ 1842{
1864 if (task_has_rt_policy(p)) { 1843 if (task_has_rt_policy(p)) {
1865 p->se.load.weight = prio_to_weight[0] * 2; 1844 p->se.load.weight = 0;
1866 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1845 p->se.load.inv_weight = WMULT_CONST;
1867 return; 1846 return;
1868 } 1847 }
1869 1848
@@ -4062,6 +4041,23 @@ int __sched wait_for_completion_killable(struct completion *x)
4062EXPORT_SYMBOL(wait_for_completion_killable); 4041EXPORT_SYMBOL(wait_for_completion_killable);
4063 4042
4064/** 4043/**
4044 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4045 * @x: holds the state of this particular completion
4046 * @timeout: timeout value in jiffies
4047 *
4048 * This waits for either a completion of a specific task to be
4049 * signaled or for a specified timeout to expire. It can be
4050 * interrupted by a kill signal. The timeout is in jiffies.
4051 */
4052unsigned long __sched
4053wait_for_completion_killable_timeout(struct completion *x,
4054 unsigned long timeout)
4055{
4056 return wait_for_common(x, timeout, TASK_KILLABLE);
4057}
4058EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4059
4060/**
4065 * try_wait_for_completion - try to decrement a completion without blocking 4061 * try_wait_for_completion - try to decrement a completion without blocking
4066 * @x: completion structure 4062 * @x: completion structure
4067 * 4063 *
@@ -7596,6 +7592,7 @@ void __init sched_init(void)
7596#ifdef CONFIG_SMP 7592#ifdef CONFIG_SMP
7597 rq->sd = NULL; 7593 rq->sd = NULL;
7598 rq->rd = NULL; 7594 rq->rd = NULL;
7595 rq->cpu_power = SCHED_LOAD_SCALE;
7599 rq->post_schedule = 0; 7596 rq->post_schedule = 0;
7600 rq->active_balance = 0; 7597 rq->active_balance = 0;
7601 rq->next_balance = jiffies; 7598 rq->next_balance = jiffies;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
41 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ); 42 * (NSEC_PER_SEC / HZ);
43} 43}
44EXPORT_SYMBOL_GPL(sched_clock);
44 45
45static __read_mostly int sched_clock_running; 46static __read_mostly int sched_clock_running;
46 47
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87a330a7185f..35565395d00d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -381,15 +381,9 @@ __initcall(init_sched_debug_procfs);
381void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 381void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
382{ 382{
383 unsigned long nr_switches; 383 unsigned long nr_switches;
384 unsigned long flags;
385 int num_threads = 1;
386
387 if (lock_task_sighand(p, &flags)) {
388 num_threads = atomic_read(&p->signal->count);
389 unlock_task_sighand(p, &flags);
390 }
391 384
392 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 385 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
386 get_nr_threads(p));
393 SEQ_printf(m, 387 SEQ_printf(m,
394 "---------------------------------------------------------\n"); 388 "---------------------------------------------------------\n");
395#define __P(F) \ 389#define __P(F) \
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 217e4a9393e4..eed35eded602 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1225 unsigned long this_load, load; 1225 unsigned long this_load, load;
1226 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1227 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
1228 unsigned int imbalance;
1229 struct task_group *tg; 1228 struct task_group *tg;
1230 unsigned long weight; 1229 unsigned long weight;
1231 int balanced; 1230 int balanced;
@@ -1252,8 +1251,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1252 tg = task_group(p); 1251 tg = task_group(p);
1253 weight = p->se.load.weight; 1252 weight = p->se.load.weight;
1254 1253
1255 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1256
1257 /* 1254 /*
1258 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1255 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1259 * due to the sync cause above having dropped this_load to 0, we'll 1256 * due to the sync cause above having dropped this_load to 0, we'll
@@ -1263,9 +1260,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1263 * Otherwise check if either cpus are near enough in load to allow this 1260 * Otherwise check if either cpus are near enough in load to allow this
1264 * task to be woken on this_cpu. 1261 * task to be woken on this_cpu.
1265 */ 1262 */
1266 balanced = !this_load || 1263 if (this_load) {
1267 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= 1264 unsigned long this_eff_load, prev_eff_load;
1268 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1265
1266 this_eff_load = 100;
1267 this_eff_load *= power_of(prev_cpu);
1268 this_eff_load *= this_load +
1269 effective_load(tg, this_cpu, weight, weight);
1270
1271 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
1272 prev_eff_load *= power_of(this_cpu);
1273 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
1274
1275 balanced = this_eff_load <= prev_eff_load;
1276 } else
1277 balanced = true;
1269 1278
1270 /* 1279 /*
1271 * If the currently running task will sleep within 1280 * If the currently running task will sleep within
@@ -2298,6 +2307,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2298 if (!power) 2307 if (!power)
2299 power = 1; 2308 power = 1;
2300 2309
2310 cpu_rq(cpu)->cpu_power = power;
2301 sdg->cpu_power = power; 2311 sdg->cpu_power = power;
2302} 2312}
2303 2313
diff --git a/kernel/signal.c b/kernel/signal.c
index 825a3f24ad76..906ae5a1779c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -642,7 +642,7 @@ static inline bool si_fromuser(const struct siginfo *info)
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
644{ 644{
645 const struct cred *cred = current_cred(), *tcred; 645 const struct cred *cred, *tcred;
646 struct pid *sid; 646 struct pid *sid;
647 int error; 647 int error;
648 648
@@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 656 if (error)
657 return error; 657 return error;
658 658
659 cred = current_cred();
659 tcred = __task_cred(t); 660 tcred = __task_cred(t);
660 if ((cred->euid ^ tcred->suid) && 661 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) &&
661 (cred->euid ^ tcred->uid) && 663 (cred->euid ^ tcred->uid) &&
662 (cred->uid ^ tcred->suid) && 664 (cred->uid ^ tcred->suid) &&
663 (cred->uid ^ tcred->uid) && 665 (cred->uid ^ tcred->uid) &&
@@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1083/* 1085/*
1084 * Nuke all other threads in the group. 1086 * Nuke all other threads in the group.
1085 */ 1087 */
1086void zap_other_threads(struct task_struct *p) 1088int zap_other_threads(struct task_struct *p)
1087{ 1089{
1088 struct task_struct *t; 1090 struct task_struct *t = p;
1091 int count = 0;
1089 1092
1090 p->signal->group_stop_count = 0; 1093 p->signal->group_stop_count = 0;
1091 1094
1092 for (t = next_thread(p); t != p; t = next_thread(t)) { 1095 while_each_thread(p, t) {
1093 /* 1096 count++;
1094 * Don't bother with already dead threads 1097
1095 */ 1098 /* Don't bother with already dead threads */
1096 if (t->exit_state) 1099 if (t->exit_state)
1097 continue; 1100 continue;
1098
1099 /* SIGKILL will be handled before any pending SIGSTOP */
1100 sigaddset(&t->pending.signal, SIGKILL); 1101 sigaddset(&t->pending.signal, SIGKILL);
1101 signal_wake_up(t, 1); 1102 signal_wake_up(t, 1);
1102 } 1103 }
1104
1105 return count;
1103} 1106}
1104 1107
1105struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
diff --git a/kernel/smp.c b/kernel/smp.c
index 3fc697336183..75c970c715d3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
52 case CPU_UP_PREPARE_FROZEN: 52 case CPU_UP_PREPARE_FROZEN:
53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
54 cpu_to_node(cpu))) 54 cpu_to_node(cpu)))
55 return NOTIFY_BAD; 55 return notifier_from_errno(-ENOMEM);
56 break; 56 break;
57 57
58#ifdef CONFIG_HOTPLUG_CPU 58#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0db913a5c60f..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
809 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
810 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
811 return NOTIFY_BAD; 811 return notifier_from_errno(PTR_ERR(p));
812 } 812 }
813 kthread_bind(p, hotcpu); 813 kthread_bind(p, hotcpu);
814 per_cpu(ksoftirqd, hotcpu) = p; 814 per_cpu(ksoftirqd, hotcpu) = p;
@@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
850 void *cpu = (void *)(long)smp_processor_id(); 850 void *cpu = (void *)(long)smp_processor_id();
851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
852 852
853 BUG_ON(err == NOTIFY_BAD); 853 BUG_ON(err != NOTIFY_OK);
854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
855 register_cpu_notifier(&cpu_nfb); 855 register_cpu_notifier(&cpu_nfb);
856 return 0; 856 return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4e7431e7c78..70f8d90331e9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
321 321
322#ifdef CONFIG_HOTPLUG_CPU 322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED: 323 case CPU_UP_CANCELED:
324 case CPU_DEAD: 324 case CPU_POST_DEAD:
325 { 325 {
326 struct cpu_stop_work *work; 326 struct cpu_stop_work *work;
327 327
diff --git a/kernel/sys.c b/kernel/sys.c
index 0d36d889c74d..e83ddbbaf89d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1632,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
1632 1632
1633char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 1633char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
1634 1634
1635static void argv_cleanup(char **argv, char **envp) 1635static void argv_cleanup(struct subprocess_info *info)
1636{ 1636{
1637 argv_free(argv); 1637 argv_free(info->argv);
1638} 1638}
1639 1639
1640/** 1640/**
@@ -1668,7 +1668,7 @@ int orderly_poweroff(bool force)
1668 goto out; 1668 goto out;
1669 } 1669 }
1670 1670
1671 call_usermodehelper_setcleanup(info, argv_cleanup); 1671 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
1672 1672
1673 ret = call_usermodehelper_exec(info, UMH_NO_WAIT); 1673 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
1674 1674
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b12583047757..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,6 +37,7 @@
37#include <linux/highuid.h> 37#include <linux/highuid.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/compaction.h>
40#include <linux/hugetlb.h> 41#include <linux/hugetlb.h>
41#include <linux/initrd.h> 42#include <linux/initrd.h>
42#include <linux/key.h> 43#include <linux/key.h>
@@ -52,6 +53,7 @@
52#include <linux/slow-work.h> 53#include <linux/slow-work.h>
53#include <linux/perf_event.h> 54#include <linux/perf_event.h>
54#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h>
55 57
56#include <asm/uaccess.h> 58#include <asm/uaccess.h>
57#include <asm/processor.h> 59#include <asm/processor.h>
@@ -261,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
261static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ 263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
262#endif 264#endif
263 265
266#ifdef CONFIG_COMPACTION
267static int min_extfrag_threshold;
268static int max_extfrag_threshold = 1000;
269#endif
270
264static struct ctl_table kern_table[] = { 271static struct ctl_table kern_table[] = {
265 { 272 {
266 .procname = "sched_child_runs_first", 273 .procname = "sched_child_runs_first",
@@ -1120,6 +1127,25 @@ static struct ctl_table vm_table[] = {
1120 .mode = 0644, 1127 .mode = 0644,
1121 .proc_handler = drop_caches_sysctl_handler, 1128 .proc_handler = drop_caches_sysctl_handler,
1122 }, 1129 },
1130#ifdef CONFIG_COMPACTION
1131 {
1132 .procname = "compact_memory",
1133 .data = &sysctl_compact_memory,
1134 .maxlen = sizeof(int),
1135 .mode = 0200,
1136 .proc_handler = sysctl_compaction_handler,
1137 },
1138 {
1139 .procname = "extfrag_threshold",
1140 .data = &sysctl_extfrag_threshold,
1141 .maxlen = sizeof(int),
1142 .mode = 0644,
1143 .proc_handler = sysctl_extfrag_handler,
1144 .extra1 = &min_extfrag_threshold,
1145 .extra2 = &max_extfrag_threshold,
1146 },
1147
1148#endif /* CONFIG_COMPACTION */
1123 { 1149 {
1124 .procname = "min_free_kbytes", 1150 .procname = "min_free_kbytes",
1125 .data = &min_free_kbytes, 1151 .data = &min_free_kbytes,
@@ -1444,6 +1470,14 @@ static struct ctl_table fs_table[] = {
1444 .child = binfmt_misc_table, 1470 .child = binfmt_misc_table,
1445 }, 1471 },
1446#endif 1472#endif
1473 {
1474 .procname = "pipe-max-size",
1475 .data = &pipe_max_size,
1476 .maxlen = sizeof(int),
1477 .mode = 0644,
1478 .proc_handler = &pipe_proc_fn,
1479 .extra1 = &pipe_min_size,
1480 },
1447/* 1481/*
1448 * NOTE: do not add new entries to this table unless you have read 1482 * NOTE: do not add new entries to this table unless you have read
1449 * Documentation/sysctl/ctl_unnumbered.txt 1483 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2083,20 +2117,20 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
2083 2117
2084#define TMPBUFLEN 22 2118#define TMPBUFLEN 22
2085/** 2119/**
2086 * proc_get_long - reads an ASCII formated integer from a user buffer 2120 * proc_get_long - reads an ASCII formatted integer from a user buffer
2087 * 2121 *
2088 * @buf - a kernel buffer 2122 * @buf: a kernel buffer
2089 * @size - size of the kernel buffer 2123 * @size: size of the kernel buffer
2090 * @val - this is where the number will be stored 2124 * @val: this is where the number will be stored
2091 * @neg - set to %TRUE if number is negative 2125 * @neg: set to %TRUE if number is negative
2092 * @perm_tr - a vector which contains the allowed trailers 2126 * @perm_tr: a vector which contains the allowed trailers
2093 * @perm_tr_len - size of the perm_tr vector 2127 * @perm_tr_len: size of the perm_tr vector
2094 * @tr - pointer to store the trailer character 2128 * @tr: pointer to store the trailer character
2095 * 2129 *
2096 * In case of success 0 is returned and buf and size are updated with 2130 * In case of success %0 is returned and @buf and @size are updated with
2097 * the amount of bytes read. If tr is non NULL and a trailing 2131 * the amount of bytes read. If @tr is non-NULL and a trailing
2098 * character exist (size is non zero after returning from this 2132 * character exists (size is non-zero after returning from this
2099 * function) tr is updated with the trailing character. 2133 * function), @tr is updated with the trailing character.
2100 */ 2134 */
2101static int proc_get_long(char **buf, size_t *size, 2135static int proc_get_long(char **buf, size_t *size,
2102 unsigned long *val, bool *neg, 2136 unsigned long *val, bool *neg,
@@ -2147,15 +2181,15 @@ static int proc_get_long(char **buf, size_t *size,
2147} 2181}
2148 2182
2149/** 2183/**
2150 * proc_put_long - coverts an integer to a decimal ASCII formated string 2184 * proc_put_long - converts an integer to a decimal ASCII formatted string
2151 * 2185 *
2152 * @buf - the user buffer 2186 * @buf: the user buffer
2153 * @size - the size of the user buffer 2187 * @size: the size of the user buffer
2154 * @val - the integer to be converted 2188 * @val: the integer to be converted
2155 * @neg - sign of the number, %TRUE for negative 2189 * @neg: sign of the number, %TRUE for negative
2156 * 2190 *
2157 * In case of success 0 is returned and buf and size are updated with 2191 * In case of success %0 is returned and @buf and @size are updated with
2158 * the amount of bytes read. 2192 * the amount of bytes written.
2159 */ 2193 */
2160static int proc_put_long(void __user **buf, size_t *size, unsigned long val, 2194static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
2161 bool neg) 2195 bool neg)
@@ -2253,6 +2287,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2253 if (write) { 2287 if (write) {
2254 left -= proc_skip_spaces(&kbuf); 2288 left -= proc_skip_spaces(&kbuf);
2255 2289
2290 if (!left)
2291 break;
2256 err = proc_get_long(&kbuf, &left, &lval, &neg, 2292 err = proc_get_long(&kbuf, &left, &lval, &neg,
2257 proc_wspace_sep, 2293 proc_wspace_sep,
2258 sizeof(proc_wspace_sep), NULL); 2294 sizeof(proc_wspace_sep), NULL);
@@ -2279,7 +2315,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2279 2315
2280 if (!write && !first && left && !err) 2316 if (!write && !first && left && !err)
2281 err = proc_put_char(&buffer, &left, '\n'); 2317 err = proc_put_char(&buffer, &left, '\n');
2282 if (write && !err) 2318 if (write && !err && left)
2283 left -= proc_skip_spaces(&kbuf); 2319 left -= proc_skip_spaces(&kbuf);
2284free: 2320free:
2285 if (write) { 2321 if (write) {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 937d31dc8566..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/kernel.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17 18
18#ifdef CONFIG_SYSCTL_SYSCALL 19#ifdef CONFIG_SYSCTL_SYSCALL
@@ -1124,11 +1125,6 @@ out:
1124 return result; 1125 return result;
1125} 1126}
1126 1127
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file, 1128static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1129 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{ 1130{
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
1156 if (!isxdigit(str[0]) || !isxdigit(str[1])) 1152 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out; 1153 goto out;
1158 1154
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); 1155 uuid[i] = (hex_to_bin(str[0]) << 4) |
1156 hex_to_bin(str[1]);
1160 str += 2; 1157 str += 2;
1161 if (*str == '-') 1158 if (*str == '-')
1162 str++; 1159 str++;
diff --git a/kernel/time.c b/kernel/time.c
index 50612faa9baf..848b1c2ab09a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,10 +132,10 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
132 */ 132 */
133static inline void warp_clock(void) 133static inline void warp_clock(void)
134{ 134{
135 struct timespec delta, adjust; 135 struct timespec adjust;
136 delta.tv_sec = sys_tz.tz_minuteswest * 60; 136
137 delta.tv_nsec = 0; 137 adjust = current_kernel_time();
138 adjust = timespec_add_safe(current_kernel_time(), delta); 138 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
139 do_settimeofday(&adjust); 139 do_settimeofday(&adjust);
140} 140}
141 141
diff --git a/kernel/timer.c b/kernel/timer.c
index 9199f3c52215..ee305c8d4e18 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -750,13 +750,18 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
750 unsigned long expires_limit, mask; 750 unsigned long expires_limit, mask;
751 int bit; 751 int bit;
752 752
753 expires_limit = expires + timer->slack; 753 expires_limit = expires;
754 754
755 if (timer->slack < 0) /* auto slack: use 0.4% */ 755 if (timer->slack >= 0) {
756 expires_limit = expires + (expires - jiffies)/256; 756 expires_limit = expires + timer->slack;
757 } else {
758 unsigned long now = jiffies;
757 759
760 /* No slack, if already expired else auto slack 0.4% */
761 if (time_after(expires, now))
762 expires_limit = expires + (expires - now)/256;
763 }
758 mask = expires ^ expires_limit; 764 mask = expires ^ expires_limit;
759
760 if (mask == 0) 765 if (mask == 0)
761 return expires; 766 return expires;
762 767
@@ -1679,11 +1684,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1679 unsigned long action, void *hcpu) 1684 unsigned long action, void *hcpu)
1680{ 1685{
1681 long cpu = (long)hcpu; 1686 long cpu = (long)hcpu;
1687 int err;
1688
1682 switch(action) { 1689 switch(action) {
1683 case CPU_UP_PREPARE: 1690 case CPU_UP_PREPARE:
1684 case CPU_UP_PREPARE_FROZEN: 1691 case CPU_UP_PREPARE_FROZEN:
1685 if (init_timers_cpu(cpu) < 0) 1692 err = init_timers_cpu(cpu);
1686 return NOTIFY_BAD; 1693 if (err < 0)
1694 return notifier_from_errno(err);
1687 break; 1695 break;
1688#ifdef CONFIG_HOTPLUG_CPU 1696#ifdef CONFIG_HOTPLUG_CPU
1689 case CPU_DEAD: 1697 case CPU_DEAD:
@@ -1709,7 +1717,7 @@ void __init init_timers(void)
1709 1717
1710 init_timer_stats(); 1718 init_timer_stats();
1711 1719
1712 BUG_ON(err == NOTIFY_BAD); 1720 BUG_ON(err != NOTIFY_OK);
1713 register_cpu_notifier(&timers_nb); 1721 register_cpu_notifier(&timers_nb);
1714 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1722 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1715} 1723}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f510..638711c17504 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
675 } 675 }
676} 676}
677 677
678static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) 678static void blk_add_trace_rq_abort(void *ignore,
679 struct request_queue *q, struct request *rq)
679{ 680{
680 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 681 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
681} 682}
682 683
683static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) 684static void blk_add_trace_rq_insert(void *ignore,
685 struct request_queue *q, struct request *rq)
684{ 686{
685 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 687 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
686} 688}
687 689
688static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) 690static void blk_add_trace_rq_issue(void *ignore,
691 struct request_queue *q, struct request *rq)
689{ 692{
690 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 693 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
691} 694}
692 695
693static void blk_add_trace_rq_requeue(struct request_queue *q, 696static void blk_add_trace_rq_requeue(void *ignore,
697 struct request_queue *q,
694 struct request *rq) 698 struct request *rq)
695{ 699{
696 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 700 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
697} 701}
698 702
699static void blk_add_trace_rq_complete(struct request_queue *q, 703static void blk_add_trace_rq_complete(void *ignore,
704 struct request_queue *q,
700 struct request *rq) 705 struct request *rq)
701{ 706{
702 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 707 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
724 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 729 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
725} 730}
726 731
727static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) 732static void blk_add_trace_bio_bounce(void *ignore,
733 struct request_queue *q, struct bio *bio)
728{ 734{
729 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 735 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
730} 736}
731 737
732static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) 738static void blk_add_trace_bio_complete(void *ignore,
739 struct request_queue *q, struct bio *bio)
733{ 740{
734 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 741 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
735} 742}
736 743
737static void blk_add_trace_bio_backmerge(struct request_queue *q, 744static void blk_add_trace_bio_backmerge(void *ignore,
745 struct request_queue *q,
738 struct bio *bio) 746 struct bio *bio)
739{ 747{
740 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 748 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
741} 749}
742 750
743static void blk_add_trace_bio_frontmerge(struct request_queue *q, 751static void blk_add_trace_bio_frontmerge(void *ignore,
752 struct request_queue *q,
744 struct bio *bio) 753 struct bio *bio)
745{ 754{
746 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 755 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
747} 756}
748 757
749static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) 758static void blk_add_trace_bio_queue(void *ignore,
759 struct request_queue *q, struct bio *bio)
750{ 760{
751 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 761 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
752} 762}
753 763
754static void blk_add_trace_getrq(struct request_queue *q, 764static void blk_add_trace_getrq(void *ignore,
765 struct request_queue *q,
755 struct bio *bio, int rw) 766 struct bio *bio, int rw)
756{ 767{
757 if (bio) 768 if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
765} 776}
766 777
767 778
768static void blk_add_trace_sleeprq(struct request_queue *q, 779static void blk_add_trace_sleeprq(void *ignore,
780 struct request_queue *q,
769 struct bio *bio, int rw) 781 struct bio *bio, int rw)
770{ 782{
771 if (bio) 783 if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
779 } 791 }
780} 792}
781 793
782static void blk_add_trace_plug(struct request_queue *q) 794static void blk_add_trace_plug(void *ignore, struct request_queue *q)
783{ 795{
784 struct blk_trace *bt = q->blk_trace; 796 struct blk_trace *bt = q->blk_trace;
785 797
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
787 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 799 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
788} 800}
789 801
790static void blk_add_trace_unplug_io(struct request_queue *q) 802static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
791{ 803{
792 struct blk_trace *bt = q->blk_trace; 804 struct blk_trace *bt = q->blk_trace;
793 805
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
800 } 812 }
801} 813}
802 814
803static void blk_add_trace_unplug_timer(struct request_queue *q) 815static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
804{ 816{
805 struct blk_trace *bt = q->blk_trace; 817 struct blk_trace *bt = q->blk_trace;
806 818
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
813 } 825 }
814} 826}
815 827
816static void blk_add_trace_split(struct request_queue *q, struct bio *bio, 828static void blk_add_trace_split(void *ignore,
829 struct request_queue *q, struct bio *bio,
817 unsigned int pdu) 830 unsigned int pdu)
818{ 831{
819 struct blk_trace *bt = q->blk_trace; 832 struct blk_trace *bt = q->blk_trace;
@@ -829,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
829 842
830/** 843/**
831 * blk_add_trace_remap - Add a trace for a remap operation 844 * blk_add_trace_remap - Add a trace for a remap operation
845 * @ignore: trace callback data parameter (not used)
832 * @q: queue the io is for 846 * @q: queue the io is for
833 * @bio: the source bio 847 * @bio: the source bio
834 * @dev: target device 848 * @dev: target device
@@ -839,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
839 * it spans a stripe (or similar). Add a trace for that action. 853 * it spans a stripe (or similar). Add a trace for that action.
840 * 854 *
841 **/ 855 **/
842static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 856static void blk_add_trace_remap(void *ignore,
843 dev_t dev, sector_t from) 857 struct request_queue *q, struct bio *bio,
858 dev_t dev, sector_t from)
844{ 859{
845 struct blk_trace *bt = q->blk_trace; 860 struct blk_trace *bt = q->blk_trace;
846 struct blk_io_trace_remap r; 861 struct blk_io_trace_remap r;
@@ -859,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
859 874
860/** 875/**
861 * blk_add_trace_rq_remap - Add a trace for a request-remap operation 876 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
877 * @ignore: trace callback data parameter (not used)
862 * @q: queue the io is for 878 * @q: queue the io is for
863 * @rq: the source request 879 * @rq: the source request
864 * @dev: target device 880 * @dev: target device
@@ -869,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
869 * Add a trace for that action. 885 * Add a trace for that action.
870 * 886 *
871 **/ 887 **/
872static void blk_add_trace_rq_remap(struct request_queue *q, 888static void blk_add_trace_rq_remap(void *ignore,
889 struct request_queue *q,
873 struct request *rq, dev_t dev, 890 struct request *rq, dev_t dev,
874 sector_t from) 891 sector_t from)
875{ 892{
@@ -921,64 +938,64 @@ static void blk_register_tracepoints(void)
921{ 938{
922 int ret; 939 int ret;
923 940
924 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); 941 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
925 WARN_ON(ret); 942 WARN_ON(ret);
926 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); 943 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
927 WARN_ON(ret); 944 WARN_ON(ret);
928 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); 945 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
929 WARN_ON(ret); 946 WARN_ON(ret);
930 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); 947 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
931 WARN_ON(ret); 948 WARN_ON(ret);
932 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); 949 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
933 WARN_ON(ret); 950 WARN_ON(ret);
934 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); 951 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
935 WARN_ON(ret); 952 WARN_ON(ret);
936 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); 953 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
937 WARN_ON(ret); 954 WARN_ON(ret);
938 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 955 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
939 WARN_ON(ret); 956 WARN_ON(ret);
940 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 957 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
941 WARN_ON(ret); 958 WARN_ON(ret);
942 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); 959 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
943 WARN_ON(ret); 960 WARN_ON(ret);
944 ret = register_trace_block_getrq(blk_add_trace_getrq); 961 ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
945 WARN_ON(ret); 962 WARN_ON(ret);
946 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); 963 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
947 WARN_ON(ret); 964 WARN_ON(ret);
948 ret = register_trace_block_plug(blk_add_trace_plug); 965 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
949 WARN_ON(ret); 966 WARN_ON(ret);
950 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); 967 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
951 WARN_ON(ret); 968 WARN_ON(ret);
952 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); 969 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
953 WARN_ON(ret); 970 WARN_ON(ret);
954 ret = register_trace_block_split(blk_add_trace_split); 971 ret = register_trace_block_split(blk_add_trace_split, NULL);
955 WARN_ON(ret); 972 WARN_ON(ret);
956 ret = register_trace_block_remap(blk_add_trace_remap); 973 ret = register_trace_block_remap(blk_add_trace_remap, NULL);
957 WARN_ON(ret); 974 WARN_ON(ret);
958 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); 975 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
959 WARN_ON(ret); 976 WARN_ON(ret);
960} 977}
961 978
962static void blk_unregister_tracepoints(void) 979static void blk_unregister_tracepoints(void)
963{ 980{
964 unregister_trace_block_rq_remap(blk_add_trace_rq_remap); 981 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
965 unregister_trace_block_remap(blk_add_trace_remap); 982 unregister_trace_block_remap(blk_add_trace_remap, NULL);
966 unregister_trace_block_split(blk_add_trace_split); 983 unregister_trace_block_split(blk_add_trace_split, NULL);
967 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 984 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
968 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); 985 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
969 unregister_trace_block_plug(blk_add_trace_plug); 986 unregister_trace_block_plug(blk_add_trace_plug, NULL);
970 unregister_trace_block_sleeprq(blk_add_trace_sleeprq); 987 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
971 unregister_trace_block_getrq(blk_add_trace_getrq); 988 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
972 unregister_trace_block_bio_queue(blk_add_trace_bio_queue); 989 unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
973 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 990 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
974 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 991 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
975 unregister_trace_block_bio_complete(blk_add_trace_bio_complete); 992 unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
976 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); 993 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
977 unregister_trace_block_rq_complete(blk_add_trace_rq_complete); 994 unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
978 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); 995 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
979 unregister_trace_block_rq_issue(blk_add_trace_rq_issue); 996 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
980 unregister_trace_block_rq_insert(blk_add_trace_rq_insert); 997 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
981 unregister_trace_block_rq_abort(blk_add_trace_rq_abort); 998 unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
982 999
983 tracepoint_synchronize_unregister(); 1000 tracepoint_synchronize_unregister();
984} 1001}
@@ -1321,7 +1338,7 @@ out:
1321} 1338}
1322 1339
1323static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1340static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1324 int flags) 1341 int flags, struct trace_event *event)
1325{ 1342{
1326 return print_one_line(iter, false); 1343 return print_one_line(iter, false);
1327} 1344}
@@ -1343,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1343} 1360}
1344 1361
1345static enum print_line_t 1362static enum print_line_t
1346blk_trace_event_print_binary(struct trace_iterator *iter, int flags) 1363blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1364 struct trace_event *event)
1347{ 1365{
1348 return blk_trace_synthesize_old_trace(iter) ? 1366 return blk_trace_synthesize_old_trace(iter) ?
1349 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1367 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = {
1381 .set_flag = blk_tracer_set_flag, 1399 .set_flag = blk_tracer_set_flag,
1382}; 1400};
1383 1401
1384static struct trace_event trace_blk_event = { 1402static struct trace_event_functions trace_blk_event_funcs = {
1385 .type = TRACE_BLK,
1386 .trace = blk_trace_event_print, 1403 .trace = blk_trace_event_print,
1387 .binary = blk_trace_event_print_binary, 1404 .binary = blk_trace_event_print_binary,
1388}; 1405};
1389 1406
1407static struct trace_event trace_blk_event = {
1408 .type = TRACE_BLK,
1409 .funcs = &trace_blk_event_funcs,
1410};
1411
1390static int __init init_blk_tracer(void) 1412static int __init init_blk_tracer(void)
1391{ 1413{
1392 if (!register_ftrace_event(&trace_blk_event)) { 1414 if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 32837e19e3bd..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3234,7 +3234,8 @@ free:
3234} 3234}
3235 3235
3236static void 3236static void
3237ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next) 3237ftrace_graph_probe_sched_switch(void *ignore,
3238 struct task_struct *prev, struct task_struct *next)
3238{ 3239{
3239 unsigned long long timestamp; 3240 unsigned long long timestamp;
3240 int index; 3241 int index;
@@ -3288,7 +3289,7 @@ static int start_graph_tracing(void)
3288 } while (ret == -EAGAIN); 3289 } while (ret == -EAGAIN);
3289 3290
3290 if (!ret) { 3291 if (!ret) {
3291 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); 3292 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3292 if (ret) 3293 if (ret)
3293 pr_info("ftrace_graph: Couldn't activate tracepoint" 3294 pr_info("ftrace_graph: Couldn't activate tracepoint"
3294 " probe to kernel_sched_switch\n"); 3295 " probe to kernel_sched_switch\n");
@@ -3364,7 +3365,7 @@ void unregister_ftrace_graph(void)
3364 ftrace_graph_entry = ftrace_graph_entry_stub; 3365 ftrace_graph_entry = ftrace_graph_entry_stub;
3365 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3366 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3366 unregister_pm_notifier(&ftrace_suspend_notifier); 3367 unregister_pm_notifier(&ftrace_suspend_notifier);
3367 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); 3368 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3368 3369
3369 out: 3370 out:
3370 mutex_unlock(&ftrace_lock); 3371 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
95 trace_wake_up(); 95 trace_wake_up();
96} 96}
97 97
98static void kmemtrace_kmalloc(unsigned long call_site, 98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
99 const void *ptr, 100 const void *ptr,
100 size_t bytes_req, 101 size_t bytes_req,
101 size_t bytes_alloc, 102 size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
105 bytes_req, bytes_alloc, gfp_flags, -1); 106 bytes_req, bytes_alloc, gfp_flags, -1);
106} 107}
107 108
108static void kmemtrace_kmem_cache_alloc(unsigned long call_site, 109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
109 const void *ptr, 111 const void *ptr,
110 size_t bytes_req, 112 size_t bytes_req,
111 size_t bytes_alloc, 113 size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
115 bytes_req, bytes_alloc, gfp_flags, -1); 117 bytes_req, bytes_alloc, gfp_flags, -1);
116} 118}
117 119
118static void kmemtrace_kmalloc_node(unsigned long call_site, 120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
119 const void *ptr, 122 const void *ptr,
120 size_t bytes_req, 123 size_t bytes_req,
121 size_t bytes_alloc, 124 size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
126 bytes_req, bytes_alloc, gfp_flags, node); 129 bytes_req, bytes_alloc, gfp_flags, node);
127} 130}
128 131
129static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, 132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
130 const void *ptr, 134 const void *ptr,
131 size_t bytes_req, 135 size_t bytes_req,
132 size_t bytes_alloc, 136 size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
137 bytes_req, bytes_alloc, gfp_flags, node); 141 bytes_req, bytes_alloc, gfp_flags, node);
138} 142}
139 143
140static void kmemtrace_kfree(unsigned long call_site, const void *ptr) 144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
141{ 146{
142 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); 147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
143} 148}
144 149
145static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) 150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
146{ 152{
147 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); 153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
148} 154}
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
151{ 157{
152 int err; 158 int err;
153 159
154 err = register_trace_kmalloc(kmemtrace_kmalloc); 160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
155 if (err) 161 if (err)
156 return err; 162 return err;
157 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
158 if (err) 164 if (err)
159 return err; 165 return err;
160 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); 166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
161 if (err) 167 if (err)
162 return err; 168 return err;
163 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
164 if (err) 170 if (err)
165 return err; 171 return err;
166 err = register_trace_kfree(kmemtrace_kfree); 172 err = register_trace_kfree(kmemtrace_kfree, NULL);
167 if (err) 173 if (err)
168 return err; 174 return err;
169 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
170 176
171 return err; 177 return err;
172} 178}
173 179
174static void kmemtrace_stop_probes(void) 180static void kmemtrace_stop_probes(void)
175{ 181{
176 unregister_trace_kmalloc(kmemtrace_kmalloc); 182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
177 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
178 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); 184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
179 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
180 unregister_trace_kfree(kmemtrace_kfree); 186 unregister_trace_kfree(kmemtrace_kfree, NULL);
181 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
182} 188}
183 189
184static int kmem_trace_init(struct trace_array *tr) 190static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
237}; 243};
238 244
239static enum print_line_t 245static enum print_line_t
240kmemtrace_print_alloc(struct trace_iterator *iter, int flags) 246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
241{ 248{
242 struct trace_seq *s = &iter->seq; 249 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry; 250 struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
257} 264}
258 265
259static enum print_line_t 266static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags) 267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
261{ 269{
262 struct trace_seq *s = &iter->seq; 270 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry; 271 struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
275} 283}
276 284
277static enum print_line_t 285static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) 286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
279{ 288{
280 struct trace_seq *s = &iter->seq; 289 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry; 290 struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
309} 318}
310 319
311static enum print_line_t 320static enum print_line_t
312kmemtrace_print_free_user(struct trace_iterator *iter, int flags) 321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
313{ 323{
314 struct trace_seq *s = &iter->seq; 324 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry; 325 struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
463 } 473 }
464} 474}
465 475
466static struct trace_event kmem_trace_alloc = { 476static struct trace_event_functions kmem_trace_alloc_funcs = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc, 477 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user, 478 .binary = kmemtrace_print_alloc_user,
470}; 479};
471 480
472static struct trace_event kmem_trace_free = { 481static struct trace_event kmem_trace_alloc = {
473 .type = TRACE_KMEM_FREE, 482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
474 .trace = kmemtrace_print_free, 487 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user, 488 .binary = kmemtrace_print_free_user,
476}; 489};
477 490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
478static struct tracer kmem_tracer __read_mostly = { 496static struct tracer kmem_tracer __read_mostly = {
479 .name = "kmemtrace", 497 .name = "kmemtrace",
480 .init = kmem_trace_init, 498 .init = kmem_trace_init,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f6059c5aa94..1da7b6ea8b85 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1768,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1768 * must fill the old tail_page with padding. 1768 * must fill the old tail_page with padding.
1769 */ 1769 */
1770 if (tail >= BUF_PAGE_SIZE) { 1770 if (tail >= BUF_PAGE_SIZE) {
1771 /*
1772 * If the page was filled, then we still need
1773 * to update the real_end. Reset it to zero
1774 * and the reader will ignore it.
1775 */
1776 if (tail == BUF_PAGE_SIZE)
1777 tail_page->real_end = 0;
1778
1771 local_sub(length, &tail_page->write); 1779 local_sub(length, &tail_page->write);
1772 return; 1780 return;
1773 } 1781 }
@@ -3894,12 +3902,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3894 ret = read; 3902 ret = read;
3895 3903
3896 cpu_buffer->lost_events = 0; 3904 cpu_buffer->lost_events = 0;
3905
3906 commit = local_read(&bpage->commit);
3897 /* 3907 /*
3898 * Set a flag in the commit field if we lost events 3908 * Set a flag in the commit field if we lost events
3899 */ 3909 */
3900 if (missed_events) { 3910 if (missed_events) {
3901 commit = local_read(&bpage->commit);
3902
3903 /* If there is room at the end of the page to save the 3911 /* If there is room at the end of the page to save the
3904 * missed events, then record it there. 3912 * missed events, then record it there.
3905 */ 3913 */
@@ -3907,10 +3915,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3907 memcpy(&bpage->data[commit], &missed_events, 3915 memcpy(&bpage->data[commit], &missed_events,
3908 sizeof(missed_events)); 3916 sizeof(missed_events));
3909 local_add(RB_MISSED_STORED, &bpage->commit); 3917 local_add(RB_MISSED_STORED, &bpage->commit);
3918 commit += sizeof(missed_events);
3910 } 3919 }
3911 local_add(RB_MISSED_EVENTS, &bpage->commit); 3920 local_add(RB_MISSED_EVENTS, &bpage->commit);
3912 } 3921 }
3913 3922
3923 /*
3924 * This page may be off to user land. Zero it out here.
3925 */
3926 if (commit < BUF_PAGE_SIZE)
3927 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3928
3914 out_unlock: 3929 out_unlock:
3915 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3930 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3916 3931
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 756d7283318b..086d36316805 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1936,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1936 } 1936 }
1937 1937
1938 if (event) 1938 if (event)
1939 return event->trace(iter, sym_flags); 1939 return event->funcs->trace(iter, sym_flags, event);
1940 1940
1941 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 1941 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1942 goto partial; 1942 goto partial;
@@ -1962,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1962 1962
1963 event = ftrace_find_event(entry->type); 1963 event = ftrace_find_event(entry->type);
1964 if (event) 1964 if (event)
1965 return event->raw(iter, 0); 1965 return event->funcs->raw(iter, 0, event);
1966 1966
1967 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 1967 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1968 goto partial; 1968 goto partial;
@@ -1989,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1989 1989
1990 event = ftrace_find_event(entry->type); 1990 event = ftrace_find_event(entry->type);
1991 if (event) { 1991 if (event) {
1992 enum print_line_t ret = event->hex(iter, 0); 1992 enum print_line_t ret = event->funcs->hex(iter, 0, event);
1993 if (ret != TRACE_TYPE_HANDLED) 1993 if (ret != TRACE_TYPE_HANDLED)
1994 return ret; 1994 return ret;
1995 } 1995 }
@@ -2014,7 +2014,8 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2014 } 2014 }
2015 2015
2016 event = ftrace_find_event(entry->type); 2016 event = ftrace_find_event(entry->type);
2017 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->funcs->binary(iter, 0, event) :
2018 TRACE_TYPE_HANDLED;
2018} 2019}
2019 2020
2020int trace_empty(struct trace_iterator *iter) 2021int trace_empty(struct trace_iterator *iter)
@@ -3309,12 +3310,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3309 size_t len, 3310 size_t len,
3310 unsigned int flags) 3311 unsigned int flags)
3311{ 3312{
3312 struct page *pages[PIPE_BUFFERS]; 3313 struct page *pages_def[PIPE_DEF_BUFFERS];
3313 struct partial_page partial[PIPE_BUFFERS]; 3314 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3314 struct trace_iterator *iter = filp->private_data; 3315 struct trace_iterator *iter = filp->private_data;
3315 struct splice_pipe_desc spd = { 3316 struct splice_pipe_desc spd = {
3316 .pages = pages, 3317 .pages = pages_def,
3317 .partial = partial, 3318 .partial = partial_def,
3318 .nr_pages = 0, /* This gets updated below. */ 3319 .nr_pages = 0, /* This gets updated below. */
3319 .flags = flags, 3320 .flags = flags,
3320 .ops = &tracing_pipe_buf_ops, 3321 .ops = &tracing_pipe_buf_ops,
@@ -3325,6 +3326,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3325 size_t rem; 3326 size_t rem;
3326 unsigned int i; 3327 unsigned int i;
3327 3328
3329 if (splice_grow_spd(pipe, &spd))
3330 return -ENOMEM;
3331
3328 /* copy the tracer to avoid using a global lock all around */ 3332 /* copy the tracer to avoid using a global lock all around */
3329 mutex_lock(&trace_types_lock); 3333 mutex_lock(&trace_types_lock);
3330 if (unlikely(old_tracer != current_trace && current_trace)) { 3334 if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3355,23 +3359,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3355 trace_access_lock(iter->cpu_file); 3359 trace_access_lock(iter->cpu_file);
3356 3360
3357 /* Fill as many pages as possible. */ 3361 /* Fill as many pages as possible. */
3358 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3362 for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
3359 pages[i] = alloc_page(GFP_KERNEL); 3363 spd.pages[i] = alloc_page(GFP_KERNEL);
3360 if (!pages[i]) 3364 if (!spd.pages[i])
3361 break; 3365 break;
3362 3366
3363 rem = tracing_fill_pipe_page(rem, iter); 3367 rem = tracing_fill_pipe_page(rem, iter);
3364 3368
3365 /* Copy the data into the page, so we can start over. */ 3369 /* Copy the data into the page, so we can start over. */
3366 ret = trace_seq_to_buffer(&iter->seq, 3370 ret = trace_seq_to_buffer(&iter->seq,
3367 page_address(pages[i]), 3371 page_address(spd.pages[i]),
3368 iter->seq.len); 3372 iter->seq.len);
3369 if (ret < 0) { 3373 if (ret < 0) {
3370 __free_page(pages[i]); 3374 __free_page(spd.pages[i]);
3371 break; 3375 break;
3372 } 3376 }
3373 partial[i].offset = 0; 3377 spd.partial[i].offset = 0;
3374 partial[i].len = iter->seq.len; 3378 spd.partial[i].len = iter->seq.len;
3375 3379
3376 trace_seq_init(&iter->seq); 3380 trace_seq_init(&iter->seq);
3377 } 3381 }
@@ -3382,12 +3386,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3382 3386
3383 spd.nr_pages = i; 3387 spd.nr_pages = i;
3384 3388
3385 return splice_to_pipe(pipe, &spd); 3389 ret = splice_to_pipe(pipe, &spd);
3390out:
3391 splice_shrink_spd(pipe, &spd);
3392 return ret;
3386 3393
3387out_err: 3394out_err:
3388 mutex_unlock(&iter->mutex); 3395 mutex_unlock(&iter->mutex);
3389 3396 goto out;
3390 return ret;
3391} 3397}
3392 3398
3393static ssize_t 3399static ssize_t
@@ -3660,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3660 size_t count, loff_t *ppos) 3666 size_t count, loff_t *ppos)
3661{ 3667{
3662 struct ftrace_buffer_info *info = filp->private_data; 3668 struct ftrace_buffer_info *info = filp->private_data;
3663 unsigned int pos;
3664 ssize_t ret; 3669 ssize_t ret;
3665 size_t size; 3670 size_t size;
3666 3671
@@ -3687,11 +3692,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3687 if (ret < 0) 3692 if (ret < 0)
3688 return 0; 3693 return 0;
3689 3694
3690 pos = ring_buffer_page_len(info->spare);
3691
3692 if (pos < PAGE_SIZE)
3693 memset(info->spare + pos, 0, PAGE_SIZE - pos);
3694
3695read: 3695read:
3696 size = PAGE_SIZE - info->read; 3696 size = PAGE_SIZE - info->read;
3697 if (size > count) 3697 if (size > count)
@@ -3786,11 +3786,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3786 unsigned int flags) 3786 unsigned int flags)
3787{ 3787{
3788 struct ftrace_buffer_info *info = file->private_data; 3788 struct ftrace_buffer_info *info = file->private_data;
3789 struct partial_page partial[PIPE_BUFFERS]; 3789 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3790 struct page *pages[PIPE_BUFFERS]; 3790 struct page *pages_def[PIPE_DEF_BUFFERS];
3791 struct splice_pipe_desc spd = { 3791 struct splice_pipe_desc spd = {
3792 .pages = pages, 3792 .pages = pages_def,
3793 .partial = partial, 3793 .partial = partial_def,
3794 .flags = flags, 3794 .flags = flags,
3795 .ops = &buffer_pipe_buf_ops, 3795 .ops = &buffer_pipe_buf_ops,
3796 .spd_release = buffer_spd_release, 3796 .spd_release = buffer_spd_release,
@@ -3799,22 +3799,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3799 int entries, size, i; 3799 int entries, size, i;
3800 size_t ret; 3800 size_t ret;
3801 3801
3802 if (splice_grow_spd(pipe, &spd))
3803 return -ENOMEM;
3804
3802 if (*ppos & (PAGE_SIZE - 1)) { 3805 if (*ppos & (PAGE_SIZE - 1)) {
3803 WARN_ONCE(1, "Ftrace: previous read must page-align\n"); 3806 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3804 return -EINVAL; 3807 ret = -EINVAL;
3808 goto out;
3805 } 3809 }
3806 3810
3807 if (len & (PAGE_SIZE - 1)) { 3811 if (len & (PAGE_SIZE - 1)) {
3808 WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); 3812 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3809 if (len < PAGE_SIZE) 3813 if (len < PAGE_SIZE) {
3810 return -EINVAL; 3814 ret = -EINVAL;
3815 goto out;
3816 }
3811 len &= PAGE_MASK; 3817 len &= PAGE_MASK;
3812 } 3818 }
3813 3819
3814 trace_access_lock(info->cpu); 3820 trace_access_lock(info->cpu);
3815 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3821 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3816 3822
3817 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3823 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
3818 struct page *page; 3824 struct page *page;
3819 int r; 3825 int r;
3820 3826
@@ -3869,11 +3875,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3869 else 3875 else
3870 ret = 0; 3876 ret = 0;
3871 /* TODO: block */ 3877 /* TODO: block */
3872 return ret; 3878 goto out;
3873 } 3879 }
3874 3880
3875 ret = splice_to_pipe(pipe, &spd); 3881 ret = splice_to_pipe(pipe, &spd);
3876 3882 splice_shrink_spd(pipe, &spd);
3883out:
3877 return ret; 3884 return ret;
3878} 3885}
3879 3886
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d1ce0bec1b3f..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
405void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 405void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
406 int pc); 406 int pc);
407#else 407#else
408static inline void ftrace_trace_stack(struct trace_array *tr, 408static inline void ftrace_trace_stack(struct ring_buffer *buffer,
409 unsigned long flags, int skip, int pc) 409 unsigned long flags, int skip, int pc)
410{ 410{
411} 411}
412 412
413static inline void ftrace_trace_userstack(struct trace_array *tr, 413static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
414 unsigned long flags, int pc) 414 unsigned long flags, int pc)
415{ 415{
416} 416}
@@ -778,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
778 struct trace_seq *s); 778 struct trace_seq *s);
779extern int filter_assign_type(const char *type); 779extern int filter_assign_type(const char *type);
780 780
781struct list_head *
782trace_get_fields(struct ftrace_event_call *event_call);
783
781static inline int 784static inline int
782filter_check_discard(struct ftrace_event_call *call, void *rec, 785filter_check_discard(struct ftrace_event_call *call, void *rec,
783 struct ring_buffer *buffer, 786 struct ring_buffer *buffer,
784 struct ring_buffer_event *event) 787 struct ring_buffer_event *event)
785{ 788{
786 if (unlikely(call->filter_active) && 789 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
787 !filter_match_preds(call->filter, rec)) { 790 !filter_match_preds(call->filter, rec)) {
788 ring_buffer_discard_commit(buffer, event); 791 ring_buffer_discard_commit(buffer, event);
789 return 1; 792 return 1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
143} 143}
144 144
145static enum print_line_t trace_branch_print(struct trace_iterator *iter, 145static enum print_line_t trace_branch_print(struct trace_iterator *iter,
146 int flags) 146 int flags, struct trace_event *event)
147{ 147{
148 struct trace_branch *field; 148 struct trace_branch *field;
149 149
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
167 " |\n"); 167 " |\n");
168} 168}
169 169
170static struct trace_event_functions trace_branch_funcs = {
171 .trace = trace_branch_print,
172};
173
170static struct trace_event trace_branch_event = { 174static struct trace_event trace_branch_event = {
171 .type = TRACE_BRANCH, 175 .type = TRACE_BRANCH,
172 .trace = trace_branch_print, 176 .funcs = &trace_branch_funcs,
173}; 177};
174 178
175static struct tracer branch_trace __read_mostly = 179static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..e6f65887842c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,13 +9,9 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); 12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
16 13
17static char *perf_trace_buf; 14static char *perf_trace_buf[4];
18static char *perf_trace_buf_nmi;
19 15
20/* 16/*
21 * Force it to be aligned to unsigned long to avoid misaligned accesses 17 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -27,57 +23,82 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
27/* Count the events in use (per event id, not per instance) */ 23/* Count the events in use (per event id, not per instance) */
28static int total_ref_count; 24static int total_ref_count;
29 25
30static int perf_trace_event_enable(struct ftrace_event_call *event) 26static int perf_trace_event_init(struct ftrace_event_call *tp_event,
27 struct perf_event *p_event)
31{ 28{
32 char *buf; 29 struct hlist_head *list;
33 int ret = -ENOMEM; 30 int ret = -ENOMEM;
31 int cpu;
34 32
35 if (event->perf_refcount++ > 0) 33 p_event->tp_event = tp_event;
34 if (tp_event->perf_refcount++ > 0)
36 return 0; 35 return 0;
37 36
38 if (!total_ref_count) { 37 list = alloc_percpu(struct hlist_head);
39 buf = (char *)alloc_percpu(perf_trace_t); 38 if (!list)
40 if (!buf) 39 goto fail;
41 goto fail_buf;
42 40
43 rcu_assign_pointer(perf_trace_buf, buf); 41 for_each_possible_cpu(cpu)
42 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
44 43
45 buf = (char *)alloc_percpu(perf_trace_t); 44 tp_event->perf_events = list;
46 if (!buf)
47 goto fail_buf_nmi;
48 45
49 rcu_assign_pointer(perf_trace_buf_nmi, buf); 46 if (!total_ref_count) {
50 } 47 char *buf;
48 int i;
51 49
52 ret = event->perf_event_enable(event); 50 for (i = 0; i < 4; i++) {
53 if (!ret) { 51 buf = (char *)alloc_percpu(perf_trace_t);
54 total_ref_count++; 52 if (!buf)
55 return 0; 53 goto fail;
54
55 perf_trace_buf[i] = buf;
56 }
56 } 57 }
57 58
58fail_buf_nmi: 59 if (tp_event->class->reg)
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret)
67 goto fail;
68
69 total_ref_count++;
70 return 0;
71
72fail:
59 if (!total_ref_count) { 73 if (!total_ref_count) {
60 free_percpu(perf_trace_buf_nmi); 74 int i;
61 free_percpu(perf_trace_buf); 75
62 perf_trace_buf_nmi = NULL; 76 for (i = 0; i < 4; i++) {
63 perf_trace_buf = NULL; 77 free_percpu(perf_trace_buf[i]);
78 perf_trace_buf[i] = NULL;
79 }
80 }
81
82 if (!--tp_event->perf_refcount) {
83 free_percpu(tp_event->perf_events);
84 tp_event->perf_events = NULL;
64 } 85 }
65fail_buf:
66 event->perf_refcount--;
67 86
68 return ret; 87 return ret;
69} 88}
70 89
71int perf_trace_enable(int event_id) 90int perf_trace_init(struct perf_event *p_event)
72{ 91{
73 struct ftrace_event_call *event; 92 struct ftrace_event_call *tp_event;
93 int event_id = p_event->attr.config;
74 int ret = -EINVAL; 94 int ret = -EINVAL;
75 95
76 mutex_lock(&event_mutex); 96 mutex_lock(&event_mutex);
77 list_for_each_entry(event, &ftrace_events, list) { 97 list_for_each_entry(tp_event, &ftrace_events, list) {
78 if (event->id == event_id && event->perf_event_enable && 98 if (tp_event->event.type == event_id &&
79 try_module_get(event->mod)) { 99 tp_event->class && tp_event->class->perf_probe &&
80 ret = perf_trace_event_enable(event); 100 try_module_get(tp_event->mod)) {
101 ret = perf_trace_event_init(tp_event, p_event);
81 break; 102 break;
82 } 103 }
83 } 104 }
@@ -86,90 +107,87 @@ int perf_trace_enable(int event_id)
86 return ret; 107 return ret;
87} 108}
88 109
89static void perf_trace_event_disable(struct ftrace_event_call *event) 110int perf_trace_enable(struct perf_event *p_event)
90{ 111{
91 char *buf, *nmi_buf; 112 struct ftrace_event_call *tp_event = p_event->tp_event;
113 struct hlist_head *list;
92 114
93 if (--event->perf_refcount > 0) 115 list = tp_event->perf_events;
94 return; 116 if (WARN_ON_ONCE(!list))
117 return -EINVAL;
95 118
96 event->perf_event_disable(event); 119 list = this_cpu_ptr(list);
97 120 hlist_add_head_rcu(&p_event->hlist_entry, list);
98 if (!--total_ref_count) {
99 buf = perf_trace_buf;
100 rcu_assign_pointer(perf_trace_buf, NULL);
101 121
102 nmi_buf = perf_trace_buf_nmi; 122 return 0;
103 rcu_assign_pointer(perf_trace_buf_nmi, NULL); 123}
104
105 /*
106 * Ensure every events in profiling have finished before
107 * releasing the buffers
108 */
109 synchronize_sched();
110 124
111 free_percpu(buf); 125void perf_trace_disable(struct perf_event *p_event)
112 free_percpu(nmi_buf); 126{
113 } 127 hlist_del_rcu(&p_event->hlist_entry);
114} 128}
115 129
116void perf_trace_disable(int event_id) 130void perf_trace_destroy(struct perf_event *p_event)
117{ 131{
118 struct ftrace_event_call *event; 132 struct ftrace_event_call *tp_event = p_event->tp_event;
133 int i;
119 134
120 mutex_lock(&event_mutex); 135 mutex_lock(&event_mutex);
121 list_for_each_entry(event, &ftrace_events, list) { 136 if (--tp_event->perf_refcount > 0)
122 if (event->id == event_id) { 137 goto out;
123 perf_trace_event_disable(event); 138
124 module_put(event->mod); 139 if (tp_event->class->reg)
125 break; 140 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
141 else
142 tracepoint_probe_unregister(tp_event->name,
143 tp_event->class->perf_probe,
144 tp_event);
145
146 /*
147 * Ensure our callback won't be called anymore. See
148 * tracepoint_probe_unregister() and __DO_TRACE().
149 */
150 synchronize_sched();
151
152 free_percpu(tp_event->perf_events);
153 tp_event->perf_events = NULL;
154
155 if (!--total_ref_count) {
156 for (i = 0; i < 4; i++) {
157 free_percpu(perf_trace_buf[i]);
158 perf_trace_buf[i] = NULL;
126 } 159 }
127 } 160 }
161out:
128 mutex_unlock(&event_mutex); 162 mutex_unlock(&event_mutex);
129} 163}
130 164
131__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 165__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
132 int *rctxp, unsigned long *irq_flags) 166 struct pt_regs *regs, int *rctxp)
133{ 167{
134 struct trace_entry *entry; 168 struct trace_entry *entry;
135 char *trace_buf, *raw_data; 169 unsigned long flags;
136 int pc, cpu; 170 char *raw_data;
171 int pc;
137 172
138 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 173 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
139 174
140 pc = preempt_count(); 175 pc = preempt_count();
141 176
142 /* Protect the per cpu buffer, begin the rcu read side */
143 local_irq_save(*irq_flags);
144
145 *rctxp = perf_swevent_get_recursion_context(); 177 *rctxp = perf_swevent_get_recursion_context();
146 if (*rctxp < 0) 178 if (*rctxp < 0)
147 goto err_recursion; 179 return NULL;
148
149 cpu = smp_processor_id();
150
151 if (in_nmi())
152 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
153 else
154 trace_buf = rcu_dereference_sched(perf_trace_buf);
155
156 if (!trace_buf)
157 goto err;
158 180
159 raw_data = per_cpu_ptr(trace_buf, cpu); 181 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
160 182
161 /* zero the dead bytes from align to not leak stack to user */ 183 /* zero the dead bytes from align to not leak stack to user */
162 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); 184 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
163 185
164 entry = (struct trace_entry *)raw_data; 186 entry = (struct trace_entry *)raw_data;
165 tracing_generic_entry_update(entry, *irq_flags, pc); 187 local_save_flags(flags);
188 tracing_generic_entry_update(entry, flags, pc);
166 entry->type = type; 189 entry->type = type;
167 190
168 return raw_data; 191 return raw_data;
169err:
170 perf_swevent_put_recursion_context(*rctxp);
171err_recursion:
172 local_irq_restore(*irq_flags);
173 return NULL;
174} 192}
175EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 193EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c7043349..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31 31
32struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call)
34{
35 if (!event_call->class->get_fields)
36 return &event_call->class->fields;
37 return event_call->class->get_fields(event_call);
38}
39
32int trace_define_field(struct ftrace_event_call *call, const char *type, 40int trace_define_field(struct ftrace_event_call *call, const char *type,
33 const char *name, int offset, int size, int is_signed, 41 const char *name, int offset, int size, int is_signed,
34 int filter_type) 42 int filter_type)
35{ 43{
36 struct ftrace_event_field *field; 44 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
37 49
38 field = kzalloc(sizeof(*field), GFP_KERNEL); 50 field = kzalloc(sizeof(*field), GFP_KERNEL);
39 if (!field) 51 if (!field)
@@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
56 field->size = size; 68 field->size = size;
57 field->is_signed = is_signed; 69 field->is_signed = is_signed;
58 70
59 list_add(&field->link, &call->fields); 71 head = trace_get_fields(call);
72 list_add(&field->link, head);
60 73
61 return 0; 74 return 0;
62 75
@@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
94void trace_destroy_fields(struct ftrace_event_call *call) 107void trace_destroy_fields(struct ftrace_event_call *call)
95{ 108{
96 struct ftrace_event_field *field, *next; 109 struct ftrace_event_field *field, *next;
110 struct list_head *head;
97 111
98 list_for_each_entry_safe(field, next, &call->fields, link) { 112 head = trace_get_fields(call);
113 list_for_each_entry_safe(field, next, head, link) {
99 list_del(&field->link); 114 list_del(&field->link);
100 kfree(field->type); 115 kfree(field->type);
101 kfree(field->name); 116 kfree(field->name);
@@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
107{ 122{
108 int id; 123 int id;
109 124
110 id = register_ftrace_event(call->event); 125 id = register_ftrace_event(&call->event);
111 if (!id) 126 if (!id)
112 return -ENODEV; 127 return -ENODEV;
113 call->id = id;
114 INIT_LIST_HEAD(&call->fields);
115 128
116 return 0; 129 return 0;
117} 130}
@@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
124 137
125 switch (enable) { 138 switch (enable) {
126 case 0: 139 case 0:
127 if (call->enabled) { 140 if (call->flags & TRACE_EVENT_FL_ENABLED) {
128 call->enabled = 0; 141 call->flags &= ~TRACE_EVENT_FL_ENABLED;
129 tracing_stop_cmdline_record(); 142 tracing_stop_cmdline_record();
130 call->unregfunc(call); 143 if (call->class->reg)
144 call->class->reg(call, TRACE_REG_UNREGISTER);
145 else
146 tracepoint_probe_unregister(call->name,
147 call->class->probe,
148 call);
131 } 149 }
132 break; 150 break;
133 case 1: 151 case 1:
134 if (!call->enabled) { 152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
135 tracing_start_cmdline_record(); 153 tracing_start_cmdline_record();
136 ret = call->regfunc(call); 154 if (call->class->reg)
155 ret = call->class->reg(call, TRACE_REG_REGISTER);
156 else
157 ret = tracepoint_probe_register(call->name,
158 call->class->probe,
159 call);
137 if (ret) { 160 if (ret) {
138 tracing_stop_cmdline_record(); 161 tracing_stop_cmdline_record();
139 pr_info("event trace: Could not enable event " 162 pr_info("event trace: Could not enable event "
140 "%s\n", call->name); 163 "%s\n", call->name);
141 break; 164 break;
142 } 165 }
143 call->enabled = 1; 166 call->flags |= TRACE_EVENT_FL_ENABLED;
144 } 167 }
145 break; 168 break;
146 } 169 }
@@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
171 mutex_lock(&event_mutex); 194 mutex_lock(&event_mutex);
172 list_for_each_entry(call, &ftrace_events, list) { 195 list_for_each_entry(call, &ftrace_events, list) {
173 196
174 if (!call->name || !call->regfunc) 197 if (!call->name || !call->class ||
198 (!call->class->probe && !call->class->reg))
175 continue; 199 continue;
176 200
177 if (match && 201 if (match &&
178 strcmp(match, call->name) != 0 && 202 strcmp(match, call->name) != 0 &&
179 strcmp(match, call->system) != 0) 203 strcmp(match, call->class->system) != 0)
180 continue; 204 continue;
181 205
182 if (sub && strcmp(sub, call->system) != 0) 206 if (sub && strcmp(sub, call->class->system) != 0)
183 continue; 207 continue;
184 208
185 if (event && strcmp(event, call->name) != 0) 209 if (event && strcmp(event, call->name) != 0)
@@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
297 * The ftrace subsystem is for showing formats only. 321 * The ftrace subsystem is for showing formats only.
298 * They can not be enabled or disabled via the event files. 322 * They can not be enabled or disabled via the event files.
299 */ 323 */
300 if (call->regfunc) 324 if (call->class && (call->class->probe || call->class->reg))
301 return call; 325 return call;
302 } 326 }
303 327
@@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
328 (*pos)++; 352 (*pos)++;
329 353
330 list_for_each_entry_continue(call, &ftrace_events, list) { 354 list_for_each_entry_continue(call, &ftrace_events, list) {
331 if (call->enabled) 355 if (call->flags & TRACE_EVENT_FL_ENABLED)
332 return call; 356 return call;
333 } 357 }
334 358
@@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
355{ 379{
356 struct ftrace_event_call *call = v; 380 struct ftrace_event_call *call = v;
357 381
358 if (strcmp(call->system, TRACE_SYSTEM) != 0) 382 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
359 seq_printf(m, "%s:", call->system); 383 seq_printf(m, "%s:", call->class->system);
360 seq_printf(m, "%s\n", call->name); 384 seq_printf(m, "%s\n", call->name);
361 385
362 return 0; 386 return 0;
@@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
387 struct ftrace_event_call *call = filp->private_data; 411 struct ftrace_event_call *call = filp->private_data;
388 char *buf; 412 char *buf;
389 413
390 if (call->enabled) 414 if (call->flags & TRACE_EVENT_FL_ENABLED)
391 buf = "1\n"; 415 buf = "1\n";
392 else 416 else
393 buf = "0\n"; 417 buf = "0\n";
@@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
450 474
451 mutex_lock(&event_mutex); 475 mutex_lock(&event_mutex);
452 list_for_each_entry(call, &ftrace_events, list) { 476 list_for_each_entry(call, &ftrace_events, list) {
453 if (!call->name || !call->regfunc) 477 if (!call->name || !call->class ||
478 (!call->class->probe && !call->class->reg))
454 continue; 479 continue;
455 480
456 if (system && strcmp(call->system, system) != 0) 481 if (system && strcmp(call->class->system, system) != 0)
457 continue; 482 continue;
458 483
459 /* 484 /*
@@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
461 * or if all events or cleared, or if we have 486 * or if all events or cleared, or if we have
462 * a mixture. 487 * a mixture.
463 */ 488 */
464 set |= (1 << !!call->enabled); 489 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
465 490
466 /* 491 /*
467 * If we have a mixture, no need to look further. 492 * If we have a mixture, no need to look further.
@@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
525{ 550{
526 struct ftrace_event_call *call = filp->private_data; 551 struct ftrace_event_call *call = filp->private_data;
527 struct ftrace_event_field *field; 552 struct ftrace_event_field *field;
553 struct list_head *head;
528 struct trace_seq *s; 554 struct trace_seq *s;
529 int common_field_count = 5; 555 int common_field_count = 5;
530 char *buf; 556 char *buf;
@@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
540 trace_seq_init(s); 566 trace_seq_init(s);
541 567
542 trace_seq_printf(s, "name: %s\n", call->name); 568 trace_seq_printf(s, "name: %s\n", call->name);
543 trace_seq_printf(s, "ID: %d\n", call->id); 569 trace_seq_printf(s, "ID: %d\n", call->event.type);
544 trace_seq_printf(s, "format:\n"); 570 trace_seq_printf(s, "format:\n");
545 571
546 list_for_each_entry_reverse(field, &call->fields, link) { 572 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) {
547 /* 574 /*
548 * Smartly shows the array type(except dynamic array). 575 * Smartly shows the array type(except dynamic array).
549 * Normal: 576 * Normal:
@@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
613 return -ENOMEM; 640 return -ENOMEM;
614 641
615 trace_seq_init(s); 642 trace_seq_init(s);
616 trace_seq_printf(s, "%d\n", call->id); 643 trace_seq_printf(s, "%d\n", call->event.type);
617 644
618 r = simple_read_from_buffer(ubuf, cnt, ppos, 645 r = simple_read_from_buffer(ubuf, cnt, ppos,
619 s->buffer, s->len); 646 s->buffer, s->len);
@@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
919 const struct file_operations *filter, 946 const struct file_operations *filter,
920 const struct file_operations *format) 947 const struct file_operations *format)
921{ 948{
949 struct list_head *head;
922 int ret; 950 int ret;
923 951
924 /* 952 /*
925 * If the trace point header did not define TRACE_SYSTEM 953 * If the trace point header did not define TRACE_SYSTEM
926 * then the system would be called "TRACE_SYSTEM". 954 * then the system would be called "TRACE_SYSTEM".
927 */ 955 */
928 if (strcmp(call->system, TRACE_SYSTEM) != 0) 956 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
929 d_events = event_subsystem_dir(call->system, d_events); 957 d_events = event_subsystem_dir(call->class->system, d_events);
930 958
931 call->dir = debugfs_create_dir(call->name, d_events); 959 call->dir = debugfs_create_dir(call->name, d_events);
932 if (!call->dir) { 960 if (!call->dir) {
@@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
935 return -1; 963 return -1;
936 } 964 }
937 965
938 if (call->regfunc) 966 if (call->class->probe || call->class->reg)
939 trace_create_file("enable", 0644, call->dir, call, 967 trace_create_file("enable", 0644, call->dir, call,
940 enable); 968 enable);
941 969
942 if (call->id && call->perf_event_enable) 970#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg))
943 trace_create_file("id", 0444, call->dir, call, 972 trace_create_file("id", 0444, call->dir, call,
944 id); 973 id);
974#endif
945 975
946 if (call->define_fields) { 976 if (call->class->define_fields) {
947 ret = trace_define_common_fields(call); 977 /*
948 if (!ret) 978 * Other events may have the same class. Only update
949 ret = call->define_fields(call); 979 * the fields if they are not already defined.
950 if (ret < 0) { 980 */
951 pr_warning("Could not initialize trace point" 981 head = trace_get_fields(call);
952 " events/%s\n", call->name); 982 if (list_empty(head)) {
953 return ret; 983 ret = trace_define_common_fields(call);
984 if (!ret)
985 ret = call->class->define_fields(call);
986 if (ret < 0) {
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
954 } 991 }
955 trace_create_file("filter", 0644, call->dir, call, 992 trace_create_file("filter", 0644, call->dir, call,
956 filter); 993 filter);
@@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
970 if (!call->name) 1007 if (!call->name)
971 return -EINVAL; 1008 return -EINVAL;
972 1009
973 if (call->raw_init) { 1010 if (call->class->raw_init) {
974 ret = call->raw_init(call); 1011 ret = call->class->raw_init(call);
975 if (ret < 0) { 1012 if (ret < 0) {
976 if (ret != -ENOSYS) 1013 if (ret != -ENOSYS)
977 pr_warning("Could not initialize trace " 1014 pr_warning("Could not initialize trace "
@@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
1035static void __trace_remove_event_call(struct ftrace_event_call *call) 1072static void __trace_remove_event_call(struct ftrace_event_call *call)
1036{ 1073{
1037 ftrace_event_enable_disable(call, 0); 1074 ftrace_event_enable_disable(call, 0);
1038 if (call->event) 1075 if (call->event.funcs)
1039 __unregister_ftrace_event(call->event); 1076 __unregister_ftrace_event(&call->event);
1040 debugfs_remove_recursive(call->dir); 1077 debugfs_remove_recursive(call->dir);
1041 list_del(&call->list); 1078 list_del(&call->list);
1042 trace_destroy_fields(call); 1079 trace_destroy_fields(call);
1043 destroy_preds(call); 1080 destroy_preds(call);
1044 remove_subsystem_dir(call->system); 1081 remove_subsystem_dir(call->class->system);
1045} 1082}
1046 1083
1047/* Remove an event_call */ 1084/* Remove an event_call */
@@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
1132 /* The linker may leave blanks */ 1169 /* The linker may leave blanks */
1133 if (!call->name) 1170 if (!call->name)
1134 continue; 1171 continue;
1135 if (call->raw_init) { 1172 if (call->class->raw_init) {
1136 ret = call->raw_init(call); 1173 ret = call->class->raw_init(call);
1137 if (ret < 0) { 1174 if (ret < 0) {
1138 if (ret != -ENOSYS) 1175 if (ret != -ENOSYS)
1139 pr_warning("Could not initialize trace " 1176 pr_warning("Could not initialize trace "
@@ -1286,8 +1323,8 @@ static __init int event_trace_init(void)
1286 /* The linker may leave blanks */ 1323 /* The linker may leave blanks */
1287 if (!call->name) 1324 if (!call->name)
1288 continue; 1325 continue;
1289 if (call->raw_init) { 1326 if (call->class->raw_init) {
1290 ret = call->raw_init(call); 1327 ret = call->class->raw_init(call);
1291 if (ret < 0) { 1328 if (ret < 0) {
1292 if (ret != -ENOSYS) 1329 if (ret != -ENOSYS)
1293 pr_warning("Could not initialize trace " 1330 pr_warning("Could not initialize trace "
@@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void)
1388 1425
1389 list_for_each_entry(call, &ftrace_events, list) { 1426 list_for_each_entry(call, &ftrace_events, list) {
1390 1427
1391 /* Only test those that have a regfunc */ 1428 /* Only test those that have a probe */
1392 if (!call->regfunc) 1429 if (!call->class || !call->class->probe)
1393 continue; 1430 continue;
1394 1431
1395/* 1432/*
@@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void)
1399 * syscalls as we test. 1436 * syscalls as we test.
1400 */ 1437 */
1401#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS 1438#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1402 if (call->system && 1439 if (call->class->system &&
1403 strcmp(call->system, "syscalls") == 0) 1440 strcmp(call->class->system, "syscalls") == 0)
1404 continue; 1441 continue;
1405#endif 1442#endif
1406 1443
@@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void)
1410 * If an event is already enabled, someone is using 1447 * If an event is already enabled, someone is using
1411 * it and the self test should not be on. 1448 * it and the self test should not be on.
1412 */ 1449 */
1413 if (call->enabled) { 1450 if (call->flags & TRACE_EVENT_FL_ENABLED) {
1414 pr_warning("Enabled event during self test!\n"); 1451 pr_warning("Enabled event during self test!\n");
1415 WARN_ON_ONCE(1); 1452 WARN_ON_ONCE(1);
1416 continue; 1453 continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 58092d844a1f..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -500,8 +500,10 @@ static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500find_event_field(struct ftrace_event_call *call, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
503 504
504 list_for_each_entry(field, &call->fields, link) { 505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) {
505 if (!strcmp(field->name, name)) 507 if (!strcmp(field->name, name))
506 return field; 508 return field;
507 } 509 }
@@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
545 struct event_filter *filter = call->filter; 547 struct event_filter *filter = call->filter;
546 int i; 548 int i;
547 549
548 call->filter_active = 0; 550 call->flags &= ~TRACE_EVENT_FL_FILTERED;
549 filter->n_preds = 0; 551 filter->n_preds = 0;
550 552
551 for (i = 0; i < MAX_FILTER_PRED; i++) 553 for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
572{ 574{
573 __free_preds(call->filter); 575 __free_preds(call->filter);
574 call->filter = NULL; 576 call->filter = NULL;
575 call->filter_active = 0; 577 call->flags &= ~TRACE_EVENT_FL_FILTERED;
576} 578}
577 579
578static struct event_filter *__alloc_preds(void) 580static struct event_filter *__alloc_preds(void)
@@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
611 if (call->filter) 613 if (call->filter)
612 return 0; 614 return 0;
613 615
614 call->filter_active = 0; 616 call->flags &= ~TRACE_EVENT_FL_FILTERED;
615 call->filter = __alloc_preds(); 617 call->filter = __alloc_preds();
616 if (IS_ERR(call->filter)) 618 if (IS_ERR(call->filter))
617 return PTR_ERR(call->filter); 619 return PTR_ERR(call->filter);
@@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
625 int err; 627 int err;
626 628
627 list_for_each_entry(call, &ftrace_events, list) { 629 list_for_each_entry(call, &ftrace_events, list) {
628 if (!call->define_fields) 630 if (!call->class || !call->class->define_fields)
629 continue; 631 continue;
630 632
631 if (strcmp(call->system, system->name) != 0) 633 if (strcmp(call->class->system, system->name) != 0)
632 continue; 634 continue;
633 635
634 err = init_preds(call); 636 err = init_preds(call);
@@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
644 struct ftrace_event_call *call; 646 struct ftrace_event_call *call;
645 647
646 list_for_each_entry(call, &ftrace_events, list) { 648 list_for_each_entry(call, &ftrace_events, list) {
647 if (!call->define_fields) 649 if (!call->class || !call->class->define_fields)
648 continue; 650 continue;
649 651
650 if (strcmp(call->system, system->name) != 0) 652 if (strcmp(call->class->system, system->name) != 0)
651 continue; 653 continue;
652 654
653 filter_disable_preds(call); 655 filter_disable_preds(call);
@@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
1249 list_for_each_entry(call, &ftrace_events, list) { 1251 list_for_each_entry(call, &ftrace_events, list) {
1250 struct event_filter *filter = call->filter; 1252 struct event_filter *filter = call->filter;
1251 1253
1252 if (!call->define_fields) 1254 if (!call->class || !call->class->define_fields)
1253 continue; 1255 continue;
1254 1256
1255 if (strcmp(call->system, system->name) != 0) 1257 if (strcmp(call->class->system, system->name) != 0)
1256 continue; 1258 continue;
1257 1259
1258 /* try to see if the filter can be applied */ 1260 /* try to see if the filter can be applied */
@@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
1266 if (err) 1268 if (err)
1267 filter_disable_preds(call); 1269 filter_disable_preds(call);
1268 else { 1270 else {
1269 call->filter_active = 1; 1271 call->flags |= TRACE_EVENT_FL_FILTERED;
1270 replace_filter_string(filter, filter_string); 1272 replace_filter_string(filter, filter_string);
1271 } 1273 }
1272 fail = false; 1274 fail = false;
@@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1315 if (err) 1317 if (err)
1316 append_filter_err(ps, call->filter); 1318 append_filter_err(ps, call->filter);
1317 else 1319 else
1318 call->filter_active = 1; 1320 call->flags |= TRACE_EVENT_FL_FILTERED;
1319out: 1321out:
1320 filter_opstack_clear(ps); 1322 filter_opstack_clear(ps);
1321 postfix_clear(ps); 1323 postfix_clear(ps);
@@ -1393,7 +1395,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1393 mutex_lock(&event_mutex); 1395 mutex_lock(&event_mutex);
1394 1396
1395 list_for_each_entry(call, &ftrace_events, list) { 1397 list_for_each_entry(call, &ftrace_events, list) {
1396 if (call->id == event_id) 1398 if (call->event.type == event_id)
1397 break; 1399 break;
1398 } 1400 }
1399 1401
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call) 128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{ 129{
130 INIT_LIST_HEAD(&call->fields); 130 INIT_LIST_HEAD(&call->class->fields);
131 return 0; 131 return 0;
132} 132}
133 133
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154 154
155#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
157 \
158struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \
162}; \
157 \ 163 \
158struct ftrace_event_call __used \ 164struct ftrace_event_call __used \
159__attribute__((__aligned__(4))) \ 165__attribute__((__aligned__(4))) \
160__attribute__((section("_ftrace_events"))) event_##call = { \ 166__attribute__((section("_ftrace_events"))) event_##call = { \
161 .name = #call, \ 167 .name = #call, \
162 .id = type, \ 168 .event.type = etype, \
163 .system = __stringify(TRACE_SYSTEM), \ 169 .class = &event_class_ftrace_##call, \
164 .raw_init = ftrace_raw_init_event, \
165 .print_fmt = print, \ 170 .print_fmt = print, \
166 .define_fields = ftrace_define_fields_##call, \
167}; \ 171}; \
168 172
169#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index dd11c830eb84..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1025,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1025 if (!event) 1025 if (!event)
1026 return TRACE_TYPE_UNHANDLED; 1026 return TRACE_TYPE_UNHANDLED;
1027 1027
1028 ret = event->trace(iter, sym_flags); 1028 ret = event->funcs->trace(iter, sym_flags, event);
1029 if (ret != TRACE_TYPE_HANDLED) 1029 if (ret != TRACE_TYPE_HANDLED)
1030 return ret; 1030 return ret;
1031 } 1031 }
@@ -1112,7 +1112,8 @@ print_graph_function(struct trace_iterator *iter)
1112} 1112}
1113 1113
1114static enum print_line_t 1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags) 1115print_graph_function_event(struct trace_iterator *iter, int flags,
1116 struct trace_event *event)
1116{ 1117{
1117 return print_graph_function(iter); 1118 return print_graph_function(iter);
1118} 1119}
@@ -1225,14 +1226,18 @@ void graph_trace_close(struct trace_iterator *iter)
1225 } 1226 }
1226} 1227}
1227 1228
1229static struct trace_event_functions graph_functions = {
1230 .trace = print_graph_function_event,
1231};
1232
1228static struct trace_event graph_trace_entry_event = { 1233static struct trace_event graph_trace_entry_event = {
1229 .type = TRACE_GRAPH_ENT, 1234 .type = TRACE_GRAPH_ENT,
1230 .trace = print_graph_function_event, 1235 .funcs = &graph_functions,
1231}; 1236};
1232 1237
1233static struct trace_event graph_trace_ret_event = { 1238static struct trace_event graph_trace_ret_event = {
1234 .type = TRACE_GRAPH_RET, 1239 .type = TRACE_GRAPH_RET,
1235 .trace = print_graph_function_event, 1240 .funcs = &graph_functions
1236}; 1241};
1237 1242
1238static struct tracer graph_trace __read_mostly = { 1243static struct tracer graph_trace __read_mostly = {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a7514326052b..f52b5f50299d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -324,8 +324,8 @@ struct trace_probe {
324 unsigned long nhit; 324 unsigned long nhit;
325 unsigned int flags; /* For TP_FLAG_* */ 325 unsigned int flags; /* For TP_FLAG_* */
326 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
327 struct ftrace_event_class class;
327 struct ftrace_event_call call; 328 struct ftrace_event_call call;
328 struct trace_event event;
329 ssize_t size; /* trace entry size */ 329 ssize_t size; /* trace entry size */
330 unsigned int nr_args; 330 unsigned int nr_args;
331 struct probe_arg args[]; 331 struct probe_arg args[];
@@ -404,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
404 goto error; 404 goto error;
405 } 405 }
406 406
407 tp->call.class = &tp->class;
407 tp->call.name = kstrdup(event, GFP_KERNEL); 408 tp->call.name = kstrdup(event, GFP_KERNEL);
408 if (!tp->call.name) 409 if (!tp->call.name)
409 goto error; 410 goto error;
@@ -413,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
413 goto error; 414 goto error;
414 } 415 }
415 416
416 tp->call.system = kstrdup(group, GFP_KERNEL); 417 tp->class.system = kstrdup(group, GFP_KERNEL);
417 if (!tp->call.system) 418 if (!tp->class.system)
418 goto error; 419 goto error;
419 420
420 INIT_LIST_HEAD(&tp->list); 421 INIT_LIST_HEAD(&tp->list);
@@ -443,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
443 for (i = 0; i < tp->nr_args; i++) 444 for (i = 0; i < tp->nr_args; i++)
444 free_probe_arg(&tp->args[i]); 445 free_probe_arg(&tp->args[i]);
445 446
446 kfree(tp->call.system); 447 kfree(tp->call.class->system);
447 kfree(tp->call.name); 448 kfree(tp->call.name);
448 kfree(tp->symbol); 449 kfree(tp->symbol);
449 kfree(tp); 450 kfree(tp);
@@ -456,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
456 457
457 list_for_each_entry(tp, &probe_list, list) 458 list_for_each_entry(tp, &probe_list, list)
458 if (strcmp(tp->call.name, event) == 0 && 459 if (strcmp(tp->call.name, event) == 0 &&
459 strcmp(tp->call.system, group) == 0) 460 strcmp(tp->call.class->system, group) == 0)
460 return tp; 461 return tp;
461 return NULL; 462 return NULL;
462} 463}
@@ -481,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
481 mutex_lock(&probe_lock); 482 mutex_lock(&probe_lock);
482 483
483 /* register as an event */ 484 /* register as an event */
484 old_tp = find_probe_event(tp->call.name, tp->call.system); 485 old_tp = find_probe_event(tp->call.name, tp->call.class->system);
485 if (old_tp) { 486 if (old_tp) {
486 /* delete old event */ 487 /* delete old event */
487 unregister_trace_probe(old_tp); 488 unregister_trace_probe(old_tp);
@@ -904,7 +905,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
904 int i; 905 int i;
905 906
906 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 907 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
907 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 908 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
908 909
909 if (!tp->symbol) 910 if (!tp->symbol)
910 seq_printf(m, " 0x%p", tp->rp.kp.addr); 911 seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -1061,8 +1062,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1061 1062
1062 size = sizeof(*entry) + tp->size; 1063 size = sizeof(*entry) + tp->size;
1063 1064
1064 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1065 irq_flags, pc); 1066 size, irq_flags, pc);
1066 if (!event) 1067 if (!event)
1067 return; 1068 return;
1068 1069
@@ -1094,8 +1095,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1094 1095
1095 size = sizeof(*entry) + tp->size; 1096 size = sizeof(*entry) + tp->size;
1096 1097
1097 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1098 irq_flags, pc); 1099 size, irq_flags, pc);
1099 if (!event) 1100 if (!event)
1100 return; 1101 return;
1101 1102
@@ -1112,18 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1112 1113
1113/* Event entry printers */ 1114/* Event entry printers */
1114enum print_line_t 1115enum print_line_t
1115print_kprobe_event(struct trace_iterator *iter, int flags) 1116print_kprobe_event(struct trace_iterator *iter, int flags,
1117 struct trace_event *event)
1116{ 1118{
1117 struct kprobe_trace_entry_head *field; 1119 struct kprobe_trace_entry_head *field;
1118 struct trace_seq *s = &iter->seq; 1120 struct trace_seq *s = &iter->seq;
1119 struct trace_event *event;
1120 struct trace_probe *tp; 1121 struct trace_probe *tp;
1121 u8 *data; 1122 u8 *data;
1122 int i; 1123 int i;
1123 1124
1124 field = (struct kprobe_trace_entry_head *)iter->ent; 1125 field = (struct kprobe_trace_entry_head *)iter->ent;
1125 event = ftrace_find_event(field->ent.type); 1126 tp = container_of(event, struct trace_probe, call.event);
1126 tp = container_of(event, struct trace_probe, event);
1127 1127
1128 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1128 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1129 goto partial; 1129 goto partial;
@@ -1149,18 +1149,17 @@ partial:
1149} 1149}
1150 1150
1151enum print_line_t 1151enum print_line_t
1152print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags,
1153 struct trace_event *event)
1153{ 1154{
1154 struct kretprobe_trace_entry_head *field; 1155 struct kretprobe_trace_entry_head *field;
1155 struct trace_seq *s = &iter->seq; 1156 struct trace_seq *s = &iter->seq;
1156 struct trace_event *event;
1157 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data; 1158 u8 *data;
1159 int i; 1159 int i;
1160 1160
1161 field = (struct kretprobe_trace_entry_head *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1162 event = ftrace_find_event(field->ent.type); 1162 tp = container_of(event, struct trace_probe, call.event);
1163 tp = container_of(event, struct trace_probe, event);
1164 1163
1165 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1164 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1166 goto partial; 1165 goto partial;
@@ -1217,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1217 1216
1218static int probe_event_raw_init(struct ftrace_event_call *event_call) 1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1219{ 1218{
1220 INIT_LIST_HEAD(&event_call->fields);
1221
1222 return 0; 1219 return 0;
1223} 1220}
1224 1221
@@ -1341,9 +1338,9 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1341 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1338 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1342 struct ftrace_event_call *call = &tp->call; 1339 struct ftrace_event_call *call = &tp->call;
1343 struct kprobe_trace_entry_head *entry; 1340 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head;
1344 u8 *data; 1342 u8 *data;
1345 int size, __size, i; 1343 int size, __size, i;
1346 unsigned long irq_flags;
1347 int rctx; 1344 int rctx;
1348 1345
1349 __size = sizeof(*entry) + tp->size; 1346 __size = sizeof(*entry) + tp->size;
@@ -1353,7 +1350,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1353 "profile buffer not large enough")) 1350 "profile buffer not large enough"))
1354 return; 1351 return;
1355 1352
1356 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1353 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1357 if (!entry) 1354 if (!entry)
1358 return; 1355 return;
1359 1356
@@ -1362,7 +1359,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1362 for (i = 0; i < tp->nr_args; i++) 1359 for (i = 0; i < tp->nr_args; i++)
1363 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1364 1361
1365 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1362 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1366} 1364}
1367 1365
1368/* Kretprobe profile handler */ 1366/* Kretprobe profile handler */
@@ -1372,9 +1370,9 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1372 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1370 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1373 struct ftrace_event_call *call = &tp->call; 1371 struct ftrace_event_call *call = &tp->call;
1374 struct kretprobe_trace_entry_head *entry; 1372 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head;
1375 u8 *data; 1374 u8 *data;
1376 int size, __size, i; 1375 int size, __size, i;
1377 unsigned long irq_flags;
1378 int rctx; 1376 int rctx;
1379 1377
1380 __size = sizeof(*entry) + tp->size; 1378 __size = sizeof(*entry) + tp->size;
@@ -1384,7 +1382,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1384 "profile buffer not large enough")) 1382 "profile buffer not large enough"))
1385 return; 1383 return;
1386 1384
1387 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1385 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1388 if (!entry) 1386 if (!entry)
1389 return; 1387 return;
1390 1388
@@ -1394,8 +1392,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1394 for (i = 0; i < tp->nr_args; i++) 1392 for (i = 0; i < tp->nr_args; i++)
1395 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1396 1394
1397 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1395 head = this_cpu_ptr(call->perf_events);
1398 irq_flags, regs); 1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1399} 1397}
1400 1398
1401static int probe_perf_enable(struct ftrace_event_call *call) 1399static int probe_perf_enable(struct ftrace_event_call *call)
@@ -1425,6 +1423,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
1425} 1423}
1426#endif /* CONFIG_PERF_EVENTS */ 1424#endif /* CONFIG_PERF_EVENTS */
1427 1425
1426static __kprobes
1427int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1428{
1429 switch (type) {
1430 case TRACE_REG_REGISTER:
1431 return probe_event_enable(event);
1432 case TRACE_REG_UNREGISTER:
1433 probe_event_disable(event);
1434 return 0;
1435
1436#ifdef CONFIG_PERF_EVENTS
1437 case TRACE_REG_PERF_REGISTER:
1438 return probe_perf_enable(event);
1439 case TRACE_REG_PERF_UNREGISTER:
1440 probe_perf_disable(event);
1441 return 0;
1442#endif
1443 }
1444 return 0;
1445}
1428 1446
1429static __kprobes 1447static __kprobes
1430int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1448int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1454,6 +1472,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1454 return 0; /* We don't tweek kernel, so just return 0 */ 1472 return 0; /* We don't tweek kernel, so just return 0 */
1455} 1473}
1456 1474
1475static struct trace_event_functions kretprobe_funcs = {
1476 .trace = print_kretprobe_event
1477};
1478
1479static struct trace_event_functions kprobe_funcs = {
1480 .trace = print_kprobe_event
1481};
1482
1457static int register_probe_event(struct trace_probe *tp) 1483static int register_probe_event(struct trace_probe *tp)
1458{ 1484{
1459 struct ftrace_event_call *call = &tp->call; 1485 struct ftrace_event_call *call = &tp->call;
@@ -1461,36 +1487,31 @@ static int register_probe_event(struct trace_probe *tp)
1461 1487
1462 /* Initialize ftrace_event_call */ 1488 /* Initialize ftrace_event_call */
1463 if (probe_is_return(tp)) { 1489 if (probe_is_return(tp)) {
1464 tp->event.trace = print_kretprobe_event; 1490 INIT_LIST_HEAD(&call->class->fields);
1465 call->raw_init = probe_event_raw_init; 1491 call->event.funcs = &kretprobe_funcs;
1466 call->define_fields = kretprobe_event_define_fields; 1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields;
1467 } else { 1494 } else {
1468 tp->event.trace = print_kprobe_event; 1495 INIT_LIST_HEAD(&call->class->fields);
1469 call->raw_init = probe_event_raw_init; 1496 call->event.funcs = &kprobe_funcs;
1470 call->define_fields = kprobe_event_define_fields; 1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields;
1471 } 1499 }
1472 if (set_print_fmt(tp) < 0) 1500 if (set_print_fmt(tp) < 0)
1473 return -ENOMEM; 1501 return -ENOMEM;
1474 call->event = &tp->event; 1502 ret = register_ftrace_event(&call->event);
1475 call->id = register_ftrace_event(&tp->event); 1503 if (!ret) {
1476 if (!call->id) {
1477 kfree(call->print_fmt); 1504 kfree(call->print_fmt);
1478 return -ENODEV; 1505 return -ENODEV;
1479 } 1506 }
1480 call->enabled = 0; 1507 call->flags = 0;
1481 call->regfunc = probe_event_enable; 1508 call->class->reg = kprobe_register;
1482 call->unregfunc = probe_event_disable;
1483
1484#ifdef CONFIG_PERF_EVENTS
1485 call->perf_event_enable = probe_perf_enable;
1486 call->perf_event_disable = probe_perf_disable;
1487#endif
1488 call->data = tp; 1509 call->data = tp;
1489 ret = trace_add_event_call(call); 1510 ret = trace_add_event_call(call);
1490 if (ret) { 1511 if (ret) {
1491 pr_info("Failed to register kprobe event: %s\n", call->name); 1512 pr_info("Failed to register kprobe event: %s\n", call->name);
1492 kfree(call->print_fmt); 1513 kfree(call->print_fmt);
1493 unregister_ftrace_event(&tp->event); 1514 unregister_ftrace_event(&call->event);
1494 } 1515 }
1495 return ret; 1516 return ret;
1496} 1517}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ab13d7008061..57c1b4596470 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -742,6 +742,9 @@ int register_ftrace_event(struct trace_event *event)
742 if (WARN_ON(!event)) 742 if (WARN_ON(!event))
743 goto out; 743 goto out;
744 744
745 if (WARN_ON(!event->funcs))
746 goto out;
747
745 INIT_LIST_HEAD(&event->list); 748 INIT_LIST_HEAD(&event->list);
746 749
747 if (!event->type) { 750 if (!event->type) {
@@ -774,14 +777,14 @@ int register_ftrace_event(struct trace_event *event)
774 goto out; 777 goto out;
775 } 778 }
776 779
777 if (event->trace == NULL) 780 if (event->funcs->trace == NULL)
778 event->trace = trace_nop_print; 781 event->funcs->trace = trace_nop_print;
779 if (event->raw == NULL) 782 if (event->funcs->raw == NULL)
780 event->raw = trace_nop_print; 783 event->funcs->raw = trace_nop_print;
781 if (event->hex == NULL) 784 if (event->funcs->hex == NULL)
782 event->hex = trace_nop_print; 785 event->funcs->hex = trace_nop_print;
783 if (event->binary == NULL) 786 if (event->funcs->binary == NULL)
784 event->binary = trace_nop_print; 787 event->funcs->binary = trace_nop_print;
785 788
786 key = event->type & (EVENT_HASHSIZE - 1); 789 key = event->type & (EVENT_HASHSIZE - 1);
787 790
@@ -823,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
823 * Standard events 826 * Standard events
824 */ 827 */
825 828
826enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) 829enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
830 struct trace_event *event)
827{ 831{
828 return TRACE_TYPE_HANDLED; 832 return TRACE_TYPE_HANDLED;
829} 833}
830 834
831/* TRACE_FN */ 835/* TRACE_FN */
832static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) 836static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
837 struct trace_event *event)
833{ 838{
834 struct ftrace_entry *field; 839 struct ftrace_entry *field;
835 struct trace_seq *s = &iter->seq; 840 struct trace_seq *s = &iter->seq;
@@ -856,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
856 return TRACE_TYPE_PARTIAL_LINE; 861 return TRACE_TYPE_PARTIAL_LINE;
857} 862}
858 863
859static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) 864static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
865 struct trace_event *event)
860{ 866{
861 struct ftrace_entry *field; 867 struct ftrace_entry *field;
862 868
@@ -870,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
870 return TRACE_TYPE_HANDLED; 876 return TRACE_TYPE_HANDLED;
871} 877}
872 878
873static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) 879static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
880 struct trace_event *event)
874{ 881{
875 struct ftrace_entry *field; 882 struct ftrace_entry *field;
876 struct trace_seq *s = &iter->seq; 883 struct trace_seq *s = &iter->seq;
@@ -883,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
883 return TRACE_TYPE_HANDLED; 890 return TRACE_TYPE_HANDLED;
884} 891}
885 892
886static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) 893static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
894 struct trace_event *event)
887{ 895{
888 struct ftrace_entry *field; 896 struct ftrace_entry *field;
889 struct trace_seq *s = &iter->seq; 897 struct trace_seq *s = &iter->seq;
@@ -896,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
896 return TRACE_TYPE_HANDLED; 904 return TRACE_TYPE_HANDLED;
897} 905}
898 906
899static struct trace_event trace_fn_event = { 907static struct trace_event_functions trace_fn_funcs = {
900 .type = TRACE_FN,
901 .trace = trace_fn_trace, 908 .trace = trace_fn_trace,
902 .raw = trace_fn_raw, 909 .raw = trace_fn_raw,
903 .hex = trace_fn_hex, 910 .hex = trace_fn_hex,
904 .binary = trace_fn_bin, 911 .binary = trace_fn_bin,
905}; 912};
906 913
914static struct trace_event trace_fn_event = {
915 .type = TRACE_FN,
916 .funcs = &trace_fn_funcs,
917};
918
907/* TRACE_CTX an TRACE_WAKE */ 919/* TRACE_CTX an TRACE_WAKE */
908static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, 920static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
909 char *delim) 921 char *delim)
@@ -932,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
932 return TRACE_TYPE_HANDLED; 944 return TRACE_TYPE_HANDLED;
933} 945}
934 946
935static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) 947static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
948 struct trace_event *event)
936{ 949{
937 return trace_ctxwake_print(iter, "==>"); 950 return trace_ctxwake_print(iter, "==>");
938} 951}
939 952
940static enum print_line_t trace_wake_print(struct trace_iterator *iter, 953static enum print_line_t trace_wake_print(struct trace_iterator *iter,
941 int flags) 954 int flags, struct trace_event *event)
942{ 955{
943 return trace_ctxwake_print(iter, " +"); 956 return trace_ctxwake_print(iter, " +");
944} 957}
@@ -966,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
966 return TRACE_TYPE_HANDLED; 979 return TRACE_TYPE_HANDLED;
967} 980}
968 981
969static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) 982static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
983 struct trace_event *event)
970{ 984{
971 return trace_ctxwake_raw(iter, 0); 985 return trace_ctxwake_raw(iter, 0);
972} 986}
973 987
974static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) 988static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
989 struct trace_event *event)
975{ 990{
976 return trace_ctxwake_raw(iter, '+'); 991 return trace_ctxwake_raw(iter, '+');
977} 992}
@@ -1000,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
1000 return TRACE_TYPE_HANDLED; 1015 return TRACE_TYPE_HANDLED;
1001} 1016}
1002 1017
1003static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) 1018static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
1019 struct trace_event *event)
1004{ 1020{
1005 return trace_ctxwake_hex(iter, 0); 1021 return trace_ctxwake_hex(iter, 0);
1006} 1022}
1007 1023
1008static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) 1024static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
1025 struct trace_event *event)
1009{ 1026{
1010 return trace_ctxwake_hex(iter, '+'); 1027 return trace_ctxwake_hex(iter, '+');
1011} 1028}
1012 1029
1013static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, 1030static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
1014 int flags) 1031 int flags, struct trace_event *event)
1015{ 1032{
1016 struct ctx_switch_entry *field; 1033 struct ctx_switch_entry *field;
1017 struct trace_seq *s = &iter->seq; 1034 struct trace_seq *s = &iter->seq;
@@ -1028,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
1028 return TRACE_TYPE_HANDLED; 1045 return TRACE_TYPE_HANDLED;
1029} 1046}
1030 1047
1031static struct trace_event trace_ctx_event = { 1048static struct trace_event_functions trace_ctx_funcs = {
1032 .type = TRACE_CTX,
1033 .trace = trace_ctx_print, 1049 .trace = trace_ctx_print,
1034 .raw = trace_ctx_raw, 1050 .raw = trace_ctx_raw,
1035 .hex = trace_ctx_hex, 1051 .hex = trace_ctx_hex,
1036 .binary = trace_ctxwake_bin, 1052 .binary = trace_ctxwake_bin,
1037}; 1053};
1038 1054
1039static struct trace_event trace_wake_event = { 1055static struct trace_event trace_ctx_event = {
1040 .type = TRACE_WAKE, 1056 .type = TRACE_CTX,
1057 .funcs = &trace_ctx_funcs,
1058};
1059
1060static struct trace_event_functions trace_wake_funcs = {
1041 .trace = trace_wake_print, 1061 .trace = trace_wake_print,
1042 .raw = trace_wake_raw, 1062 .raw = trace_wake_raw,
1043 .hex = trace_wake_hex, 1063 .hex = trace_wake_hex,
1044 .binary = trace_ctxwake_bin, 1064 .binary = trace_ctxwake_bin,
1045}; 1065};
1046 1066
1067static struct trace_event trace_wake_event = {
1068 .type = TRACE_WAKE,
1069 .funcs = &trace_wake_funcs,
1070};
1071
1047/* TRACE_SPECIAL */ 1072/* TRACE_SPECIAL */
1048static enum print_line_t trace_special_print(struct trace_iterator *iter, 1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1049 int flags) 1074 int flags, struct trace_event *event)
1050{ 1075{
1051 struct special_entry *field; 1076 struct special_entry *field;
1052 1077
@@ -1062,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
1062} 1087}
1063 1088
1064static enum print_line_t trace_special_hex(struct trace_iterator *iter, 1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1065 int flags) 1090 int flags, struct trace_event *event)
1066{ 1091{
1067 struct special_entry *field; 1092 struct special_entry *field;
1068 struct trace_seq *s = &iter->seq; 1093 struct trace_seq *s = &iter->seq;
@@ -1077,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1077} 1102}
1078 1103
1079static enum print_line_t trace_special_bin(struct trace_iterator *iter, 1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1080 int flags) 1105 int flags, struct trace_event *event)
1081{ 1106{
1082 struct special_entry *field; 1107 struct special_entry *field;
1083 struct trace_seq *s = &iter->seq; 1108 struct trace_seq *s = &iter->seq;
@@ -1091,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1091 return TRACE_TYPE_HANDLED; 1116 return TRACE_TYPE_HANDLED;
1092} 1117}
1093 1118
1094static struct trace_event trace_special_event = { 1119static struct trace_event_functions trace_special_funcs = {
1095 .type = TRACE_SPECIAL,
1096 .trace = trace_special_print, 1120 .trace = trace_special_print,
1097 .raw = trace_special_print, 1121 .raw = trace_special_print,
1098 .hex = trace_special_hex, 1122 .hex = trace_special_hex,
1099 .binary = trace_special_bin, 1123 .binary = trace_special_bin,
1100}; 1124};
1101 1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1102/* TRACE_STACK */ 1131/* TRACE_STACK */
1103 1132
1104static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1133static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1105 int flags) 1134 int flags, struct trace_event *event)
1106{ 1135{
1107 struct stack_entry *field; 1136 struct stack_entry *field;
1108 struct trace_seq *s = &iter->seq; 1137 struct trace_seq *s = &iter->seq;
@@ -1130,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1130 return TRACE_TYPE_PARTIAL_LINE; 1159 return TRACE_TYPE_PARTIAL_LINE;
1131} 1160}
1132 1161
1133static struct trace_event trace_stack_event = { 1162static struct trace_event_functions trace_stack_funcs = {
1134 .type = TRACE_STACK,
1135 .trace = trace_stack_print, 1163 .trace = trace_stack_print,
1136 .raw = trace_special_print, 1164 .raw = trace_special_print,
1137 .hex = trace_special_hex, 1165 .hex = trace_special_hex,
1138 .binary = trace_special_bin, 1166 .binary = trace_special_bin,
1139}; 1167};
1140 1168
1169static struct trace_event trace_stack_event = {
1170 .type = TRACE_STACK,
1171 .funcs = &trace_stack_funcs,
1172};
1173
1141/* TRACE_USER_STACK */ 1174/* TRACE_USER_STACK */
1142static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, 1175static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1143 int flags) 1176 int flags, struct trace_event *event)
1144{ 1177{
1145 struct userstack_entry *field; 1178 struct userstack_entry *field;
1146 struct trace_seq *s = &iter->seq; 1179 struct trace_seq *s = &iter->seq;
@@ -1159,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1159 return TRACE_TYPE_PARTIAL_LINE; 1192 return TRACE_TYPE_PARTIAL_LINE;
1160} 1193}
1161 1194
1162static struct trace_event trace_user_stack_event = { 1195static struct trace_event_functions trace_user_stack_funcs = {
1163 .type = TRACE_USER_STACK,
1164 .trace = trace_user_stack_print, 1196 .trace = trace_user_stack_print,
1165 .raw = trace_special_print, 1197 .raw = trace_special_print,
1166 .hex = trace_special_hex, 1198 .hex = trace_special_hex,
1167 .binary = trace_special_bin, 1199 .binary = trace_special_bin,
1168}; 1200};
1169 1201
1202static struct trace_event trace_user_stack_event = {
1203 .type = TRACE_USER_STACK,
1204 .funcs = &trace_user_stack_funcs,
1205};
1206
1170/* TRACE_BPRINT */ 1207/* TRACE_BPRINT */
1171static enum print_line_t 1208static enum print_line_t
1172trace_bprint_print(struct trace_iterator *iter, int flags) 1209trace_bprint_print(struct trace_iterator *iter, int flags,
1210 struct trace_event *event)
1173{ 1211{
1174 struct trace_entry *entry = iter->ent; 1212 struct trace_entry *entry = iter->ent;
1175 struct trace_seq *s = &iter->seq; 1213 struct trace_seq *s = &iter->seq;
@@ -1194,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
1194 1232
1195 1233
1196static enum print_line_t 1234static enum print_line_t
1197trace_bprint_raw(struct trace_iterator *iter, int flags) 1235trace_bprint_raw(struct trace_iterator *iter, int flags,
1236 struct trace_event *event)
1198{ 1237{
1199 struct bprint_entry *field; 1238 struct bprint_entry *field;
1200 struct trace_seq *s = &iter->seq; 1239 struct trace_seq *s = &iter->seq;
@@ -1213,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
1213 return TRACE_TYPE_PARTIAL_LINE; 1252 return TRACE_TYPE_PARTIAL_LINE;
1214} 1253}
1215 1254
1255static struct trace_event_functions trace_bprint_funcs = {
1256 .trace = trace_bprint_print,
1257 .raw = trace_bprint_raw,
1258};
1216 1259
1217static struct trace_event trace_bprint_event = { 1260static struct trace_event trace_bprint_event = {
1218 .type = TRACE_BPRINT, 1261 .type = TRACE_BPRINT,
1219 .trace = trace_bprint_print, 1262 .funcs = &trace_bprint_funcs,
1220 .raw = trace_bprint_raw,
1221}; 1263};
1222 1264
1223/* TRACE_PRINT */ 1265/* TRACE_PRINT */
1224static enum print_line_t trace_print_print(struct trace_iterator *iter, 1266static enum print_line_t trace_print_print(struct trace_iterator *iter,
1225 int flags) 1267 int flags, struct trace_event *event)
1226{ 1268{
1227 struct print_entry *field; 1269 struct print_entry *field;
1228 struct trace_seq *s = &iter->seq; 1270 struct trace_seq *s = &iter->seq;
@@ -1241,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1241 return TRACE_TYPE_PARTIAL_LINE; 1283 return TRACE_TYPE_PARTIAL_LINE;
1242} 1284}
1243 1285
1244static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) 1286static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1287 struct trace_event *event)
1245{ 1288{
1246 struct print_entry *field; 1289 struct print_entry *field;
1247 1290
@@ -1256,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
1256 return TRACE_TYPE_PARTIAL_LINE; 1299 return TRACE_TYPE_PARTIAL_LINE;
1257} 1300}
1258 1301
1259static struct trace_event trace_print_event = { 1302static struct trace_event_functions trace_print_funcs = {
1260 .type = TRACE_PRINT,
1261 .trace = trace_print_print, 1303 .trace = trace_print_print,
1262 .raw = trace_print_raw, 1304 .raw = trace_print_raw,
1263}; 1305};
1264 1306
1307static struct trace_event trace_print_event = {
1308 .type = TRACE_PRINT,
1309 .funcs = &trace_print_funcs,
1310};
1311
1265 1312
1266static struct trace_event *events[] __initdata = { 1313static struct trace_event *events[] __initdata = {
1267 &trace_fn_event, 1314 &trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
25extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags, struct trace_event *event);
29extern int 29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); 30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 31
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a55fccfede5d..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,7 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct task_struct *prev, struct task_struct *next) 53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54{ 54{
55 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
56 unsigned long flags; 56 unsigned long flags;
@@ -108,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
108} 108}
109 109
110static void 110static void
111probe_sched_wakeup(struct task_struct *wakee, int success) 111probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
112{ 112{
113 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
114 unsigned long flags; 114 unsigned long flags;
@@ -138,21 +138,21 @@ static int tracing_sched_register(void)
138{ 138{
139 int ret; 139 int ret;
140 140
141 ret = register_trace_sched_wakeup(probe_sched_wakeup); 141 ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
142 if (ret) { 142 if (ret) {
143 pr_info("wakeup trace: Couldn't activate tracepoint" 143 pr_info("wakeup trace: Couldn't activate tracepoint"
144 " probe to kernel_sched_wakeup\n"); 144 " probe to kernel_sched_wakeup\n");
145 return ret; 145 return ret;
146 } 146 }
147 147
148 ret = register_trace_sched_wakeup_new(probe_sched_wakeup); 148 ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
149 if (ret) { 149 if (ret) {
150 pr_info("wakeup trace: Couldn't activate tracepoint" 150 pr_info("wakeup trace: Couldn't activate tracepoint"
151 " probe to kernel_sched_wakeup_new\n"); 151 " probe to kernel_sched_wakeup_new\n");
152 goto fail_deprobe; 152 goto fail_deprobe;
153 } 153 }
154 154
155 ret = register_trace_sched_switch(probe_sched_switch); 155 ret = register_trace_sched_switch(probe_sched_switch, NULL);
156 if (ret) { 156 if (ret) {
157 pr_info("sched trace: Couldn't activate tracepoint" 157 pr_info("sched trace: Couldn't activate tracepoint"
158 " probe to kernel_sched_switch\n"); 158 " probe to kernel_sched_switch\n");
@@ -161,17 +161,17 @@ static int tracing_sched_register(void)
161 161
162 return ret; 162 return ret;
163fail_deprobe_wake_new: 163fail_deprobe_wake_new:
164 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 164 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
165fail_deprobe: 165fail_deprobe:
166 unregister_trace_sched_wakeup(probe_sched_wakeup); 166 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
167 return ret; 167 return ret;
168} 168}
169 169
170static void tracing_sched_unregister(void) 170static void tracing_sched_unregister(void)
171{ 171{
172 unregister_trace_sched_switch(probe_sched_switch); 172 unregister_trace_sched_switch(probe_sched_switch, NULL);
173 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 173 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
174 unregister_trace_sched_wakeup(probe_sched_wakeup); 174 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
175} 175}
176 176
177static void tracing_start_sched_switch(void) 177static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8052446ceeaa..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
98 return 1; 98 return 1;
99} 99}
100 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) 101static void
102probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
102{ 103{
103 if (task != wakeup_task) 104 if (task != wakeup_task)
104 return; 105 return;
@@ -107,7 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 108}
108 109
109static void notrace 110static void notrace
110probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) 111probe_wakeup_sched_switch(void *ignore,
112 struct task_struct *prev, struct task_struct *next)
111{ 113{
112 struct trace_array_cpu *data; 114 struct trace_array_cpu *data;
113 cycle_t T0, T1, delta; 115 cycle_t T0, T1, delta;
@@ -199,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
199} 201}
200 202
201static void 203static void
202probe_wakeup(struct task_struct *p, int success) 204probe_wakeup(void *ignore, struct task_struct *p, int success)
203{ 205{
204 struct trace_array_cpu *data; 206 struct trace_array_cpu *data;
205 int cpu = smp_processor_id(); 207 int cpu = smp_processor_id();
@@ -263,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
263{ 265{
264 int ret; 266 int ret;
265 267
266 ret = register_trace_sched_wakeup(probe_wakeup); 268 ret = register_trace_sched_wakeup(probe_wakeup, NULL);
267 if (ret) { 269 if (ret) {
268 pr_info("wakeup trace: Couldn't activate tracepoint" 270 pr_info("wakeup trace: Couldn't activate tracepoint"
269 " probe to kernel_sched_wakeup\n"); 271 " probe to kernel_sched_wakeup\n");
270 return; 272 return;
271 } 273 }
272 274
273 ret = register_trace_sched_wakeup_new(probe_wakeup); 275 ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
274 if (ret) { 276 if (ret) {
275 pr_info("wakeup trace: Couldn't activate tracepoint" 277 pr_info("wakeup trace: Couldn't activate tracepoint"
276 " probe to kernel_sched_wakeup_new\n"); 278 " probe to kernel_sched_wakeup_new\n");
277 goto fail_deprobe; 279 goto fail_deprobe;
278 } 280 }
279 281
280 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 282 ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
281 if (ret) { 283 if (ret) {
282 pr_info("sched trace: Couldn't activate tracepoint" 284 pr_info("sched trace: Couldn't activate tracepoint"
283 " probe to kernel_sched_switch\n"); 285 " probe to kernel_sched_switch\n");
284 goto fail_deprobe_wake_new; 286 goto fail_deprobe_wake_new;
285 } 287 }
286 288
287 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); 289 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
288 if (ret) { 290 if (ret) {
289 pr_info("wakeup trace: Couldn't activate tracepoint" 291 pr_info("wakeup trace: Couldn't activate tracepoint"
290 " probe to kernel_sched_migrate_task\n"); 292 " probe to kernel_sched_migrate_task\n");
@@ -311,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
311 313
312 return; 314 return;
313fail_deprobe_wake_new: 315fail_deprobe_wake_new:
314 unregister_trace_sched_wakeup_new(probe_wakeup); 316 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
315fail_deprobe: 317fail_deprobe:
316 unregister_trace_sched_wakeup(probe_wakeup); 318 unregister_trace_sched_wakeup(probe_wakeup, NULL);
317} 319}
318 320
319static void stop_wakeup_tracer(struct trace_array *tr) 321static void stop_wakeup_tracer(struct trace_array *tr)
320{ 322{
321 tracer_enabled = 0; 323 tracer_enabled = 0;
322 unregister_ftrace_function(&trace_ops); 324 unregister_ftrace_function(&trace_ops);
323 unregister_trace_sched_switch(probe_wakeup_sched_switch); 325 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
324 unregister_trace_sched_wakeup_new(probe_wakeup); 326 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
325 unregister_trace_sched_wakeup(probe_wakeup); 327 unregister_trace_sched_wakeup(probe_wakeup, NULL);
326 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); 328 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
327} 329}
328 330
329static int __wakeup_tracer_init(struct trace_array *tr) 331static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..34e35804304b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,54 @@ static int sys_refcount_exit;
15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17 17
18static int syscall_enter_register(struct ftrace_event_call *event,
19 enum trace_reg type);
20static int syscall_exit_register(struct ftrace_event_call *event,
21 enum trace_reg type);
22
23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call)
28{
29 struct syscall_metadata *entry = call->data;
30
31 return &entry->enter_fields;
32}
33
34static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call)
36{
37 struct syscall_metadata *entry = call->data;
38
39 return &entry->exit_fields;
40}
41
42struct trace_event_functions enter_syscall_print_funcs = {
43 .trace = print_syscall_enter,
44};
45
46struct trace_event_functions exit_syscall_print_funcs = {
47 .trace = print_syscall_exit,
48};
49
50struct ftrace_event_class event_class_syscall_enter = {
51 .system = "syscalls",
52 .reg = syscall_enter_register,
53 .define_fields = syscall_enter_define_fields,
54 .get_fields = syscall_get_enter_fields,
55 .raw_init = init_syscall_trace,
56};
57
58struct ftrace_event_class event_class_syscall_exit = {
59 .system = "syscalls",
60 .reg = syscall_exit_register,
61 .define_fields = syscall_exit_define_fields,
62 .get_fields = syscall_get_exit_fields,
63 .raw_init = init_syscall_trace,
64};
65
18extern unsigned long __start_syscalls_metadata[]; 66extern unsigned long __start_syscalls_metadata[];
19extern unsigned long __stop_syscalls_metadata[]; 67extern unsigned long __stop_syscalls_metadata[];
20 68
@@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
53} 101}
54 102
55enum print_line_t 103enum print_line_t
56print_syscall_enter(struct trace_iterator *iter, int flags) 104print_syscall_enter(struct trace_iterator *iter, int flags,
105 struct trace_event *event)
57{ 106{
58 struct trace_seq *s = &iter->seq; 107 struct trace_seq *s = &iter->seq;
59 struct trace_entry *ent = iter->ent; 108 struct trace_entry *ent = iter->ent;
@@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
68 if (!entry) 117 if (!entry)
69 goto end; 118 goto end;
70 119
71 if (entry->enter_event->id != ent->type) { 120 if (entry->enter_event->event.type != ent->type) {
72 WARN_ON_ONCE(1); 121 WARN_ON_ONCE(1);
73 goto end; 122 goto end;
74 } 123 }
@@ -105,7 +154,8 @@ end:
105} 154}
106 155
107enum print_line_t 156enum print_line_t
108print_syscall_exit(struct trace_iterator *iter, int flags) 157print_syscall_exit(struct trace_iterator *iter, int flags,
158 struct trace_event *event)
109{ 159{
110 struct trace_seq *s = &iter->seq; 160 struct trace_seq *s = &iter->seq;
111 struct trace_entry *ent = iter->ent; 161 struct trace_entry *ent = iter->ent;
@@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
123 return TRACE_TYPE_HANDLED; 173 return TRACE_TYPE_HANDLED;
124 } 174 }
125 175
126 if (entry->exit_event->id != ent->type) { 176 if (entry->exit_event->event.type != ent->type) {
127 WARN_ON_ONCE(1); 177 WARN_ON_ONCE(1);
128 return TRACE_TYPE_UNHANDLED; 178 return TRACE_TYPE_UNHANDLED;
129 } 179 }
@@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
205 kfree(call->print_fmt); 255 kfree(call->print_fmt);
206} 256}
207 257
208int syscall_enter_define_fields(struct ftrace_event_call *call) 258static int syscall_enter_define_fields(struct ftrace_event_call *call)
209{ 259{
210 struct syscall_trace_enter trace; 260 struct syscall_trace_enter trace;
211 struct syscall_metadata *meta = call->data; 261 struct syscall_metadata *meta = call->data;
@@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
228 return ret; 278 return ret;
229} 279}
230 280
231int syscall_exit_define_fields(struct ftrace_event_call *call) 281static int syscall_exit_define_fields(struct ftrace_event_call *call)
232{ 282{
233 struct syscall_trace_exit trace; 283 struct syscall_trace_exit trace;
234 int ret; 284 int ret;
@@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
243 return ret; 293 return ret;
244} 294}
245 295
246void ftrace_syscall_enter(struct pt_regs *regs, long id) 296void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
247{ 297{
248 struct syscall_trace_enter *entry; 298 struct syscall_trace_enter *entry;
249 struct syscall_metadata *sys_data; 299 struct syscall_metadata *sys_data;
@@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
265 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 315 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
266 316
267 event = trace_current_buffer_lock_reserve(&buffer, 317 event = trace_current_buffer_lock_reserve(&buffer,
268 sys_data->enter_event->id, size, 0, 0); 318 sys_data->enter_event->event.type, size, 0, 0);
269 if (!event) 319 if (!event)
270 return; 320 return;
271 321
@@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
278 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 328 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
279} 329}
280 330
281void ftrace_syscall_exit(struct pt_regs *regs, long ret) 331void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
282{ 332{
283 struct syscall_trace_exit *entry; 333 struct syscall_trace_exit *entry;
284 struct syscall_metadata *sys_data; 334 struct syscall_metadata *sys_data;
@@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
297 return; 347 return;
298 348
299 event = trace_current_buffer_lock_reserve(&buffer, 349 event = trace_current_buffer_lock_reserve(&buffer,
300 sys_data->exit_event->id, sizeof(*entry), 0, 0); 350 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
301 if (!event) 351 if (!event)
302 return; 352 return;
303 353
@@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
320 return -ENOSYS; 370 return -ENOSYS;
321 mutex_lock(&syscall_trace_lock); 371 mutex_lock(&syscall_trace_lock);
322 if (!sys_refcount_enter) 372 if (!sys_refcount_enter)
323 ret = register_trace_sys_enter(ftrace_syscall_enter); 373 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
324 if (!ret) { 374 if (!ret) {
325 set_bit(num, enabled_enter_syscalls); 375 set_bit(num, enabled_enter_syscalls);
326 sys_refcount_enter++; 376 sys_refcount_enter++;
@@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
340 sys_refcount_enter--; 390 sys_refcount_enter--;
341 clear_bit(num, enabled_enter_syscalls); 391 clear_bit(num, enabled_enter_syscalls);
342 if (!sys_refcount_enter) 392 if (!sys_refcount_enter)
343 unregister_trace_sys_enter(ftrace_syscall_enter); 393 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
344 mutex_unlock(&syscall_trace_lock); 394 mutex_unlock(&syscall_trace_lock);
345} 395}
346 396
@@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
354 return -ENOSYS; 404 return -ENOSYS;
355 mutex_lock(&syscall_trace_lock); 405 mutex_lock(&syscall_trace_lock);
356 if (!sys_refcount_exit) 406 if (!sys_refcount_exit)
357 ret = register_trace_sys_exit(ftrace_syscall_exit); 407 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
358 if (!ret) { 408 if (!ret) {
359 set_bit(num, enabled_exit_syscalls); 409 set_bit(num, enabled_exit_syscalls);
360 sys_refcount_exit++; 410 sys_refcount_exit++;
@@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
374 sys_refcount_exit--; 424 sys_refcount_exit--;
375 clear_bit(num, enabled_exit_syscalls); 425 clear_bit(num, enabled_exit_syscalls);
376 if (!sys_refcount_exit) 426 if (!sys_refcount_exit)
377 unregister_trace_sys_exit(ftrace_syscall_exit); 427 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
378 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
379} 429}
380 430
@@ -434,11 +484,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
434static int sys_perf_refcount_enter; 484static int sys_perf_refcount_enter;
435static int sys_perf_refcount_exit; 485static int sys_perf_refcount_exit;
436 486
437static void perf_syscall_enter(struct pt_regs *regs, long id) 487static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
438{ 488{
439 struct syscall_metadata *sys_data; 489 struct syscall_metadata *sys_data;
440 struct syscall_trace_enter *rec; 490 struct syscall_trace_enter *rec;
441 unsigned long flags; 491 struct hlist_head *head;
442 int syscall_nr; 492 int syscall_nr;
443 int rctx; 493 int rctx;
444 int size; 494 int size;
@@ -461,14 +511,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
461 return; 511 return;
462 512
463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 513 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
464 sys_data->enter_event->id, &rctx, &flags); 514 sys_data->enter_event->event.type, regs, &rctx);
465 if (!rec) 515 if (!rec)
466 return; 516 return;
467 517
468 rec->nr = syscall_nr; 518 rec->nr = syscall_nr;
469 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 519 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
470 (unsigned long *)&rec->args); 520 (unsigned long *)&rec->args);
471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 521
522 head = this_cpu_ptr(sys_data->enter_event->perf_events);
523 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
472} 524}
473 525
474int perf_sysenter_enable(struct ftrace_event_call *call) 526int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -480,7 +532,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
480 532
481 mutex_lock(&syscall_trace_lock); 533 mutex_lock(&syscall_trace_lock);
482 if (!sys_perf_refcount_enter) 534 if (!sys_perf_refcount_enter)
483 ret = register_trace_sys_enter(perf_syscall_enter); 535 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
484 if (ret) { 536 if (ret) {
485 pr_info("event trace: Could not activate" 537 pr_info("event trace: Could not activate"
486 "syscall entry trace point"); 538 "syscall entry trace point");
@@ -502,15 +554,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
502 sys_perf_refcount_enter--; 554 sys_perf_refcount_enter--;
503 clear_bit(num, enabled_perf_enter_syscalls); 555 clear_bit(num, enabled_perf_enter_syscalls);
504 if (!sys_perf_refcount_enter) 556 if (!sys_perf_refcount_enter)
505 unregister_trace_sys_enter(perf_syscall_enter); 557 unregister_trace_sys_enter(perf_syscall_enter, NULL);
506 mutex_unlock(&syscall_trace_lock); 558 mutex_unlock(&syscall_trace_lock);
507} 559}
508 560
509static void perf_syscall_exit(struct pt_regs *regs, long ret) 561static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
510{ 562{
511 struct syscall_metadata *sys_data; 563 struct syscall_metadata *sys_data;
512 struct syscall_trace_exit *rec; 564 struct syscall_trace_exit *rec;
513 unsigned long flags; 565 struct hlist_head *head;
514 int syscall_nr; 566 int syscall_nr;
515 int rctx; 567 int rctx;
516 int size; 568 int size;
@@ -536,14 +588,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
536 return; 588 return;
537 589
538 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 590 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
539 sys_data->exit_event->id, &rctx, &flags); 591 sys_data->exit_event->event.type, regs, &rctx);
540 if (!rec) 592 if (!rec)
541 return; 593 return;
542 594
543 rec->nr = syscall_nr; 595 rec->nr = syscall_nr;
544 rec->ret = syscall_get_return_value(current, regs); 596 rec->ret = syscall_get_return_value(current, regs);
545 597
546 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 598 head = this_cpu_ptr(sys_data->exit_event->perf_events);
599 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
547} 600}
548 601
549int perf_sysexit_enable(struct ftrace_event_call *call) 602int perf_sysexit_enable(struct ftrace_event_call *call)
@@ -555,7 +608,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
555 608
556 mutex_lock(&syscall_trace_lock); 609 mutex_lock(&syscall_trace_lock);
557 if (!sys_perf_refcount_exit) 610 if (!sys_perf_refcount_exit)
558 ret = register_trace_sys_exit(perf_syscall_exit); 611 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
559 if (ret) { 612 if (ret) {
560 pr_info("event trace: Could not activate" 613 pr_info("event trace: Could not activate"
561 "syscall exit trace point"); 614 "syscall exit trace point");
@@ -577,9 +630,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
577 sys_perf_refcount_exit--; 630 sys_perf_refcount_exit--;
578 clear_bit(num, enabled_perf_exit_syscalls); 631 clear_bit(num, enabled_perf_exit_syscalls);
579 if (!sys_perf_refcount_exit) 632 if (!sys_perf_refcount_exit)
580 unregister_trace_sys_exit(perf_syscall_exit); 633 unregister_trace_sys_exit(perf_syscall_exit, NULL);
581 mutex_unlock(&syscall_trace_lock); 634 mutex_unlock(&syscall_trace_lock);
582} 635}
583 636
584#endif /* CONFIG_PERF_EVENTS */ 637#endif /* CONFIG_PERF_EVENTS */
585 638
639static int syscall_enter_register(struct ftrace_event_call *event,
640 enum trace_reg type)
641{
642 switch (type) {
643 case TRACE_REG_REGISTER:
644 return reg_event_syscall_enter(event);
645 case TRACE_REG_UNREGISTER:
646 unreg_event_syscall_enter(event);
647 return 0;
648
649#ifdef CONFIG_PERF_EVENTS
650 case TRACE_REG_PERF_REGISTER:
651 return perf_sysenter_enable(event);
652 case TRACE_REG_PERF_UNREGISTER:
653 perf_sysenter_disable(event);
654 return 0;
655#endif
656 }
657 return 0;
658}
659
660static int syscall_exit_register(struct ftrace_event_call *event,
661 enum trace_reg type)
662{
663 switch (type) {
664 case TRACE_REG_REGISTER:
665 return reg_event_syscall_exit(event);
666 case TRACE_REG_UNREGISTER:
667 unreg_event_syscall_exit(event);
668 return 0;
669
670#ifdef CONFIG_PERF_EVENTS
671 case TRACE_REG_PERF_REGISTER:
672 return perf_sysexit_enable(event);
673 case TRACE_REG_PERF_UNREGISTER:
674 perf_sysexit_disable(event);
675 return 0;
676#endif
677 }
678 return 0;
679}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9e..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
49 49
50/* Insertion of a work */ 50/* Insertion of a work */
51static void 51static void
52probe_workqueue_insertion(struct task_struct *wq_thread, 52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
53 struct work_struct *work) 54 struct work_struct *work)
54{ 55{
55 int cpu = cpumask_first(&wq_thread->cpus_allowed); 56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
70 71
71/* Execution of a work */ 72/* Execution of a work */
72static void 73static void
73probe_workqueue_execution(struct task_struct *wq_thread, 74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
74 struct work_struct *work) 76 struct work_struct *work)
75{ 77{
76 int cpu = cpumask_first(&wq_thread->cpus_allowed); 78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
90} 92}
91 93
92/* Creation of a cpu workqueue thread */ 94/* Creation of a cpu workqueue thread */
93static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) 95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
94{ 97{
95 struct cpu_workqueue_stats *cws; 98 struct cpu_workqueue_stats *cws;
96 unsigned long flags; 99 unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
114} 117}
115 118
116/* Destruction of a cpu workqueue thread */ 119/* Destruction of a cpu workqueue thread */
117static void probe_workqueue_destruction(struct task_struct *wq_thread) 120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
118{ 122{
119 /* Workqueue only execute on one cpu */ 123 /* Workqueue only execute on one cpu */
120 int cpu = cpumask_first(&wq_thread->cpus_allowed); 124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
259{ 263{
260 int ret, cpu; 264 int ret, cpu;
261 265
262 ret = register_trace_workqueue_insertion(probe_workqueue_insertion); 266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
263 if (ret) 267 if (ret)
264 goto out; 268 goto out;
265 269
266 ret = register_trace_workqueue_execution(probe_workqueue_execution); 270 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
267 if (ret) 271 if (ret)
268 goto no_insertion; 272 goto no_insertion;
269 273
270 ret = register_trace_workqueue_creation(probe_workqueue_creation); 274 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
271 if (ret) 275 if (ret)
272 goto no_execution; 276 goto no_execution;
273 277
274 ret = register_trace_workqueue_destruction(probe_workqueue_destruction); 278 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
275 if (ret) 279 if (ret)
276 goto no_creation; 280 goto no_creation;
277 281
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
283 return 0; 287 return 0;
284 288
285no_creation: 289no_creation:
286 unregister_trace_workqueue_creation(probe_workqueue_creation); 290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
287no_execution: 291no_execution:
288 unregister_trace_workqueue_execution(probe_workqueue_execution); 292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
289no_insertion: 293no_insertion:
290 unregister_trace_workqueue_insertion(probe_workqueue_insertion); 294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
291out: 295out:
292 pr_warning("trace_workqueue: unable to trace workqueues\n"); 296 pr_warning("trace_workqueue: unable to trace workqueues\n");
293 297
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
54 */ 54 */
55struct tracepoint_entry { 55struct tracepoint_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 void **funcs; 57 struct tracepoint_func *funcs;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; 59 char name[0];
60}; 60};
@@ -64,12 +64,12 @@ struct tp_probes {
64 struct rcu_head rcu; 64 struct rcu_head rcu;
65 struct list_head list; 65 struct list_head list;
66 } u; 66 } u;
67 void *probes[0]; 67 struct tracepoint_func probes[0];
68}; 68};
69 69
70static inline void *allocate_probes(int count) 70static inline void *allocate_probes(int count)
71{ 71{
72 struct tp_probes *p = kmalloc(count * sizeof(void *) 72 struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
73 + sizeof(struct tp_probes), GFP_KERNEL); 73 + sizeof(struct tp_probes), GFP_KERNEL);
74 return p == NULL ? NULL : p->probes; 74 return p == NULL ? NULL : p->probes;
75} 75}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
79 kfree(container_of(head, struct tp_probes, u.rcu)); 79 kfree(container_of(head, struct tp_probes, u.rcu));
80} 80}
81 81
82static inline void release_probes(void *old) 82static inline void release_probes(struct tracepoint_func *old)
83{ 83{
84 if (old) { 84 if (old) {
85 struct tp_probes *tp_probes = container_of(old, 85 struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
95 if (!tracepoint_debug || !entry->funcs) 95 if (!tracepoint_debug || !entry->funcs)
96 return; 96 return;
97 97
98 for (i = 0; entry->funcs[i]; i++) 98 for (i = 0; entry->funcs[i].func; i++)
99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); 99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
100} 100}
101 101
102static void * 102static struct tracepoint_func *
103tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) 103tracepoint_entry_add_probe(struct tracepoint_entry *entry,
104 void *probe, void *data)
104{ 105{
105 int nr_probes = 0; 106 int nr_probes = 0;
106 void **old, **new; 107 struct tracepoint_func *old, *new;
107 108
108 WARN_ON(!probe); 109 WARN_ON(!probe);
109 110
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 old = entry->funcs; 112 old = entry->funcs;
112 if (old) { 113 if (old) {
113 /* (N -> N+1), (N != 0, 1) probes */ 114 /* (N -> N+1), (N != 0, 1) probes */
114 for (nr_probes = 0; old[nr_probes]; nr_probes++) 115 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
115 if (old[nr_probes] == probe) 116 if (old[nr_probes].func == probe &&
117 old[nr_probes].data == data)
116 return ERR_PTR(-EEXIST); 118 return ERR_PTR(-EEXIST);
117 } 119 }
118 /* + 2 : one for new probe, one for NULL func */ 120 /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
120 if (new == NULL) 122 if (new == NULL)
121 return ERR_PTR(-ENOMEM); 123 return ERR_PTR(-ENOMEM);
122 if (old) 124 if (old)
123 memcpy(new, old, nr_probes * sizeof(void *)); 125 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
124 new[nr_probes] = probe; 126 new[nr_probes].func = probe;
125 new[nr_probes + 1] = NULL; 127 new[nr_probes].data = data;
128 new[nr_probes + 1].func = NULL;
126 entry->refcount = nr_probes + 1; 129 entry->refcount = nr_probes + 1;
127 entry->funcs = new; 130 entry->funcs = new;
128 debug_print_probes(entry); 131 debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
130} 133}
131 134
132static void * 135static void *
133tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) 136tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
137 void *probe, void *data)
134{ 138{
135 int nr_probes = 0, nr_del = 0, i; 139 int nr_probes = 0, nr_del = 0, i;
136 void **old, **new; 140 struct tracepoint_func *old, *new;
137 141
138 old = entry->funcs; 142 old = entry->funcs;
139 143
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
142 146
143 debug_print_probes(entry); 147 debug_print_probes(entry);
144 /* (N -> M), (N > 1, M >= 0) probes */ 148 /* (N -> M), (N > 1, M >= 0) probes */
145 for (nr_probes = 0; old[nr_probes]; nr_probes++) { 149 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
146 if ((!probe || old[nr_probes] == probe)) 150 if (!probe ||
151 (old[nr_probes].func == probe &&
152 old[nr_probes].data == data))
147 nr_del++; 153 nr_del++;
148 } 154 }
149 155
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
160 new = allocate_probes(nr_probes - nr_del + 1); 166 new = allocate_probes(nr_probes - nr_del + 1);
161 if (new == NULL) 167 if (new == NULL)
162 return ERR_PTR(-ENOMEM); 168 return ERR_PTR(-ENOMEM);
163 for (i = 0; old[i]; i++) 169 for (i = 0; old[i].func; i++)
164 if ((probe && old[i] != probe)) 170 if (probe &&
171 (old[i].func != probe || old[i].data != data))
165 new[j++] = old[i]; 172 new[j++] = old[i];
166 new[nr_probes - nr_del] = NULL; 173 new[nr_probes - nr_del].func = NULL;
167 entry->refcount = nr_probes - nr_del; 174 entry->refcount = nr_probes - nr_del;
168 entry->funcs = new; 175 entry->funcs = new;
169 } 176 }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
315 module_update_tracepoints(); 322 module_update_tracepoints();
316} 323}
317 324
318static void *tracepoint_add_probe(const char *name, void *probe) 325static struct tracepoint_func *
326tracepoint_add_probe(const char *name, void *probe, void *data)
319{ 327{
320 struct tracepoint_entry *entry; 328 struct tracepoint_entry *entry;
321 void *old; 329 struct tracepoint_func *old;
322 330
323 entry = get_tracepoint(name); 331 entry = get_tracepoint(name);
324 if (!entry) { 332 if (!entry) {
325 entry = add_tracepoint(name); 333 entry = add_tracepoint(name);
326 if (IS_ERR(entry)) 334 if (IS_ERR(entry))
327 return entry; 335 return (struct tracepoint_func *)entry;
328 } 336 }
329 old = tracepoint_entry_add_probe(entry, probe); 337 old = tracepoint_entry_add_probe(entry, probe, data);
330 if (IS_ERR(old) && !entry->refcount) 338 if (IS_ERR(old) && !entry->refcount)
331 remove_tracepoint(entry); 339 remove_tracepoint(entry);
332 return old; 340 return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
340 * Returns 0 if ok, error value on error. 348 * Returns 0 if ok, error value on error.
341 * The probe address must at least be aligned on the architecture pointer size. 349 * The probe address must at least be aligned on the architecture pointer size.
342 */ 350 */
343int tracepoint_probe_register(const char *name, void *probe) 351int tracepoint_probe_register(const char *name, void *probe, void *data)
344{ 352{
345 void *old; 353 struct tracepoint_func *old;
346 354
347 mutex_lock(&tracepoints_mutex); 355 mutex_lock(&tracepoints_mutex);
348 old = tracepoint_add_probe(name, probe); 356 old = tracepoint_add_probe(name, probe, data);
349 mutex_unlock(&tracepoints_mutex); 357 mutex_unlock(&tracepoints_mutex);
350 if (IS_ERR(old)) 358 if (IS_ERR(old))
351 return PTR_ERR(old); 359 return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
356} 364}
357EXPORT_SYMBOL_GPL(tracepoint_probe_register); 365EXPORT_SYMBOL_GPL(tracepoint_probe_register);
358 366
359static void *tracepoint_remove_probe(const char *name, void *probe) 367static struct tracepoint_func *
368tracepoint_remove_probe(const char *name, void *probe, void *data)
360{ 369{
361 struct tracepoint_entry *entry; 370 struct tracepoint_entry *entry;
362 void *old; 371 struct tracepoint_func *old;
363 372
364 entry = get_tracepoint(name); 373 entry = get_tracepoint(name);
365 if (!entry) 374 if (!entry)
366 return ERR_PTR(-ENOENT); 375 return ERR_PTR(-ENOENT);
367 old = tracepoint_entry_remove_probe(entry, probe); 376 old = tracepoint_entry_remove_probe(entry, probe, data);
368 if (IS_ERR(old)) 377 if (IS_ERR(old))
369 return old; 378 return old;
370 if (!entry->refcount) 379 if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
382 * itself uses stop_machine(), which insures that every preempt disabled section 391 * itself uses stop_machine(), which insures that every preempt disabled section
383 * have finished. 392 * have finished.
384 */ 393 */
385int tracepoint_probe_unregister(const char *name, void *probe) 394int tracepoint_probe_unregister(const char *name, void *probe, void *data)
386{ 395{
387 void *old; 396 struct tracepoint_func *old;
388 397
389 mutex_lock(&tracepoints_mutex); 398 mutex_lock(&tracepoints_mutex);
390 old = tracepoint_remove_probe(name, probe); 399 old = tracepoint_remove_probe(name, probe, data);
391 mutex_unlock(&tracepoints_mutex); 400 mutex_unlock(&tracepoints_mutex);
392 if (IS_ERR(old)) 401 if (IS_ERR(old))
393 return PTR_ERR(old); 402 return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
418 * 427 *
419 * caller must call tracepoint_probe_update_all() 428 * caller must call tracepoint_probe_update_all()
420 */ 429 */
421int tracepoint_probe_register_noupdate(const char *name, void *probe) 430int tracepoint_probe_register_noupdate(const char *name, void *probe,
431 void *data)
422{ 432{
423 void *old; 433 struct tracepoint_func *old;
424 434
425 mutex_lock(&tracepoints_mutex); 435 mutex_lock(&tracepoints_mutex);
426 old = tracepoint_add_probe(name, probe); 436 old = tracepoint_add_probe(name, probe, data);
427 if (IS_ERR(old)) { 437 if (IS_ERR(old)) {
428 mutex_unlock(&tracepoints_mutex); 438 mutex_unlock(&tracepoints_mutex);
429 return PTR_ERR(old); 439 return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
441 * 451 *
442 * caller must call tracepoint_probe_update_all() 452 * caller must call tracepoint_probe_update_all()
443 */ 453 */
444int tracepoint_probe_unregister_noupdate(const char *name, void *probe) 454int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
455 void *data)
445{ 456{
446 void *old; 457 struct tracepoint_func *old;
447 458
448 mutex_lock(&tracepoints_mutex); 459 mutex_lock(&tracepoints_mutex);
449 old = tracepoint_remove_probe(name, probe); 460 old = tracepoint_remove_probe(name, probe, data);
450 if (IS_ERR(old)) { 461 if (IS_ERR(old)) {
451 mutex_unlock(&tracepoints_mutex); 462 mutex_unlock(&tracepoints_mutex);
452 return PTR_ERR(old); 463 return PTR_ERR(old);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 77dabbf64b8f..327d2deb4451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1110,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1110 unsigned int cpu = (unsigned long)hcpu; 1110 unsigned int cpu = (unsigned long)hcpu;
1111 struct cpu_workqueue_struct *cwq; 1111 struct cpu_workqueue_struct *cwq;
1112 struct workqueue_struct *wq; 1112 struct workqueue_struct *wq;
1113 int ret = NOTIFY_OK; 1113 int err = 0;
1114 1114
1115 action &= ~CPU_TASKS_FROZEN; 1115 action &= ~CPU_TASKS_FROZEN;
1116 1116
@@ -1124,12 +1124,13 @@ undo:
1124 1124
1125 switch (action) { 1125 switch (action) {
1126 case CPU_UP_PREPARE: 1126 case CPU_UP_PREPARE:
1127 if (!create_workqueue_thread(cwq, cpu)) 1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1128 break; 1129 break;
1129 printk(KERN_ERR "workqueue [%s] for %i failed\n", 1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1130 wq->name, cpu); 1131 wq->name, cpu);
1131 action = CPU_UP_CANCELED; 1132 action = CPU_UP_CANCELED;
1132 ret = NOTIFY_BAD; 1133 err = -ENOMEM;
1133 goto undo; 1134 goto undo;
1134 1135
1135 case CPU_ONLINE: 1136 case CPU_ONLINE:
@@ -1150,7 +1151,7 @@ undo:
1150 cpumask_clear_cpu(cpu, cpu_populated_map); 1151 cpumask_clear_cpu(cpu, cpu_populated_map);
1151 } 1152 }
1152 1153
1153 return ret; 1154 return notifier_from_errno(err);
1154} 1155}
1155 1156
1156#ifdef CONFIG_SMP 1157#ifdef CONFIG_SMP