aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2009-09-14 00:16:56 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2009-09-14 00:16:56 -0400
commitfc8e1ead9314cf0e0f1922e661428b93d3a50d88 (patch)
treef3cb97c4769b74f6627a59769f1ed5c92a13c58a /kernel
parent2bcaa6a4238094c5695d5b1943078388d82d3004 (diff)
parent9de48cc300fb10f7d9faa978670becf5e352462a (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c72
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c168
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c260
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/exit.c317
-rw-r--r--kernel/fork.c84
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c1246
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/gcov/Kconfig48
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c148
-rw-r--r--kernel/gcov/fs.c673
-rw-r--r--kernel/gcov/gcc_3_4.c447
-rw-r--r--kernel/gcov/gcov.h128
-rw-r--r--kernel/groups.c288
-rw-r--r--kernel/hrtimer.c86
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/handle.c74
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/manage.c95
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c42
-rw-r--r--kernel/kallsyms.c134
-rw-r--r--kernel/kexec.c16
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c97
-rw-r--r--kernel/lockdep.c16
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c126
-rw-r--r--kernel/mutex.c31
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c46
-rw-r--r--kernel/perf_counter.c4860
-rw-r--r--kernel/pid.c17
-rw-r--r--kernel/pid_namespace.c24
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/hibernate.c (renamed from kernel/power/disk.c)34
-rw-r--r--kernel/power/hibernate_nvs.c135
-rw-r--r--kernel/power/main.c521
-rw-r--r--kernel/power/power.h25
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c80
-rw-r--r--kernel/power/suspend.c300
-rw-r--r--kernel/power/suspend_test.c187
-rw-r--r--kernel/power/swsusp.c198
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/printk.c33
-rw-r--r--kernel/profile.c19
-rw-r--r--kernel/ptrace.c178
-rw-r--r--kernel/rcupreempt.c8
-rw-r--r--kernel/rcutree.c28
-rw-r--r--kernel/rcutree_trace.c64
-rw-r--r--kernel/res_counter.c12
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c248
-rw-r--r--kernel/rtmutex_common.h8
-rw-r--r--kernel/sched.c530
-rw-r--r--kernel/sched_cpupri.c25
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c61
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c119
-rw-r--r--kernel/slow-work.c27
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c85
-rw-r--r--kernel/sys.c290
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c93
-rw-r--r--kernel/time/clockevents.c19
-rw-r--r--kernel/time/clocksource.c23
-rw-r--r--kernel/time/tick-broadcast.c9
-rw-r--r--kernel/time/tick-oneshot.c17
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c145
-rw-r--r--kernel/trace/Kconfig167
-rw-r--r--kernel/trace/Makefile20
-rw-r--r--kernel/trace/blktrace.c296
-rw-r--r--kernel/trace/events.c14
-rw-r--r--kernel/trace/ftrace.c901
-rw-r--r--kernel/trace/kmemtrace.c12
-rw-r--r--kernel/trace/ring_buffer.c1043
-rw-r--r--kernel/trace/ring_buffer_benchmark.c419
-rw-r--r--kernel/trace/trace.c461
-rw-r--r--kernel/trace/trace.h254
-rw-r--r--kernel/trace/trace_boot.c5
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_profile.c24
-rw-r--r--kernel/trace/trace_event_types.h15
-rw-r--r--kernel/trace/trace_events.c861
-rw-r--r--kernel/trace/trace_events_filter.c1213
-rw-r--r--kernel/trace/trace_events_stage_1.h39
-rw-r--r--kernel/trace/trace_events_stage_2.h176
-rw-r--r--kernel/trace/trace_events_stage_3.h281
-rw-r--r--kernel/trace/trace_export.c110
-rw-r--r--kernel/trace/trace_functions.c13
-rw-r--r--kernel/trace/trace_functions_graph.c78
-rw-r--r--kernel/trace/trace_hw_branches.c203
-rw-r--r--kernel/trace/trace_mmiotrace.c6
-rw-r--r--kernel/trace/trace_output.c239
-rw-r--r--kernel/trace/trace_output.h34
-rw-r--r--kernel/trace/trace_power.c8
-rw-r--r--kernel/trace/trace_printk.c34
-rw-r--r--kernel/trace/trace_sched_switch.c12
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_selftest.c58
-rw-r--r--kernel/trace/trace_stack.c26
-rw-r--r--kernel/trace/trace_stat.c232
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_sysprof.c9
-rw-r--r--kernel/trace/trace_workqueue.c25
-rw-r--r--kernel/user.c67
-rw-r--r--kernel/utsname.c13
-rw-r--r--kernel/wait.c7
-rw-r--r--kernel/workqueue.c11
136 files changed, 16537 insertions, 5450 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660a..2093a691f1c2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o
14obj-y += groups.o
14 15
15ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
@@ -68,8 +69,9 @@ obj-$(CONFIG_IKCONFIG) += configs.o
68obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
69obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
70obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
71obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
72obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/
73obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
74obj-$(CONFIG_KPROBES) += kprobes.o 76obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_KGDB) += kgdb.o 77obj-$(CONFIG_KGDB) += kgdb.o
@@ -93,8 +95,11 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o 95obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 96obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 97obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 100obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 101obj-$(CONFIG_SLOW_WORK) += slow-work.o
102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
98 103
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
115/* The netlink socket. */ 115/* The netlink socket. */
116static struct sock *audit_sock; 116static struct sock *audit_sock;
117 117
118/* Inotify handle. */
119struct inotify_handle *audit_ih;
120
121/* Hash for inode-based rules */ 118/* Hash for inode-based rules */
122struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 119struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
123 120
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
136static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
137 134
138/* Serialize requests from userspace. */ 135/* Serialize requests from userspace. */
139static DEFINE_MUTEX(audit_cmd_mutex); 136DEFINE_MUTEX(audit_cmd_mutex);
140 137
141/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 138/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
142 * audit records. Since printk uses a 1024 byte buffer, this buffer 139 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
375 kfree_skb(skb); 372 kfree_skb(skb);
376} 373}
377 374
375/*
376 * For one reason or another this nlh isn't getting delivered to the userspace
377 * audit daemon, just send it to printk.
378 */
379static void audit_printk_skb(struct sk_buff *skb)
380{
381 struct nlmsghdr *nlh = nlmsg_hdr(skb);
382 char *data = NLMSG_DATA(nlh);
383
384 if (nlh->nlmsg_type != AUDIT_EOE) {
385 if (printk_ratelimit())
386 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
387 else
388 audit_log_lost("printk limit exceeded\n");
389 }
390
391 audit_hold_skb(skb);
392}
393
378static void kauditd_send_skb(struct sk_buff *skb) 394static void kauditd_send_skb(struct sk_buff *skb)
379{ 395{
380 int err; 396 int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
427 if (skb) { 443 if (skb) {
428 if (audit_pid) 444 if (audit_pid)
429 kauditd_send_skb(skb); 445 kauditd_send_skb(skb);
430 else { 446 else
431 if (printk_ratelimit()) 447 audit_printk_skb(skb);
432 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
433 else
434 audit_log_lost("printk limit exceeded\n");
435
436 audit_hold_skb(skb);
437 }
438 } else { 448 } else {
439 DECLARE_WAITQUEUE(wait, current); 449 DECLARE_WAITQUEUE(wait, current);
440 set_current_state(TASK_INTERRUPTIBLE); 450 set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
495 return 0; 505 return 0;
496} 506}
497 507
498#ifdef CONFIG_AUDIT_TREE
499static int prune_tree_thread(void *unused)
500{
501 mutex_lock(&audit_cmd_mutex);
502 audit_prune_trees();
503 mutex_unlock(&audit_cmd_mutex);
504 return 0;
505}
506
507void audit_schedule_prune(void)
508{
509 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
510}
511#endif
512
513struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
514 int multi, void *payload, int size) 509 int multi, void *payload, int size)
515{ 510{
516 struct sk_buff *skb; 511 struct sk_buff *skb;
517 struct nlmsghdr *nlh; 512 struct nlmsghdr *nlh;
518 int len = NLMSG_SPACE(size);
519 void *data; 513 void *data;
520 int flags = multi ? NLM_F_MULTI : 0; 514 int flags = multi ? NLM_F_MULTI : 0;
521 int t = done ? NLMSG_DONE : type; 515 int t = done ? NLMSG_DONE : type;
522 516
523 skb = alloc_skb(len, GFP_KERNEL); 517 skb = nlmsg_new(size, GFP_KERNEL);
524 if (!skb) 518 if (!skb)
525 return NULL; 519 return NULL;
526 520
527 nlh = NLMSG_PUT(skb, pid, seq, t, size); 521 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
528 nlh->nlmsg_flags = flags; 522 data = NLMSG_DATA(nlh);
529 data = NLMSG_DATA(nlh);
530 memcpy(data, payload, size); 523 memcpy(data, payload, size);
531 return skb; 524 return skb;
532 525
533nlmsg_failure: /* Used by NLMSG_PUT */ 526nlmsg_failure: /* Used by NLMSG_NEW */
534 if (skb) 527 if (skb)
535 kfree_skb(skb); 528 kfree_skb(skb);
536 return NULL; 529 return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
926} 919}
927 920
928/* 921/*
929 * Get message from skb (based on rtnetlink_rcv_skb). Each message is 922 * Get message from skb. Each message is processed by audit_receive_msg.
930 * processed by audit_receive_msg. Malformed skbs with wrong length are 923 * Malformed skbs with wrong length are discarded silently.
931 * discarded silently.
932 */ 924 */
933static void audit_receive_skb(struct sk_buff *skb) 925static void audit_receive_skb(struct sk_buff *skb)
934{ 926{
935 int err; 927 struct nlmsghdr *nlh;
936 struct nlmsghdr *nlh; 928 /*
937 u32 rlen; 929 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
930 * if the nlmsg_len was not aligned
931 */
932 int len;
933 int err;
938 934
939 while (skb->len >= NLMSG_SPACE(0)) { 935 nlh = nlmsg_hdr(skb);
940 nlh = nlmsg_hdr(skb); 936 len = skb->len;
941 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 937
942 return; 938 while (NLMSG_OK(nlh, len)) {
943 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 939 err = audit_receive_msg(skb, nlh);
944 if (rlen > skb->len) 940 /* if err or if this message says it wants a response */
945 rlen = skb->len; 941 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
946 if ((err = audit_receive_msg(skb, nlh))) {
947 netlink_ack(skb, nlh, err); 942 netlink_ack(skb, nlh, err);
948 } else if (nlh->nlmsg_flags & NLM_F_ACK) 943
949 netlink_ack(skb, nlh, 0); 944 nlh = NLMSG_NEXT(nlh, len);
950 skb_pull(skb, rlen);
951 } 945 }
952} 946}
953 947
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
959 mutex_unlock(&audit_cmd_mutex); 953 mutex_unlock(&audit_cmd_mutex);
960} 954}
961 955
962#ifdef CONFIG_AUDITSYSCALL
963static const struct inotify_operations audit_inotify_ops = {
964 .handle_event = audit_handle_ievent,
965 .destroy_watch = audit_free_parent,
966};
967#endif
968
969/* Initialize audit support at boot time. */ 956/* Initialize audit support at boot time. */
970static int __init audit_init(void) 957static int __init audit_init(void)
971{ 958{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
991 978
992 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 979 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
993 980
994#ifdef CONFIG_AUDITSYSCALL
995 audit_ih = inotify_init(&audit_inotify_ops);
996 if (IS_ERR(audit_ih))
997 audit_panic("cannot initialize inotify handle");
998#endif
999
1000 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 981 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
1001 INIT_LIST_HEAD(&audit_inode_hash[i]); 982 INIT_LIST_HEAD(&audit_inode_hash[i]);
1002 983
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1070 goto err; 1051 goto err;
1071 } 1052 }
1072 1053
1073 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
1074 if (!ab->skb)
1075 goto err;
1076
1077 ab->ctx = ctx; 1054 ab->ctx = ctx;
1078 ab->gfp_mask = gfp_mask; 1055 ab->gfp_mask = gfp_mask;
1079 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 1056
1080 nlh->nlmsg_type = type; 1057 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1081 nlh->nlmsg_flags = 0; 1058 if (!ab->skb)
1082 nlh->nlmsg_pid = 0; 1059 goto nlmsg_failure;
1083 nlh->nlmsg_seq = 0; 1060
1061 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1062
1084 return ab; 1063 return ab;
1064
1065nlmsg_failure: /* Used by NLMSG_NEW */
1066 kfree_skb(ab->skb);
1067 ab->skb = NULL;
1085err: 1068err:
1086 audit_buffer_free(ab); 1069 audit_buffer_free(ab);
1087 return NULL; 1070 return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1452 kfree(pathname); 1435 kfree(pathname);
1453} 1436}
1454 1437
1438void audit_log_key(struct audit_buffer *ab, char *key)
1439{
1440 audit_log_format(ab, " key=");
1441 if (key)
1442 audit_log_untrustedstring(ab, key);
1443 else
1444 audit_log_format(ab, "(null)");
1445}
1446
1455/** 1447/**
1456 * audit_log_end - end one audit record 1448 * audit_log_end - end one audit record
1457 * @ab: the audit_buffer 1449 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
1475 skb_queue_tail(&audit_skb_queue, ab->skb); 1467 skb_queue_tail(&audit_skb_queue, ab->skb);
1476 wake_up_interruptible(&kauditd_wait); 1468 wake_up_interruptible(&kauditd_wait);
1477 } else { 1469 } else {
1478 if (nlh->nlmsg_type != AUDIT_EOE) { 1470 audit_printk_skb(ab->skb);
1479 if (printk_ratelimit()) {
1480 printk(KERN_NOTICE "type=%d %s\n",
1481 nlh->nlmsg_type,
1482 ab->skb->data + NLMSG_SPACE(0));
1483 } else
1484 audit_log_lost("printk limit exceeded\n");
1485 }
1486 audit_hold_skb(ab->skb);
1487 } 1471 }
1488 ab->skb = NULL; 1472 ab->skb = NULL;
1489 } 1473 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent; 56struct audit_watch;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
68struct audit_tree; 57struct audit_tree;
69struct audit_chunk; 58struct audit_chunk;
70 59
@@ -108,19 +97,28 @@ struct audit_netlink_list {
108 97
109int audit_send_list(void *); 98int audit_send_list(void *);
110 99
111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
115extern void audit_free_parent(struct inotify_watch *);
116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
117 const char *, struct inode *);
118extern int selinux_audit_rule_update(void); 100extern int selinux_audit_rule_update(void);
119 101
120extern struct mutex audit_filter_mutex; 102extern struct mutex audit_filter_mutex;
121extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
123 105
106/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch);
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule);
113extern void audit_remove_watch(struct audit_watch *watch);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch);
118
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
120 struct audit_watch *watch);
121
124#ifdef CONFIG_AUDIT_TREE 122#ifdef CONFIG_AUDIT_TREE
125extern struct audit_chunk *audit_tree_lookup(const struct inode *); 123extern struct audit_chunk *audit_tree_lookup(const struct inode *);
126extern void audit_put_chunk(struct audit_chunk *); 124extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
130extern int audit_remove_tree_rule(struct audit_krule *); 128extern int audit_remove_tree_rule(struct audit_krule *);
131extern void audit_trim_trees(void); 129extern void audit_trim_trees(void);
132extern int audit_tag_tree(char *old, char *new); 130extern int audit_tag_tree(char *old, char *new);
133extern void audit_schedule_prune(void);
134extern void audit_prune_trees(void);
135extern const char *audit_tree_path(struct audit_tree *); 131extern const char *audit_tree_path(struct audit_tree *);
136extern void audit_put_tree(struct audit_tree *); 132extern void audit_put_tree(struct audit_tree *);
133extern void audit_kill_trees(struct list_head *);
137#else 134#else
138#define audit_remove_tree_rule(rule) BUG() 135#define audit_remove_tree_rule(rule) BUG()
139#define audit_add_tree_rule(rule) -EINVAL 136#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
142#define audit_put_tree(tree) (void)0 139#define audit_put_tree(tree) (void)0
143#define audit_tag_tree(old, new) -EINVAL 140#define audit_tag_tree(old, new) -EINVAL
144#define audit_tree_path(rule) "" /* never called */ 141#define audit_tree_path(rule) "" /* never called */
142#define audit_kill_trees(list) BUG()
145#endif 143#endif
146 144
147extern char *audit_unpack_string(void **, size_t *, size_t); 145extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
160 return 0; 158 return 0;
161} 159}
162extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 160extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
161extern struct list_head *audit_killed_trees(void);
163#else 162#else
164#define audit_signal_info(s,t) AUDIT_DISABLED 163#define audit_signal_info(s,t) AUDIT_DISABLED
165#define audit_filter_inodes(t,c) AUDIT_DISABLED 164#define audit_filter_inodes(t,c) AUDIT_DISABLED
166#endif 165#endif
166
167extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a82..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h>
5 6
6struct audit_tree; 7struct audit_tree;
7struct audit_chunk; 8struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
441 if (rule->tree) { 442 if (rule->tree) {
442 /* not a half-baked one */ 443 /* not a half-baked one */
443 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
444 audit_log_format(ab, "op=remove rule dir="); 445 audit_log_format(ab, "op=");
446 audit_log_string(ab, "remove rule");
447 audit_log_format(ab, " dir=");
445 audit_log_untrustedstring(ab, rule->tree->pathname); 448 audit_log_untrustedstring(ab, rule->tree->pathname);
446 if (rule->filterkey) { 449 audit_log_key(ab, rule->filterkey);
447 audit_log_format(ab, " key=");
448 audit_log_untrustedstring(ab, rule->filterkey);
449 } else
450 audit_log_format(ab, " key=(null)");
451 audit_log_format(ab, " list=%d res=1", rule->listnr); 450 audit_log_format(ab, " list=%d res=1", rule->listnr);
452 audit_log_end(ab); 451 audit_log_end(ab);
453 rule->tree = NULL; 452 rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
519 } 518 }
520} 519}
521 520
521static void audit_schedule_prune(void);
522
522/* called with audit_filter_mutex */ 523/* called with audit_filter_mutex */
523int audit_remove_tree_rule(struct audit_krule *rule) 524int audit_remove_tree_rule(struct audit_krule *rule)
524{ 525{
@@ -568,7 +569,7 @@ void audit_trim_trees(void)
568 if (err) 569 if (err)
569 goto skip_it; 570 goto skip_it;
570 571
571 root_mnt = collect_mounts(path.mnt, path.dentry); 572 root_mnt = collect_mounts(&path);
572 path_put(&path); 573 path_put(&path);
573 if (!root_mnt) 574 if (!root_mnt)
574 goto skip_it; 575 goto skip_it;
@@ -660,7 +661,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
660 err = kern_path(tree->pathname, 0, &path); 661 err = kern_path(tree->pathname, 0, &path);
661 if (err) 662 if (err)
662 goto Err; 663 goto Err;
663 mnt = collect_mounts(path.mnt, path.dentry); 664 mnt = collect_mounts(&path);
664 path_put(&path); 665 path_put(&path);
665 if (!mnt) { 666 if (!mnt) {
666 err = -ENOMEM; 667 err = -ENOMEM;
@@ -720,7 +721,7 @@ int audit_tag_tree(char *old, char *new)
720 err = kern_path(new, 0, &path); 721 err = kern_path(new, 0, &path);
721 if (err) 722 if (err)
722 return err; 723 return err;
723 tagged = collect_mounts(path.mnt, path.dentry); 724 tagged = collect_mounts(&path);
724 path_put(&path); 725 path_put(&path);
725 if (!tagged) 726 if (!tagged)
726 return -ENOMEM; 727 return -ENOMEM;
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
824 825
825/* 826/*
826 * That gets run when evict_chunk() ends up needing to kill audit_tree. 827 * That gets run when evict_chunk() ends up needing to kill audit_tree.
827 * Runs from a separate thread, with audit_cmd_mutex held. 828 * Runs from a separate thread.
828 */ 829 */
829void audit_prune_trees(void) 830static int prune_tree_thread(void *unused)
830{ 831{
832 mutex_lock(&audit_cmd_mutex);
831 mutex_lock(&audit_filter_mutex); 833 mutex_lock(&audit_filter_mutex);
832 834
833 while (!list_empty(&prune_list)) { 835 while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
844 } 846 }
845 847
846 mutex_unlock(&audit_filter_mutex); 848 mutex_unlock(&audit_filter_mutex);
849 mutex_unlock(&audit_cmd_mutex);
850 return 0;
851}
852
853static void audit_schedule_prune(void)
854{
855 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
856}
857
858/*
859 * ... and that one is done if evict_chunk() decides to delay until the end
860 * of syscall. Runs synchronously.
861 */
862void audit_kill_trees(struct list_head *list)
863{
864 mutex_lock(&audit_cmd_mutex);
865 mutex_lock(&audit_filter_mutex);
866
867 while (!list_empty(list)) {
868 struct audit_tree *victim;
869
870 victim = list_entry(list->next, struct audit_tree, list);
871 kill_rules(victim);
872 list_del_init(&victim->list);
873
874 mutex_unlock(&audit_filter_mutex);
875
876 prune_one(victim);
877
878 mutex_lock(&audit_filter_mutex);
879 }
880
881 mutex_unlock(&audit_filter_mutex);
882 mutex_unlock(&audit_cmd_mutex);
847} 883}
848 884
849/* 885/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
854static void evict_chunk(struct audit_chunk *chunk) 890static void evict_chunk(struct audit_chunk *chunk)
855{ 891{
856 struct audit_tree *owner; 892 struct audit_tree *owner;
893 struct list_head *postponed = audit_killed_trees();
894 int need_prune = 0;
857 int n; 895 int n;
858 896
859 if (chunk->dead) 897 if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
869 owner->root = NULL; 907 owner->root = NULL;
870 list_del_init(&owner->same_root); 908 list_del_init(&owner->same_root);
871 spin_unlock(&hash_lock); 909 spin_unlock(&hash_lock);
872 kill_rules(owner); 910 if (!postponed) {
873 list_move(&owner->list, &prune_list); 911 kill_rules(owner);
874 audit_schedule_prune(); 912 list_move(&owner->list, &prune_list);
913 need_prune = 1;
914 } else {
915 list_move(&owner->list, postponed);
916 }
875 spin_lock(&hash_lock); 917 spin_lock(&hash_lock);
876 } 918 }
877 list_del_rcu(&chunk->hash); 919 list_del_rcu(&chunk->hash);
878 for (n = 0; n < chunk->count; n++) 920 for (n = 0; n < chunk->count; n++)
879 list_del_init(&chunk->owners[n].list); 921 list_del_init(&chunk->owners[n].list);
880 spin_unlock(&hash_lock); 922 spin_unlock(&hash_lock);
923 if (need_prune)
924 audit_schedule_prune();
881 mutex_unlock(&audit_filter_mutex); 925 mutex_unlock(&audit_filter_mutex);
882} 926}
883 927
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
1/* audit_watch.c -- watching inodes
2 *
3 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h>
32#include "audit.h"
33
34/*
35 * Reference counting:
36 *
37 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
38 * event. Each audit_watch holds a reference to its associated parent.
39 *
40 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
41 * audit_remove_watch(). Additionally, an audit_watch may exist
42 * temporarily to assist in searching existing filter data. Each
43 * audit_krule holds a reference to its associated watch.
44 */
45
46struct audit_watch {
47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */
50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */
53 struct list_head rules; /* associated rules */
54};
55
56struct audit_parent {
57 struct list_head ilist; /* entry in inotify registration list */
58 struct list_head watches; /* associated watches */
59 struct inotify_watch wdata; /* inotify watch data */
60 unsigned flags; /* status flags */
61};
62
63/* Inotify handle. */
64struct inotify_handle *audit_ih;
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Inotify events we care about. */
78#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
79
80static void audit_free_parent(struct inotify_watch *i_watch)
81{
82 struct audit_parent *parent;
83
84 parent = container_of(i_watch, struct audit_parent, wdata);
85 WARN_ON(!list_empty(&parent->watches));
86 kfree(parent);
87}
88
89void audit_get_watch(struct audit_watch *watch)
90{
91 atomic_inc(&watch->count);
92}
93
94void audit_put_watch(struct audit_watch *watch)
95{
96 if (atomic_dec_and_test(&watch->count)) {
97 WARN_ON(watch->parent);
98 WARN_ON(!list_empty(&watch->rules));
99 kfree(watch->path);
100 kfree(watch);
101 }
102}
103
104void audit_remove_watch(struct audit_watch *watch)
105{
106 list_del(&watch->wlist);
107 put_inotify_watch(&watch->parent->wdata);
108 watch->parent = NULL;
109 audit_put_watch(watch); /* match initial get */
110}
111
112char *audit_watch_path(struct audit_watch *watch)
113{
114 return watch->path;
115}
116
117struct list_head *audit_watch_rules(struct audit_watch *watch)
118{
119 return &watch->rules;
120}
121
122unsigned long audit_watch_inode(struct audit_watch *watch)
123{
124 return watch->ino;
125}
126
127dev_t audit_watch_dev(struct audit_watch *watch)
128{
129 return watch->dev;
130}
131
132/* Initialize a parent watch entry. */
133static struct audit_parent *audit_init_parent(struct nameidata *ndp)
134{
135 struct audit_parent *parent;
136 s32 wd;
137
138 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
139 if (unlikely(!parent))
140 return ERR_PTR(-ENOMEM);
141
142 INIT_LIST_HEAD(&parent->watches);
143 parent->flags = 0;
144
145 inotify_init_watch(&parent->wdata);
146 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
147 get_inotify_watch(&parent->wdata);
148 wd = inotify_add_watch(audit_ih, &parent->wdata,
149 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
150 if (wd < 0) {
151 audit_free_parent(&parent->wdata);
152 return ERR_PTR(wd);
153 }
154
155 return parent;
156}
157
158/* Initialize a watch entry. */
159static struct audit_watch *audit_init_watch(char *path)
160{
161 struct audit_watch *watch;
162
163 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
164 if (unlikely(!watch))
165 return ERR_PTR(-ENOMEM);
166
167 INIT_LIST_HEAD(&watch->rules);
168 atomic_set(&watch->count, 1);
169 watch->path = path;
170 watch->dev = (dev_t)-1;
171 watch->ino = (unsigned long)-1;
172
173 return watch;
174}
175
176/* Translate a watch string to kernel respresentation. */
177int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
178{
179 struct audit_watch *watch;
180
181 if (!audit_ih)
182 return -EOPNOTSUPP;
183
184 if (path[0] != '/' || path[len-1] == '/' ||
185 krule->listnr != AUDIT_FILTER_EXIT ||
186 op != Audit_equal ||
187 krule->inode_f || krule->watch || krule->tree)
188 return -EINVAL;
189
190 watch = audit_init_watch(path);
191 if (IS_ERR(watch))
192 return PTR_ERR(watch);
193
194 audit_get_watch(watch);
195 krule->watch = watch;
196
197 return 0;
198}
199
200/* Duplicate the given audit watch. The new watch's rules list is initialized
201 * to an empty list and wlist is undefined. */
202static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
203{
204 char *path;
205 struct audit_watch *new;
206
207 path = kstrdup(old->path, GFP_KERNEL);
208 if (unlikely(!path))
209 return ERR_PTR(-ENOMEM);
210
211 new = audit_init_watch(path);
212 if (IS_ERR(new)) {
213 kfree(path);
214 goto out;
215 }
216
217 new->dev = old->dev;
218 new->ino = old->ino;
219 get_inotify_watch(&old->parent->wdata);
220 new->parent = old->parent;
221
222out:
223 return new;
224}
225
226static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
227{
228 if (audit_enabled) {
229 struct audit_buffer *ab;
230 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
231 audit_log_format(ab, "auid=%u ses=%u op=",
232 audit_get_loginuid(current),
233 audit_get_sessionid(current));
234 audit_log_string(ab, op);
235 audit_log_format(ab, " path=");
236 audit_log_untrustedstring(ab, w->path);
237 audit_log_key(ab, r->filterkey);
238 audit_log_format(ab, " list=%d res=1", r->listnr);
239 audit_log_end(ab);
240 }
241}
242
243/* Update inode info in audit rules based on filesystem event. */
244static void audit_update_watch(struct audit_parent *parent,
245 const char *dname, dev_t dev,
246 unsigned long ino, unsigned invalidating)
247{
248 struct audit_watch *owatch, *nwatch, *nextw;
249 struct audit_krule *r, *nextr;
250 struct audit_entry *oentry, *nentry;
251
252 mutex_lock(&audit_filter_mutex);
253 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
254 if (audit_compare_dname_path(dname, owatch->path, NULL))
255 continue;
256
257 /* If the update involves invalidating rules, do the inode-based
258 * filtering now, so we don't omit records. */
259 if (invalidating && current->audit_context)
260 audit_filter_inodes(current, current->audit_context);
261
262 nwatch = audit_dupe_watch(owatch);
263 if (IS_ERR(nwatch)) {
264 mutex_unlock(&audit_filter_mutex);
265 audit_panic("error updating watch, skipping");
266 return;
267 }
268 nwatch->dev = dev;
269 nwatch->ino = ino;
270
271 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
272
273 oentry = container_of(r, struct audit_entry, rule);
274 list_del(&oentry->rule.rlist);
275 list_del_rcu(&oentry->list);
276
277 nentry = audit_dupe_rule(&oentry->rule, nwatch);
278 if (IS_ERR(nentry)) {
279 list_del(&oentry->rule.list);
280 audit_panic("error updating watch, removing");
281 } else {
282 int h = audit_hash_ino((u32)ino);
283 list_add(&nentry->rule.rlist, &nwatch->rules);
284 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
285 list_replace(&oentry->rule.list,
286 &nentry->rule.list);
287 }
288
289 audit_watch_log_rule_change(r, owatch, "updated rules");
290
291 call_rcu(&oentry->rcu, audit_free_rule_rcu);
292 }
293
294 audit_remove_watch(owatch);
295 goto add_watch_to_parent; /* event applies to a single watch */
296 }
297 mutex_unlock(&audit_filter_mutex);
298 return;
299
300add_watch_to_parent:
301 list_add(&nwatch->wlist, &parent->watches);
302 mutex_unlock(&audit_filter_mutex);
303 return;
304}
305
306/* Remove all watches & rules associated with a parent that is going away. */
307static void audit_remove_parent_watches(struct audit_parent *parent)
308{
309 struct audit_watch *w, *nextw;
310 struct audit_krule *r, *nextr;
311 struct audit_entry *e;
312
313 mutex_lock(&audit_filter_mutex);
314 parent->flags |= AUDIT_PARENT_INVALID;
315 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
316 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
317 e = container_of(r, struct audit_entry, rule);
318 audit_watch_log_rule_change(r, w, "remove rule");
319 list_del(&r->rlist);
320 list_del(&r->list);
321 list_del_rcu(&e->list);
322 call_rcu(&e->rcu, audit_free_rule_rcu);
323 }
324 audit_remove_watch(w);
325 }
326 mutex_unlock(&audit_filter_mutex);
327}
328
329/* Unregister inotify watches for parents on in_list.
330 * Generates an IN_IGNORED event. */
331void audit_inotify_unregister(struct list_head *in_list)
332{
333 struct audit_parent *p, *n;
334
335 list_for_each_entry_safe(p, n, in_list, ilist) {
336 list_del(&p->ilist);
337 inotify_rm_watch(audit_ih, &p->wdata);
338 /* the unpin matching the pin in audit_do_del_rule() */
339 unpin_inotify_watch(&p->wdata);
340 }
341}
342
343/* Get path information necessary for adding watches. */
344static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
345{
346 struct nameidata *ndparent, *ndwatch;
347 int err;
348
349 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
350 if (unlikely(!ndparent))
351 return -ENOMEM;
352
353 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
354 if (unlikely(!ndwatch)) {
355 kfree(ndparent);
356 return -ENOMEM;
357 }
358
359 err = path_lookup(path, LOOKUP_PARENT, ndparent);
360 if (err) {
361 kfree(ndparent);
362 kfree(ndwatch);
363 return err;
364 }
365
366 err = path_lookup(path, 0, ndwatch);
367 if (err) {
368 kfree(ndwatch);
369 ndwatch = NULL;
370 }
371
372 *ndp = ndparent;
373 *ndw = ndwatch;
374
375 return 0;
376}
377
378/* Release resources used for watch path information. */
379static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
380{
381 if (ndp) {
382 path_put(&ndp->path);
383 kfree(ndp);
384 }
385 if (ndw) {
386 path_put(&ndw->path);
387 kfree(ndw);
388 }
389}
390
391/* Associate the given rule with an existing parent inotify_watch.
392 * Caller must hold audit_filter_mutex. */
393static void audit_add_to_parent(struct audit_krule *krule,
394 struct audit_parent *parent)
395{
396 struct audit_watch *w, *watch = krule->watch;
397 int watch_found = 0;
398
399 list_for_each_entry(w, &parent->watches, wlist) {
400 if (strcmp(watch->path, w->path))
401 continue;
402
403 watch_found = 1;
404
405 /* put krule's and initial refs to temporary watch */
406 audit_put_watch(watch);
407 audit_put_watch(watch);
408
409 audit_get_watch(w);
410 krule->watch = watch = w;
411 break;
412 }
413
414 if (!watch_found) {
415 get_inotify_watch(&parent->wdata);
416 watch->parent = parent;
417
418 list_add(&watch->wlist, &parent->watches);
419 }
420 list_add(&krule->rlist, &watch->rules);
421}
422
423/* Find a matching watch entry, or add this one.
424 * Caller must hold audit_filter_mutex. */
425int audit_add_watch(struct audit_krule *krule)
426{
427 struct audit_watch *watch = krule->watch;
428 struct inotify_watch *i_watch;
429 struct audit_parent *parent;
430 struct nameidata *ndp = NULL, *ndw = NULL;
431 int ret = 0;
432
433 mutex_unlock(&audit_filter_mutex);
434
435 /* Avoid calling path_lookup under audit_filter_mutex. */
436 ret = audit_get_nd(watch->path, &ndp, &ndw);
437 if (ret) {
438 /* caller expects mutex locked */
439 mutex_lock(&audit_filter_mutex);
440 goto error;
441 }
442
443 /* update watch filter fields */
444 if (ndw) {
445 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
446 watch->ino = ndw->path.dentry->d_inode->i_ino;
447 }
448
449 /* The audit_filter_mutex must not be held during inotify calls because
450 * we hold it during inotify event callback processing. If an existing
451 * inotify watch is found, inotify_find_watch() grabs a reference before
452 * returning.
453 */
454 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
455 &i_watch) < 0) {
456 parent = audit_init_parent(ndp);
457 if (IS_ERR(parent)) {
458 /* caller expects mutex locked */
459 mutex_lock(&audit_filter_mutex);
460 ret = PTR_ERR(parent);
461 goto error;
462 }
463 } else
464 parent = container_of(i_watch, struct audit_parent, wdata);
465
466 mutex_lock(&audit_filter_mutex);
467
468 /* parent was moved before we took audit_filter_mutex */
469 if (parent->flags & AUDIT_PARENT_INVALID)
470 ret = -ENOENT;
471 else
472 audit_add_to_parent(krule, parent);
473
474 /* match get in audit_init_parent or inotify_find_watch */
475 put_inotify_watch(&parent->wdata);
476
477error:
478 audit_put_nd(ndp, ndw); /* NULL args OK */
479 return ret;
480
481}
482
483void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
484{
485 struct audit_watch *watch = krule->watch;
486 struct audit_parent *parent = watch->parent;
487
488 list_del(&krule->rlist);
489
490 if (list_empty(&watch->rules)) {
491 audit_remove_watch(watch);
492
493 if (list_empty(&parent->watches)) {
494 /* Put parent on the inotify un-registration
495 * list. Grab a reference before releasing
496 * audit_filter_mutex, to be released in
497 * audit_inotify_unregister().
498 * If filesystem is going away, just leave
499 * the sucker alone, eviction will take
500 * care of it. */
501 if (pin_inotify_watch(&parent->wdata))
502 list_add(&parent->ilist, list);
503 }
504 }
505}
506
507/* Update watch data in audit rules based on inotify events. */
508static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
509 u32 cookie, const char *dname, struct inode *inode)
510{
511 struct audit_parent *parent;
512
513 parent = container_of(i_watch, struct audit_parent, wdata);
514
515 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
516 audit_update_watch(parent, dname, inode->i_sb->s_dev,
517 inode->i_ino, 0);
518 else if (mask & (IN_DELETE|IN_MOVED_FROM))
519 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
520 /* inotify automatically removes the watch and sends IN_IGNORED */
521 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
522 audit_remove_parent_watches(parent);
523 /* inotify does not remove the watch, so remove it manually */
524 else if(mask & IN_MOVE_SELF) {
525 audit_remove_parent_watches(parent);
526 inotify_remove_watch_locked(audit_ih, i_watch);
527 } else if (mask & IN_IGNORED)
528 put_inotify_watch(i_watch);
529}
530
531static const struct inotify_operations audit_inotify_ops = {
532 .handle_event = audit_handle_ievent,
533 .destroy_watch = audit_free_parent,
534};
535
536static int __init audit_watch_init(void)
537{
538 audit_ih = inotify_init(&audit_inotify_ops);
539 if (IS_ERR(audit_ih))
540 audit_panic("cannot initialize inotify handle");
541 return 0;
542}
543subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h> 30#include <linux/security.h>
32#include "audit.h" 31#include "audit.h"
33 32
@@ -44,36 +43,6 @@
44 * be written directly provided audit_filter_mutex is held. 43 * be written directly provided audit_filter_mutex is held.
45 */ 44 */
46 45
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */ 46/* Audit filter lists, defined in <linux/audit.h> */
78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 47struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
79 LIST_HEAD_INIT(audit_filter_list[0]), 48 LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
97 66
98DEFINE_MUTEX(audit_filter_mutex); 67DEFINE_MUTEX(audit_filter_mutex);
99 68
100/* Inotify events we care about. */
101#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
102
103void audit_free_parent(struct inotify_watch *i_watch)
104{
105 struct audit_parent *parent;
106
107 parent = container_of(i_watch, struct audit_parent, wdata);
108 WARN_ON(!list_empty(&parent->watches));
109 kfree(parent);
110}
111
112static inline void audit_get_watch(struct audit_watch *watch)
113{
114 atomic_inc(&watch->count);
115}
116
117static void audit_put_watch(struct audit_watch *watch)
118{
119 if (atomic_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path);
123 kfree(watch);
124 }
125}
126
127static void audit_remove_watch(struct audit_watch *watch)
128{
129 list_del(&watch->wlist);
130 put_inotify_watch(&watch->parent->wdata);
131 watch->parent = NULL;
132 audit_put_watch(watch); /* match initial get */
133}
134
135static inline void audit_free_rule(struct audit_entry *e) 69static inline void audit_free_rule(struct audit_entry *e)
136{ 70{
137 int i; 71 int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
156 audit_free_rule(e); 90 audit_free_rule(e);
157} 91}
158 92
159/* Initialize a parent watch entry. */
160static struct audit_parent *audit_init_parent(struct nameidata *ndp)
161{
162 struct audit_parent *parent;
163 s32 wd;
164
165 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
166 if (unlikely(!parent))
167 return ERR_PTR(-ENOMEM);
168
169 INIT_LIST_HEAD(&parent->watches);
170 parent->flags = 0;
171
172 inotify_init_watch(&parent->wdata);
173 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
174 get_inotify_watch(&parent->wdata);
175 wd = inotify_add_watch(audit_ih, &parent->wdata,
176 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
177 if (wd < 0) {
178 audit_free_parent(&parent->wdata);
179 return ERR_PTR(wd);
180 }
181
182 return parent;
183}
184
185/* Initialize a watch entry. */
186static struct audit_watch *audit_init_watch(char *path)
187{
188 struct audit_watch *watch;
189
190 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
191 if (unlikely(!watch))
192 return ERR_PTR(-ENOMEM);
193
194 INIT_LIST_HEAD(&watch->rules);
195 atomic_set(&watch->count, 1);
196 watch->path = path;
197 watch->dev = (dev_t)-1;
198 watch->ino = (unsigned long)-1;
199
200 return watch;
201}
202
203/* Initialize an audit filterlist entry. */ 93/* Initialize an audit filterlist entry. */
204static inline struct audit_entry *audit_init_entry(u32 field_count) 94static inline struct audit_entry *audit_init_entry(u32 field_count)
205{ 95{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
260 return 0; 150 return 0;
261} 151}
262 152
263/* Translate a watch string to kernel respresentation. */
264static int audit_to_watch(struct audit_krule *krule, char *path, int len,
265 u32 op)
266{
267 struct audit_watch *watch;
268
269 if (!audit_ih)
270 return -EOPNOTSUPP;
271
272 if (path[0] != '/' || path[len-1] == '/' ||
273 krule->listnr != AUDIT_FILTER_EXIT ||
274 op != Audit_equal ||
275 krule->inode_f || krule->watch || krule->tree)
276 return -EINVAL;
277
278 watch = audit_init_watch(path);
279 if (IS_ERR(watch))
280 return PTR_ERR(watch);
281
282 audit_get_watch(watch);
283 krule->watch = watch;
284
285 return 0;
286}
287
288static __u32 *classes[AUDIT_SYSCALL_CLASSES]; 153static __u32 *classes[AUDIT_SYSCALL_CLASSES];
289 154
290int __init audit_register_class(int class, unsigned *list) 155int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
766 break; 631 break;
767 case AUDIT_WATCH: 632 case AUDIT_WATCH:
768 data->buflen += data->values[i] = 633 data->buflen += data->values[i] =
769 audit_pack_string(&bufp, krule->watch->path); 634 audit_pack_string(&bufp,
635 audit_watch_path(krule->watch));
770 break; 636 break;
771 case AUDIT_DIR: 637 case AUDIT_DIR:
772 data->buflen += data->values[i] = 638 data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
818 return 1; 684 return 1;
819 break; 685 break;
820 case AUDIT_WATCH: 686 case AUDIT_WATCH:
821 if (strcmp(a->watch->path, b->watch->path)) 687 if (strcmp(audit_watch_path(a->watch),
688 audit_watch_path(b->watch)))
822 return 1; 689 return 1;
823 break; 690 break;
824 case AUDIT_DIR: 691 case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
844 return 0; 711 return 0;
845} 712}
846 713
847/* Duplicate the given audit watch. The new watch's rules list is initialized
848 * to an empty list and wlist is undefined. */
849static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
850{
851 char *path;
852 struct audit_watch *new;
853
854 path = kstrdup(old->path, GFP_KERNEL);
855 if (unlikely(!path))
856 return ERR_PTR(-ENOMEM);
857
858 new = audit_init_watch(path);
859 if (IS_ERR(new)) {
860 kfree(path);
861 goto out;
862 }
863
864 new->dev = old->dev;
865 new->ino = old->ino;
866 get_inotify_watch(&old->parent->wdata);
867 new->parent = old->parent;
868
869out:
870 return new;
871}
872
873/* Duplicate LSM field information. The lsm_rule is opaque, so must be 714/* Duplicate LSM field information. The lsm_rule is opaque, so must be
874 * re-initialized. */ 715 * re-initialized. */
875static inline int audit_dupe_lsm_field(struct audit_field *df, 716static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
904 * rule with the new rule in the filterlist, then free the old rule. 745 * rule with the new rule in the filterlist, then free the old rule.
905 * The rlist element is undefined; list manipulations are handled apart from 746 * The rlist element is undefined; list manipulations are handled apart from
906 * the initial copy. */ 747 * the initial copy. */
907static struct audit_entry *audit_dupe_rule(struct audit_krule *old, 748struct audit_entry *audit_dupe_rule(struct audit_krule *old,
908 struct audit_watch *watch) 749 struct audit_watch *watch)
909{ 750{
910 u32 fcount = old->field_count; 751 u32 fcount = old->field_count;
911 struct audit_entry *entry; 752 struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
977 return entry; 818 return entry;
978} 819}
979 820
980/* Update inode info in audit rules based on filesystem event. */
981static void audit_update_watch(struct audit_parent *parent,
982 const char *dname, dev_t dev,
983 unsigned long ino, unsigned invalidating)
984{
985 struct audit_watch *owatch, *nwatch, *nextw;
986 struct audit_krule *r, *nextr;
987 struct audit_entry *oentry, *nentry;
988
989 mutex_lock(&audit_filter_mutex);
990 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
991 if (audit_compare_dname_path(dname, owatch->path, NULL))
992 continue;
993
994 /* If the update involves invalidating rules, do the inode-based
995 * filtering now, so we don't omit records. */
996 if (invalidating && current->audit_context)
997 audit_filter_inodes(current, current->audit_context);
998
999 nwatch = audit_dupe_watch(owatch);
1000 if (IS_ERR(nwatch)) {
1001 mutex_unlock(&audit_filter_mutex);
1002 audit_panic("error updating watch, skipping");
1003 return;
1004 }
1005 nwatch->dev = dev;
1006 nwatch->ino = ino;
1007
1008 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
1009
1010 oentry = container_of(r, struct audit_entry, rule);
1011 list_del(&oentry->rule.rlist);
1012 list_del_rcu(&oentry->list);
1013
1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1017 audit_panic("error updating watch, removing");
1018 } else {
1019 int h = audit_hash_ino((u32)ino);
1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1024 }
1025
1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1027 }
1028
1029 if (audit_enabled) {
1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current),
1035 audit_get_sessionid(current));
1036 audit_log_format(ab,
1037 " op=updated rules specifying path=");
1038 audit_log_untrustedstring(ab, owatch->path);
1039 audit_log_format(ab, " with dev=%u ino=%lu\n",
1040 dev, ino);
1041 audit_log_format(ab, " list=%d res=1", r->listnr);
1042 audit_log_end(ab);
1043 }
1044 audit_remove_watch(owatch);
1045 goto add_watch_to_parent; /* event applies to a single watch */
1046 }
1047 mutex_unlock(&audit_filter_mutex);
1048 return;
1049
1050add_watch_to_parent:
1051 list_add(&nwatch->wlist, &parent->watches);
1052 mutex_unlock(&audit_filter_mutex);
1053 return;
1054}
1055
1056/* Remove all watches & rules associated with a parent that is going away. */
1057static void audit_remove_parent_watches(struct audit_parent *parent)
1058{
1059 struct audit_watch *w, *nextw;
1060 struct audit_krule *r, *nextr;
1061 struct audit_entry *e;
1062
1063 mutex_lock(&audit_filter_mutex);
1064 parent->flags |= AUDIT_PARENT_INVALID;
1065 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1066 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) {
1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current),
1074 audit_get_sessionid(current));
1075 audit_log_format(ab, " op=remove rule path=");
1076 audit_log_untrustedstring(ab, w->path);
1077 if (r->filterkey) {
1078 audit_log_format(ab, " key=");
1079 audit_log_untrustedstring(ab,
1080 r->filterkey);
1081 } else
1082 audit_log_format(ab, " key=(null)");
1083 audit_log_format(ab, " list=%d res=1",
1084 r->listnr);
1085 audit_log_end(ab);
1086 }
1087 list_del(&r->rlist);
1088 list_del(&r->list);
1089 list_del_rcu(&e->list);
1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1091 }
1092 audit_remove_watch(w);
1093 }
1094 mutex_unlock(&audit_filter_mutex);
1095}
1096
1097/* Unregister inotify watches for parents on in_list.
1098 * Generates an IN_IGNORED event. */
1099static void audit_inotify_unregister(struct list_head *in_list)
1100{
1101 struct audit_parent *p, *n;
1102
1103 list_for_each_entry_safe(p, n, in_list, ilist) {
1104 list_del(&p->ilist);
1105 inotify_rm_watch(audit_ih, &p->wdata);
1106 /* the unpin matching the pin in audit_do_del_rule() */
1107 unpin_inotify_watch(&p->wdata);
1108 }
1109}
1110
1111/* Find an existing audit rule. 821/* Find an existing audit rule.
1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 822 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1113static struct audit_entry *audit_find_rule(struct audit_entry *entry, 823static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
1145 return found; 855 return found;
1146} 856}
1147 857
1148/* Get path information necessary for adding watches. */
1149static int audit_get_nd(char *path, struct nameidata **ndp,
1150 struct nameidata **ndw)
1151{
1152 struct nameidata *ndparent, *ndwatch;
1153 int err;
1154
1155 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1156 if (unlikely(!ndparent))
1157 return -ENOMEM;
1158
1159 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1160 if (unlikely(!ndwatch)) {
1161 kfree(ndparent);
1162 return -ENOMEM;
1163 }
1164
1165 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1166 if (err) {
1167 kfree(ndparent);
1168 kfree(ndwatch);
1169 return err;
1170 }
1171
1172 err = path_lookup(path, 0, ndwatch);
1173 if (err) {
1174 kfree(ndwatch);
1175 ndwatch = NULL;
1176 }
1177
1178 *ndp = ndparent;
1179 *ndw = ndwatch;
1180
1181 return 0;
1182}
1183
1184/* Release resources used for watch path information. */
1185static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1186{
1187 if (ndp) {
1188 path_put(&ndp->path);
1189 kfree(ndp);
1190 }
1191 if (ndw) {
1192 path_put(&ndw->path);
1193 kfree(ndw);
1194 }
1195}
1196
1197/* Associate the given rule with an existing parent inotify_watch.
1198 * Caller must hold audit_filter_mutex. */
1199static void audit_add_to_parent(struct audit_krule *krule,
1200 struct audit_parent *parent)
1201{
1202 struct audit_watch *w, *watch = krule->watch;
1203 int watch_found = 0;
1204
1205 list_for_each_entry(w, &parent->watches, wlist) {
1206 if (strcmp(watch->path, w->path))
1207 continue;
1208
1209 watch_found = 1;
1210
1211 /* put krule's and initial refs to temporary watch */
1212 audit_put_watch(watch);
1213 audit_put_watch(watch);
1214
1215 audit_get_watch(w);
1216 krule->watch = watch = w;
1217 break;
1218 }
1219
1220 if (!watch_found) {
1221 get_inotify_watch(&parent->wdata);
1222 watch->parent = parent;
1223
1224 list_add(&watch->wlist, &parent->watches);
1225 }
1226 list_add(&krule->rlist, &watch->rules);
1227}
1228
1229/* Find a matching watch entry, or add this one.
1230 * Caller must hold audit_filter_mutex. */
1231static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1232 struct nameidata *ndw)
1233{
1234 struct audit_watch *watch = krule->watch;
1235 struct inotify_watch *i_watch;
1236 struct audit_parent *parent;
1237 int ret = 0;
1238
1239 /* update watch filter fields */
1240 if (ndw) {
1241 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1242 watch->ino = ndw->path.dentry->d_inode->i_ino;
1243 }
1244
1245 /* The audit_filter_mutex must not be held during inotify calls because
1246 * we hold it during inotify event callback processing. If an existing
1247 * inotify watch is found, inotify_find_watch() grabs a reference before
1248 * returning.
1249 */
1250 mutex_unlock(&audit_filter_mutex);
1251
1252 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1253 &i_watch) < 0) {
1254 parent = audit_init_parent(ndp);
1255 if (IS_ERR(parent)) {
1256 /* caller expects mutex locked */
1257 mutex_lock(&audit_filter_mutex);
1258 return PTR_ERR(parent);
1259 }
1260 } else
1261 parent = container_of(i_watch, struct audit_parent, wdata);
1262
1263 mutex_lock(&audit_filter_mutex);
1264
1265 /* parent was moved before we took audit_filter_mutex */
1266 if (parent->flags & AUDIT_PARENT_INVALID)
1267 ret = -ENOENT;
1268 else
1269 audit_add_to_parent(krule, parent);
1270
1271 /* match get in audit_init_parent or inotify_find_watch */
1272 put_inotify_watch(&parent->wdata);
1273 return ret;
1274}
1275
1276static u64 prio_low = ~0ULL/2; 858static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1; 859static u64 prio_high = ~0ULL/2 - 1;
1278 860
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
1282 struct audit_entry *e; 864 struct audit_entry *e;
1283 struct audit_watch *watch = entry->rule.watch; 865 struct audit_watch *watch = entry->rule.watch;
1284 struct audit_tree *tree = entry->rule.tree; 866 struct audit_tree *tree = entry->rule.tree;
1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list; 867 struct list_head *list;
1287 int h, err; 868 int h, err;
1288#ifdef CONFIG_AUDITSYSCALL 869#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
1296 877
1297 mutex_lock(&audit_filter_mutex); 878 mutex_lock(&audit_filter_mutex);
1298 e = audit_find_rule(entry, &list); 879 e = audit_find_rule(entry, &list);
1299 mutex_unlock(&audit_filter_mutex);
1300 if (e) { 880 if (e) {
881 mutex_unlock(&audit_filter_mutex);
1301 err = -EEXIST; 882 err = -EEXIST;
1302 /* normally audit_add_tree_rule() will free it on failure */ 883 /* normally audit_add_tree_rule() will free it on failure */
1303 if (tree) 884 if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
1305 goto error; 886 goto error;
1306 } 887 }
1307 888
1308 /* Avoid calling path_lookup under audit_filter_mutex. */
1309 if (watch) {
1310 err = audit_get_nd(watch->path, &ndp, &ndw);
1311 if (err)
1312 goto error;
1313 }
1314
1315 mutex_lock(&audit_filter_mutex);
1316 if (watch) { 889 if (watch) {
1317 /* audit_filter_mutex is dropped and re-taken during this call */ 890 /* audit_filter_mutex is dropped and re-taken during this call */
1318 err = audit_add_watch(&entry->rule, ndp, ndw); 891 err = audit_add_watch(&entry->rule);
1319 if (err) { 892 if (err) {
1320 mutex_unlock(&audit_filter_mutex); 893 mutex_unlock(&audit_filter_mutex);
1321 goto error; 894 goto error;
1322 } 895 }
1323 h = audit_hash_ino((u32)watch->ino); 896 /* entry->rule.watch may have changed during audit_add_watch() */
897 watch = entry->rule.watch;
898 h = audit_hash_ino((u32)audit_watch_inode(watch));
1324 list = &audit_inode_hash[h]; 899 list = &audit_inode_hash[h];
1325 } 900 }
1326 if (tree) { 901 if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
1358#endif 933#endif
1359 mutex_unlock(&audit_filter_mutex); 934 mutex_unlock(&audit_filter_mutex);
1360 935
1361 audit_put_nd(ndp, ndw); /* NULL args OK */
1362 return 0; 936 return 0;
1363 937
1364error: 938error:
1365 audit_put_nd(ndp, ndw); /* NULL args OK */
1366 if (watch) 939 if (watch)
1367 audit_put_watch(watch); /* tmp watch, matches initial get */ 940 audit_put_watch(watch); /* tmp watch, matches initial get */
1368 return err; 941 return err;
@@ -1372,7 +945,7 @@ error:
1372static inline int audit_del_rule(struct audit_entry *entry) 945static inline int audit_del_rule(struct audit_entry *entry)
1373{ 946{
1374 struct audit_entry *e; 947 struct audit_entry *e;
1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 948 struct audit_watch *watch = entry->rule.watch;
1376 struct audit_tree *tree = entry->rule.tree; 949 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list; 950 struct list_head *list;
1378 LIST_HEAD(inotify_list); 951 LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1394 goto out; 967 goto out;
1395 } 968 }
1396 969
1397 watch = e->rule.watch; 970 if (e->rule.watch)
1398 if (watch) { 971 audit_remove_watch_rule(&e->rule, &inotify_list);
1399 struct audit_parent *parent = watch->parent;
1400
1401 list_del(&e->rule.rlist);
1402
1403 if (list_empty(&watch->rules)) {
1404 audit_remove_watch(watch);
1405
1406 if (list_empty(&parent->watches)) {
1407 /* Put parent on the inotify un-registration
1408 * list. Grab a reference before releasing
1409 * audit_filter_mutex, to be released in
1410 * audit_inotify_unregister().
1411 * If filesystem is going away, just leave
1412 * the sucker alone, eviction will take
1413 * care of it.
1414 */
1415 if (pin_inotify_watch(&parent->wdata))
1416 list_add(&parent->ilist, &inotify_list);
1417 }
1418 }
1419 }
1420 972
1421 if (e->rule.tree) 973 if (e->rule.tree)
1422 audit_remove_tree_rule(&e->rule); 974 audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1438 audit_inotify_unregister(&inotify_list); 990 audit_inotify_unregister(&inotify_list);
1439 991
1440out: 992out:
1441 if (tmp_watch) 993 if (watch)
1442 audit_put_watch(tmp_watch); /* match initial get */ 994 audit_put_watch(watch); /* match initial get */
1443 if (tree) 995 if (tree)
1444 audit_put_tree(tree); /* that's the temporary one */ 996 audit_put_tree(tree); /* that's the temporary one */
1445 997
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1527 security_release_secctx(ctx, len); 1079 security_release_secctx(ctx, len);
1528 } 1080 }
1529 } 1081 }
1530 audit_log_format(ab, " op=%s rule key=", action); 1082 audit_log_format(ab, " op=");
1531 if (rule->filterkey) 1083 audit_log_string(ab, action);
1532 audit_log_untrustedstring(ab, rule->filterkey); 1084 audit_log_key(ab, rule->filterkey);
1533 else
1534 audit_log_format(ab, "(null)");
1535 audit_log_format(ab, " list=%d res=%d", rule->listnr, res); 1085 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1536 audit_log_end(ab); 1086 audit_log_end(ab);
1537} 1087}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1595 return PTR_ERR(entry); 1145 return PTR_ERR(entry);
1596 1146
1597 err = audit_add_rule(entry); 1147 err = audit_add_rule(entry);
1598 audit_log_rule_change(loginuid, sessionid, sid, "add", 1148 audit_log_rule_change(loginuid, sessionid, sid, "add rule",
1599 &entry->rule, !err); 1149 &entry->rule, !err);
1600 1150
1601 if (err) 1151 if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1611 return PTR_ERR(entry); 1161 return PTR_ERR(entry);
1612 1162
1613 err = audit_del_rule(entry); 1163 err = audit_del_rule(entry);
1614 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1164 audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
1615 &entry->rule, !err); 1165 &entry->rule, !err);
1616 1166
1617 audit_free_rule(entry); 1167 audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
1793 list_del(&r->list); 1343 list_del(&r->list);
1794 } else { 1344 } else {
1795 if (watch) { 1345 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules); 1346 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1797 list_del(&r->rlist); 1347 list_del(&r->rlist);
1798 } else if (tree) 1348 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist); 1349 list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
1829 1379
1830 return err; 1380 return err;
1831} 1381}
1832
1833/* Update watch data in audit rules based on inotify events. */
1834void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1835 u32 cookie, const char *dname, struct inode *inode)
1836{
1837 struct audit_parent *parent;
1838
1839 parent = container_of(i_watch, struct audit_parent, wdata);
1840
1841 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1842 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1843 inode->i_ino, 0);
1844 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1845 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1846 /* inotify automatically removes the watch and sends IN_IGNORED */
1847 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1848 audit_remove_parent_watches(parent);
1849 /* inotify does not remove the watch, so remove it manually */
1850 else if(mask & IN_MOVE_SELF) {
1851 audit_remove_parent_watches(parent);
1852 inotify_remove_watch_locked(audit_ih, i_watch);
1853 } else if (mask & IN_IGNORED)
1854 put_inotify_watch(i_watch);
1855}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count; 201 int tree_count;
202 struct list_head killed_trees;
202 203
203 int type; 204 int type;
204 union { 205 union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
548 } 549 }
549 break; 550 break;
550 case AUDIT_WATCH: 551 case AUDIT_WATCH:
551 if (name && rule->watch->ino != (unsigned long)-1) 552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
552 result = (name->dev == rule->watch->dev && 553 result = (name->dev == audit_watch_dev(rule->watch) &&
553 name->ino == rule->watch->ino); 554 name->ino == audit_watch_inode(rule->watch));
554 break; 555 break;
555 case AUDIT_DIR: 556 case AUDIT_DIR:
556 if (ctx) 557 if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
853 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 854 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
854 return NULL; 855 return NULL;
855 audit_zero_context(context, state); 856 audit_zero_context(context, state);
857 INIT_LIST_HEAD(&context->killed_trees);
856 return context; 858 return context;
857} 859}
858 860
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1026{
1025 char arg_num_len_buf[12]; 1027 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1028 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of " a=" */ 1029 /* how many digits are in arg_num? 5 is the length of ' a=""' */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1030 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
1029 size_t len, len_left, to_send; 1031 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1032 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
1031 unsigned int i, has_cntl = 0, too_long = 0; 1033 unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 if (has_cntl) 1139 if (has_cntl)
1138 audit_log_n_hex(*ab, buf, to_send); 1140 audit_log_n_hex(*ab, buf, to_send);
1139 else 1141 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1142 audit_log_string(*ab, buf);
1141 1143
1142 p += to_send; 1144 p += to_send;
1143 len_left -= to_send; 1145 len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1372 1374
1373 1375
1374 audit_log_task_info(ab, tsk); 1376 audit_log_task_info(ab, tsk);
1375 if (context->filterkey) { 1377 audit_log_key(ab, context->filterkey);
1376 audit_log_format(ab, " key=");
1377 audit_log_untrustedstring(ab, context->filterkey);
1378 } else
1379 audit_log_format(ab, " key=(null)");
1380 audit_log_end(ab); 1378 audit_log_end(ab);
1381 1379
1382 for (aux = context->aux; aux; aux = aux->next) { 1380 for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
1549 /* that can happen only if we are called from do_exit() */ 1547 /* that can happen only if we are called from do_exit() */
1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1548 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1551 audit_log_exit(context, tsk); 1549 audit_log_exit(context, tsk);
1550 if (!list_empty(&context->killed_trees))
1551 audit_kill_trees(&context->killed_trees);
1552 1552
1553 audit_free_context(context); 1553 audit_free_context(context);
1554} 1554}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
1692 context->in_syscall = 0; 1692 context->in_syscall = 0;
1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1694 1694
1695 if (!list_empty(&context->killed_trees))
1696 audit_kill_trees(&context->killed_trees);
1697
1695 if (context->previous) { 1698 if (context->previous) {
1696 struct audit_context *new_context = context->previous; 1699 struct audit_context *new_context = context->previous;
1697 context->previous = NULL; 1700 context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
2525 audit_log_format(ab, " sig=%ld", signr); 2528 audit_log_format(ab, " sig=%ld", signr);
2526 audit_log_end(ab); 2529 audit_log_end(ab);
2527} 2530}
2531
2532struct list_head *audit_killed_trees(void)
2533{
2534 struct audit_context *ctx = current->audit_context;
2535 if (likely(!ctx || !ctx->in_syscall))
2536 return NULL;
2537 return &ctx->killed_trees;
2538}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd3765..b6eadfe30e7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,8 @@
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
49 51
50#include <asm/atomic.h> 52#include <asm/atomic.h>
51 53
@@ -733,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
733 * reference to css->refcnt. In general, this refcnt is expected to goes down 735 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon. 736 * to zero, soon.
735 * 737 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 738 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
737 */ 739 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 740DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739 741
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 742static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
741{ 743{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 744 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq); 745 wake_up_all(&cgroup_rmdir_waitq);
744} 746}
745 747
748void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
749{
750 css_get(css);
751}
752
753void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
754{
755 cgroup_wakeup_rmdir_waiter(css->cgroup);
756 css_put(css);
757}
758
759
746static int rebind_subsystems(struct cgroupfs_root *root, 760static int rebind_subsystems(struct cgroupfs_root *root,
747 unsigned long final_bits) 761 unsigned long final_bits)
748{ 762{
@@ -842,6 +856,11 @@ static int parse_cgroupfs_options(char *data,
842 struct cgroup_sb_opts *opts) 856 struct cgroup_sb_opts *opts)
843{ 857{
844 char *token, *o = data ?: "all"; 858 char *token, *o = data ?: "all";
859 unsigned long mask = (unsigned long)-1;
860
861#ifdef CONFIG_CPUSETS
862 mask = ~(1UL << cpuset_subsys_id);
863#endif
845 864
846 opts->subsys_bits = 0; 865 opts->subsys_bits = 0;
847 opts->flags = 0; 866 opts->flags = 0;
@@ -886,6 +905,15 @@ static int parse_cgroupfs_options(char *data,
886 } 905 }
887 } 906 }
888 907
908 /*
909 * Option noprefix was introduced just for backward compatibility
910 * with the old cpuset, so we allow noprefix only if mounting just
911 * the cpuset subsystem.
912 */
913 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
914 (opts->subsys_bits & mask))
915 return -EINVAL;
916
889 /* We can't have an empty hierarchy */ 917 /* We can't have an empty hierarchy */
890 if (!opts->subsys_bits) 918 if (!opts->subsys_bits)
891 return -EINVAL; 919 return -EINVAL;
@@ -900,6 +928,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
900 struct cgroup *cgrp = &root->top_cgroup; 928 struct cgroup *cgrp = &root->top_cgroup;
901 struct cgroup_sb_opts opts; 929 struct cgroup_sb_opts opts;
902 930
931 lock_kernel();
903 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 932 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
904 mutex_lock(&cgroup_mutex); 933 mutex_lock(&cgroup_mutex);
905 934
@@ -927,6 +956,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
927 kfree(opts.release_agent); 956 kfree(opts.release_agent);
928 mutex_unlock(&cgroup_mutex); 957 mutex_unlock(&cgroup_mutex);
929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 958 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
959 unlock_kernel();
930 return ret; 960 return ret;
931} 961}
932 962
@@ -943,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
943 INIT_LIST_HEAD(&cgrp->children); 973 INIT_LIST_HEAD(&cgrp->children);
944 INIT_LIST_HEAD(&cgrp->css_sets); 974 INIT_LIST_HEAD(&cgrp->css_sets);
945 INIT_LIST_HEAD(&cgrp->release_list); 975 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list);
946 init_rwsem(&cgrp->pids_mutex); 977 init_rwsem(&cgrp->pids_mutex);
947} 978}
948static void init_cgroup_root(struct cgroupfs_root *root) 979static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1340,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1340 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1371 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1341 * is no longer empty. 1372 * is no longer empty.
1342 */ 1373 */
1343 cgroup_wakeup_rmdir_waiters(cgrp); 1374 cgroup_wakeup_rmdir_waiter(cgrp);
1344 return 0; 1375 return 0;
1345} 1376}
1346 1377
@@ -2184,12 +2215,30 @@ err:
2184 return ret; 2215 return ret;
2185} 2216}
2186 2217
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2187static int cmppid(const void *a, const void *b) 2237static int cmppid(const void *a, const void *b)
2188{ 2238{
2189 return *(pid_t *)a - *(pid_t *)b; 2239 return *(pid_t *)a - *(pid_t *)b;
2190} 2240}
2191 2241
2192
2193/* 2242/*
2194 * seq_file methods for the "tasks" file. The seq_file position is the 2243 * seq_file methods for the "tasks" file. The seq_file position is the
2195 * next pid to display; the seq_file iterator is a pointer to the pid 2244 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2204,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2204 * after a seek to the start). Use a binary-search to find the 2253 * after a seek to the start). Use a binary-search to find the
2205 * next pid to display, if any 2254 * next pid to display, if any
2206 */ 2255 */
2207 struct cgroup *cgrp = s->private; 2256 struct cgroup_pids *cp = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2208 int index = 0, pid = *pos; 2258 int index = 0, pid = *pos;
2209 int *iter; 2259 int *iter;
2210 2260
2211 down_read(&cgrp->pids_mutex); 2261 down_read(&cgrp->pids_mutex);
2212 if (pid) { 2262 if (pid) {
2213 int end = cgrp->pids_length; 2263 int end = cp->length;
2214 2264
2215 while (index < end) { 2265 while (index < end) {
2216 int mid = (index + end) / 2; 2266 int mid = (index + end) / 2;
2217 if (cgrp->tasks_pids[mid] == pid) { 2267 if (cp->tasks_pids[mid] == pid) {
2218 index = mid; 2268 index = mid;
2219 break; 2269 break;
2220 } else if (cgrp->tasks_pids[mid] <= pid) 2270 } else if (cp->tasks_pids[mid] <= pid)
2221 index = mid + 1; 2271 index = mid + 1;
2222 else 2272 else
2223 end = mid; 2273 end = mid;
2224 } 2274 }
2225 } 2275 }
2226 /* If we're off the end of the array, we're done */ 2276 /* If we're off the end of the array, we're done */
2227 if (index >= cgrp->pids_length) 2277 if (index >= cp->length)
2228 return NULL; 2278 return NULL;
2229 /* Update the abstract position to be the actual pid that we found */ 2279 /* Update the abstract position to be the actual pid that we found */
2230 iter = cgrp->tasks_pids + index; 2280 iter = cp->tasks_pids + index;
2231 *pos = *iter; 2281 *pos = *iter;
2232 return iter; 2282 return iter;
2233} 2283}
2234 2284
2235static void cgroup_tasks_stop(struct seq_file *s, void *v) 2285static void cgroup_tasks_stop(struct seq_file *s, void *v)
2236{ 2286{
2237 struct cgroup *cgrp = s->private; 2287 struct cgroup_pids *cp = s->private;
2288 struct cgroup *cgrp = cp->cgrp;
2238 up_read(&cgrp->pids_mutex); 2289 up_read(&cgrp->pids_mutex);
2239} 2290}
2240 2291
2241static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2242{ 2293{
2243 struct cgroup *cgrp = s->private; 2294 struct cgroup_pids *cp = s->private;
2244 int *p = v; 2295 int *p = v;
2245 int *end = cgrp->tasks_pids + cgrp->pids_length; 2296 int *end = cp->tasks_pids + cp->length;
2246 2297
2247 /* 2298 /*
2248 * Advance to the next pid in the array. If this goes off the 2299 * Advance to the next pid in the array. If this goes off the
@@ -2269,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2269 .show = cgroup_tasks_show, 2320 .show = cgroup_tasks_show,
2270}; 2321};
2271 2322
2272static void release_cgroup_pid_array(struct cgroup *cgrp) 2323static void release_cgroup_pid_array(struct cgroup_pids *cp)
2273{ 2324{
2325 struct cgroup *cgrp = cp->cgrp;
2326
2274 down_write(&cgrp->pids_mutex); 2327 down_write(&cgrp->pids_mutex);
2275 BUG_ON(!cgrp->pids_use_count); 2328 BUG_ON(!cp->use_count);
2276 if (!--cgrp->pids_use_count) { 2329 if (!--cp->use_count) {
2277 kfree(cgrp->tasks_pids); 2330 list_del(&cp->list);
2278 cgrp->tasks_pids = NULL; 2331 put_pid_ns(cp->ns);
2279 cgrp->pids_length = 0; 2332 kfree(cp->tasks_pids);
2333 kfree(cp);
2280 } 2334 }
2281 up_write(&cgrp->pids_mutex); 2335 up_write(&cgrp->pids_mutex);
2282} 2336}
2283 2337
2284static int cgroup_tasks_release(struct inode *inode, struct file *file) 2338static int cgroup_tasks_release(struct inode *inode, struct file *file)
2285{ 2339{
2286 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2340 struct seq_file *seq;
2341 struct cgroup_pids *cp;
2287 2342
2288 if (!(file->f_mode & FMODE_READ)) 2343 if (!(file->f_mode & FMODE_READ))
2289 return 0; 2344 return 0;
2290 2345
2291 release_cgroup_pid_array(cgrp); 2346 seq = file->private_data;
2347 cp = seq->private;
2348
2349 release_cgroup_pid_array(cp);
2292 return seq_release(inode, file); 2350 return seq_release(inode, file);
2293} 2351}
2294 2352
@@ -2307,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
2307static int cgroup_tasks_open(struct inode *unused, struct file *file) 2365static int cgroup_tasks_open(struct inode *unused, struct file *file)
2308{ 2366{
2309 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns;
2369 struct cgroup_pids *cp;
2310 pid_t *pidarray; 2370 pid_t *pidarray;
2311 int npids; 2371 int npids;
2312 int retval; 2372 int retval;
@@ -2333,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2333 * array if necessary 2393 * array if necessary
2334 */ 2394 */
2335 down_write(&cgrp->pids_mutex); 2395 down_write(&cgrp->pids_mutex);
2336 kfree(cgrp->tasks_pids); 2396
2337 cgrp->tasks_pids = pidarray; 2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2338 cgrp->pids_length = npids; 2398 if (ns == cp->ns)
2339 cgrp->pids_use_count++; 2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2340 up_write(&cgrp->pids_mutex); 2417 up_write(&cgrp->pids_mutex);
2341 2418
2342 file->f_op = &cgroup_tasks_operations; 2419 file->f_op = &cgroup_tasks_operations;
2343 2420
2344 retval = seq_open(file, &cgroup_tasks_seq_operations); 2421 retval = seq_open(file, &cgroup_tasks_seq_operations);
2345 if (retval) { 2422 if (retval) {
2346 release_cgroup_pid_array(cgrp); 2423 release_cgroup_pid_array(cp);
2347 return retval; 2424 return retval;
2348 } 2425 }
2349 ((struct seq_file *)file->private_data)->private = cgrp; 2426 ((struct seq_file *)file->private_data)->private = cp;
2350 return 0; 2427 return 0;
2351} 2428}
2352 2429
@@ -2679,33 +2756,42 @@ again:
2679 mutex_unlock(&cgroup_mutex); 2756 mutex_unlock(&cgroup_mutex);
2680 2757
2681 /* 2758 /*
2759 * In general, subsystem has no css->refcnt after pre_destroy(). But
2760 * in racy cases, subsystem may have to get css->refcnt after
2761 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2762 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2763 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2764 * and subsystem's reference count handling. Please see css_get/put
2765 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2766 */
2767 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2768
2769 /*
2682 * Call pre_destroy handlers of subsys. Notify subsystems 2770 * Call pre_destroy handlers of subsys. Notify subsystems
2683 * that rmdir() request comes. 2771 * that rmdir() request comes.
2684 */ 2772 */
2685 ret = cgroup_call_pre_destroy(cgrp); 2773 ret = cgroup_call_pre_destroy(cgrp);
2686 if (ret) 2774 if (ret) {
2775 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2687 return ret; 2776 return ret;
2777 }
2688 2778
2689 mutex_lock(&cgroup_mutex); 2779 mutex_lock(&cgroup_mutex);
2690 parent = cgrp->parent; 2780 parent = cgrp->parent;
2691 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2781 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2782 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2692 mutex_unlock(&cgroup_mutex); 2783 mutex_unlock(&cgroup_mutex);
2693 return -EBUSY; 2784 return -EBUSY;
2694 } 2785 }
2695 /*
2696 * css_put/get is provided for subsys to grab refcnt to css. In typical
2697 * case, subsystem has no reference after pre_destroy(). But, under
2698 * hierarchy management, some *temporal* refcnt can be hold.
2699 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2700 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2701 * is called when css_put() is called and refcnt goes down to 0.
2702 */
2703 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2704 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2786 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2705
2706 if (!cgroup_clear_css_refs(cgrp)) { 2787 if (!cgroup_clear_css_refs(cgrp)) {
2707 mutex_unlock(&cgroup_mutex); 2788 mutex_unlock(&cgroup_mutex);
2708 schedule(); 2789 /*
2790 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2791 * prepare_to_wait(), we need to check this flag.
2792 */
2793 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2794 schedule();
2709 finish_wait(&cgroup_rmdir_waitq, &wait); 2795 finish_wait(&cgroup_rmdir_waitq, &wait);
2710 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2796 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2711 if (signal_pending(current)) 2797 if (signal_pending(current))
@@ -3277,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
3277 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3363 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3278 check_for_release(cgrp); 3364 check_for_release(cgrp);
3279 } 3365 }
3280 cgroup_wakeup_rmdir_waiters(cgrp); 3366 cgroup_wakeup_rmdir_waiter(cgrp);
3281 } 3367 }
3282 rcu_read_unlock(); 3368 rcu_read_unlock();
3283} 3369}
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f..f6c204f07ea6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
882 882
883} 883}
884 884
885asmlinkage long
886compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
887 struct compat_siginfo __user *uinfo)
888{
889 siginfo_t info;
890
891 if (copy_siginfo_from_user32(&info, uinfo))
892 return -EFAULT;
893 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
894}
895
885#ifdef __ARCH_WANT_COMPAT_SYS_TIME 896#ifdef __ARCH_WANT_COMPAT_SYS_TIME
886 897
887/* compat_time_t is a 32 bit "long" and needs to get converted. */ 898/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..8ce10043e4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
34 * an ongoing cpu hotplug operation. 34 * an ongoing cpu hotplug operation.
35 */ 35 */
36 int refcount; 36 int refcount;
37} cpu_hotplug; 37} cpu_hotplug = {
38 38 .active_writer = NULL,
39void __init cpu_hotplug_init(void) 39 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
40{ 40 .refcount = 0,
41 cpu_hotplug.active_writer = NULL; 41};
42 mutex_init(&cpu_hotplug.lock);
43 cpu_hotplug.refcount = 0;
44}
45 42
46#ifdef CONFIG_HOTPLUG_CPU 43#ifdef CONFIG_HOTPLUG_CPU
47 44
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca869..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
331 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 305 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
332} 306}
333 307
334/** 308/*
335 * cpuset_update_task_memory_state - update task memory placement 309 * update task's spread flag if cpuset's page/slab spread flag is set
336 * 310 *
337 * If the current tasks cpusets mems_allowed changed behind our 311 * Called with callback_mutex/cgroup_mutex held
338 * backs, update current->mems_allowed, mems_generation and task NUMA
339 * mempolicy to the new value.
340 *
341 * Task mempolicy is updated by rebinding it relative to the
342 * current->cpuset if a task has its memory placement changed.
343 * Do not call this routine if in_interrupt().
344 *
345 * Call without callback_mutex or task_lock() held. May be
346 * called with or without cgroup_mutex held. Thanks in part to
347 * 'the_top_cpuset_hack', the task's cpuset pointer will never
348 * be NULL. This routine also might acquire callback_mutex during
349 * call.
350 *
351 * Reading current->cpuset->mems_generation doesn't need task_lock
352 * to guard the current->cpuset derefence, because it is guarded
353 * from concurrent freeing of current->cpuset using RCU.
354 *
355 * The rcu_dereference() is technically probably not needed,
356 * as I don't actually mind if I see a new cpuset pointer but
357 * an old value of mems_generation. However this really only
358 * matters on alpha systems using cpusets heavily. If I dropped
359 * that rcu_dereference(), it would save them a memory barrier.
360 * For all other arch's, rcu_dereference is a no-op anyway, and for
361 * alpha systems not using cpusets, another planned optimization,
362 * avoiding the rcu critical section for tasks in the root cpuset
363 * which is statically allocated, so can't vanish, will make this
364 * irrelevant. Better to use RCU as intended, than to engage in
365 * some cute trick to save a memory barrier that is impossible to
366 * test, for alpha systems using cpusets heavily, which might not
367 * even exist.
368 *
369 * This routine is needed to update the per-task mems_allowed data,
370 * within the tasks context, when it is trying to allocate memory
371 * (in various mm/mempolicy.c routines) and notices that some other
372 * task has been modifying its cpuset.
373 */ 312 */
374 313static void cpuset_update_task_spread_flag(struct cpuset *cs,
375void cpuset_update_task_memory_state(void) 314 struct task_struct *tsk)
376{ 315{
377 int my_cpusets_mem_gen; 316 if (is_spread_page(cs))
378 struct task_struct *tsk = current; 317 tsk->flags |= PF_SPREAD_PAGE;
379 struct cpuset *cs; 318 else
380 319 tsk->flags &= ~PF_SPREAD_PAGE;
381 rcu_read_lock(); 320 if (is_spread_slab(cs))
382 my_cpusets_mem_gen = task_cs(tsk)->mems_generation; 321 tsk->flags |= PF_SPREAD_SLAB;
383 rcu_read_unlock(); 322 else
384 323 tsk->flags &= ~PF_SPREAD_SLAB;
385 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
386 mutex_lock(&callback_mutex);
387 task_lock(tsk);
388 cs = task_cs(tsk); /* Maybe changed when task not locked */
389 guarantee_online_mems(cs, &tsk->mems_allowed);
390 tsk->cpuset_mems_generation = cs->mems_generation;
391 if (is_spread_page(cs))
392 tsk->flags |= PF_SPREAD_PAGE;
393 else
394 tsk->flags &= ~PF_SPREAD_PAGE;
395 if (is_spread_slab(cs))
396 tsk->flags |= PF_SPREAD_SLAB;
397 else
398 tsk->flags &= ~PF_SPREAD_SLAB;
399 task_unlock(tsk);
400 mutex_unlock(&callback_mutex);
401 mpol_rebind_task(tsk, &tsk->mems_allowed);
402 }
403} 324}
404 325
405/* 326/*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1007 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1008 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1009 * migrating memory region. 930 * migrating memory region.
1010 *
1011 * We call cpuset_update_task_memory_state() before hacking
1012 * our tasks mems_allowed, so that we are assured of being in
1013 * sync with our tasks cpuset, and in particular, callbacks to
1014 * cpuset_update_task_memory_state() from nested page allocations
1015 * won't see any mismatch of our cpuset and task mems_generation
1016 * values, so won't overwrite our hacked tasks mems_allowed
1017 * nodemask.
1018 */ 931 */
1019 932
1020static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1022{ 935{
1023 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1024 937
1025 cpuset_update_task_memory_state();
1026
1027 mutex_lock(&callback_mutex);
1028 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1029 mutex_unlock(&callback_mutex);
1030 939
1031 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1032 941
1033 mutex_lock(&callback_mutex);
1034 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1035 mutex_unlock(&callback_mutex);
1036} 943}
1037 944
1038/* 945/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1041 */ 969 */
1042static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1046 struct cpuset *cs; 974 struct cpuset *cs;
1047 int migrate; 975 int migrate;
1048 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1049 985
1050 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1051 if (!mm) 987 if (!mm)
1052 return; 988 return;
1053 989
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1056 991
1057 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1104/* 1039/*
1105 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1106 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1107 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1108 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1109 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1110 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1111 * 1046 *
1112 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1113 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1160 1095
1161 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1162 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1163 cs->mems_generation = cpuset_mems_generation++;
1164 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1165 1099
1166 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1193} 1127}
1194 1128
1195/* 1129/*
1130 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1131 * @tsk: task to be updated
1132 * @scan: struct cgroup_scanner containing the cgroup of the task
1133 *
1134 * Called by cgroup_scan_tasks() for each task in a cgroup.
1135 *
1136 * We don't need to re-check for the cgroup/cpuset membership, since we're
1137 * holding cgroup_lock() at this point.
1138 */
1139static void cpuset_change_flag(struct task_struct *tsk,
1140 struct cgroup_scanner *scan)
1141{
1142 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1143}
1144
1145/*
1146 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1147 * @cs: the cpuset in which each task's spread flags needs to be changed
1148 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1149 *
1150 * Called with cgroup_mutex held
1151 *
1152 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1153 * calling callback functions for each.
1154 *
1155 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1156 * if @heap != NULL.
1157 */
1158static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1159{
1160 struct cgroup_scanner scan;
1161
1162 scan.cg = cs->css.cgroup;
1163 scan.test_task = NULL;
1164 scan.process_task = cpuset_change_flag;
1165 scan.heap = heap;
1166 cgroup_scan_tasks(&scan);
1167}
1168
1169/*
1196 * update_flag - read a 0 or a 1 in a file and update associated flag 1170 * update_flag - read a 0 or a 1 in a file and update associated flag
1197 * bit: the bit to update (see cpuset_flagbits_t) 1171 * bit: the bit to update (see cpuset_flagbits_t)
1198 * cs: the cpuset to update 1172 * cs: the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1205 int turning_on) 1179 int turning_on)
1206{ 1180{
1207 struct cpuset *trialcs; 1181 struct cpuset *trialcs;
1208 int err;
1209 int balance_flag_changed; 1182 int balance_flag_changed;
1183 int spread_flag_changed;
1184 struct ptr_heap heap;
1185 int err;
1210 1186
1211 trialcs = alloc_trial_cpuset(cs); 1187 trialcs = alloc_trial_cpuset(cs);
1212 if (!trialcs) 1188 if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1221 if (err < 0) 1197 if (err < 0)
1222 goto out; 1198 goto out;
1223 1199
1200 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1201 if (err < 0)
1202 goto out;
1203
1224 balance_flag_changed = (is_sched_load_balance(cs) != 1204 balance_flag_changed = (is_sched_load_balance(cs) !=
1225 is_sched_load_balance(trialcs)); 1205 is_sched_load_balance(trialcs));
1226 1206
1207 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1208 || (is_spread_page(cs) != is_spread_page(trialcs)));
1209
1227 mutex_lock(&callback_mutex); 1210 mutex_lock(&callback_mutex);
1228 cs->flags = trialcs->flags; 1211 cs->flags = trialcs->flags;
1229 mutex_unlock(&callback_mutex); 1212 mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1231 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1214 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1232 async_rebuild_sched_domains(); 1215 async_rebuild_sched_domains();
1233 1216
1217 if (spread_flag_changed)
1218 update_tasks_flags(cs, &heap);
1219 heap_free(&heap);
1234out: 1220out:
1235 free_trial_cpuset(trialcs); 1221 free_trial_cpuset(trialcs);
1236 return err; 1222 return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1372 1358
1373 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1374 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1375 } else { 1362 } else {
1376 mutex_lock(&callback_mutex);
1377 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1378 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1379 } 1365 }
1380 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1381 if (err) 1367 if (err)
1382 return; 1368 return;
1383 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1373 cpuset_update_task_spread_flag(cs, tsk);
1374
1384 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
1385 to = cs->mems_allowed; 1376 to = cs->mems_allowed;
1386 mm = get_task_mm(tsk); 1377 mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1442 break; 1433 break;
1443 case FILE_SPREAD_PAGE: 1434 case FILE_SPREAD_PAGE:
1444 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1435 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1445 cs->mems_generation = cpuset_mems_generation++;
1446 break; 1436 break;
1447 case FILE_SPREAD_SLAB: 1437 case FILE_SPREAD_SLAB:
1448 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1438 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1449 cs->mems_generation = cpuset_mems_generation++;
1450 break; 1439 break;
1451 default: 1440 default:
1452 retval = -EINVAL; 1441 retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1786 struct cpuset *parent; 1775 struct cpuset *parent;
1787 1776
1788 if (!cont->parent) { 1777 if (!cont->parent) {
1789 /* This is early initialization for the top cgroup */
1790 top_cpuset.mems_generation = cpuset_mems_generation++;
1791 return &top_cpuset.css; 1778 return &top_cpuset.css;
1792 } 1779 }
1793 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1799 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1800 } 1787 }
1801 1788
1802 cpuset_update_task_memory_state();
1803 cs->flags = 0; 1789 cs->flags = 0;
1804 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1805 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1808 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1809 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1810 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1811 cs->mems_generation = cpuset_mems_generation++;
1812 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1813 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1814 1799
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1827{ 1812{
1828 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1829 1814
1830 cpuset_update_task_memory_state();
1831
1832 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1833 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1834 1817
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1849 .early_init = 1, 1832 .early_init = 1,
1850}; 1833};
1851 1834
1852/*
1853 * cpuset_init_early - just enough so that the calls to
1854 * cpuset_update_task_memory_state() in early init code
1855 * are harmless.
1856 */
1857
1858int __init cpuset_init_early(void)
1859{
1860 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0;
1864}
1865
1866
1867/** 1835/**
1868 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1869 * 1837 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
1874{ 1842{
1875 int err = 0; 1843 int err = 0;
1876 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1877 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1878 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1879 1850
1880 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1881 top_cpuset.mems_generation = cpuset_mems_generation++;
1882 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1883 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1884 1854
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d707..1bb4d7e5d616 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
167 167
168/* 168/*
169 * Prepare credentials for current to perform an execve() 169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex 170 * - The caller must hold current->cred_guard_mutex
171 */ 171 */
172struct cred *prepare_exec_creds(void) 172struct cred *prepare_exec_creds(void)
173{ 173{
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
276 struct cred *new; 276 struct cred *new;
277 int ret; 277 int ret;
278 278
279 mutex_init(&p->cred_exec_mutex); 279 mutex_init(&p->cred_guard_mutex);
280 280
281 if ( 281 if (
282#ifdef CONFIG_KEYS 282#ifdef CONFIG_KEYS
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
@@ -48,7 +47,8 @@
48#include <linux/tracehook.h> 47#include <linux/tracehook.h>
49#include <linux/fs_struct.h> 48#include <linux/fs_struct.h>
50#include <linux/init_task.h> 49#include <linux/init_task.h>
51#include <trace/sched.h> 50#include <linux/perf_counter.h>
51#include <trace/events/sched.h>
52 52
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
54#include <asm/unistd.h> 54#include <asm/unistd.h>
@@ -56,10 +56,6 @@
56#include <asm/mmu_context.h> 56#include <asm/mmu_context.h>
57#include "cred-internals.h" 57#include "cred-internals.h"
58 58
59DEFINE_TRACE(sched_process_free);
60DEFINE_TRACE(sched_process_exit);
61DEFINE_TRACE(sched_process_wait);
62
63static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
64 60
65static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p)
@@ -158,6 +154,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
158{ 154{
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 156
157#ifdef CONFIG_PERF_COUNTERS
158 WARN_ON_ONCE(tsk->perf_counter_ctxp);
159#endif
161 trace_sched_process_free(tsk); 160 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 161 put_task_struct(tsk);
163} 162}
@@ -174,6 +173,7 @@ repeat:
174 atomic_dec(&__task_cred(p)->user->processes); 173 atomic_dec(&__task_cred(p)->user->processes);
175 174
176 proc_flush_task(p); 175 proc_flush_task(p);
176
177 write_lock_irq(&tasklist_lock); 177 write_lock_irq(&tasklist_lock);
178 tracehook_finish_release_task(p); 178 tracehook_finish_release_task(p);
179 __exit_signal(p); 179 __exit_signal(p);
@@ -374,9 +374,8 @@ static void set_special_pids(struct pid *pid)
374} 374}
375 375
376/* 376/*
377 * Let kernel threads use this to say that they 377 * Let kernel threads use this to say that they allow a certain signal.
378 * allow a certain signal (since daemonize() will 378 * Must not be used if kthread was cloned with CLONE_SIGHAND.
379 * have disabled all of them by default).
380 */ 379 */
381int allow_signal(int sig) 380int allow_signal(int sig)
382{ 381{
@@ -384,14 +383,14 @@ int allow_signal(int sig)
384 return -EINVAL; 383 return -EINVAL;
385 384
386 spin_lock_irq(&current->sighand->siglock); 385 spin_lock_irq(&current->sighand->siglock);
386 /* This is only needed for daemonize()'ed kthreads */
387 sigdelset(&current->blocked, sig); 387 sigdelset(&current->blocked, sig);
388 if (!current->mm) { 388 /*
389 /* Kernel threads handle their own signals. 389 * Kernel threads handle their own signals. Let the signal code
390 Let the signal code know it'll be handled, so 390 * know it'll be handled, so that they don't get converted to
391 that they don't get converted to SIGKILL or 391 * SIGKILL or just silently dropped.
392 just silently dropped */ 392 */
393 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; 393 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
394 }
395 recalc_sigpending(); 394 recalc_sigpending();
396 spin_unlock_irq(&current->sighand->siglock); 395 spin_unlock_irq(&current->sighand->siglock);
397 return 0; 396 return 0;
@@ -590,7 +589,7 @@ retry:
590 /* 589 /*
591 * Search in the siblings 590 * Search in the siblings
592 */ 591 */
593 list_for_each_entry(c, &p->parent->children, sibling) { 592 list_for_each_entry(c, &p->real_parent->children, sibling) {
594 if (c->mm == mm) 593 if (c->mm == mm)
595 goto assign_new_owner; 594 goto assign_new_owner;
596 } 595 }
@@ -757,7 +756,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
757 p->exit_signal = SIGCHLD; 756 p->exit_signal = SIGCHLD;
758 757
759 /* If it has exited notify the new parent about this child's death. */ 758 /* If it has exited notify the new parent about this child's death. */
760 if (!p->ptrace && 759 if (!task_ptrace(p) &&
761 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 760 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
762 do_notify_parent(p, p->exit_signal); 761 do_notify_parent(p, p->exit_signal);
763 if (task_detached(p)) { 762 if (task_detached(p)) {
@@ -782,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
782 list_for_each_entry_safe(p, n, &father->children, sibling) { 781 list_for_each_entry_safe(p, n, &father->children, sibling) {
783 p->real_parent = reaper; 782 p->real_parent = reaper;
784 if (p->parent == father) { 783 if (p->parent == father) {
785 BUG_ON(p->ptrace); 784 BUG_ON(task_ptrace(p));
786 p->parent = p->real_parent; 785 p->parent = p->real_parent;
787 } 786 }
788 reparent_thread(father, p, &dead_children); 787 reparent_thread(father, p, &dead_children);
@@ -975,16 +974,19 @@ NORET_TYPE void do_exit(long code)
975 module_put(tsk->binfmt->module); 974 module_put(tsk->binfmt->module);
976 975
977 proc_exit_connector(tsk); 976 proc_exit_connector(tsk);
977
978 /*
979 * Flush inherited counters to the parent - before the parent
980 * gets woken up by child-exit notifications.
981 */
982 perf_counter_exit_task(tsk);
983
978 exit_notify(tsk, group_dead); 984 exit_notify(tsk, group_dead);
979#ifdef CONFIG_NUMA 985#ifdef CONFIG_NUMA
980 mpol_put(tsk->mempolicy); 986 mpol_put(tsk->mempolicy);
981 tsk->mempolicy = NULL; 987 tsk->mempolicy = NULL;
982#endif 988#endif
983#ifdef CONFIG_FUTEX 989#ifdef CONFIG_FUTEX
984 /*
985 * This must happen late, after the PID is not
986 * hashed anymore:
987 */
988 if (unlikely(!list_empty(&tsk->pi_state_list))) 990 if (unlikely(!list_empty(&tsk->pi_state_list)))
989 exit_pi_state_list(tsk); 991 exit_pi_state_list(tsk);
990 if (unlikely(current->pi_state_cache)) 992 if (unlikely(current->pi_state_cache))
@@ -1077,6 +1079,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
1077 return 0; 1079 return 0;
1078} 1080}
1079 1081
1082struct wait_opts {
1083 enum pid_type wo_type;
1084 int wo_flags;
1085 struct pid *wo_pid;
1086
1087 struct siginfo __user *wo_info;
1088 int __user *wo_stat;
1089 struct rusage __user *wo_rusage;
1090
1091 int notask_error;
1092};
1093
1080static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1094static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1081{ 1095{
1082 struct pid *pid = NULL; 1096 struct pid *pid = NULL;
@@ -1087,13 +1101,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1087 return pid; 1101 return pid;
1088} 1102}
1089 1103
1090static int eligible_child(enum pid_type type, struct pid *pid, int options, 1104static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1091 struct task_struct *p)
1092{ 1105{
1093 int err; 1106 int err;
1094 1107
1095 if (type < PIDTYPE_MAX) { 1108 if (wo->wo_type < PIDTYPE_MAX) {
1096 if (task_pid_type(p, type) != pid) 1109 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1097 return 0; 1110 return 0;
1098 } 1111 }
1099 1112
@@ -1102,8 +1115,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1102 * set; otherwise, wait for non-clone children *only*. (Note: 1115 * set; otherwise, wait for non-clone children *only*. (Note:
1103 * A "clone" child here is one that reports to its parent 1116 * A "clone" child here is one that reports to its parent
1104 * using a signal other than SIGCHLD.) */ 1117 * using a signal other than SIGCHLD.) */
1105 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) 1118 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1106 && !(options & __WALL)) 1119 && !(wo->wo_flags & __WALL))
1107 return 0; 1120 return 0;
1108 1121
1109 err = security_task_wait(p); 1122 err = security_task_wait(p);
@@ -1113,14 +1126,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1113 return 1; 1126 return 1;
1114} 1127}
1115 1128
1116static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1129static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1117 int why, int status, 1130 pid_t pid, uid_t uid, int why, int status)
1118 struct siginfo __user *infop,
1119 struct rusage __user *rusagep)
1120{ 1131{
1121 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; 1132 struct siginfo __user *infop;
1133 int retval = wo->wo_rusage
1134 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1122 1135
1123 put_task_struct(p); 1136 put_task_struct(p);
1137 infop = wo->wo_info;
1124 if (!retval) 1138 if (!retval)
1125 retval = put_user(SIGCHLD, &infop->si_signo); 1139 retval = put_user(SIGCHLD, &infop->si_signo);
1126 if (!retval) 1140 if (!retval)
@@ -1144,19 +1158,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1144 * the lock and this task is uninteresting. If we return nonzero, we have 1158 * the lock and this task is uninteresting. If we return nonzero, we have
1145 * released the lock and the system call should return. 1159 * released the lock and the system call should return.
1146 */ 1160 */
1147static int wait_task_zombie(struct task_struct *p, int options, 1161static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1148 struct siginfo __user *infop,
1149 int __user *stat_addr, struct rusage __user *ru)
1150{ 1162{
1151 unsigned long state; 1163 unsigned long state;
1152 int retval, status, traced; 1164 int retval, status, traced;
1153 pid_t pid = task_pid_vnr(p); 1165 pid_t pid = task_pid_vnr(p);
1154 uid_t uid = __task_cred(p)->uid; 1166 uid_t uid = __task_cred(p)->uid;
1167 struct siginfo __user *infop;
1155 1168
1156 if (!likely(options & WEXITED)) 1169 if (!likely(wo->wo_flags & WEXITED))
1157 return 0; 1170 return 0;
1158 1171
1159 if (unlikely(options & WNOWAIT)) { 1172 if (unlikely(wo->wo_flags & WNOWAIT)) {
1160 int exit_code = p->exit_code; 1173 int exit_code = p->exit_code;
1161 int why, status; 1174 int why, status;
1162 1175
@@ -1169,8 +1182,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1169 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; 1182 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1170 status = exit_code & 0x7f; 1183 status = exit_code & 0x7f;
1171 } 1184 }
1172 return wait_noreap_copyout(p, pid, uid, why, 1185 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1173 status, infop, ru);
1174 } 1186 }
1175 1187
1176 /* 1188 /*
@@ -1184,11 +1196,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
1184 } 1196 }
1185 1197
1186 traced = ptrace_reparented(p); 1198 traced = ptrace_reparented(p);
1187 1199 /*
1188 if (likely(!traced)) { 1200 * It can be ptraced but not reparented, check
1201 * !task_detached() to filter out sub-threads.
1202 */
1203 if (likely(!traced) && likely(!task_detached(p))) {
1189 struct signal_struct *psig; 1204 struct signal_struct *psig;
1190 struct signal_struct *sig; 1205 struct signal_struct *sig;
1191 struct task_cputime cputime;
1192 1206
1193 /* 1207 /*
1194 * The resource counters for the group leader are in its 1208 * The resource counters for the group leader are in its
@@ -1201,26 +1215,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1201 * p->signal fields, because they are only touched by 1215 * p->signal fields, because they are only touched by
1202 * __exit_signal, which runs with tasklist_lock 1216 * __exit_signal, which runs with tasklist_lock
1203 * write-locked anyway, and so is excluded here. We do 1217 * write-locked anyway, and so is excluded here. We do
1204 * need to protect the access to p->parent->signal fields, 1218 * need to protect the access to parent->signal fields,
1205 * as other threads in the parent group can be right 1219 * as other threads in the parent group can be right
1206 * here reaping other children at the same time. 1220 * here reaping other children at the same time.
1207 *
1208 * We use thread_group_cputime() to get times for the thread
1209 * group, which consolidates times for all threads in the
1210 * group including the group leader.
1211 */ 1221 */
1212 thread_group_cputime(p, &cputime); 1222 spin_lock_irq(&p->real_parent->sighand->siglock);
1213 spin_lock_irq(&p->parent->sighand->siglock); 1223 psig = p->real_parent->signal;
1214 psig = p->parent->signal;
1215 sig = p->signal; 1224 sig = p->signal;
1216 psig->cutime = 1225 psig->cutime =
1217 cputime_add(psig->cutime, 1226 cputime_add(psig->cutime,
1218 cputime_add(cputime.utime, 1227 cputime_add(p->utime,
1219 sig->cutime)); 1228 cputime_add(sig->utime,
1229 sig->cutime)));
1220 psig->cstime = 1230 psig->cstime =
1221 cputime_add(psig->cstime, 1231 cputime_add(psig->cstime,
1222 cputime_add(cputime.stime, 1232 cputime_add(p->stime,
1223 sig->cstime)); 1233 cputime_add(sig->stime,
1234 sig->cstime)));
1224 psig->cgtime = 1235 psig->cgtime =
1225 cputime_add(psig->cgtime, 1236 cputime_add(psig->cgtime,
1226 cputime_add(p->gtime, 1237 cputime_add(p->gtime,
@@ -1242,7 +1253,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1242 sig->oublock + sig->coublock; 1253 sig->oublock + sig->coublock;
1243 task_io_accounting_add(&psig->ioac, &p->ioac); 1254 task_io_accounting_add(&psig->ioac, &p->ioac);
1244 task_io_accounting_add(&psig->ioac, &sig->ioac); 1255 task_io_accounting_add(&psig->ioac, &sig->ioac);
1245 spin_unlock_irq(&p->parent->sighand->siglock); 1256 spin_unlock_irq(&p->real_parent->sighand->siglock);
1246 } 1257 }
1247 1258
1248 /* 1259 /*
@@ -1251,11 +1262,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
1251 */ 1262 */
1252 read_unlock(&tasklist_lock); 1263 read_unlock(&tasklist_lock);
1253 1264
1254 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1265 retval = wo->wo_rusage
1266 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1255 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1267 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1256 ? p->signal->group_exit_code : p->exit_code; 1268 ? p->signal->group_exit_code : p->exit_code;
1257 if (!retval && stat_addr) 1269 if (!retval && wo->wo_stat)
1258 retval = put_user(status, stat_addr); 1270 retval = put_user(status, wo->wo_stat);
1271
1272 infop = wo->wo_info;
1259 if (!retval && infop) 1273 if (!retval && infop)
1260 retval = put_user(SIGCHLD, &infop->si_signo); 1274 retval = put_user(SIGCHLD, &infop->si_signo);
1261 if (!retval && infop) 1275 if (!retval && infop)
@@ -1323,15 +1337,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1323 * the lock and this task is uninteresting. If we return nonzero, we have 1337 * the lock and this task is uninteresting. If we return nonzero, we have
1324 * released the lock and the system call should return. 1338 * released the lock and the system call should return.
1325 */ 1339 */
1326static int wait_task_stopped(int ptrace, struct task_struct *p, 1340static int wait_task_stopped(struct wait_opts *wo,
1327 int options, struct siginfo __user *infop, 1341 int ptrace, struct task_struct *p)
1328 int __user *stat_addr, struct rusage __user *ru)
1329{ 1342{
1343 struct siginfo __user *infop;
1330 int retval, exit_code, *p_code, why; 1344 int retval, exit_code, *p_code, why;
1331 uid_t uid = 0; /* unneeded, required by compiler */ 1345 uid_t uid = 0; /* unneeded, required by compiler */
1332 pid_t pid; 1346 pid_t pid;
1333 1347
1334 if (!(options & WUNTRACED)) 1348 /*
1349 * Traditionally we see ptrace'd stopped tasks regardless of options.
1350 */
1351 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1335 return 0; 1352 return 0;
1336 1353
1337 exit_code = 0; 1354 exit_code = 0;
@@ -1345,7 +1362,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1345 if (!exit_code) 1362 if (!exit_code)
1346 goto unlock_sig; 1363 goto unlock_sig;
1347 1364
1348 if (!unlikely(options & WNOWAIT)) 1365 if (!unlikely(wo->wo_flags & WNOWAIT))
1349 *p_code = 0; 1366 *p_code = 0;
1350 1367
1351 /* don't need the RCU readlock here as we're holding a spinlock */ 1368 /* don't need the RCU readlock here as we're holding a spinlock */
@@ -1367,14 +1384,15 @@ unlock_sig:
1367 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1384 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1368 read_unlock(&tasklist_lock); 1385 read_unlock(&tasklist_lock);
1369 1386
1370 if (unlikely(options & WNOWAIT)) 1387 if (unlikely(wo->wo_flags & WNOWAIT))
1371 return wait_noreap_copyout(p, pid, uid, 1388 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1372 why, exit_code, 1389
1373 infop, ru); 1390 retval = wo->wo_rusage
1391 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1392 if (!retval && wo->wo_stat)
1393 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1374 1394
1375 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1395 infop = wo->wo_info;
1376 if (!retval && stat_addr)
1377 retval = put_user((exit_code << 8) | 0x7f, stat_addr);
1378 if (!retval && infop) 1396 if (!retval && infop)
1379 retval = put_user(SIGCHLD, &infop->si_signo); 1397 retval = put_user(SIGCHLD, &infop->si_signo);
1380 if (!retval && infop) 1398 if (!retval && infop)
@@ -1401,15 +1419,13 @@ unlock_sig:
1401 * the lock and this task is uninteresting. If we return nonzero, we have 1419 * the lock and this task is uninteresting. If we return nonzero, we have
1402 * released the lock and the system call should return. 1420 * released the lock and the system call should return.
1403 */ 1421 */
1404static int wait_task_continued(struct task_struct *p, int options, 1422static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1405 struct siginfo __user *infop,
1406 int __user *stat_addr, struct rusage __user *ru)
1407{ 1423{
1408 int retval; 1424 int retval;
1409 pid_t pid; 1425 pid_t pid;
1410 uid_t uid; 1426 uid_t uid;
1411 1427
1412 if (!unlikely(options & WCONTINUED)) 1428 if (!unlikely(wo->wo_flags & WCONTINUED))
1413 return 0; 1429 return 0;
1414 1430
1415 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1431 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1421,7 +1437,7 @@ static int wait_task_continued(struct task_struct *p, int options,
1421 spin_unlock_irq(&p->sighand->siglock); 1437 spin_unlock_irq(&p->sighand->siglock);
1422 return 0; 1438 return 0;
1423 } 1439 }
1424 if (!unlikely(options & WNOWAIT)) 1440 if (!unlikely(wo->wo_flags & WNOWAIT))
1425 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1441 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1426 uid = __task_cred(p)->uid; 1442 uid = __task_cred(p)->uid;
1427 spin_unlock_irq(&p->sighand->siglock); 1443 spin_unlock_irq(&p->sighand->siglock);
@@ -1430,17 +1446,17 @@ static int wait_task_continued(struct task_struct *p, int options,
1430 get_task_struct(p); 1446 get_task_struct(p);
1431 read_unlock(&tasklist_lock); 1447 read_unlock(&tasklist_lock);
1432 1448
1433 if (!infop) { 1449 if (!wo->wo_info) {
1434 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1450 retval = wo->wo_rusage
1451 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1435 put_task_struct(p); 1452 put_task_struct(p);
1436 if (!retval && stat_addr) 1453 if (!retval && wo->wo_stat)
1437 retval = put_user(0xffff, stat_addr); 1454 retval = put_user(0xffff, wo->wo_stat);
1438 if (!retval) 1455 if (!retval)
1439 retval = pid; 1456 retval = pid;
1440 } else { 1457 } else {
1441 retval = wait_noreap_copyout(p, pid, uid, 1458 retval = wait_noreap_copyout(wo, p, pid, uid,
1442 CLD_CONTINUED, SIGCONT, 1459 CLD_CONTINUED, SIGCONT);
1443 infop, ru);
1444 BUG_ON(retval == 0); 1460 BUG_ON(retval == 0);
1445 } 1461 }
1446 1462
@@ -1450,19 +1466,16 @@ static int wait_task_continued(struct task_struct *p, int options,
1450/* 1466/*
1451 * Consider @p for a wait by @parent. 1467 * Consider @p for a wait by @parent.
1452 * 1468 *
1453 * -ECHILD should be in *@notask_error before the first call. 1469 * -ECHILD should be in ->notask_error before the first call.
1454 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1470 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1455 * Returns zero if the search for a child should continue; 1471 * Returns zero if the search for a child should continue;
1456 * then *@notask_error is 0 if @p is an eligible child, 1472 * then ->notask_error is 0 if @p is an eligible child,
1457 * or another error from security_task_wait(), or still -ECHILD. 1473 * or another error from security_task_wait(), or still -ECHILD.
1458 */ 1474 */
1459static int wait_consider_task(struct task_struct *parent, int ptrace, 1475static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
1460 struct task_struct *p, int *notask_error, 1476 int ptrace, struct task_struct *p)
1461 enum pid_type type, struct pid *pid, int options,
1462 struct siginfo __user *infop,
1463 int __user *stat_addr, struct rusage __user *ru)
1464{ 1477{
1465 int ret = eligible_child(type, pid, options, p); 1478 int ret = eligible_child(wo, p);
1466 if (!ret) 1479 if (!ret)
1467 return ret; 1480 return ret;
1468 1481
@@ -1474,16 +1487,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1474 * to look for security policy problems, rather 1487 * to look for security policy problems, rather
1475 * than for mysterious wait bugs. 1488 * than for mysterious wait bugs.
1476 */ 1489 */
1477 if (*notask_error) 1490 if (wo->notask_error)
1478 *notask_error = ret; 1491 wo->notask_error = ret;
1492 return 0;
1479 } 1493 }
1480 1494
1481 if (likely(!ptrace) && unlikely(p->ptrace)) { 1495 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1482 /* 1496 /*
1483 * This child is hidden by ptrace. 1497 * This child is hidden by ptrace.
1484 * We aren't allowed to see it now, but eventually we will. 1498 * We aren't allowed to see it now, but eventually we will.
1485 */ 1499 */
1486 *notask_error = 0; 1500 wo->notask_error = 0;
1487 return 0; 1501 return 0;
1488 } 1502 }
1489 1503
@@ -1494,34 +1508,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1494 * We don't reap group leaders with subthreads. 1508 * We don't reap group leaders with subthreads.
1495 */ 1509 */
1496 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1510 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1497 return wait_task_zombie(p, options, infop, stat_addr, ru); 1511 return wait_task_zombie(wo, p);
1498 1512
1499 /* 1513 /*
1500 * It's stopped or running now, so it might 1514 * It's stopped or running now, so it might
1501 * later continue, exit, or stop again. 1515 * later continue, exit, or stop again.
1502 */ 1516 */
1503 *notask_error = 0; 1517 wo->notask_error = 0;
1504 1518
1505 if (task_stopped_code(p, ptrace)) 1519 if (task_stopped_code(p, ptrace))
1506 return wait_task_stopped(ptrace, p, options, 1520 return wait_task_stopped(wo, ptrace, p);
1507 infop, stat_addr, ru);
1508 1521
1509 return wait_task_continued(p, options, infop, stat_addr, ru); 1522 return wait_task_continued(wo, p);
1510} 1523}
1511 1524
1512/* 1525/*
1513 * Do the work of do_wait() for one thread in the group, @tsk. 1526 * Do the work of do_wait() for one thread in the group, @tsk.
1514 * 1527 *
1515 * -ECHILD should be in *@notask_error before the first call. 1528 * -ECHILD should be in ->notask_error before the first call.
1516 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1529 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1517 * Returns zero if the search for a child should continue; then 1530 * Returns zero if the search for a child should continue; then
1518 * *@notask_error is 0 if there were any eligible children, 1531 * ->notask_error is 0 if there were any eligible children,
1519 * or another error from security_task_wait(), or still -ECHILD. 1532 * or another error from security_task_wait(), or still -ECHILD.
1520 */ 1533 */
1521static int do_wait_thread(struct task_struct *tsk, int *notask_error, 1534static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1522 enum pid_type type, struct pid *pid, int options,
1523 struct siginfo __user *infop, int __user *stat_addr,
1524 struct rusage __user *ru)
1525{ 1535{
1526 struct task_struct *p; 1536 struct task_struct *p;
1527 1537
@@ -1530,9 +1540,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1530 * Do not consider detached threads. 1540 * Do not consider detached threads.
1531 */ 1541 */
1532 if (!task_detached(p)) { 1542 if (!task_detached(p)) {
1533 int ret = wait_consider_task(tsk, 0, p, notask_error, 1543 int ret = wait_consider_task(wo, tsk, 0, p);
1534 type, pid, options,
1535 infop, stat_addr, ru);
1536 if (ret) 1544 if (ret)
1537 return ret; 1545 return ret;
1538 } 1546 }
@@ -1541,22 +1549,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1541 return 0; 1549 return 0;
1542} 1550}
1543 1551
1544static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, 1552static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1545 enum pid_type type, struct pid *pid, int options,
1546 struct siginfo __user *infop, int __user *stat_addr,
1547 struct rusage __user *ru)
1548{ 1553{
1549 struct task_struct *p; 1554 struct task_struct *p;
1550 1555
1551 /*
1552 * Traditionally we see ptrace'd stopped tasks regardless of options.
1553 */
1554 options |= WUNTRACED;
1555
1556 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1556 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1557 int ret = wait_consider_task(tsk, 1, p, notask_error, 1557 int ret = wait_consider_task(wo, tsk, 1, p);
1558 type, pid, options,
1559 infop, stat_addr, ru);
1560 if (ret) 1558 if (ret)
1561 return ret; 1559 return ret;
1562 } 1560 }
@@ -1564,65 +1562,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1564 return 0; 1562 return 0;
1565} 1563}
1566 1564
1567static long do_wait(enum pid_type type, struct pid *pid, int options, 1565static long do_wait(struct wait_opts *wo)
1568 struct siginfo __user *infop, int __user *stat_addr,
1569 struct rusage __user *ru)
1570{ 1566{
1571 DECLARE_WAITQUEUE(wait, current); 1567 DECLARE_WAITQUEUE(wait, current);
1572 struct task_struct *tsk; 1568 struct task_struct *tsk;
1573 int retval; 1569 int retval;
1574 1570
1575 trace_sched_process_wait(pid); 1571 trace_sched_process_wait(wo->wo_pid);
1576 1572
1577 add_wait_queue(&current->signal->wait_chldexit,&wait); 1573 add_wait_queue(&current->signal->wait_chldexit,&wait);
1578repeat: 1574repeat:
1579 /* 1575 /*
1580 * If there is nothing that can match our critiera just get out. 1576 * If there is nothing that can match our critiera just get out.
1581 * We will clear @retval to zero if we see any child that might later 1577 * We will clear ->notask_error to zero if we see any child that
1582 * match our criteria, even if we are not able to reap it yet. 1578 * might later match our criteria, even if we are not able to reap
1579 * it yet.
1583 */ 1580 */
1584 retval = -ECHILD; 1581 wo->notask_error = -ECHILD;
1585 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1582 if ((wo->wo_type < PIDTYPE_MAX) &&
1586 goto end; 1583 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1584 goto notask;
1587 1585
1588 current->state = TASK_INTERRUPTIBLE; 1586 set_current_state(TASK_INTERRUPTIBLE);
1589 read_lock(&tasklist_lock); 1587 read_lock(&tasklist_lock);
1590 tsk = current; 1588 tsk = current;
1591 do { 1589 do {
1592 int tsk_result = do_wait_thread(tsk, &retval, 1590 retval = do_wait_thread(wo, tsk);
1593 type, pid, options, 1591 if (retval)
1594 infop, stat_addr, ru);
1595 if (!tsk_result)
1596 tsk_result = ptrace_do_wait(tsk, &retval,
1597 type, pid, options,
1598 infop, stat_addr, ru);
1599 if (tsk_result) {
1600 /*
1601 * tasklist_lock is unlocked and we have a final result.
1602 */
1603 retval = tsk_result;
1604 goto end; 1592 goto end;
1605 }
1606 1593
1607 if (options & __WNOTHREAD) 1594 retval = ptrace_do_wait(wo, tsk);
1595 if (retval)
1596 goto end;
1597
1598 if (wo->wo_flags & __WNOTHREAD)
1608 break; 1599 break;
1609 tsk = next_thread(tsk); 1600 } while_each_thread(current, tsk);
1610 BUG_ON(tsk->signal != current->signal);
1611 } while (tsk != current);
1612 read_unlock(&tasklist_lock); 1601 read_unlock(&tasklist_lock);
1613 1602
1614 if (!retval && !(options & WNOHANG)) { 1603notask:
1604 retval = wo->notask_error;
1605 if (!retval && !(wo->wo_flags & WNOHANG)) {
1615 retval = -ERESTARTSYS; 1606 retval = -ERESTARTSYS;
1616 if (!signal_pending(current)) { 1607 if (!signal_pending(current)) {
1617 schedule(); 1608 schedule();
1618 goto repeat; 1609 goto repeat;
1619 } 1610 }
1620 } 1611 }
1621
1622end: 1612end:
1623 current->state = TASK_RUNNING; 1613 __set_current_state(TASK_RUNNING);
1624 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1614 remove_wait_queue(&current->signal->wait_chldexit,&wait);
1625 if (infop) { 1615 if (wo->wo_info) {
1616 struct siginfo __user *infop = wo->wo_info;
1617
1626 if (retval > 0) 1618 if (retval > 0)
1627 retval = 0; 1619 retval = 0;
1628 else { 1620 else {
@@ -1651,6 +1643,7 @@ end:
1651SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1643SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1652 infop, int, options, struct rusage __user *, ru) 1644 infop, int, options, struct rusage __user *, ru)
1653{ 1645{
1646 struct wait_opts wo;
1654 struct pid *pid = NULL; 1647 struct pid *pid = NULL;
1655 enum pid_type type; 1648 enum pid_type type;
1656 long ret; 1649 long ret;
@@ -1680,7 +1673,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1680 1673
1681 if (type < PIDTYPE_MAX) 1674 if (type < PIDTYPE_MAX)
1682 pid = find_get_pid(upid); 1675 pid = find_get_pid(upid);
1683 ret = do_wait(type, pid, options, infop, NULL, ru); 1676
1677 wo.wo_type = type;
1678 wo.wo_pid = pid;
1679 wo.wo_flags = options;
1680 wo.wo_info = infop;
1681 wo.wo_stat = NULL;
1682 wo.wo_rusage = ru;
1683 ret = do_wait(&wo);
1684 put_pid(pid); 1684 put_pid(pid);
1685 1685
1686 /* avoid REGPARM breakage on x86: */ 1686 /* avoid REGPARM breakage on x86: */
@@ -1691,6 +1691,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1691SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1691SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1692 int, options, struct rusage __user *, ru) 1692 int, options, struct rusage __user *, ru)
1693{ 1693{
1694 struct wait_opts wo;
1694 struct pid *pid = NULL; 1695 struct pid *pid = NULL;
1695 enum pid_type type; 1696 enum pid_type type;
1696 long ret; 1697 long ret;
@@ -1712,7 +1713,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1712 pid = find_get_pid(upid); 1713 pid = find_get_pid(upid);
1713 } 1714 }
1714 1715
1715 ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); 1716 wo.wo_type = type;
1717 wo.wo_pid = pid;
1718 wo.wo_flags = options | WEXITED;
1719 wo.wo_info = NULL;
1720 wo.wo_stat = stat_addr;
1721 wo.wo_rusage = ru;
1722 ret = do_wait(&wo);
1716 put_pid(pid); 1723 put_pid(pid);
1717 1724
1718 /* avoid REGPARM breakage on x86: */ 1725 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 875ffbdd96d0..e6c04d462ab2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -61,8 +60,8 @@
61#include <linux/proc_fs.h> 60#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 61#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 62#include <linux/fs_struct.h>
64#include <trace/sched.h>
65#include <linux/magic.h> 63#include <linux/magic.h>
64#include <linux/perf_counter.h>
66 65
67#include <asm/pgtable.h> 66#include <asm/pgtable.h>
68#include <asm/pgalloc.h> 67#include <asm/pgalloc.h>
@@ -71,6 +70,8 @@
71#include <asm/cacheflush.h> 70#include <asm/cacheflush.h>
72#include <asm/tlbflush.h> 71#include <asm/tlbflush.h>
73 72
73#include <trace/events/sched.h>
74
74/* 75/*
75 * Protected counters by write_lock_irq(&tasklist_lock) 76 * Protected counters by write_lock_irq(&tasklist_lock)
76 */ 77 */
@@ -83,8 +84,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
83 84
84__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 85__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
85 86
86DEFINE_TRACE(sched_process_fork);
87
88int nr_processes(void) 87int nr_processes(void)
89{ 88{
90 int cpu; 89 int cpu;
@@ -178,7 +177,7 @@ void __init fork_init(unsigned long mempages)
178 /* create a slab on which task_structs can be allocated */ 177 /* create a slab on which task_structs can be allocated */
179 task_struct_cachep = 178 task_struct_cachep =
180 kmem_cache_create("task_struct", sizeof(struct task_struct), 179 kmem_cache_create("task_struct", sizeof(struct task_struct),
181 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 180 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
182#endif 181#endif
183 182
184 /* do the arch specific task caches init */ 183 /* do the arch specific task caches init */
@@ -568,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 567 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 568 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 569 */
571 if (tsk->clear_child_tid 570 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 571 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 572 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 573 /*
574 * We don't check the error code - if userspace has
575 * not set up a proper pointer then tough luck.
576 */
577 put_user(0, tsk->clear_child_tid);
578 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
579 1, NULL, NULL, 0);
580 }
575 tsk->clear_child_tid = NULL; 581 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 582 }
584} 583}
585 584
@@ -816,11 +815,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
816{ 815{
817 struct signal_struct *sig; 816 struct signal_struct *sig;
818 817
819 if (clone_flags & CLONE_THREAD) { 818 if (clone_flags & CLONE_THREAD)
820 atomic_inc(&current->signal->count);
821 atomic_inc(&current->signal->live);
822 return 0; 819 return 0;
823 }
824 820
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 821 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
826 tsk->signal = sig; 822 tsk->signal = sig;
@@ -878,16 +874,6 @@ void __cleanup_signal(struct signal_struct *sig)
878 kmem_cache_free(signal_cachep, sig); 874 kmem_cache_free(signal_cachep, sig);
879} 875}
880 876
881static void cleanup_signal(struct task_struct *tsk)
882{
883 struct signal_struct *sig = tsk->signal;
884
885 atomic_dec(&sig->live);
886
887 if (atomic_dec_and_test(&sig->count))
888 __cleanup_signal(sig);
889}
890
891static void copy_flags(unsigned long clone_flags, struct task_struct *p) 877static void copy_flags(unsigned long clone_flags, struct task_struct *p)
892{ 878{
893 unsigned long new_flags = p->flags; 879 unsigned long new_flags = p->flags;
@@ -982,6 +968,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
982 if (!p) 968 if (!p)
983 goto fork_out; 969 goto fork_out;
984 970
971 ftrace_graph_init_task(p);
972
985 rt_mutex_init_task(p); 973 rt_mutex_init_task(p);
986 974
987#ifdef CONFIG_PROVE_LOCKING 975#ifdef CONFIG_PROVE_LOCKING
@@ -1027,7 +1015,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1027 p->vfork_done = NULL; 1015 p->vfork_done = NULL;
1028 spin_lock_init(&p->alloc_lock); 1016 spin_lock_init(&p->alloc_lock);
1029 1017
1030 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1031 init_sigpending(&p->pending); 1018 init_sigpending(&p->pending);
1032 1019
1033 p->utime = cputime_zero; 1020 p->utime = cputime_zero;
@@ -1089,12 +1076,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1089#ifdef CONFIG_DEBUG_MUTEXES 1076#ifdef CONFIG_DEBUG_MUTEXES
1090 p->blocked_on = NULL; /* not blocked yet */ 1077 p->blocked_on = NULL; /* not blocked yet */
1091#endif 1078#endif
1092 if (unlikely(current->ptrace)) 1079
1093 ptrace_fork(p, clone_flags); 1080 p->bts = NULL;
1094 1081
1095 /* Perform scheduler related setup. Assign this task to a CPU. */ 1082 /* Perform scheduler related setup. Assign this task to a CPU. */
1096 sched_fork(p, clone_flags); 1083 sched_fork(p, clone_flags);
1097 1084
1085 retval = perf_counter_init_task(p);
1086 if (retval)
1087 goto bad_fork_cleanup_policy;
1088
1098 if ((retval = audit_alloc(p))) 1089 if ((retval = audit_alloc(p)))
1099 goto bad_fork_cleanup_policy; 1090 goto bad_fork_cleanup_policy;
1100 /* copy all the process information */ 1091 /* copy all the process information */
@@ -1131,8 +1122,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1131 } 1122 }
1132 } 1123 }
1133 1124
1134 ftrace_graph_init_task(p);
1135
1136 p->pid = pid_nr(pid); 1125 p->pid = pid_nr(pid);
1137 p->tgid = p->pid; 1126 p->tgid = p->pid;
1138 if (clone_flags & CLONE_THREAD) 1127 if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1130,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1141 if (current->nsproxy != p->nsproxy) { 1130 if (current->nsproxy != p->nsproxy) {
1142 retval = ns_cgroup_clone(p, pid); 1131 retval = ns_cgroup_clone(p, pid);
1143 if (retval) 1132 if (retval)
1144 goto bad_fork_free_graph; 1133 goto bad_fork_free_pid;
1145 } 1134 }
1146 1135
1147 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1136 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,10 +1222,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233 spin_unlock(&current->sighand->siglock); 1222 spin_unlock(&current->sighand->siglock);
1234 write_unlock_irq(&tasklist_lock); 1223 write_unlock_irq(&tasklist_lock);
1235 retval = -ERESTARTNOINTR; 1224 retval = -ERESTARTNOINTR;
1236 goto bad_fork_free_graph; 1225 goto bad_fork_free_pid;
1237 } 1226 }
1238 1227
1239 if (clone_flags & CLONE_THREAD) { 1228 if (clone_flags & CLONE_THREAD) {
1229 atomic_inc(&current->signal->count);
1230 atomic_inc(&current->signal->live);
1240 p->group_leader = current->group_leader; 1231 p->group_leader = current->group_leader;
1241 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1232 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1242 } 1233 }
@@ -1266,10 +1257,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1266 write_unlock_irq(&tasklist_lock); 1257 write_unlock_irq(&tasklist_lock);
1267 proc_fork_connector(p); 1258 proc_fork_connector(p);
1268 cgroup_post_fork(p); 1259 cgroup_post_fork(p);
1260 perf_counter_fork(p);
1269 return p; 1261 return p;
1270 1262
1271bad_fork_free_graph:
1272 ftrace_graph_exit_task(p);
1273bad_fork_free_pid: 1263bad_fork_free_pid:
1274 if (pid != &init_struct_pid) 1264 if (pid != &init_struct_pid)
1275 free_pid(pid); 1265 free_pid(pid);
@@ -1281,7 +1271,8 @@ bad_fork_cleanup_mm:
1281 if (p->mm) 1271 if (p->mm)
1282 mmput(p->mm); 1272 mmput(p->mm);
1283bad_fork_cleanup_signal: 1273bad_fork_cleanup_signal:
1284 cleanup_signal(p); 1274 if (!(clone_flags & CLONE_THREAD))
1275 __cleanup_signal(p->signal);
1285bad_fork_cleanup_sighand: 1276bad_fork_cleanup_sighand:
1286 __cleanup_sighand(p->sighand); 1277 __cleanup_sighand(p->sighand);
1287bad_fork_cleanup_fs: 1278bad_fork_cleanup_fs:
@@ -1293,6 +1284,7 @@ bad_fork_cleanup_semundo:
1293bad_fork_cleanup_audit: 1284bad_fork_cleanup_audit:
1294 audit_free(p); 1285 audit_free(p);
1295bad_fork_cleanup_policy: 1286bad_fork_cleanup_policy:
1287 perf_counter_free_task(p);
1296#ifdef CONFIG_NUMA 1288#ifdef CONFIG_NUMA
1297 mpol_put(p->mempolicy); 1289 mpol_put(p->mempolicy);
1298bad_fork_cleanup_cgroup: 1290bad_fork_cleanup_cgroup:
@@ -1461,20 +1453,20 @@ void __init proc_caches_init(void)
1461{ 1453{
1462 sighand_cachep = kmem_cache_create("sighand_cache", 1454 sighand_cachep = kmem_cache_create("sighand_cache",
1463 sizeof(struct sighand_struct), 0, 1455 sizeof(struct sighand_struct), 0,
1464 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1456 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1465 sighand_ctor); 1457 SLAB_NOTRACK, sighand_ctor);
1466 signal_cachep = kmem_cache_create("signal_cache", 1458 signal_cachep = kmem_cache_create("signal_cache",
1467 sizeof(struct signal_struct), 0, 1459 sizeof(struct signal_struct), 0,
1468 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1460 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1469 files_cachep = kmem_cache_create("files_cache", 1461 files_cachep = kmem_cache_create("files_cache",
1470 sizeof(struct files_struct), 0, 1462 sizeof(struct files_struct), 0,
1471 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1463 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1472 fs_cachep = kmem_cache_create("fs_cache", 1464 fs_cachep = kmem_cache_create("fs_cache",
1473 sizeof(struct fs_struct), 0, 1465 sizeof(struct fs_struct), 0,
1474 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1466 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1475 mm_cachep = kmem_cache_create("mm_struct", 1467 mm_cachep = kmem_cache_create("mm_struct",
1476 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1468 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1469 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1478 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1470 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1479 mmap_init(); 1471 mmap_init();
1480} 1472}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index d546b2d53a62..e18cfbdc7190 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
19 * PRIVATE futexes by Eric Dumazet 19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 * 21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
23 * enough at me, Linus for the original (flawed) idea, Matthew 27 * enough at me, Linus for the original (flawed) idea, Matthew
24 * Kirkwood for proof-of-concept implementation. 28 * Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
96 */ 100 */
97struct futex_q { 101struct futex_q {
98 struct plist_node list; 102 struct plist_node list;
99 /* There can only be a single waiter */ 103 /* Waiter reference */
100 wait_queue_head_t waiter; 104 struct task_struct *task;
101 105
102 /* Which hash list lock to use: */ 106 /* Which hash list lock to use: */
103 spinlock_t *lock_ptr; 107 spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
107 111
108 /* Optional priority inheritance state: */ 112 /* Optional priority inheritance state: */
109 struct futex_pi_state *pi_state; 113 struct futex_pi_state *pi_state;
110 struct task_struct *task; 114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter;
111 117
112 /* Bitset for the optional bitmasked wakeup */ 118 /* Bitset for the optional bitmasked wakeup */
113 u32 bitset; 119 u32 bitset;
@@ -241,6 +247,7 @@ again:
241 if (err < 0) 247 if (err < 0)
242 return err; 248 return err;
243 249
250 page = compound_head(page);
244 lock_page(page); 251 lock_page(page);
245 if (!page->mapping) { 252 if (!page->mapping) {
246 unlock_page(page); 253 unlock_page(page);
@@ -278,6 +285,44 @@ void put_futex_key(int fshared, union futex_key *key)
278 drop_futex_key_refs(key); 285 drop_futex_key_refs(key);
279} 286}
280 287
288/*
289 * fault_in_user_writeable - fault in user address and verify RW access
290 * @uaddr: pointer to faulting user space address
291 *
292 * Slow path to fixup the fault we just took in the atomic write
293 * access to @uaddr.
294 *
295 * We have no generic implementation of a non destructive write to the
296 * user address. We know that we faulted in the atomic pagefault
297 * disabled section so we can as well avoid the #PF overhead by
298 * calling get_user_pages() right away.
299 */
300static int fault_in_user_writeable(u32 __user *uaddr)
301{
302 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
303 1, 1, 0, NULL, NULL);
304 return ret < 0 ? ret : 0;
305}
306
307/**
308 * futex_top_waiter() - Return the highest priority waiter on a futex
309 * @hb: the hash bucket the futex_q's reside in
310 * @key: the futex key (to distinguish it from other futex futex_q's)
311 *
312 * Must be called with the hb lock held.
313 */
314static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
315 union futex_key *key)
316{
317 struct futex_q *this;
318
319 plist_for_each_entry(this, &hb->chain, list) {
320 if (match_futex(&this->key, key))
321 return this;
322 }
323 return NULL;
324}
325
281static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 326static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
282{ 327{
283 u32 curval; 328 u32 curval;
@@ -539,28 +584,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
539 return 0; 584 return 0;
540} 585}
541 586
587/**
588 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
589 * @uaddr: the pi futex user address
590 * @hb: the pi futex hash bucket
591 * @key: the futex key associated with uaddr and hb
592 * @ps: the pi_state pointer where we store the result of the
593 * lookup
594 * @task: the task to perform the atomic lock work for. This will
595 * be "current" except in the case of requeue pi.
596 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
597 *
598 * Returns:
599 * 0 - ready to wait
600 * 1 - acquired the lock
601 * <0 - error
602 *
603 * The hb->lock and futex_key refs shall be held by the caller.
604 */
605static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
606 union futex_key *key,
607 struct futex_pi_state **ps,
608 struct task_struct *task, int set_waiters)
609{
610 int lock_taken, ret, ownerdied = 0;
611 u32 uval, newval, curval;
612
613retry:
614 ret = lock_taken = 0;
615
616 /*
617 * To avoid races, we attempt to take the lock here again
618 * (by doing a 0 -> TID atomic cmpxchg), while holding all
619 * the locks. It will most likely not succeed.
620 */
621 newval = task_pid_vnr(task);
622 if (set_waiters)
623 newval |= FUTEX_WAITERS;
624
625 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
626
627 if (unlikely(curval == -EFAULT))
628 return -EFAULT;
629
630 /*
631 * Detect deadlocks.
632 */
633 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
634 return -EDEADLK;
635
636 /*
637 * Surprise - we got the lock. Just return to userspace:
638 */
639 if (unlikely(!curval))
640 return 1;
641
642 uval = curval;
643
644 /*
645 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
646 * to wake at the next unlock.
647 */
648 newval = curval | FUTEX_WAITERS;
649
650 /*
651 * There are two cases, where a futex might have no owner (the
652 * owner TID is 0): OWNER_DIED. We take over the futex in this
653 * case. We also do an unconditional take over, when the owner
654 * of the futex died.
655 *
656 * This is safe as we are protected by the hash bucket lock !
657 */
658 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
659 /* Keep the OWNER_DIED bit */
660 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
661 ownerdied = 0;
662 lock_taken = 1;
663 }
664
665 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
666
667 if (unlikely(curval == -EFAULT))
668 return -EFAULT;
669 if (unlikely(curval != uval))
670 goto retry;
671
672 /*
673 * We took the lock due to owner died take over.
674 */
675 if (unlikely(lock_taken))
676 return 1;
677
678 /*
679 * We dont have the lock. Look up the PI state (or create it if
680 * we are the first waiter):
681 */
682 ret = lookup_pi_state(uval, hb, key, ps);
683
684 if (unlikely(ret)) {
685 switch (ret) {
686 case -ESRCH:
687 /*
688 * No owner found for this futex. Check if the
689 * OWNER_DIED bit is set to figure out whether
690 * this is a robust futex or not.
691 */
692 if (get_futex_value_locked(&curval, uaddr))
693 return -EFAULT;
694
695 /*
696 * We simply start over in case of a robust
697 * futex. The code above will take the futex
698 * and return happy.
699 */
700 if (curval & FUTEX_OWNER_DIED) {
701 ownerdied = 1;
702 goto retry;
703 }
704 default:
705 break;
706 }
707 }
708
709 return ret;
710}
711
542/* 712/*
543 * The hash bucket lock must be held when this is called. 713 * The hash bucket lock must be held when this is called.
544 * Afterwards, the futex_q must not be accessed. 714 * Afterwards, the futex_q must not be accessed.
545 */ 715 */
546static void wake_futex(struct futex_q *q) 716static void wake_futex(struct futex_q *q)
547{ 717{
548 plist_del(&q->list, &q->list.plist); 718 struct task_struct *p = q->task;
719
549 /* 720 /*
550 * The lock in wake_up_all() is a crucial memory barrier after the 721 * We set q->lock_ptr = NULL _before_ we wake up the task. If
551 * plist_del() and also before assigning to q->lock_ptr. 722 * a non futex wake up happens on another CPU then the task
723 * might exit and p would dereference a non existing task
724 * struct. Prevent this by holding a reference on p across the
725 * wake up.
552 */ 726 */
553 wake_up(&q->waiter); 727 get_task_struct(p);
728
729 plist_del(&q->list, &q->list.plist);
554 /* 730 /*
555 * The waiting task can free the futex_q as soon as this is written, 731 * The waiting task can free the futex_q as soon as
556 * without taking any locks. This must come last. 732 * q->lock_ptr = NULL is written, without taking any locks. A
557 * 733 * memory barrier is required here to prevent the following
558 * A memory barrier is required here to prevent the following store to 734 * store to lock_ptr from getting ahead of the plist_del.
559 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
560 * end of wake_up() does not prevent this store from moving.
561 */ 735 */
562 smp_wmb(); 736 smp_wmb();
563 q->lock_ptr = NULL; 737 q->lock_ptr = NULL;
738
739 wake_up_state(p, TASK_NORMAL);
740 put_task_struct(p);
564} 741}
565 742
566static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 743static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +866,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
689 866
690 plist_for_each_entry_safe(this, next, head, list) { 867 plist_for_each_entry_safe(this, next, head, list) {
691 if (match_futex (&this->key, &key)) { 868 if (match_futex (&this->key, &key)) {
692 if (this->pi_state) { 869 if (this->pi_state || this->rt_waiter) {
693 ret = -EINVAL; 870 ret = -EINVAL;
694 break; 871 break;
695 } 872 }
@@ -739,7 +916,6 @@ retry:
739retry_private: 916retry_private:
740 op_ret = futex_atomic_op_inuser(op, uaddr2); 917 op_ret = futex_atomic_op_inuser(op, uaddr2);
741 if (unlikely(op_ret < 0)) { 918 if (unlikely(op_ret < 0)) {
742 u32 dummy;
743 919
744 double_unlock_hb(hb1, hb2); 920 double_unlock_hb(hb1, hb2);
745 921
@@ -757,7 +933,7 @@ retry_private:
757 goto out_put_keys; 933 goto out_put_keys;
758 } 934 }
759 935
760 ret = get_user(dummy, uaddr2); 936 ret = fault_in_user_writeable(uaddr2);
761 if (ret) 937 if (ret)
762 goto out_put_keys; 938 goto out_put_keys;
763 939
@@ -802,24 +978,194 @@ out:
802 return ret; 978 return ret;
803} 979}
804 980
805/* 981/**
806 * Requeue all waiters hashed on one physical page to another 982 * requeue_futex() - Requeue a futex_q from one hb to another
807 * physical page. 983 * @q: the futex_q to requeue
984 * @hb1: the source hash_bucket
985 * @hb2: the target hash_bucket
986 * @key2: the new key for the requeued futex_q
987 */
988static inline
989void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
990 struct futex_hash_bucket *hb2, union futex_key *key2)
991{
992
993 /*
994 * If key1 and key2 hash to the same bucket, no need to
995 * requeue.
996 */
997 if (likely(&hb1->chain != &hb2->chain)) {
998 plist_del(&q->list, &hb1->chain);
999 plist_add(&q->list, &hb2->chain);
1000 q->lock_ptr = &hb2->lock;
1001#ifdef CONFIG_DEBUG_PI_LIST
1002 q->list.plist.lock = &hb2->lock;
1003#endif
1004 }
1005 get_futex_key_refs(key2);
1006 q->key = *key2;
1007}
1008
1009/**
1010 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1011 * q: the futex_q
1012 * key: the key of the requeue target futex
1013 * hb: the hash_bucket of the requeue target futex
1014 *
1015 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1016 * target futex if it is uncontended or via a lock steal. Set the futex_q key
1017 * to the requeue target futex so the waiter can detect the wakeup on the right
1018 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1019 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1020 * to protect access to the pi_state to fixup the owner later. Must be called
1021 * with both q->lock_ptr and hb->lock held.
1022 */
1023static inline
1024void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1025 struct futex_hash_bucket *hb)
1026{
1027 drop_futex_key_refs(&q->key);
1028 get_futex_key_refs(key);
1029 q->key = *key;
1030
1031 WARN_ON(plist_node_empty(&q->list));
1032 plist_del(&q->list, &q->list.plist);
1033
1034 WARN_ON(!q->rt_waiter);
1035 q->rt_waiter = NULL;
1036
1037 q->lock_ptr = &hb->lock;
1038#ifdef CONFIG_DEBUG_PI_LIST
1039 q->list.plist.lock = &hb->lock;
1040#endif
1041
1042 wake_up_state(q->task, TASK_NORMAL);
1043}
1044
1045/**
1046 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1047 * @pifutex: the user address of the to futex
1048 * @hb1: the from futex hash bucket, must be locked by the caller
1049 * @hb2: the to futex hash bucket, must be locked by the caller
1050 * @key1: the from futex key
1051 * @key2: the to futex key
1052 * @ps: address to store the pi_state pointer
1053 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1054 *
1055 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1056 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1057 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1058 * hb1 and hb2 must be held by the caller.
1059 *
1060 * Returns:
1061 * 0 - failed to acquire the lock atomicly
1062 * 1 - acquired the lock
1063 * <0 - error
1064 */
1065static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1066 struct futex_hash_bucket *hb1,
1067 struct futex_hash_bucket *hb2,
1068 union futex_key *key1, union futex_key *key2,
1069 struct futex_pi_state **ps, int set_waiters)
1070{
1071 struct futex_q *top_waiter = NULL;
1072 u32 curval;
1073 int ret;
1074
1075 if (get_futex_value_locked(&curval, pifutex))
1076 return -EFAULT;
1077
1078 /*
1079 * Find the top_waiter and determine if there are additional waiters.
1080 * If the caller intends to requeue more than 1 waiter to pifutex,
1081 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1082 * as we have means to handle the possible fault. If not, don't set
1083 * the bit unecessarily as it will force the subsequent unlock to enter
1084 * the kernel.
1085 */
1086 top_waiter = futex_top_waiter(hb1, key1);
1087
1088 /* There are no waiters, nothing for us to do. */
1089 if (!top_waiter)
1090 return 0;
1091
1092 /*
1093 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1094 * the contended case or if set_waiters is 1. The pi_state is returned
1095 * in ps in contended cases.
1096 */
1097 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1098 set_waiters);
1099 if (ret == 1)
1100 requeue_pi_wake_futex(top_waiter, key2, hb2);
1101
1102 return ret;
1103}
1104
1105/**
1106 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1107 * uaddr1: source futex user address
1108 * uaddr2: target futex user address
1109 * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1110 * nr_requeue: number of waiters to requeue (0-INT_MAX)
1111 * requeue_pi: if we are attempting to requeue from a non-pi futex to a
1112 * pi futex (pi to pi requeue is not supported)
1113 *
1114 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1115 * uaddr2 atomically on behalf of the top waiter.
1116 *
1117 * Returns:
1118 * >=0 - on success, the number of tasks requeued or woken
1119 * <0 - on error
808 */ 1120 */
809static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1121static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
810 int nr_wake, int nr_requeue, u32 *cmpval) 1122 int nr_wake, int nr_requeue, u32 *cmpval,
1123 int requeue_pi)
811{ 1124{
812 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1125 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1126 int drop_count = 0, task_count = 0, ret;
1127 struct futex_pi_state *pi_state = NULL;
813 struct futex_hash_bucket *hb1, *hb2; 1128 struct futex_hash_bucket *hb1, *hb2;
814 struct plist_head *head1; 1129 struct plist_head *head1;
815 struct futex_q *this, *next; 1130 struct futex_q *this, *next;
816 int ret, drop_count = 0; 1131 u32 curval2;
1132
1133 if (requeue_pi) {
1134 /*
1135 * requeue_pi requires a pi_state, try to allocate it now
1136 * without any locks in case it fails.
1137 */
1138 if (refill_pi_state_cache())
1139 return -ENOMEM;
1140 /*
1141 * requeue_pi must wake as many tasks as it can, up to nr_wake
1142 * + nr_requeue, since it acquires the rt_mutex prior to
1143 * returning to userspace, so as to not leave the rt_mutex with
1144 * waiters and no owner. However, second and third wake-ups
1145 * cannot be predicted as they involve race conditions with the
1146 * first wake and a fault while looking up the pi_state. Both
1147 * pthread_cond_signal() and pthread_cond_broadcast() should
1148 * use nr_wake=1.
1149 */
1150 if (nr_wake != 1)
1151 return -EINVAL;
1152 }
817 1153
818retry: 1154retry:
1155 if (pi_state != NULL) {
1156 /*
1157 * We will have to lookup the pi_state again, so free this one
1158 * to keep the accounting correct.
1159 */
1160 free_pi_state(pi_state);
1161 pi_state = NULL;
1162 }
1163
819 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1164 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
820 if (unlikely(ret != 0)) 1165 if (unlikely(ret != 0))
821 goto out; 1166 goto out;
822 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); 1167 ret = get_futex_key(uaddr2, fshared, &key2,
1168 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
823 if (unlikely(ret != 0)) 1169 if (unlikely(ret != 0))
824 goto out_put_key1; 1170 goto out_put_key1;
825 1171
@@ -854,32 +1200,106 @@ retry_private:
854 } 1200 }
855 } 1201 }
856 1202
1203 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1204 /*
1205 * Attempt to acquire uaddr2 and wake the top waiter. If we
1206 * intend to requeue waiters, force setting the FUTEX_WAITERS
1207 * bit. We force this here where we are able to easily handle
1208 * faults rather in the requeue loop below.
1209 */
1210 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1211 &key2, &pi_state, nr_requeue);
1212
1213 /*
1214 * At this point the top_waiter has either taken uaddr2 or is
1215 * waiting on it. If the former, then the pi_state will not
1216 * exist yet, look it up one more time to ensure we have a
1217 * reference to it.
1218 */
1219 if (ret == 1) {
1220 WARN_ON(pi_state);
1221 task_count++;
1222 ret = get_futex_value_locked(&curval2, uaddr2);
1223 if (!ret)
1224 ret = lookup_pi_state(curval2, hb2, &key2,
1225 &pi_state);
1226 }
1227
1228 switch (ret) {
1229 case 0:
1230 break;
1231 case -EFAULT:
1232 double_unlock_hb(hb1, hb2);
1233 put_futex_key(fshared, &key2);
1234 put_futex_key(fshared, &key1);
1235 ret = fault_in_user_writeable(uaddr2);
1236 if (!ret)
1237 goto retry;
1238 goto out;
1239 case -EAGAIN:
1240 /* The owner was exiting, try again. */
1241 double_unlock_hb(hb1, hb2);
1242 put_futex_key(fshared, &key2);
1243 put_futex_key(fshared, &key1);
1244 cond_resched();
1245 goto retry;
1246 default:
1247 goto out_unlock;
1248 }
1249 }
1250
857 head1 = &hb1->chain; 1251 head1 = &hb1->chain;
858 plist_for_each_entry_safe(this, next, head1, list) { 1252 plist_for_each_entry_safe(this, next, head1, list) {
859 if (!match_futex (&this->key, &key1)) 1253 if (task_count - nr_wake >= nr_requeue)
1254 break;
1255
1256 if (!match_futex(&this->key, &key1))
860 continue; 1257 continue;
861 if (++ret <= nr_wake) { 1258
1259 /*
1260 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1261 * be paired with each other and no other futex ops.
1262 */
1263 if ((requeue_pi && !this->rt_waiter) ||
1264 (!requeue_pi && this->rt_waiter)) {
1265 ret = -EINVAL;
1266 break;
1267 }
1268
1269 /*
1270 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1271 * lock, we already woke the top_waiter. If not, it will be
1272 * woken by futex_unlock_pi().
1273 */
1274 if (++task_count <= nr_wake && !requeue_pi) {
862 wake_futex(this); 1275 wake_futex(this);
863 } else { 1276 continue;
864 /* 1277 }
865 * If key1 and key2 hash to the same bucket, no need to
866 * requeue.
867 */
868 if (likely(head1 != &hb2->chain)) {
869 plist_del(&this->list, &hb1->chain);
870 plist_add(&this->list, &hb2->chain);
871 this->lock_ptr = &hb2->lock;
872#ifdef CONFIG_DEBUG_PI_LIST
873 this->list.plist.lock = &hb2->lock;
874#endif
875 }
876 this->key = key2;
877 get_futex_key_refs(&key2);
878 drop_count++;
879 1278
880 if (ret - nr_wake >= nr_requeue) 1279 /*
881 break; 1280 * Requeue nr_requeue waiters and possibly one more in the case
1281 * of requeue_pi if we couldn't acquire the lock atomically.
1282 */
1283 if (requeue_pi) {
1284 /* Prepare the waiter to take the rt_mutex. */
1285 atomic_inc(&pi_state->refcount);
1286 this->pi_state = pi_state;
1287 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1288 this->rt_waiter,
1289 this->task, 1);
1290 if (ret == 1) {
1291 /* We got the lock. */
1292 requeue_pi_wake_futex(this, &key2, hb2);
1293 continue;
1294 } else if (ret) {
1295 /* -EDEADLK */
1296 this->pi_state = NULL;
1297 free_pi_state(pi_state);
1298 goto out_unlock;
1299 }
882 } 1300 }
1301 requeue_futex(this, hb1, hb2, &key2);
1302 drop_count++;
883 } 1303 }
884 1304
885out_unlock: 1305out_unlock:
@@ -899,7 +1319,9 @@ out_put_keys:
899out_put_key1: 1319out_put_key1:
900 put_futex_key(fshared, &key1); 1320 put_futex_key(fshared, &key1);
901out: 1321out:
902 return ret; 1322 if (pi_state != NULL)
1323 free_pi_state(pi_state);
1324 return ret ? ret : task_count;
903} 1325}
904 1326
905/* The key must be already stored in q->key. */ 1327/* The key must be already stored in q->key. */
@@ -907,8 +1329,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
907{ 1329{
908 struct futex_hash_bucket *hb; 1330 struct futex_hash_bucket *hb;
909 1331
910 init_waitqueue_head(&q->waiter);
911
912 get_futex_key_refs(&q->key); 1332 get_futex_key_refs(&q->key);
913 hb = hash_futex(&q->key); 1333 hb = hash_futex(&q->key);
914 q->lock_ptr = &hb->lock; 1334 q->lock_ptr = &hb->lock;
@@ -1097,7 +1517,7 @@ retry:
1097handle_fault: 1517handle_fault:
1098 spin_unlock(q->lock_ptr); 1518 spin_unlock(q->lock_ptr);
1099 1519
1100 ret = get_user(uval, uaddr); 1520 ret = fault_in_user_writeable(uaddr);
1101 1521
1102 spin_lock(q->lock_ptr); 1522 spin_lock(q->lock_ptr);
1103 1523
@@ -1119,35 +1539,149 @@ handle_fault:
1119 */ 1539 */
1120#define FLAGS_SHARED 0x01 1540#define FLAGS_SHARED 0x01
1121#define FLAGS_CLOCKRT 0x02 1541#define FLAGS_CLOCKRT 0x02
1542#define FLAGS_HAS_TIMEOUT 0x04
1122 1543
1123static long futex_wait_restart(struct restart_block *restart); 1544static long futex_wait_restart(struct restart_block *restart);
1124 1545
1125static int futex_wait(u32 __user *uaddr, int fshared, 1546/**
1126 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1547 * fixup_owner() - Post lock pi_state and corner case management
1548 * @uaddr: user address of the futex
1549 * @fshared: whether the futex is shared (1) or not (0)
1550 * @q: futex_q (contains pi_state and access to the rt_mutex)
1551 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1552 *
1553 * After attempting to lock an rt_mutex, this function is called to cleanup
1554 * the pi_state owner as well as handle race conditions that may allow us to
1555 * acquire the lock. Must be called with the hb lock held.
1556 *
1557 * Returns:
1558 * 1 - success, lock taken
1559 * 0 - success, lock not taken
1560 * <0 - on error (-EFAULT)
1561 */
1562static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1563 int locked)
1127{ 1564{
1128 struct task_struct *curr = current; 1565 struct task_struct *owner;
1129 struct restart_block *restart; 1566 int ret = 0;
1130 DECLARE_WAITQUEUE(wait, curr);
1131 struct futex_hash_bucket *hb;
1132 struct futex_q q;
1133 u32 uval;
1134 int ret;
1135 struct hrtimer_sleeper t;
1136 int rem = 0;
1137 1567
1138 if (!bitset) 1568 if (locked) {
1139 return -EINVAL; 1569 /*
1570 * Got the lock. We might not be the anticipated owner if we
1571 * did a lock-steal - fix up the PI-state in that case:
1572 */
1573 if (q->pi_state->owner != current)
1574 ret = fixup_pi_state_owner(uaddr, q, current, fshared);
1575 goto out;
1576 }
1140 1577
1141 q.pi_state = NULL; 1578 /*
1142 q.bitset = bitset; 1579 * Catch the rare case, where the lock was released when we were on the
1143retry: 1580 * way back before we locked the hash bucket.
1144 q.key = FUTEX_KEY_INIT; 1581 */
1145 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); 1582 if (q->pi_state->owner == current) {
1146 if (unlikely(ret != 0)) 1583 /*
1584 * Try to get the rt_mutex now. This might fail as some other
1585 * task acquired the rt_mutex after we removed ourself from the
1586 * rt_mutex waiters list.
1587 */
1588 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1589 locked = 1;
1590 goto out;
1591 }
1592
1593 /*
1594 * pi_state is incorrect, some other task did a lock steal and
1595 * we returned due to timeout or signal without taking the
1596 * rt_mutex. Too late. We can access the rt_mutex_owner without
1597 * locking, as the other task is now blocked on the hash bucket
1598 * lock. Fix the state up.
1599 */
1600 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1601 ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
1147 goto out; 1602 goto out;
1603 }
1148 1604
1149retry_private: 1605 /*
1150 hb = queue_lock(&q); 1606 * Paranoia check. If we did not take the lock, then we should not be
1607 * the owner, nor the pending owner, of the rt_mutex.
1608 */
1609 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1610 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1611 "pi-state %p\n", ret,
1612 q->pi_state->pi_mutex.owner,
1613 q->pi_state->owner);
1614
1615out:
1616 return ret ? ret : locked;
1617}
1618
1619/**
1620 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1621 * @hb: the futex hash bucket, must be locked by the caller
1622 * @q: the futex_q to queue up on
1623 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1624 */
1625static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1626 struct hrtimer_sleeper *timeout)
1627{
1628 queue_me(q, hb);
1629
1630 /*
1631 * There might have been scheduling since the queue_me(), as we
1632 * cannot hold a spinlock across the get_user() in case it
1633 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
1634 * queueing ourselves into the futex hash. This code thus has to
1635 * rely on the futex_wake() code removing us from hash when it
1636 * wakes us up.
1637 */
1638 set_current_state(TASK_INTERRUPTIBLE);
1639
1640 /* Arm the timer */
1641 if (timeout) {
1642 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1643 if (!hrtimer_active(&timeout->timer))
1644 timeout->task = NULL;
1645 }
1646
1647 /*
1648 * !plist_node_empty() is safe here without any lock.
1649 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1650 */
1651 if (likely(!plist_node_empty(&q->list))) {
1652 /*
1653 * If the timer has already expired, current will already be
1654 * flagged for rescheduling. Only call schedule if there
1655 * is no timeout, or if it has yet to expire.
1656 */
1657 if (!timeout || timeout->task)
1658 schedule();
1659 }
1660 __set_current_state(TASK_RUNNING);
1661}
1662
1663/**
1664 * futex_wait_setup() - Prepare to wait on a futex
1665 * @uaddr: the futex userspace address
1666 * @val: the expected value
1667 * @fshared: whether the futex is shared (1) or not (0)
1668 * @q: the associated futex_q
1669 * @hb: storage for hash_bucket pointer to be returned to caller
1670 *
1671 * Setup the futex_q and locate the hash_bucket. Get the futex value and
1672 * compare it with the expected value. Handle atomic faults internally.
1673 * Return with the hb lock held and a q.key reference on success, and unlocked
1674 * with no q.key reference on failure.
1675 *
1676 * Returns:
1677 * 0 - uaddr contains val and hb has been locked
1678 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1679 */
1680static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1681 struct futex_q *q, struct futex_hash_bucket **hb)
1682{
1683 u32 uval;
1684 int ret;
1151 1685
1152 /* 1686 /*
1153 * Access the page AFTER the hash-bucket is locked. 1687 * Access the page AFTER the hash-bucket is locked.
@@ -1165,95 +1699,83 @@ retry_private:
1165 * A consequence is that futex_wait() can return zero and absorb 1699 * A consequence is that futex_wait() can return zero and absorb
1166 * a wakeup when *uaddr != val on entry to the syscall. This is 1700 * a wakeup when *uaddr != val on entry to the syscall. This is
1167 * rare, but normal. 1701 * rare, but normal.
1168 *
1169 * For shared futexes, we hold the mmap semaphore, so the mapping
1170 * cannot have changed since we looked it up in get_futex_key.
1171 */ 1702 */
1703retry:
1704 q->key = FUTEX_KEY_INIT;
1705 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
1706 if (unlikely(ret != 0))
1707 return ret;
1708
1709retry_private:
1710 *hb = queue_lock(q);
1711
1172 ret = get_futex_value_locked(&uval, uaddr); 1712 ret = get_futex_value_locked(&uval, uaddr);
1173 1713
1174 if (unlikely(ret)) { 1714 if (ret) {
1175 queue_unlock(&q, hb); 1715 queue_unlock(q, *hb);
1176 1716
1177 ret = get_user(uval, uaddr); 1717 ret = get_user(uval, uaddr);
1178 if (ret) 1718 if (ret)
1179 goto out_put_key; 1719 goto out;
1180 1720
1181 if (!fshared) 1721 if (!fshared)
1182 goto retry_private; 1722 goto retry_private;
1183 1723
1184 put_futex_key(fshared, &q.key); 1724 put_futex_key(fshared, &q->key);
1185 goto retry; 1725 goto retry;
1186 } 1726 }
1187 ret = -EWOULDBLOCK;
1188 if (unlikely(uval != val)) {
1189 queue_unlock(&q, hb);
1190 goto out_put_key;
1191 }
1192 1727
1193 /* Only actually queue if *uaddr contained val. */ 1728 if (uval != val) {
1194 queue_me(&q, hb); 1729 queue_unlock(q, *hb);
1730 ret = -EWOULDBLOCK;
1731 }
1195 1732
1196 /* 1733out:
1197 * There might have been scheduling since the queue_me(), as we 1734 if (ret)
1198 * cannot hold a spinlock across the get_user() in case it 1735 put_futex_key(fshared, &q->key);
1199 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1736 return ret;
1200 * queueing ourselves into the futex hash. This code thus has to 1737}
1201 * rely on the futex_wake() code removing us from hash when it
1202 * wakes us up.
1203 */
1204 1738
1205 /* add_wait_queue is the barrier after __set_current_state. */ 1739static int futex_wait(u32 __user *uaddr, int fshared,
1206 __set_current_state(TASK_INTERRUPTIBLE); 1740 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1207 add_wait_queue(&q.waiter, &wait); 1741{
1208 /* 1742 struct hrtimer_sleeper timeout, *to = NULL;
1209 * !plist_node_empty() is safe here without any lock. 1743 struct restart_block *restart;
1210 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1744 struct futex_hash_bucket *hb;
1211 */ 1745 struct futex_q q;
1212 if (likely(!plist_node_empty(&q.list))) { 1746 int ret;
1213 if (!abs_time)
1214 schedule();
1215 else {
1216 hrtimer_init_on_stack(&t.timer,
1217 clockrt ? CLOCK_REALTIME :
1218 CLOCK_MONOTONIC,
1219 HRTIMER_MODE_ABS);
1220 hrtimer_init_sleeper(&t, current);
1221 hrtimer_set_expires_range_ns(&t.timer, *abs_time,
1222 current->timer_slack_ns);
1223
1224 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1225 if (!hrtimer_active(&t.timer))
1226 t.task = NULL;
1227 1747
1228 /* 1748 if (!bitset)
1229 * the timer could have already expired, in which 1749 return -EINVAL;
1230 * case current would be flagged for rescheduling.
1231 * Don't bother calling schedule.
1232 */
1233 if (likely(t.task))
1234 schedule();
1235 1750
1236 hrtimer_cancel(&t.timer); 1751 q.pi_state = NULL;
1752 q.bitset = bitset;
1753 q.rt_waiter = NULL;
1237 1754
1238 /* Flag if a timeout occured */ 1755 if (abs_time) {
1239 rem = (t.task == NULL); 1756 to = &timeout;
1240 1757
1241 destroy_hrtimer_on_stack(&t.timer); 1758 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
1242 } 1759 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1760 hrtimer_init_sleeper(to, current);
1761 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1762 current->timer_slack_ns);
1243 } 1763 }
1244 __set_current_state(TASK_RUNNING);
1245 1764
1246 /* 1765 /* Prepare to wait on uaddr. */
1247 * NOTE: we don't remove ourselves from the waitqueue because 1766 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1248 * we are the only user of it. 1767 if (ret)
1249 */ 1768 goto out;
1769
1770 /* queue_me and wait for wakeup, timeout, or a signal. */
1771 futex_wait_queue_me(hb, &q, to);
1250 1772
1251 /* If we were woken (and unqueued), we succeeded, whatever. */ 1773 /* If we were woken (and unqueued), we succeeded, whatever. */
1252 ret = 0; 1774 ret = 0;
1253 if (!unqueue_me(&q)) 1775 if (!unqueue_me(&q))
1254 goto out_put_key; 1776 goto out_put_key;
1255 ret = -ETIMEDOUT; 1777 ret = -ETIMEDOUT;
1256 if (rem) 1778 if (to && !to->task)
1257 goto out_put_key; 1779 goto out_put_key;
1258 1780
1259 /* 1781 /*
@@ -1270,7 +1792,7 @@ retry_private:
1270 restart->futex.val = val; 1792 restart->futex.val = val;
1271 restart->futex.time = abs_time->tv64; 1793 restart->futex.time = abs_time->tv64;
1272 restart->futex.bitset = bitset; 1794 restart->futex.bitset = bitset;
1273 restart->futex.flags = 0; 1795 restart->futex.flags = FLAGS_HAS_TIMEOUT;
1274 1796
1275 if (fshared) 1797 if (fshared)
1276 restart->futex.flags |= FLAGS_SHARED; 1798 restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1804,10 @@ retry_private:
1282out_put_key: 1804out_put_key:
1283 put_futex_key(fshared, &q.key); 1805 put_futex_key(fshared, &q.key);
1284out: 1806out:
1807 if (to) {
1808 hrtimer_cancel(&to->timer);
1809 destroy_hrtimer_on_stack(&to->timer);
1810 }
1285 return ret; 1811 return ret;
1286} 1812}
1287 1813
@@ -1290,13 +1816,16 @@ static long futex_wait_restart(struct restart_block *restart)
1290{ 1816{
1291 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1817 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1292 int fshared = 0; 1818 int fshared = 0;
1293 ktime_t t; 1819 ktime_t t, *tp = NULL;
1294 1820
1295 t.tv64 = restart->futex.time; 1821 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1822 t.tv64 = restart->futex.time;
1823 tp = &t;
1824 }
1296 restart->fn = do_no_restart_syscall; 1825 restart->fn = do_no_restart_syscall;
1297 if (restart->futex.flags & FLAGS_SHARED) 1826 if (restart->futex.flags & FLAGS_SHARED)
1298 fshared = 1; 1827 fshared = 1;
1299 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1828 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
1300 restart->futex.bitset, 1829 restart->futex.bitset,
1301 restart->futex.flags & FLAGS_CLOCKRT); 1830 restart->futex.flags & FLAGS_CLOCKRT);
1302} 1831}
@@ -1312,11 +1841,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1312 int detect, ktime_t *time, int trylock) 1841 int detect, ktime_t *time, int trylock)
1313{ 1842{
1314 struct hrtimer_sleeper timeout, *to = NULL; 1843 struct hrtimer_sleeper timeout, *to = NULL;
1315 struct task_struct *curr = current;
1316 struct futex_hash_bucket *hb; 1844 struct futex_hash_bucket *hb;
1317 u32 uval, newval, curval;
1318 struct futex_q q; 1845 struct futex_q q;
1319 int ret, lock_taken, ownerdied = 0; 1846 int res, ret;
1320 1847
1321 if (refill_pi_state_cache()) 1848 if (refill_pi_state_cache())
1322 return -ENOMEM; 1849 return -ENOMEM;
@@ -1330,6 +1857,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1330 } 1857 }
1331 1858
1332 q.pi_state = NULL; 1859 q.pi_state = NULL;
1860 q.rt_waiter = NULL;
1333retry: 1861retry:
1334 q.key = FUTEX_KEY_INIT; 1862 q.key = FUTEX_KEY_INIT;
1335 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1863 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1867,15 @@ retry:
1339retry_private: 1867retry_private:
1340 hb = queue_lock(&q); 1868 hb = queue_lock(&q);
1341 1869
1342retry_locked: 1870 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1343 ret = lock_taken = 0;
1344
1345 /*
1346 * To avoid races, we attempt to take the lock here again
1347 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1348 * the locks. It will most likely not succeed.
1349 */
1350 newval = task_pid_vnr(current);
1351
1352 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1353
1354 if (unlikely(curval == -EFAULT))
1355 goto uaddr_faulted;
1356
1357 /*
1358 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1359 * situation and we return success to user space.
1360 */
1361 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1362 ret = -EDEADLK;
1363 goto out_unlock_put_key;
1364 }
1365
1366 /*
1367 * Surprise - we got the lock. Just return to userspace:
1368 */
1369 if (unlikely(!curval))
1370 goto out_unlock_put_key;
1371
1372 uval = curval;
1373
1374 /*
1375 * Set the WAITERS flag, so the owner will know it has someone
1376 * to wake at next unlock
1377 */
1378 newval = curval | FUTEX_WAITERS;
1379
1380 /*
1381 * There are two cases, where a futex might have no owner (the
1382 * owner TID is 0): OWNER_DIED. We take over the futex in this
1383 * case. We also do an unconditional take over, when the owner
1384 * of the futex died.
1385 *
1386 * This is safe as we are protected by the hash bucket lock !
1387 */
1388 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1389 /* Keep the OWNER_DIED bit */
1390 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1391 ownerdied = 0;
1392 lock_taken = 1;
1393 }
1394
1395 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1396
1397 if (unlikely(curval == -EFAULT))
1398 goto uaddr_faulted;
1399 if (unlikely(curval != uval))
1400 goto retry_locked;
1401
1402 /*
1403 * We took the lock due to owner died take over.
1404 */
1405 if (unlikely(lock_taken))
1406 goto out_unlock_put_key;
1407
1408 /*
1409 * We dont have the lock. Look up the PI state (or create it if
1410 * we are the first waiter):
1411 */
1412 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1413
1414 if (unlikely(ret)) { 1871 if (unlikely(ret)) {
1415 switch (ret) { 1872 switch (ret) {
1416 1873 case 1:
1874 /* We got the lock. */
1875 ret = 0;
1876 goto out_unlock_put_key;
1877 case -EFAULT:
1878 goto uaddr_faulted;
1417 case -EAGAIN: 1879 case -EAGAIN:
1418 /* 1880 /*
1419 * Task is exiting and we just wait for the 1881 * Task is exiting and we just wait for the
@@ -1423,25 +1885,6 @@ retry_locked:
1423 put_futex_key(fshared, &q.key); 1885 put_futex_key(fshared, &q.key);
1424 cond_resched(); 1886 cond_resched();
1425 goto retry; 1887 goto retry;
1426
1427 case -ESRCH:
1428 /*
1429 * No owner found for this futex. Check if the
1430 * OWNER_DIED bit is set to figure out whether
1431 * this is a robust futex or not.
1432 */
1433 if (get_futex_value_locked(&curval, uaddr))
1434 goto uaddr_faulted;
1435
1436 /*
1437 * We simply start over in case of a robust
1438 * futex. The code above will take the futex
1439 * and return happy.
1440 */
1441 if (curval & FUTEX_OWNER_DIED) {
1442 ownerdied = 1;
1443 goto retry_locked;
1444 }
1445 default: 1888 default:
1446 goto out_unlock_put_key; 1889 goto out_unlock_put_key;
1447 } 1890 }
@@ -1465,71 +1908,21 @@ retry_locked:
1465 } 1908 }
1466 1909
1467 spin_lock(q.lock_ptr); 1910 spin_lock(q.lock_ptr);
1468 1911 /*
1469 if (!ret) { 1912 * Fixup the pi_state owner and possibly acquire the lock if we
1470 /* 1913 * haven't already.
1471 * Got the lock. We might not be the anticipated owner 1914 */
1472 * if we did a lock-steal - fix up the PI-state in 1915 res = fixup_owner(uaddr, fshared, &q, !ret);
1473 * that case: 1916 /*
1474 */ 1917 * If fixup_owner() returned an error, proprogate that. If it acquired
1475 if (q.pi_state->owner != curr) 1918 * the lock, clear our -ETIMEDOUT or -EINTR.
1476 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); 1919 */
1477 } else { 1920 if (res)
1478 /* 1921 ret = (res < 0) ? res : 0;
1479 * Catch the rare case, where the lock was released
1480 * when we were on the way back before we locked the
1481 * hash bucket.
1482 */
1483 if (q.pi_state->owner == curr) {
1484 /*
1485 * Try to get the rt_mutex now. This might
1486 * fail as some other task acquired the
1487 * rt_mutex after we removed ourself from the
1488 * rt_mutex waiters list.
1489 */
1490 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1491 ret = 0;
1492 else {
1493 /*
1494 * pi_state is incorrect, some other
1495 * task did a lock steal and we
1496 * returned due to timeout or signal
1497 * without taking the rt_mutex. Too
1498 * late. We can access the
1499 * rt_mutex_owner without locking, as
1500 * the other task is now blocked on
1501 * the hash bucket lock. Fix the state
1502 * up.
1503 */
1504 struct task_struct *owner;
1505 int res;
1506
1507 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1508 res = fixup_pi_state_owner(uaddr, &q, owner,
1509 fshared);
1510
1511 /* propagate -EFAULT, if the fixup failed */
1512 if (res)
1513 ret = res;
1514 }
1515 } else {
1516 /*
1517 * Paranoia check. If we did not take the lock
1518 * in the trylock above, then we should not be
1519 * the owner of the rtmutex, neither the real
1520 * nor the pending one:
1521 */
1522 if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1523 printk(KERN_ERR "futex_lock_pi: ret = %d "
1524 "pi-mutex: %p pi-state %p\n", ret,
1525 q.pi_state->pi_mutex.owner,
1526 q.pi_state->owner);
1527 }
1528 }
1529 1922
1530 /* 1923 /*
1531 * If fixup_pi_state_owner() faulted and was unable to handle the 1924 * If fixup_owner() faulted and was unable to handle the fault, unlock
1532 * fault, unlock it and return the fault to userspace. 1925 * it and return the fault to userspace.
1533 */ 1926 */
1534 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 1927 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
1535 rt_mutex_unlock(&q.pi_state->pi_mutex); 1928 rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1930,7 @@ retry_locked:
1537 /* Unqueue and drop the lock */ 1930 /* Unqueue and drop the lock */
1538 unqueue_me_pi(&q); 1931 unqueue_me_pi(&q);
1539 1932
1540 if (to) 1933 goto out;
1541 destroy_hrtimer_on_stack(&to->timer);
1542 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1543 1934
1544out_unlock_put_key: 1935out_unlock_put_key:
1545 queue_unlock(&q, hb); 1936 queue_unlock(&q, hb);
@@ -1549,19 +1940,12 @@ out_put_key:
1549out: 1940out:
1550 if (to) 1941 if (to)
1551 destroy_hrtimer_on_stack(&to->timer); 1942 destroy_hrtimer_on_stack(&to->timer);
1552 return ret; 1943 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1553 1944
1554uaddr_faulted: 1945uaddr_faulted:
1555 /*
1556 * We have to r/w *(int __user *)uaddr, and we have to modify it
1557 * atomically. Therefore, if we continue to fault after get_user()
1558 * below, we need to handle the fault ourselves, while still holding
1559 * the mmap_sem. This can occur if the uaddr is under contention as
1560 * we have to drop the mmap_sem in order to call get_user().
1561 */
1562 queue_unlock(&q, hb); 1946 queue_unlock(&q, hb);
1563 1947
1564 ret = get_user(uval, uaddr); 1948 ret = fault_in_user_writeable(uaddr);
1565 if (ret) 1949 if (ret)
1566 goto out_put_key; 1950 goto out_put_key;
1567 1951
@@ -1572,7 +1956,6 @@ uaddr_faulted:
1572 goto retry; 1956 goto retry;
1573} 1957}
1574 1958
1575
1576/* 1959/*
1577 * Userspace attempted a TID -> 0 atomic transition, and failed. 1960 * Userspace attempted a TID -> 0 atomic transition, and failed.
1578 * This is the in-kernel slowpath: we look up the PI state (if any), 1961 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1657,23 +2040,239 @@ out:
1657 return ret; 2040 return ret;
1658 2041
1659pi_faulted: 2042pi_faulted:
1660 /*
1661 * We have to r/w *(int __user *)uaddr, and we have to modify it
1662 * atomically. Therefore, if we continue to fault after get_user()
1663 * below, we need to handle the fault ourselves, while still holding
1664 * the mmap_sem. This can occur if the uaddr is under contention as
1665 * we have to drop the mmap_sem in order to call get_user().
1666 */
1667 spin_unlock(&hb->lock); 2043 spin_unlock(&hb->lock);
1668 put_futex_key(fshared, &key); 2044 put_futex_key(fshared, &key);
1669 2045
1670 ret = get_user(uval, uaddr); 2046 ret = fault_in_user_writeable(uaddr);
1671 if (!ret) 2047 if (!ret)
1672 goto retry; 2048 goto retry;
1673 2049
1674 return ret; 2050 return ret;
1675} 2051}
1676 2052
2053/**
2054 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2055 * @hb: the hash_bucket futex_q was original enqueued on
2056 * @q: the futex_q woken while waiting to be requeued
2057 * @key2: the futex_key of the requeue target futex
2058 * @timeout: the timeout associated with the wait (NULL if none)
2059 *
2060 * Detect if the task was woken on the initial futex as opposed to the requeue
2061 * target futex. If so, determine if it was a timeout or a signal that caused
2062 * the wakeup and return the appropriate error code to the caller. Must be
2063 * called with the hb lock held.
2064 *
2065 * Returns
2066 * 0 - no early wakeup detected
2067 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2068 */
2069static inline
2070int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2071 struct futex_q *q, union futex_key *key2,
2072 struct hrtimer_sleeper *timeout)
2073{
2074 int ret = 0;
2075
2076 /*
2077 * With the hb lock held, we avoid races while we process the wakeup.
2078 * We only need to hold hb (and not hb2) to ensure atomicity as the
2079 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2080 * It can't be requeued from uaddr2 to something else since we don't
2081 * support a PI aware source futex for requeue.
2082 */
2083 if (!match_futex(&q->key, key2)) {
2084 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2085 /*
2086 * We were woken prior to requeue by a timeout or a signal.
2087 * Unqueue the futex_q and determine which it was.
2088 */
2089 plist_del(&q->list, &q->list.plist);
2090 drop_futex_key_refs(&q->key);
2091
2092 if (timeout && !timeout->task)
2093 ret = -ETIMEDOUT;
2094 else
2095 ret = -ERESTARTNOINTR;
2096 }
2097 return ret;
2098}
2099
2100/**
2101 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2102 * @uaddr: the futex we initialyl wait on (non-pi)
2103 * @fshared: whether the futexes are shared (1) or not (0). They must be
2104 * the same type, no requeueing from private to shared, etc.
2105 * @val: the expected value of uaddr
2106 * @abs_time: absolute timeout
2107 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
2108 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2109 * @uaddr2: the pi futex we will take prior to returning to user-space
2110 *
2111 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2112 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2113 * complete the acquisition of the rt_mutex prior to returning to userspace.
2114 * This ensures the rt_mutex maintains an owner when it has waiters; without
2115 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2116 * need to.
2117 *
2118 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2119 * via the following:
2120 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2121 * 2) wakeup on uaddr2 after a requeue and subsequent unlock
2122 * 3) signal (before or after requeue)
2123 * 4) timeout (before or after requeue)
2124 *
2125 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
2126 *
2127 * If 2, we may then block on trying to take the rt_mutex and return via:
2128 * 5) successful lock
2129 * 6) signal
2130 * 7) timeout
2131 * 8) other lock acquisition failure
2132 *
2133 * If 6, we setup a restart_block with futex_lock_pi() as the function.
2134 *
2135 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2136 *
2137 * Returns:
2138 * 0 - On success
2139 * <0 - On error
2140 */
2141static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2142 u32 val, ktime_t *abs_time, u32 bitset,
2143 int clockrt, u32 __user *uaddr2)
2144{
2145 struct hrtimer_sleeper timeout, *to = NULL;
2146 struct rt_mutex_waiter rt_waiter;
2147 struct rt_mutex *pi_mutex = NULL;
2148 struct futex_hash_bucket *hb;
2149 union futex_key key2;
2150 struct futex_q q;
2151 int res, ret;
2152
2153 if (!bitset)
2154 return -EINVAL;
2155
2156 if (abs_time) {
2157 to = &timeout;
2158 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
2159 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2160 hrtimer_init_sleeper(to, current);
2161 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2162 current->timer_slack_ns);
2163 }
2164
2165 /*
2166 * The waiter is allocated on our stack, manipulated by the requeue
2167 * code while we sleep on uaddr.
2168 */
2169 debug_rt_mutex_init_waiter(&rt_waiter);
2170 rt_waiter.task = NULL;
2171
2172 q.pi_state = NULL;
2173 q.bitset = bitset;
2174 q.rt_waiter = &rt_waiter;
2175
2176 key2 = FUTEX_KEY_INIT;
2177 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2178 if (unlikely(ret != 0))
2179 goto out;
2180
2181 /* Prepare to wait on uaddr. */
2182 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2183 if (ret)
2184 goto out_key2;
2185
2186 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2187 futex_wait_queue_me(hb, &q, to);
2188
2189 spin_lock(&hb->lock);
2190 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2191 spin_unlock(&hb->lock);
2192 if (ret)
2193 goto out_put_keys;
2194
2195 /*
2196 * In order for us to be here, we know our q.key == key2, and since
2197 * we took the hb->lock above, we also know that futex_requeue() has
2198 * completed and we no longer have to concern ourselves with a wakeup
2199 * race with the atomic proxy lock acquition by the requeue code.
2200 */
2201
2202 /* Check if the requeue code acquired the second futex for us. */
2203 if (!q.rt_waiter) {
2204 /*
2205 * Got the lock. We might not be the anticipated owner if we
2206 * did a lock-steal - fix up the PI-state in that case.
2207 */
2208 if (q.pi_state && (q.pi_state->owner != current)) {
2209 spin_lock(q.lock_ptr);
2210 ret = fixup_pi_state_owner(uaddr2, &q, current,
2211 fshared);
2212 spin_unlock(q.lock_ptr);
2213 }
2214 } else {
2215 /*
2216 * We have been woken up by futex_unlock_pi(), a timeout, or a
2217 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2218 * the pi_state.
2219 */
2220 WARN_ON(!&q.pi_state);
2221 pi_mutex = &q.pi_state->pi_mutex;
2222 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2223 debug_rt_mutex_free_waiter(&rt_waiter);
2224
2225 spin_lock(q.lock_ptr);
2226 /*
2227 * Fixup the pi_state owner and possibly acquire the lock if we
2228 * haven't already.
2229 */
2230 res = fixup_owner(uaddr2, fshared, &q, !ret);
2231 /*
2232 * If fixup_owner() returned an error, proprogate that. If it
2233 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
2234 */
2235 if (res)
2236 ret = (res < 0) ? res : 0;
2237
2238 /* Unqueue and drop the lock. */
2239 unqueue_me_pi(&q);
2240 }
2241
2242 /*
2243 * If fixup_pi_state_owner() faulted and was unable to handle the
2244 * fault, unlock the rt_mutex and return the fault to userspace.
2245 */
2246 if (ret == -EFAULT) {
2247 if (rt_mutex_owner(pi_mutex) == current)
2248 rt_mutex_unlock(pi_mutex);
2249 } else if (ret == -EINTR) {
2250 /*
2251 * We've already been requeued, but we have no way to
2252 * restart by calling futex_lock_pi() directly. We
2253 * could restart the syscall, but that will look at
2254 * the user space value and return right away. So we
2255 * drop back with EWOULDBLOCK to tell user space that
2256 * "val" has been changed. That's the same what the
2257 * restart of the syscall would do in
2258 * futex_wait_setup().
2259 */
2260 ret = -EWOULDBLOCK;
2261 }
2262
2263out_put_keys:
2264 put_futex_key(fshared, &q.key);
2265out_key2:
2266 put_futex_key(fshared, &key2);
2267
2268out:
2269 if (to) {
2270 hrtimer_cancel(&to->timer);
2271 destroy_hrtimer_on_stack(&to->timer);
2272 }
2273 return ret;
2274}
2275
1677/* 2276/*
1678 * Support for robust futexes: the kernel cleans up held futexes at 2277 * Support for robust futexes: the kernel cleans up held futexes at
1679 * thread exit time. 2278 * thread exit time.
@@ -1896,7 +2495,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1896 fshared = 1; 2495 fshared = 1;
1897 2496
1898 clockrt = op & FUTEX_CLOCK_REALTIME; 2497 clockrt = op & FUTEX_CLOCK_REALTIME;
1899 if (clockrt && cmd != FUTEX_WAIT_BITSET) 2498 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
1900 return -ENOSYS; 2499 return -ENOSYS;
1901 2500
1902 switch (cmd) { 2501 switch (cmd) {
@@ -1911,10 +2510,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1911 ret = futex_wake(uaddr, fshared, val, val3); 2510 ret = futex_wake(uaddr, fshared, val, val3);
1912 break; 2511 break;
1913 case FUTEX_REQUEUE: 2512 case FUTEX_REQUEUE:
1914 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 2513 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
1915 break; 2514 break;
1916 case FUTEX_CMP_REQUEUE: 2515 case FUTEX_CMP_REQUEUE:
1917 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); 2516 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2517 0);
1918 break; 2518 break;
1919 case FUTEX_WAKE_OP: 2519 case FUTEX_WAKE_OP:
1920 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2520 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2531,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1931 if (futex_cmpxchg_enabled) 2531 if (futex_cmpxchg_enabled)
1932 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2532 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
1933 break; 2533 break;
2534 case FUTEX_WAIT_REQUEUE_PI:
2535 val3 = FUTEX_BITSET_MATCH_ANY;
2536 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
2537 clockrt, uaddr2);
2538 break;
2539 case FUTEX_CMP_REQUEUE_PI:
2540 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2541 1);
2542 break;
1934 default: 2543 default:
1935 ret = -ENOSYS; 2544 ret = -ENOSYS;
1936 } 2545 }
@@ -1948,7 +2557,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1948 int cmd = op & FUTEX_CMD_MASK; 2557 int cmd = op & FUTEX_CMD_MASK;
1949 2558
1950 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2559 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
1951 cmd == FUTEX_WAIT_BITSET)) { 2560 cmd == FUTEX_WAIT_BITSET ||
2561 cmd == FUTEX_WAIT_REQUEUE_PI)) {
1952 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2562 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1953 return -EFAULT; 2563 return -EFAULT;
1954 if (!timespec_valid(&ts)) 2564 if (!timespec_valid(&ts))
@@ -1960,11 +2570,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1960 tp = &t; 2570 tp = &t;
1961 } 2571 }
1962 /* 2572 /*
1963 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2573 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
1964 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2574 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
1965 */ 2575 */
1966 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2576 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
1967 cmd == FUTEX_WAKE_OP) 2577 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
1968 val2 = (u32) (unsigned long) utime; 2578 val2 = (u32) (unsigned long) utime;
1969 2579
1970 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2580 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
180 int cmd = op & FUTEX_CMD_MASK; 180 int cmd = op & FUTEX_CMD_MASK;
181 181
182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
183 cmd == FUTEX_WAIT_BITSET)) { 183 cmd == FUTEX_WAIT_BITSET ||
184 cmd == FUTEX_WAIT_REQUEUE_PI)) {
184 if (get_compat_timespec(&ts, utime)) 185 if (get_compat_timespec(&ts, utime))
185 return -EFAULT; 186 return -EFAULT;
186 if (!timespec_valid(&ts)) 187 if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
191 t = ktime_add_safe(ktime_get(), t); 192 t = ktime_add_safe(ktime_get(), t);
192 tp = &t; 193 tp = &t;
193 } 194 }
194 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 195 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
196 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
195 val2 = (int) (unsigned long) utime; 197 val2 = (int) (unsigned long) utime;
196 198
197 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 199 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 000000000000..22e9dcfaa3d3
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
1menu "GCOV-based kernel profiling"
2
3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS
6 default n
7 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage
9 measurements).
10
11 If unsure, say N.
12
13 Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
14 for the entire kernel. To enable profiling for specific files or
15 directories, add a line similar to the following to the respective
16 Makefile:
17
18 For a single file (e.g. main.o):
19 GCOV_PROFILE_main.o := y
20
21 For all files in one directory:
22 GCOV_PROFILE := y
23
24 To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
25 is specified, use:
26
27 GCOV_PROFILE_main.o := n
28 and:
29 GCOV_PROFILE := n
30
31 Note that the debugfs filesystem has to be mounted to access
32 profiling data.
33
34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL
37 depends on S390 || X86
38 default n
39 ---help---
40 This options activates profiling for the entire kernel.
41
42 If unsure, say N.
43
44 Note that a kernel compiled with profiling flags will be significantly
45 larger and run slower. Also be sure to exclude files from profiling
46 which are not linked to the kernel image to prevent linker errors.
47
48endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 000000000000..3f761001d517
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 000000000000..9b22d03cc581
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
1/*
2 * This code maintains a list of active profiling data structures.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 */
15
16#define pr_fmt(fmt) "gcov: " fmt
17
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include "gcov.h"
22
23static struct gcov_info *gcov_info_head;
24static int gcov_events_enabled;
25static DEFINE_MUTEX(gcov_lock);
26
27/*
28 * __gcov_init is called by gcc-generated constructor code for each object
29 * file compiled with -fprofile-arcs.
30 */
31void __gcov_init(struct gcov_info *info)
32{
33 static unsigned int gcov_version;
34
35 mutex_lock(&gcov_lock);
36 if (gcov_version == 0) {
37 gcov_version = info->version;
38 /*
39 * Printing gcc's version magic may prove useful for debugging
40 * incompatibility reports.
41 */
42 pr_info("version magic: 0x%x\n", gcov_version);
43 }
44 /*
45 * Add new profiling data structure to list and inform event
46 * listener.
47 */
48 info->next = gcov_info_head;
49 gcov_info_head = info;
50 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock);
53}
54EXPORT_SYMBOL(__gcov_init);
55
56/*
57 * These functions may be referenced by gcc-generated profiling code but serve
58 * no function for kernel profiling.
59 */
60void __gcov_flush(void)
61{
62 /* Unused. */
63}
64EXPORT_SYMBOL(__gcov_flush);
65
66void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
67{
68 /* Unused. */
69}
70EXPORT_SYMBOL(__gcov_merge_add);
71
72void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
73{
74 /* Unused. */
75}
76EXPORT_SYMBOL(__gcov_merge_single);
77
78void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
79{
80 /* Unused. */
81}
82EXPORT_SYMBOL(__gcov_merge_delta);
83
84/**
85 * gcov_enable_events - enable event reporting through gcov_event()
86 *
87 * Turn on reporting of profiling data load/unload-events through the
88 * gcov_event() callback. Also replay all previous events once. This function
89 * is needed because some events are potentially generated too early for the
90 * callback implementation to handle them initially.
91 */
92void gcov_enable_events(void)
93{
94 struct gcov_info *info;
95
96 mutex_lock(&gcov_lock);
97 gcov_events_enabled = 1;
98 /* Perform event callback for previously registered entries. */
99 for (info = gcov_info_head; info; info = info->next)
100 gcov_event(GCOV_ADD, info);
101 mutex_unlock(&gcov_lock);
102}
103
104#ifdef CONFIG_MODULES
105static inline int within(void *addr, void *start, unsigned long size)
106{
107 return ((addr >= start) && (addr < start + size));
108}
109
110/* Update list and generate events when modules are unloaded. */
111static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
112 void *data)
113{
114 struct module *mod = data;
115 struct gcov_info *info;
116 struct gcov_info *prev;
117
118 if (event != MODULE_STATE_GOING)
119 return NOTIFY_OK;
120 mutex_lock(&gcov_lock);
121 prev = NULL;
122 /* Remove entries located in module from linked list. */
123 for (info = gcov_info_head; info; info = info->next) {
124 if (within(info, mod->module_core, mod->core_size)) {
125 if (prev)
126 prev->next = info->next;
127 else
128 gcov_info_head = info->next;
129 if (gcov_events_enabled)
130 gcov_event(GCOV_REMOVE, info);
131 } else
132 prev = info;
133 }
134 mutex_unlock(&gcov_lock);
135
136 return NOTIFY_OK;
137}
138
139static struct notifier_block gcov_nb = {
140 .notifier_call = gcov_module_notifier,
141};
142
143static int __init gcov_init(void)
144{
145 return register_module_notifier(&gcov_nb);
146}
147device_initcall(gcov_init);
148#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 000000000000..ef3c3f88a7a3
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
1/*
2 * This code exports profiling data as debugfs files to userspace.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 * Yi CDL Yang
15 */
16
17#define pr_fmt(fmt) "gcov: " fmt
18
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/debugfs.h>
22#include <linux/fs.h>
23#include <linux/list.h>
24#include <linux/string.h>
25#include <linux/slab.h>
26#include <linux/mutex.h>
27#include <linux/seq_file.h>
28#include "gcov.h"
29
30/**
31 * struct gcov_node - represents a debugfs entry
32 * @list: list head for child node list
33 * @children: child nodes
34 * @all: list head for list of all nodes
35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory
37 * @ghost: when an object file containing profiling data is unloaded we keep a
38 * copy of the profiling data here to allow collecting coverage data
39 * for cleanup code. Such a node is called a "ghost".
40 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links
42 * @name: data file basename
43 *
44 * struct gcov_node represents an entity within the gcov/ subdirectory
45 * of debugfs. There are directory and data file nodes. The latter represent
46 * the actual synthesized data file plus any associated symbolic links which
47 * are needed by the gcov tool to work correctly.
48 */
49struct gcov_node {
50 struct list_head list;
51 struct list_head children;
52 struct list_head all;
53 struct gcov_node *parent;
54 struct gcov_info *info;
55 struct gcov_info *ghost;
56 struct dentry *dentry;
57 struct dentry **links;
58 char name[0];
59};
60
61static const char objtree[] = OBJTREE;
62static const char srctree[] = SRCTREE;
63static struct gcov_node root_node;
64static struct dentry *reset_dentry;
65static LIST_HEAD(all_head);
66static DEFINE_MUTEX(node_lock);
67
68/* If non-zero, keep copies of profiling data for unloaded modules. */
69static int gcov_persist = 1;
70
71static int __init gcov_persist_setup(char *str)
72{
73 unsigned long val;
74
75 if (strict_strtoul(str, 0, &val)) {
76 pr_warning("invalid gcov_persist parameter '%s'\n", str);
77 return 0;
78 }
79 gcov_persist = val;
80 pr_info("setting gcov_persist to %d\n", gcov_persist);
81
82 return 1;
83}
84__setup("gcov_persist=", gcov_persist_setup);
85
86/*
87 * seq_file.start() implementation for gcov data files. Note that the
88 * gcov_iterator interface is designed to be more restrictive than seq_file
89 * (no start from arbitrary position, etc.), to simplify the iterator
90 * implementation.
91 */
92static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
93{
94 loff_t i;
95
96 gcov_iter_start(seq->private);
97 for (i = 0; i < *pos; i++) {
98 if (gcov_iter_next(seq->private))
99 return NULL;
100 }
101 return seq->private;
102}
103
104/* seq_file.next() implementation for gcov data files. */
105static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
106{
107 struct gcov_iterator *iter = data;
108
109 if (gcov_iter_next(iter))
110 return NULL;
111 (*pos)++;
112
113 return iter;
114}
115
116/* seq_file.show() implementation for gcov data files. */
117static int gcov_seq_show(struct seq_file *seq, void *data)
118{
119 struct gcov_iterator *iter = data;
120
121 if (gcov_iter_write(iter, seq))
122 return -EINVAL;
123 return 0;
124}
125
126static void gcov_seq_stop(struct seq_file *seq, void *data)
127{
128 /* Unused. */
129}
130
131static const struct seq_operations gcov_seq_ops = {
132 .start = gcov_seq_start,
133 .next = gcov_seq_next,
134 .show = gcov_seq_show,
135 .stop = gcov_seq_stop,
136};
137
138/*
139 * Return the profiling data set for a given node. This can either be the
140 * original profiling data structure or a duplicate (also called "ghost")
141 * in case the associated object file has been unloaded.
142 */
143static struct gcov_info *get_node_info(struct gcov_node *node)
144{
145 if (node->info)
146 return node->info;
147
148 return node->ghost;
149}
150
151/*
152 * open() implementation for gcov data files. Create a copy of the profiling
153 * data set and initialize the iterator and seq_file interface.
154 */
155static int gcov_seq_open(struct inode *inode, struct file *file)
156{
157 struct gcov_node *node = inode->i_private;
158 struct gcov_iterator *iter;
159 struct seq_file *seq;
160 struct gcov_info *info;
161 int rc = -ENOMEM;
162
163 mutex_lock(&node_lock);
164 /*
165 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access.
167 */
168 info = gcov_info_dup(get_node_info(node));
169 if (!info)
170 goto out_unlock;
171 iter = gcov_iter_new(info);
172 if (!iter)
173 goto err_free_info;
174 rc = seq_open(file, &gcov_seq_ops);
175 if (rc)
176 goto err_free_iter_info;
177 seq = file->private_data;
178 seq->private = iter;
179out_unlock:
180 mutex_unlock(&node_lock);
181 return rc;
182
183err_free_iter_info:
184 gcov_iter_free(iter);
185err_free_info:
186 gcov_info_free(info);
187 goto out_unlock;
188}
189
190/*
191 * release() implementation for gcov data files. Release resources allocated
192 * by open().
193 */
194static int gcov_seq_release(struct inode *inode, struct file *file)
195{
196 struct gcov_iterator *iter;
197 struct gcov_info *info;
198 struct seq_file *seq;
199
200 seq = file->private_data;
201 iter = seq->private;
202 info = gcov_iter_get_info(iter);
203 gcov_iter_free(iter);
204 gcov_info_free(info);
205 seq_release(inode, file);
206
207 return 0;
208}
209
210/*
211 * Find a node by the associated data file name. Needs to be called with
212 * node_lock held.
213 */
214static struct gcov_node *get_node_by_name(const char *name)
215{
216 struct gcov_node *node;
217 struct gcov_info *info;
218
219 list_for_each_entry(node, &all_head, all) {
220 info = get_node_info(node);
221 if (info && (strcmp(info->filename, name) == 0))
222 return node;
223 }
224
225 return NULL;
226}
227
228static void remove_node(struct gcov_node *node);
229
230/*
231 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is
233 * a "ghost" node), remove the debug fs node as well.
234 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos)
237{
238 struct seq_file *seq;
239 struct gcov_info *info;
240 struct gcov_node *node;
241
242 seq = file->private_data;
243 info = gcov_iter_get_info(seq->private);
244 mutex_lock(&node_lock);
245 node = get_node_by_name(info->filename);
246 if (node) {
247 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost)
249 remove_node(node);
250 else
251 gcov_info_reset(node->info);
252 }
253 /* Reset counts for open file. */
254 gcov_info_reset(info);
255 mutex_unlock(&node_lock);
256
257 return len;
258}
259
260/*
261 * Given a string <path> representing a file path of format:
262 * path/to/file.gcda
263 * construct and return a new string:
264 * <dir/>path/to/file.<ext>
265 */
266static char *link_target(const char *dir, const char *path, const char *ext)
267{
268 char *target;
269 char *old_ext;
270 char *copy;
271
272 copy = kstrdup(path, GFP_KERNEL);
273 if (!copy)
274 return NULL;
275 old_ext = strrchr(copy, '.');
276 if (old_ext)
277 *old_ext = '\0';
278 if (dir)
279 target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
280 else
281 target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
282 kfree(copy);
283
284 return target;
285}
286
287/*
288 * Construct a string representing the symbolic link target for the given
289 * gcov data file name and link type. Depending on the link type and the
290 * location of the data file, the link target can either point to a
291 * subdirectory of srctree, objtree or in an external location.
292 */
293static char *get_link_target(const char *filename, const struct gcov_link *ext)
294{
295 const char *rel;
296 char *result;
297
298 if (strncmp(filename, objtree, strlen(objtree)) == 0) {
299 rel = filename + strlen(objtree) + 1;
300 if (ext->dir == SRC_TREE)
301 result = link_target(srctree, rel, ext->ext);
302 else
303 result = link_target(objtree, rel, ext->ext);
304 } else {
305 /* External compilation. */
306 result = link_target(NULL, filename, ext->ext);
307 }
308
309 return result;
310}
311
312#define SKEW_PREFIX ".tmp_"
313
314/*
315 * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
316 * for filename skewing caused by the mod-versioning mechanism.
317 */
318static const char *deskew(const char *basename)
319{
320 if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
321 return basename + sizeof(SKEW_PREFIX) - 1;
322 return basename;
323}
324
325/*
326 * Create links to additional files (usually .c and .gcno files) which the
327 * gcov tool expects to find in the same directory as the gcov data file.
328 */
329static void add_links(struct gcov_node *node, struct dentry *parent)
330{
331 char *basename;
332 char *target;
333 int num;
334 int i;
335
336 for (num = 0; gcov_link[num].ext; num++)
337 /* Nothing. */;
338 node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
339 if (!node->links)
340 return;
341 for (i = 0; i < num; i++) {
342 target = get_link_target(get_node_info(node)->filename,
343 &gcov_link[i]);
344 if (!target)
345 goto out_err;
346 basename = strrchr(target, '/');
347 if (!basename)
348 goto out_err;
349 basename++;
350 node->links[i] = debugfs_create_symlink(deskew(basename),
351 parent, target);
352 if (!node->links[i])
353 goto out_err;
354 kfree(target);
355 }
356
357 return;
358out_err:
359 kfree(target);
360 while (i-- > 0)
361 debugfs_remove(node->links[i]);
362 kfree(node->links);
363 node->links = NULL;
364}
365
366static const struct file_operations gcov_data_fops = {
367 .open = gcov_seq_open,
368 .release = gcov_seq_release,
369 .read = seq_read,
370 .llseek = seq_lseek,
371 .write = gcov_seq_write,
372};
373
374/* Basic initialization of a new node. */
375static void init_node(struct gcov_node *node, struct gcov_info *info,
376 const char *name, struct gcov_node *parent)
377{
378 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all);
381 node->info = info;
382 node->parent = parent;
383 if (name)
384 strcpy(node->name, name);
385}
386
387/*
388 * Create a new node and associated debugfs entry. Needs to be called with
389 * node_lock held.
390 */
391static struct gcov_node *new_node(struct gcov_node *parent,
392 struct gcov_info *info, const char *name)
393{
394 struct gcov_node *node;
395
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) {
398 pr_warning("out of memory\n");
399 return NULL;
400 }
401 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */
403 if (info) {
404 node->dentry = debugfs_create_file(deskew(node->name), 0600,
405 parent->dentry, node, &gcov_data_fops);
406 } else
407 node->dentry = debugfs_create_dir(node->name, parent->dentry);
408 if (!node->dentry) {
409 pr_warning("could not create file\n");
410 kfree(node);
411 return NULL;
412 }
413 if (info)
414 add_links(node, parent->dentry);
415 list_add(&node->list, &parent->children);
416 list_add(&node->all, &all_head);
417
418 return node;
419}
420
421/* Remove symbolic links associated with node. */
422static void remove_links(struct gcov_node *node)
423{
424 int i;
425
426 if (!node->links)
427 return;
428 for (i = 0; gcov_link[i].ext; i++)
429 debugfs_remove(node->links[i]);
430 kfree(node->links);
431 node->links = NULL;
432}
433
434/*
435 * Remove node from all lists and debugfs and release associated resources.
436 * Needs to be called with node_lock held.
437 */
438static void release_node(struct gcov_node *node)
439{
440 list_del(&node->list);
441 list_del(&node->all);
442 debugfs_remove(node->dentry);
443 remove_links(node);
444 if (node->ghost)
445 gcov_info_free(node->ghost);
446 kfree(node);
447}
448
449/* Release node and empty parents. Needs to be called with node_lock held. */
450static void remove_node(struct gcov_node *node)
451{
452 struct gcov_node *parent;
453
454 while ((node != &root_node) && list_empty(&node->children)) {
455 parent = node->parent;
456 release_node(node);
457 node = parent;
458 }
459}
460
461/*
462 * Find child node with given basename. Needs to be called with node_lock
463 * held.
464 */
465static struct gcov_node *get_child_by_name(struct gcov_node *parent,
466 const char *name)
467{
468 struct gcov_node *node;
469
470 list_for_each_entry(node, &parent->children, list) {
471 if (strcmp(node->name, name) == 0)
472 return node;
473 }
474
475 return NULL;
476}
477
478/*
479 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes.
481 */
482static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos)
484{
485 struct gcov_node *node;
486
487 mutex_lock(&node_lock);
488restart:
489 list_for_each_entry(node, &all_head, all) {
490 if (node->info)
491 gcov_info_reset(node->info);
492 else if (list_empty(&node->children)) {
493 remove_node(node);
494 /* Several nodes may have gone - restart loop. */
495 goto restart;
496 }
497 }
498 mutex_unlock(&node_lock);
499
500 return len;
501}
502
503/* read() implementation for reset file. Unused. */
504static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
505 loff_t *pos)
506{
507 /* Allow read operation so that a recursive copy won't fail. */
508 return 0;
509}
510
511static const struct file_operations gcov_reset_fops = {
512 .write = reset_write,
513 .read = reset_read,
514};
515
516/*
517 * Create a node for a given profiling data set and add it to all lists and
518 * debugfs. Needs to be called with node_lock held.
519 */
520static void add_node(struct gcov_info *info)
521{
522 char *filename;
523 char *curr;
524 char *next;
525 struct gcov_node *parent;
526 struct gcov_node *node;
527
528 filename = kstrdup(info->filename, GFP_KERNEL);
529 if (!filename)
530 return;
531 parent = &root_node;
532 /* Create directory nodes along the path. */
533 for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
534 if (curr == next)
535 continue;
536 *next = 0;
537 if (strcmp(curr, ".") == 0)
538 continue;
539 if (strcmp(curr, "..") == 0) {
540 if (!parent->parent)
541 goto err_remove;
542 parent = parent->parent;
543 continue;
544 }
545 node = get_child_by_name(parent, curr);
546 if (!node) {
547 node = new_node(parent, NULL, curr);
548 if (!node)
549 goto err_remove;
550 }
551 parent = node;
552 }
553 /* Create file node. */
554 node = new_node(parent, info, curr);
555 if (!node)
556 goto err_remove;
557out:
558 kfree(filename);
559 return;
560
561err_remove:
562 remove_node(parent);
563 goto out;
564}
565
566/*
567 * The profiling data set associated with this node is being unloaded. Store a
568 * copy of the profiling data and turn this node into a "ghost".
569 */
570static int ghost_node(struct gcov_node *node)
571{
572 node->ghost = gcov_info_dup(node->info);
573 if (!node->ghost) {
574 pr_warning("could not save data for '%s' (out of memory)\n",
575 node->info->filename);
576 return -ENOMEM;
577 }
578 node->info = NULL;
579
580 return 0;
581}
582
583/*
584 * Profiling data for this node has been loaded again. Add profiling data
585 * from previous instantiation and turn this node into a regular node.
586 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info)
588{
589 if (gcov_info_is_compatible(node->ghost, info))
590 gcov_info_add(info, node->ghost);
591 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n",
593 info->filename);
594 }
595 gcov_info_free(node->ghost);
596 node->ghost = NULL;
597 node->info = info;
598}
599
600/*
601 * Callback to create/remove profiling files when code compiled with
602 * -fprofile-arcs is loaded/unloaded.
603 */
604void gcov_event(enum gcov_action action, struct gcov_info *info)
605{
606 struct gcov_node *node;
607
608 mutex_lock(&node_lock);
609 node = get_node_by_name(info->filename);
610 switch (action) {
611 case GCOV_ADD:
612 /* Add new node or revive ghost. */
613 if (!node) {
614 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break;
624 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */
626 if (!node) {
627 pr_warning("could not remove '%s' (not found)\n",
628 info->filename);
629 break;
630 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break;
637 }
638 mutex_unlock(&node_lock);
639}
640
641/* Create debugfs entries. */
642static __init int gcov_fs_init(void)
643{
644 int rc = -EIO;
645
646 init_node(&root_node, NULL, NULL, NULL);
647 /*
648 * /sys/kernel/debug/gcov will be parent for the reset control file
649 * and all profiling files.
650 */
651 root_node.dentry = debugfs_create_dir("gcov", NULL);
652 if (!root_node.dentry)
653 goto err_remove;
654 /*
655 * Create reset file which resets all profiling counts when written
656 * to.
657 */
658 reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
659 NULL, &gcov_reset_fops);
660 if (!reset_dentry)
661 goto err_remove;
662 /* Replay previous events to get our fs hierarchy up-to-date. */
663 gcov_enable_events();
664 return 0;
665
666err_remove:
667 pr_err("init failed\n");
668 if (root_node.dentry)
669 debugfs_remove(root_node.dentry);
670
671 return rc;
672}
673device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 000000000000..ae5bb4260033
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
1/*
2 * This code provides functions to handle gcc's profiling data format
3 * introduced with gcc 3.4. Future versions of gcc may change the gcov
4 * format (as happened before), so all format-specific information needs
5 * to be kept modular and easily exchangeable.
6 *
7 * This file is based on gcc-internal definitions. Functions and data
8 * structures are defined to be compatible with gcc counterparts.
9 * For a better understanding, refer to gcc source: gcc/gcov-io.h.
10 *
11 * Copyright IBM Corp. 2009
12 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 *
14 * Uses gcc-internal data definitions.
15 */
16
17#include <linux/errno.h>
18#include <linux/slab.h>
19#include <linux/string.h>
20#include <linux/seq_file.h>
21#include <linux/vmalloc.h>
22#include "gcov.h"
23
24/* Symbolic links to be created for each profiling data file. */
25const struct gcov_link gcov_link[] = {
26 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
27 { 0, NULL},
28};
29
30/*
31 * Determine whether a counter is active. Based on gcc magic. Doesn't change
32 * at run-time.
33 */
34static int counter_active(struct gcov_info *info, unsigned int type)
35{
36 return (1 << type) & info->ctr_mask;
37}
38
39/* Determine number of active counters. Based on gcc magic. */
40static unsigned int num_counter_active(struct gcov_info *info)
41{
42 unsigned int i;
43 unsigned int result = 0;
44
45 for (i = 0; i < GCOV_COUNTERS; i++) {
46 if (counter_active(info, i))
47 result++;
48 }
49 return result;
50}
51
52/**
53 * gcov_info_reset - reset profiling data to zero
54 * @info: profiling data set
55 */
56void gcov_info_reset(struct gcov_info *info)
57{
58 unsigned int active = num_counter_active(info);
59 unsigned int i;
60
61 for (i = 0; i < active; i++) {
62 memset(info->counts[i].values, 0,
63 info->counts[i].num * sizeof(gcov_type));
64 }
65}
66
67/**
68 * gcov_info_is_compatible - check if profiling data can be added
69 * @info1: first profiling data set
70 * @info2: second profiling data set
71 *
72 * Returns non-zero if profiling data can be added, zero otherwise.
73 */
74int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
75{
76 return (info1->stamp == info2->stamp);
77}
78
79/**
80 * gcov_info_add - add up profiling data
81 * @dest: profiling data set to which data is added
82 * @source: profiling data set which is added
83 *
84 * Adds profiling counts of @source to @dest.
85 */
86void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
87{
88 unsigned int i;
89 unsigned int j;
90
91 for (i = 0; i < num_counter_active(dest); i++) {
92 for (j = 0; j < dest->counts[i].num; j++) {
93 dest->counts[i].values[j] +=
94 source->counts[i].values[j];
95 }
96 }
97}
98
99/* Get size of function info entry. Based on gcc magic. */
100static size_t get_fn_size(struct gcov_info *info)
101{
102 size_t size;
103
104 size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
105 sizeof(unsigned int);
106 if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
107 size = ALIGN(size, __alignof__(struct gcov_fn_info));
108 return size;
109}
110
111/* Get address of function info entry. Based on gcc magic. */
112static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
113{
114 return (struct gcov_fn_info *)
115 ((char *) info->functions + fn * get_fn_size(info));
116}
117
118/**
119 * gcov_info_dup - duplicate profiling data set
120 * @info: profiling data set to duplicate
121 *
122 * Return newly allocated duplicate on success, %NULL on error.
123 */
124struct gcov_info *gcov_info_dup(struct gcov_info *info)
125{
126 struct gcov_info *dup;
127 unsigned int i;
128 unsigned int active;
129
130 /* Duplicate gcov_info. */
131 active = num_counter_active(info);
132 dup = kzalloc(sizeof(struct gcov_info) +
133 sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
134 if (!dup)
135 return NULL;
136 dup->version = info->version;
137 dup->stamp = info->stamp;
138 dup->n_functions = info->n_functions;
139 dup->ctr_mask = info->ctr_mask;
140 /* Duplicate filename. */
141 dup->filename = kstrdup(info->filename, GFP_KERNEL);
142 if (!dup->filename)
143 goto err_free;
144 /* Duplicate table of functions. */
145 dup->functions = kmemdup(info->functions, info->n_functions *
146 get_fn_size(info), GFP_KERNEL);
147 if (!dup->functions)
148 goto err_free;
149 /* Duplicate counter arrays. */
150 for (i = 0; i < active ; i++) {
151 struct gcov_ctr_info *ctr = &info->counts[i];
152 size_t size = ctr->num * sizeof(gcov_type);
153
154 dup->counts[i].num = ctr->num;
155 dup->counts[i].merge = ctr->merge;
156 dup->counts[i].values = vmalloc(size);
157 if (!dup->counts[i].values)
158 goto err_free;
159 memcpy(dup->counts[i].values, ctr->values, size);
160 }
161 return dup;
162
163err_free:
164 gcov_info_free(dup);
165 return NULL;
166}
167
168/**
169 * gcov_info_free - release memory for profiling data set duplicate
170 * @info: profiling data set duplicate to free
171 */
172void gcov_info_free(struct gcov_info *info)
173{
174 unsigned int active = num_counter_active(info);
175 unsigned int i;
176
177 for (i = 0; i < active ; i++)
178 vfree(info->counts[i].values);
179 kfree(info->functions);
180 kfree(info->filename);
181 kfree(info);
182}
183
184/**
185 * struct type_info - iterator helper array
186 * @ctr_type: counter type
187 * @offset: index of the first value of the current function for this type
188 *
189 * This array is needed to convert the in-memory data format into the in-file
190 * data format:
191 *
192 * In-memory:
193 * for each counter type
194 * for each function
195 * values
196 *
197 * In-file:
198 * for each function
199 * for each counter type
200 * values
201 *
202 * See gcc source gcc/gcov-io.h for more information on data organization.
203 */
204struct type_info {
205 int ctr_type;
206 unsigned int offset;
207};
208
209/**
210 * struct gcov_iterator - specifies current file position in logical records
211 * @info: associated profiling data
212 * @record: record type
213 * @function: function number
214 * @type: counter type
215 * @count: index into values array
216 * @num_types: number of counter types
217 * @type_info: helper array to get values-array offset for current function
218 */
219struct gcov_iterator {
220 struct gcov_info *info;
221
222 int record;
223 unsigned int function;
224 unsigned int type;
225 unsigned int count;
226
227 int num_types;
228 struct type_info type_info[0];
229};
230
231static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
232{
233 return get_fn_info(iter->info, iter->function);
234}
235
236static struct type_info *get_type(struct gcov_iterator *iter)
237{
238 return &iter->type_info[iter->type];
239}
240
241/**
242 * gcov_iter_new - allocate and initialize profiling data iterator
243 * @info: profiling data set to be iterated
244 *
245 * Return file iterator on success, %NULL otherwise.
246 */
247struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
248{
249 struct gcov_iterator *iter;
250
251 iter = kzalloc(sizeof(struct gcov_iterator) +
252 num_counter_active(info) * sizeof(struct type_info),
253 GFP_KERNEL);
254 if (iter)
255 iter->info = info;
256
257 return iter;
258}
259
260/**
261 * gcov_iter_free - release memory for iterator
262 * @iter: file iterator to free
263 */
264void gcov_iter_free(struct gcov_iterator *iter)
265{
266 kfree(iter);
267}
268
269/**
270 * gcov_iter_get_info - return profiling data set for given file iterator
271 * @iter: file iterator
272 */
273struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
274{
275 return iter->info;
276}
277
278/**
279 * gcov_iter_start - reset file iterator to starting position
280 * @iter: file iterator
281 */
282void gcov_iter_start(struct gcov_iterator *iter)
283{
284 int i;
285
286 iter->record = 0;
287 iter->function = 0;
288 iter->type = 0;
289 iter->count = 0;
290 iter->num_types = 0;
291 for (i = 0; i < GCOV_COUNTERS; i++) {
292 if (counter_active(iter->info, i)) {
293 iter->type_info[iter->num_types].ctr_type = i;
294 iter->type_info[iter->num_types++].offset = 0;
295 }
296 }
297}
298
299/* Mapping of logical record number to actual file content. */
300#define RECORD_FILE_MAGIC 0
301#define RECORD_GCOV_VERSION 1
302#define RECORD_TIME_STAMP 2
303#define RECORD_FUNCTION_TAG 3
304#define RECORD_FUNCTON_TAG_LEN 4
305#define RECORD_FUNCTION_IDENT 5
306#define RECORD_FUNCTION_CHECK 6
307#define RECORD_COUNT_TAG 7
308#define RECORD_COUNT_LEN 8
309#define RECORD_COUNT 9
310
311/**
312 * gcov_iter_next - advance file iterator to next logical record
313 * @iter: file iterator
314 *
315 * Return zero if new position is valid, non-zero if iterator has reached end.
316 */
317int gcov_iter_next(struct gcov_iterator *iter)
318{
319 switch (iter->record) {
320 case RECORD_FILE_MAGIC:
321 case RECORD_GCOV_VERSION:
322 case RECORD_FUNCTION_TAG:
323 case RECORD_FUNCTON_TAG_LEN:
324 case RECORD_FUNCTION_IDENT:
325 case RECORD_COUNT_TAG:
326 /* Advance to next record */
327 iter->record++;
328 break;
329 case RECORD_COUNT:
330 /* Advance to next count */
331 iter->count++;
332 /* fall through */
333 case RECORD_COUNT_LEN:
334 if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
335 iter->record = 9;
336 break;
337 }
338 /* Advance to next counter type */
339 get_type(iter)->offset += iter->count;
340 iter->count = 0;
341 iter->type++;
342 /* fall through */
343 case RECORD_FUNCTION_CHECK:
344 if (iter->type < iter->num_types) {
345 iter->record = 7;
346 break;
347 }
348 /* Advance to next function */
349 iter->type = 0;
350 iter->function++;
351 /* fall through */
352 case RECORD_TIME_STAMP:
353 if (iter->function < iter->info->n_functions)
354 iter->record = 3;
355 else
356 iter->record = -1;
357 break;
358 }
359 /* Check for EOF. */
360 if (iter->record == -1)
361 return -EINVAL;
362 else
363 return 0;
364}
365
366/**
367 * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
368 * @seq: seq_file handle
369 * @v: value to be stored
370 *
371 * Number format defined by gcc: numbers are recorded in the 32 bit
372 * unsigned binary form of the endianness of the machine generating the
373 * file.
374 */
375static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
376{
377 return seq_write(seq, &v, sizeof(v));
378}
379
380/**
381 * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
382 * @seq: seq_file handle
383 * @v: value to be stored
384 *
385 * Number format defined by gcc: numbers are recorded in the 32 bit
386 * unsigned binary form of the endianness of the machine generating the
387 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
388 * first.
389 */
390static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
391{
392 u32 data[2];
393
394 data[0] = (v & 0xffffffffUL);
395 data[1] = (v >> 32);
396 return seq_write(seq, data, sizeof(data));
397}
398
399/**
400 * gcov_iter_write - write data for current pos to seq_file
401 * @iter: file iterator
402 * @seq: seq_file handle
403 *
404 * Return zero on success, non-zero otherwise.
405 */
406int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
407{
408 int rc = -EINVAL;
409
410 switch (iter->record) {
411 case RECORD_FILE_MAGIC:
412 rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
413 break;
414 case RECORD_GCOV_VERSION:
415 rc = seq_write_gcov_u32(seq, iter->info->version);
416 break;
417 case RECORD_TIME_STAMP:
418 rc = seq_write_gcov_u32(seq, iter->info->stamp);
419 break;
420 case RECORD_FUNCTION_TAG:
421 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
422 break;
423 case RECORD_FUNCTON_TAG_LEN:
424 rc = seq_write_gcov_u32(seq, 2);
425 break;
426 case RECORD_FUNCTION_IDENT:
427 rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
428 break;
429 case RECORD_FUNCTION_CHECK:
430 rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
431 break;
432 case RECORD_COUNT_TAG:
433 rc = seq_write_gcov_u32(seq,
434 GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
435 break;
436 case RECORD_COUNT_LEN:
437 rc = seq_write_gcov_u32(seq,
438 get_func(iter)->n_ctrs[iter->type] * 2);
439 break;
440 case RECORD_COUNT:
441 rc = seq_write_gcov_u64(seq,
442 iter->info->counts[iter->type].
443 values[iter->count + get_type(iter)->offset]);
444 break;
445 }
446 return rc;
447}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 000000000000..060073ebf7a6
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
1/*
2 * Profiling infrastructure declarations.
3 *
4 * This file is based on gcc-internal definitions. Data structures are
5 * defined to be compatible with gcc counterparts. For a better
6 * understanding, refer to gcc source: gcc/gcov-io.h.
7 *
8 * Copyright IBM Corp. 2009
9 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
10 *
11 * Uses gcc-internal data definitions.
12 */
13
14#ifndef GCOV_H
15#define GCOV_H GCOV_H
16
17#include <linux/types.h>
18
19/*
20 * Profiling data types used for gcc 3.4 and above - these are defined by
21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible.
23 */
24#define GCOV_COUNTERS 5
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
28#define GCOV_TAG_FOR_COUNTER(count) \
29 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
30
31#if BITS_PER_LONG >= 64
32typedef long gcov_type;
33#else
34typedef long long gcov_type;
35#endif
36
37/**
38 * struct gcov_fn_info - profiling meta data per function
39 * @ident: object file-unique function identifier
40 * @checksum: function checksum
41 * @n_ctrs: number of values per counter type belonging to this function
42 *
43 * This data is generated by gcc during compilation and doesn't change
44 * at run-time.
45 */
46struct gcov_fn_info {
47 unsigned int ident;
48 unsigned int checksum;
49 unsigned int n_ctrs[0];
50};
51
52/**
53 * struct gcov_ctr_info - profiling data per counter type
54 * @num: number of counter values for this type
55 * @values: array of counter values for this type
56 * @merge: merge function for counter values of this type (unused)
57 *
58 * This data is generated by gcc during compilation and doesn't change
59 * at run-time with the exception of the values array.
60 */
61struct gcov_ctr_info {
62 unsigned int num;
63 gcov_type *values;
64 void (*merge)(gcov_type *, unsigned int);
65};
66
67/**
68 * struct gcov_info - profiling data per object file
69 * @version: gcov version magic indicating the gcc version used for compilation
70 * @next: list head for a singly-linked list
71 * @stamp: time stamp
72 * @filename: name of the associated gcov data file
73 * @n_functions: number of instrumented functions
74 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active
76 * @counts: counter data per counter type
77 *
78 * This data is generated by gcc during compilation and doesn't change
79 * at run-time with the exception of the next pointer.
80 */
81struct gcov_info {
82 unsigned int version;
83 struct gcov_info *next;
84 unsigned int stamp;
85 const char *filename;
86 unsigned int n_functions;
87 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask;
89 struct gcov_ctr_info counts[0];
90};
91
92/* Base interface. */
93enum gcov_action {
94 GCOV_ADD,
95 GCOV_REMOVE,
96};
97
98void gcov_event(enum gcov_action action, struct gcov_info *info);
99void gcov_enable_events(void);
100
101/* Iterator control. */
102struct seq_file;
103struct gcov_iterator;
104
105struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
106void gcov_iter_free(struct gcov_iterator *iter);
107void gcov_iter_start(struct gcov_iterator *iter);
108int gcov_iter_next(struct gcov_iterator *iter);
109int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
110struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
111
112/* gcov_info control. */
113void gcov_info_reset(struct gcov_info *info);
114int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
115void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
116struct gcov_info *gcov_info_dup(struct gcov_info *info);
117void gcov_info_free(struct gcov_info *info);
118
119struct gcov_link {
120 enum {
121 OBJ_TREE,
122 SRC_TREE,
123 } dir;
124 const char *ext;
125};
126extern const struct gcov_link gcov_link[];
127
128#endif /* GCOV_H */
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
1/*
2 * Supplementary group IDs
3 */
4#include <linux/cred.h>
5#include <linux/module.h>
6#include <linux/slab.h>
7#include <linux/security.h>
8#include <linux/syscalls.h>
9#include <asm/uaccess.h>
10
11/* init to 2 - one for init_task, one to ensure it is never freed */
12struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
13
14struct group_info *groups_alloc(int gidsetsize)
15{
16 struct group_info *group_info;
17 int nblocks;
18 int i;
19
20 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
21 /* Make sure we always allocate at least one indirect block pointer */
22 nblocks = nblocks ? : 1;
23 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
24 if (!group_info)
25 return NULL;
26 group_info->ngroups = gidsetsize;
27 group_info->nblocks = nblocks;
28 atomic_set(&group_info->usage, 1);
29
30 if (gidsetsize <= NGROUPS_SMALL)
31 group_info->blocks[0] = group_info->small_block;
32 else {
33 for (i = 0; i < nblocks; i++) {
34 gid_t *b;
35 b = (void *)__get_free_page(GFP_USER);
36 if (!b)
37 goto out_undo_partial_alloc;
38 group_info->blocks[i] = b;
39 }
40 }
41 return group_info;
42
43out_undo_partial_alloc:
44 while (--i >= 0) {
45 free_page((unsigned long)group_info->blocks[i]);
46 }
47 kfree(group_info);
48 return NULL;
49}
50
51EXPORT_SYMBOL(groups_alloc);
52
53void groups_free(struct group_info *group_info)
54{
55 if (group_info->blocks[0] != group_info->small_block) {
56 int i;
57 for (i = 0; i < group_info->nblocks; i++)
58 free_page((unsigned long)group_info->blocks[i]);
59 }
60 kfree(group_info);
61}
62
63EXPORT_SYMBOL(groups_free);
64
65/* export the group_info to a user-space array */
66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info)
68{
69 int i;
70 unsigned int count = group_info->ngroups;
71
72 for (i = 0; i < group_info->nblocks; i++) {
73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
74 unsigned int len = cp_count * sizeof(*grouplist);
75
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
81 }
82 return 0;
83}
84
85/* fill a group_info from a user-space array - it must be allocated already */
86static int groups_from_user(struct group_info *group_info,
87 gid_t __user *grouplist)
88{
89 int i;
90 unsigned int count = group_info->ngroups;
91
92 for (i = 0; i < group_info->nblocks; i++) {
93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
94 unsigned int len = cp_count * sizeof(*grouplist);
95
96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 return -EFAULT;
98
99 grouplist += NGROUPS_PER_BLOCK;
100 count -= cp_count;
101 }
102 return 0;
103}
104
105/* a simple Shell sort */
106static void groups_sort(struct group_info *group_info)
107{
108 int base, max, stride;
109 int gidsetsize = group_info->ngroups;
110
111 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
112 ; /* nothing */
113 stride /= 3;
114
115 while (stride) {
116 max = gidsetsize - stride;
117 for (base = 0; base < max; base++) {
118 int left = base;
119 int right = left + stride;
120 gid_t tmp = GROUP_AT(group_info, right);
121
122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left);
125 right = left;
126 left -= stride;
127 }
128 GROUP_AT(group_info, right) = tmp;
129 }
130 stride /= 3;
131 }
132}
133
134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, gid_t grp)
136{
137 unsigned int left, right;
138
139 if (!group_info)
140 return 0;
141
142 left = 0;
143 right = group_info->ngroups;
144 while (left < right) {
145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid);
147 if (cmp > 0)
148 left = mid + 1;
149 else if (cmp < 0)
150 right = mid;
151 else
152 return 1;
153 }
154 return 0;
155}
156
157/**
158 * set_groups - Change a group subscription in a set of credentials
159 * @new: The newly prepared set of credentials to alter
160 * @group_info: The group list to install
161 *
162 * Validate a group subscription and, if valid, insert it into a set
163 * of credentials.
164 */
165int set_groups(struct cred *new, struct group_info *group_info)
166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info);
174 groups_sort(group_info);
175 get_group_info(group_info);
176 new->group_info = group_info;
177 return 0;
178}
179
180EXPORT_SYMBOL(set_groups);
181
182/**
183 * set_current_groups - Change current's group subscription
184 * @group_info: The group list to impose
185 *
186 * Validate a group subscription and, if valid, impose it upon current's task
187 * security record.
188 */
189int set_current_groups(struct group_info *group_info)
190{
191 struct cred *new;
192 int ret;
193
194 new = prepare_creds();
195 if (!new)
196 return -ENOMEM;
197
198 ret = set_groups(new, group_info);
199 if (ret < 0) {
200 abort_creds(new);
201 return ret;
202 }
203
204 return commit_creds(new);
205}
206
207EXPORT_SYMBOL(set_current_groups);
208
209SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
210{
211 const struct cred *cred = current_cred();
212 int i;
213
214 if (gidsetsize < 0)
215 return -EINVAL;
216
217 /* no need to grab task_lock here; it cannot change */
218 i = cred->group_info->ngroups;
219 if (gidsetsize) {
220 if (i > gidsetsize) {
221 i = -EINVAL;
222 goto out;
223 }
224 if (groups_to_user(grouplist, cred->group_info)) {
225 i = -EFAULT;
226 goto out;
227 }
228 }
229out:
230 return i;
231}
232
233/*
234 * SMP: Our groups are copy-on-write. We can set them safely
235 * without another task interfering.
236 */
237
238SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
239{
240 struct group_info *group_info;
241 int retval;
242
243 if (!capable(CAP_SETGID))
244 return -EPERM;
245 if ((unsigned)gidsetsize > NGROUPS_MAX)
246 return -EINVAL;
247
248 group_info = groups_alloc(gidsetsize);
249 if (!group_info)
250 return -ENOMEM;
251 retval = groups_from_user(group_info, grouplist);
252 if (retval) {
253 put_group_info(group_info);
254 return retval;
255 }
256
257 retval = set_current_groups(group_info);
258 put_group_info(group_info);
259
260 return retval;
261}
262
263/*
264 * Check whether we're fsgid/egid or in the supplemental group..
265 */
266int in_group_p(gid_t grp)
267{
268 const struct cred *cred = current_cred();
269 int retval = 1;
270
271 if (grp != cred->fsgid)
272 retval = groups_search(cred->group_info, grp);
273 return retval;
274}
275
276EXPORT_SYMBOL(in_group_p);
277
278int in_egroup_p(gid_t grp)
279{
280 const struct cred *cred = current_cred();
281 int retval = 1;
282
283 if (grp != cred->egid)
284 retval = groups_search(cred->group_info, grp);
285 return retval;
286}
287
288EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c19583..49da79ab8486 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h>
47#include <linux/timer.h>
46 48
47#include <asm/uaccess.h> 49#include <asm/uaccess.h>
48 50
@@ -189,21 +191,65 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
189 } 191 }
190} 192}
191 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
192/* 234/*
193 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
194 */ 236 */
195static inline struct hrtimer_clock_base * 237static inline struct hrtimer_clock_base *
196switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) 238switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
239 int pinned)
197{ 240{
198 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
199 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
243 int this_cpu = smp_processor_id();
244 int cpu = hrtimer_get_target(this_cpu, pinned);
200 245
201 new_cpu_base = &__get_cpu_var(hrtimer_bases); 246again:
247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
202 new_base = &new_cpu_base->clock_base[base->index]; 248 new_base = &new_cpu_base->clock_base[base->index];
203 249
204 if (base != new_base) { 250 if (base != new_base) {
205 /* 251 /*
206 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
207 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
208 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
209 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -218,6 +264,14 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
218 timer->base = NULL; 264 timer->base = NULL;
219 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
220 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
267
268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
269 cpu = this_cpu;
270 spin_unlock(&new_base->cpu_base->lock);
271 spin_lock(&base->cpu_base->lock);
272 timer->base = base;
273 goto again;
274 }
221 timer->base = new_base; 275 timer->base = new_base;
222 } 276 }
223 return new_base; 277 return new_base;
@@ -235,7 +289,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
235 return base; 289 return base;
236} 290}
237 291
238# define switch_hrtimer_base(t, b) (b) 292# define switch_hrtimer_base(t, b, p) (b)
239 293
240#endif /* !CONFIG_SMP */ 294#endif /* !CONFIG_SMP */
241 295
@@ -332,6 +386,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
332 return res; 386 return res;
333} 387}
334 388
389EXPORT_SYMBOL_GPL(ktime_add_safe);
390
335#ifdef CONFIG_DEBUG_OBJECTS_TIMERS 391#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
336 392
337static struct debug_obj_descr hrtimer_debug_descr; 393static struct debug_obj_descr hrtimer_debug_descr;
@@ -907,9 +963,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
907 ret = remove_hrtimer(timer, base); 963 ret = remove_hrtimer(timer, base);
908 964
909 /* Switch the timer base, if necessary: */ 965 /* Switch the timer base, if necessary: */
910 new_base = switch_hrtimer_base(timer, base); 966 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
911 967
912 if (mode == HRTIMER_MODE_REL) { 968 if (mode & HRTIMER_MODE_REL) {
913 tim = ktime_add_safe(tim, new_base->get_time()); 969 tim = ktime_add_safe(tim, new_base->get_time());
914 /* 970 /*
915 * CONFIG_TIME_LOW_RES is a temporary way for architectures 971 * CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -1226,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1226 1282
1227 expires_next.tv64 = KTIME_MAX; 1283 expires_next.tv64 = KTIME_MAX;
1228 1284
1285 spin_lock(&cpu_base->lock);
1286 /*
1287 * We set expires_next to KTIME_MAX here with cpu_base->lock
1288 * held to prevent that a timer is enqueued in our queue via
1289 * the migration code. This does not affect enqueueing of
1290 * timers which run their callback and need to be requeued on
1291 * this CPU.
1292 */
1293 cpu_base->expires_next.tv64 = KTIME_MAX;
1294
1229 base = cpu_base->clock_base; 1295 base = cpu_base->clock_base;
1230 1296
1231 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1297 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1232 ktime_t basenow; 1298 ktime_t basenow;
1233 struct rb_node *node; 1299 struct rb_node *node;
1234 1300
1235 spin_lock(&cpu_base->lock);
1236
1237 basenow = ktime_add(now, base->offset); 1301 basenow = ktime_add(now, base->offset);
1238 1302
1239 while ((node = base->first)) { 1303 while ((node = base->first)) {
@@ -1266,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1266 1330
1267 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1268 } 1332 }
1269 spin_unlock(&cpu_base->lock);
1270 base++; 1333 base++;
1271 } 1334 }
1272 1335
1336 /*
1337 * Store the new expiry value so the migration code can verify
1338 * against it.
1339 */
1273 cpu_base->expires_next = expires_next; 1340 cpu_base->expires_next = expires_next;
1341 spin_unlock(&cpu_base->lock);
1274 1342
1275 /* Reprogramming necessary ? */ 1343 /* Reprogramming necessary ? */
1276 if (expires_next.tv64 != KTIME_MAX) { 1344 if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964..7d047808419d 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2..13c68e71b726 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
359 359
360 spin_lock(&desc->lock); 360 spin_lock(&desc->lock);
361 mask_ack_irq(desc, irq); 361 mask_ack_irq(desc, irq);
362 desc = irq_remap_to_desc(irq, desc);
363 362
364 if (unlikely(desc->status & IRQ_INPROGRESS)) 363 if (unlikely(desc->status & IRQ_INPROGRESS))
365 goto out_unlock; 364 goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
438 desc->status &= ~IRQ_INPROGRESS; 437 desc->status &= ~IRQ_INPROGRESS;
439out: 438out:
440 desc->chip->eoi(irq); 439 desc->chip->eoi(irq);
441 desc = irq_remap_to_desc(irq, desc);
442 440
443 spin_unlock(&desc->lock); 441 spin_unlock(&desc->lock);
444} 442}
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
475 !desc->action)) { 473 !desc->action)) {
476 desc->status |= (IRQ_PENDING | IRQ_MASKED); 474 desc->status |= (IRQ_PENDING | IRQ_MASKED);
477 mask_ack_irq(desc, irq); 475 mask_ack_irq(desc, irq);
478 desc = irq_remap_to_desc(irq, desc);
479 goto out_unlock; 476 goto out_unlock;
480 } 477 }
481 kstat_incr_irqs_this_cpu(irq, desc); 478 kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
483 /* Start handling the irq */ 480 /* Start handling the irq */
484 if (desc->chip->ack) 481 if (desc->chip->ack)
485 desc->chip->ack(irq); 482 desc->chip->ack(irq);
486 desc = irq_remap_to_desc(irq, desc);
487 483
488 /* Mark the IRQ currently in progress.*/ 484 /* Mark the IRQ currently in progress.*/
489 desc->status |= IRQ_INPROGRESS; 485 desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
544 if (!noirqdebug) 540 if (!noirqdebug)
545 note_interrupt(irq, desc, action_ret); 541 note_interrupt(irq, desc, action_ret);
546 542
547 if (desc->chip->eoi) { 543 if (desc->chip->eoi)
548 desc->chip->eoi(irq); 544 desc->chip->eoi(irq);
549 desc = irq_remap_to_desc(irq, desc);
550 }
551} 545}
552 546
553void 547void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
582 576
583 /* Uninstall? */ 577 /* Uninstall? */
584 if (handle == handle_bad_irq) { 578 if (handle == handle_bad_irq) {
585 if (desc->chip != &no_irq_chip) { 579 if (desc->chip != &no_irq_chip)
586 mask_ack_irq(desc, irq); 580 mask_ack_irq(desc, irq);
587 desc = irq_remap_to_desc(irq, desc);
588 }
589 desc->status |= IRQ_DISABLED; 581 desc->status |= IRQ_DISABLED;
590 desc->depth = 1; 582 desc->depth = 1;
591 } 583 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e08754744f..065205bdd920 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,14 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/slab.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/random.h> 16#include <linux/random.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 19#include <linux/rculist.h>
19#include <linux/hash.h> 20#include <linux/hash.h>
20#include <trace/irq.h>
21#include <linux/bootmem.h> 21#include <linux/bootmem.h>
22#include <trace/events/irq.h>
22 23
23#include "internals.h" 24#include "internals.h"
24 25
@@ -44,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
44#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 45#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
45static void __init init_irq_default_affinity(void) 46static void __init init_irq_default_affinity(void)
46{ 47{
47 alloc_bootmem_cpumask_var(&irq_default_affinity); 48 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
48 cpumask_setall(irq_default_affinity); 49 cpumask_setall(irq_default_affinity);
49} 50}
50#else 51#else
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
81 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 82 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
82}; 83};
83 84
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 85void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
85{ 86{
86 int node;
87 void *ptr; 87 void *ptr;
88 88
89 node = cpu_to_node(cpu); 89 if (slab_is_available())
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92 else
93 ptr = alloc_bootmem_node(NODE_DATA(node),
94 nr * sizeof(*desc->kstat_irqs));
91 95
92 /* 96 /*
93 * don't overwite if can not get new one 97 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one 98 * init_copy_kstat_irqs() could still use old one
95 */ 99 */
96 if (ptr) { 100 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", 101 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
98 cpu, node);
99 desc->kstat_irqs = ptr; 102 desc->kstat_irqs = ptr;
100 } 103 }
101} 104}
102 105
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 106static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{ 107{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 108 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106 109
107 spin_lock_init(&desc->lock); 110 spin_lock_init(&desc->lock);
108 desc->irq = irq; 111 desc->irq = irq;
109#ifdef CONFIG_SMP 112#ifdef CONFIG_SMP
110 desc->cpu = cpu; 113 desc->node = node;
111#endif 114#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 115 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, cpu, nr_cpu_ids); 116 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) { 117 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n"); 118 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1); 119 BUG_ON(1);
117 } 120 }
118 if (!init_alloc_desc_masks(desc, cpu, false)) { 121 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); 122 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1); 123 BUG_ON(1);
121 } 124 }
122 arch_init_chip_data(desc, cpu); 125 init_desc_masks(desc);
126 arch_init_chip_data(desc, node);
123} 127}
124 128
125/* 129/*
@@ -146,6 +150,7 @@ int __init early_irq_init(void)
146{ 150{
147 struct irq_desc *desc; 151 struct irq_desc *desc;
148 int legacy_count; 152 int legacy_count;
153 int node;
149 int i; 154 int i;
150 155
151 init_irq_default_affinity(); 156 init_irq_default_affinity();
@@ -156,20 +161,21 @@ int __init early_irq_init(void)
156 161
157 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
158 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node;
159 165
160 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
161 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
162 168
163 /* allocate based on nr_cpu_ids */ 169 /* allocate based on nr_cpu_ids */
164 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ 170 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
165 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * 171 sizeof(int), GFP_NOWAIT, node);
166 sizeof(int));
167 172
168 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
169 desc[i].irq = i; 174 desc[i].irq = i;
170 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
171 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
172 init_alloc_desc_masks(&desc[i], 0, true); 177 alloc_desc_masks(&desc[i], node, true);
178 init_desc_masks(&desc[i]);
173 irq_desc_ptrs[i] = desc + i; 179 irq_desc_ptrs[i] = desc + i;
174 } 180 }
175 181
@@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
187 return NULL; 193 return NULL;
188} 194}
189 195
190struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 196struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
191{ 197{
192 struct irq_desc *desc; 198 struct irq_desc *desc;
193 unsigned long flags; 199 unsigned long flags;
194 int node;
195 200
196 if (irq >= nr_irqs) { 201 if (irq >= nr_irqs) {
197 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", 202 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
210 if (desc) 215 if (desc)
211 goto out_unlock; 216 goto out_unlock;
212 217
213 node = cpu_to_node(cpu); 218 if (slab_is_available())
214 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 219 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
215 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", 220 else
216 irq, cpu, node); 221 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
222
223 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
217 if (!desc) { 224 if (!desc) {
218 printk(KERN_ERR "can not alloc irq_desc\n"); 225 printk(KERN_ERR "can not alloc irq_desc\n");
219 BUG_ON(1); 226 BUG_ON(1);
220 } 227 }
221 init_one_irq_desc(irq, desc, cpu); 228 init_one_irq_desc(irq, desc, node);
222 229
223 irq_desc_ptrs[irq] = desc; 230 irq_desc_ptrs[irq] = desc;
224 231
@@ -256,7 +263,8 @@ int __init early_irq_init(void)
256 263
257 for (i = 0; i < count; i++) { 264 for (i = 0; i < count; i++) {
258 desc[i].irq = i; 265 desc[i].irq = i;
259 init_alloc_desc_masks(&desc[i], 0, true); 266 alloc_desc_masks(&desc[i], 0, true);
267 init_desc_masks(&desc[i]);
260 desc[i].kstat_irqs = kstat_irqs_all[i]; 268 desc[i].kstat_irqs = kstat_irqs_all[i];
261 } 269 }
262 return arch_early_irq_init(); 270 return arch_early_irq_init();
@@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
267 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 275 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
268} 276}
269 277
270struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 278struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
271{ 279{
272 return irq_to_desc(irq); 280 return irq_to_desc(irq);
273} 281}
@@ -348,9 +356,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
348 "but no thread function available.", irq, action->name); 356 "but no thread function available.", irq, action->name);
349} 357}
350 358
351DEFINE_TRACE(irq_handler_entry);
352DEFINE_TRACE(irq_handler_exit);
353
354/** 359/**
355 * handle_IRQ_event - irq action chain handler 360 * handle_IRQ_event - irq action chain handler
356 * @irq: the interrupt number 361 * @irq: the interrupt number
@@ -453,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
453 /* 458 /*
454 * No locking required for CPU-local interrupts: 459 * No locking required for CPU-local interrupts:
455 */ 460 */
456 if (desc->chip->ack) { 461 if (desc->chip->ack)
457 desc->chip->ack(irq); 462 desc->chip->ack(irq);
458 /* get new one */
459 desc = irq_remap_to_desc(irq, desc);
460 }
461 if (likely(!(desc->status & IRQ_DISABLED))) { 463 if (likely(!(desc->status & IRQ_DISABLED))) {
462 action_ret = handle_IRQ_event(irq, desc->action); 464 action_ret = handle_IRQ_event(irq, desc->action);
463 if (!noirqdebug) 465 if (!noirqdebug)
@@ -468,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
468 } 470 }
469 471
470 spin_lock(&desc->lock); 472 spin_lock(&desc->lock);
471 if (desc->chip->ack) { 473 if (desc->chip->ack)
472 desc->chip->ack(irq); 474 desc->chip->ack(irq);
473 desc = irq_remap_to_desc(irq, desc);
474 }
475 /* 475 /*
476 * REPLAY is when Linux resends an IRQ that was dropped earlier 476 * REPLAY is when Linux resends an IRQ that was dropped earlier
477 * WAITING is used by probe to mark irqs that are being tested 477 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38f..e70ed5592eb9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 17
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern spinlock_t sparse_irq_lock;
22 22
@@ -42,6 +42,8 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void irq_set_thread_affinity(struct irq_desc *desc);
46
45/* 47/*
46 * Debugging printout: 48 * Debugging printout:
47 */ 49 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca59243..0ec9ed831737 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83static void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -109,17 +117,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
109 spin_lock_irqsave(&desc->lock, flags); 117 spin_lock_irqsave(&desc->lock, flags);
110 118
111#ifdef CONFIG_GENERIC_PENDING_IRQ 119#ifdef CONFIG_GENERIC_PENDING_IRQ
112 if (desc->status & IRQ_MOVE_PCNTXT) 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 desc->chip->set_affinity(irq, cpumask); 121 if (!desc->chip->set_affinity(irq, cpumask)) {
122 cpumask_copy(desc->affinity, cpumask);
123 irq_set_thread_affinity(desc);
124 }
125 }
114 else { 126 else {
115 desc->status |= IRQ_MOVE_PENDING; 127 desc->status |= IRQ_MOVE_PENDING;
116 cpumask_copy(desc->pending_mask, cpumask); 128 cpumask_copy(desc->pending_mask, cpumask);
117 } 129 }
118#else 130#else
119 cpumask_copy(desc->affinity, cpumask); 131 if (!desc->chip->set_affinity(irq, cpumask)) {
120 desc->chip->set_affinity(irq, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
133 irq_set_thread_affinity(desc);
134 }
121#endif 135#endif
122 irq_set_thread_affinity(desc, cpumask);
123 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
124 spin_unlock_irqrestore(&desc->lock, flags); 137 spin_unlock_irqrestore(&desc->lock, flags);
125 return 0; 138 return 0;
@@ -171,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
171 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
172 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
173 if (!ret) 186 if (!ret)
174 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
175 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
176 189
177 return ret; 190 return ret;
@@ -438,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
438 return -1; 451 return -1;
439} 452}
440 453
454#ifdef CONFIG_SMP
455/*
456 * Check whether we need to change the affinity of the interrupt thread.
457 */
458static void
459irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
460{
461 cpumask_var_t mask;
462
463 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
464 return;
465
466 /*
467 * In case we are out of memory we set IRQTF_AFFINITY again and
468 * try again next time
469 */
470 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
471 set_bit(IRQTF_AFFINITY, &action->thread_flags);
472 return;
473 }
474
475 spin_lock_irq(&desc->lock);
476 cpumask_copy(mask, desc->affinity);
477 spin_unlock_irq(&desc->lock);
478
479 set_cpus_allowed_ptr(current, mask);
480 free_cpumask_var(mask);
481}
482#else
483static inline void
484irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
485#endif
486
441/* 487/*
442 * Interrupt handler thread 488 * Interrupt handler thread
443 */ 489 */
@@ -453,6 +499,8 @@ static int irq_thread(void *data)
453 499
454 while (!irq_wait_for_interrupt(action)) { 500 while (!irq_wait_for_interrupt(action)) {
455 501
502 irq_thread_check_affinity(desc, action);
503
456 atomic_inc(&desc->threads_active); 504 atomic_inc(&desc->threads_active);
457 505
458 spin_lock_irq(&desc->lock); 506 spin_lock_irq(&desc->lock);
@@ -559,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
559 */ 607 */
560 get_task_struct(t); 608 get_task_struct(t);
561 new->thread = t; 609 new->thread = t;
562 wake_up_process(t);
563 } 610 }
564 611
565 /* 612 /*
@@ -642,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
642 (int)(new->flags & IRQF_TRIGGER_MASK)); 689 (int)(new->flags & IRQF_TRIGGER_MASK));
643 } 690 }
644 691
692 new->irq = irq;
645 *old_ptr = new; 693 *old_ptr = new;
646 694
647 /* Reset broken irq detection when installing new handler */ 695 /* Reset broken irq detection when installing new handler */
@@ -659,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
659 707
660 spin_unlock_irqrestore(&desc->lock, flags); 708 spin_unlock_irqrestore(&desc->lock, flags);
661 709
662 new->irq = irq; 710 /*
711 * Strictly no need to wake it up, but hung_task complains
712 * when no hard interrupt wakes the thread up.
713 */
714 if (new->thread)
715 wake_up_process(new->thread);
716
663 register_irq_proc(irq, desc); 717 register_irq_proc(irq, desc);
664 new->dir = NULL; 718 new->dir = NULL;
665 register_handler_proc(irq, new); 719 register_handler_proc(irq, new);
@@ -713,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
713{ 767{
714 struct irq_desc *desc = irq_to_desc(irq); 768 struct irq_desc *desc = irq_to_desc(irq);
715 struct irqaction *action, **action_ptr; 769 struct irqaction *action, **action_ptr;
716 struct task_struct *irqthread;
717 unsigned long flags; 770 unsigned long flags;
718 771
719 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 772 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -761,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
761 desc->chip->disable(irq); 814 desc->chip->disable(irq);
762 } 815 }
763 816
764 irqthread = action->thread;
765 action->thread = NULL;
766
767 spin_unlock_irqrestore(&desc->lock, flags); 817 spin_unlock_irqrestore(&desc->lock, flags);
768 818
769 unregister_handler_proc(irq, action); 819 unregister_handler_proc(irq, action);
@@ -771,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
771 /* Make sure it's not being used on another CPU: */ 821 /* Make sure it's not being used on another CPU: */
772 synchronize_irq(irq); 822 synchronize_irq(irq);
773 823
774 if (irqthread) {
775 if (!test_bit(IRQTF_DIED, &action->thread_flags))
776 kthread_stop(irqthread);
777 put_task_struct(irqthread);
778 }
779
780#ifdef CONFIG_DEBUG_SHIRQ 824#ifdef CONFIG_DEBUG_SHIRQ
781 /* 825 /*
782 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 826 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -792,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
792 local_irq_restore(flags); 836 local_irq_restore(flags);
793 } 837 }
794#endif 838#endif
839
840 if (action->thread) {
841 if (!test_bit(IRQTF_DIED, &action->thread_flags))
842 kthread_stop(action->thread);
843 put_task_struct(action->thread);
844 }
845
795 return action; 846 return action;
796} 847}
797 848
@@ -851,7 +902,7 @@ EXPORT_SYMBOL(free_irq);
851 * still called in hard interrupt context and has to check 902 * still called in hard interrupt context and has to check
852 * whether the interrupt originates from the device. If yes it 903 * whether the interrupt originates from the device. If yes it
853 * needs to disable the interrupt on the device and return 904 * needs to disable the interrupt on the device and return
854 * IRQ_THREAD_WAKE which will wake up the handler thread and run 905 * IRQ_WAKE_THREAD which will wake up the handler thread and run
855 * @thread_fn. This split handler design is necessary to support 906 * @thread_fn. This split handler design is necessary to support
856 * shared interrupts. 907 * shared interrupts.
857 * 908 *
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3#include <linux/interrupt.h>
4
5#include "internals.h"
3 6
4void move_masked_irq(int irq) 7void move_masked_irq(int irq)
5{ 8{
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
39 * masking the irqs. 42 * masking the irqs.
40 */ 43 */
41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 45 < nr_cpu_ids))
43 cpumask_and(desc->affinity, 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
44 desc->pending_mask, cpu_online_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
45 desc->chip->set_affinity(irq, desc->affinity); 48 irq_set_thread_affinity(desc);
46 } 49 }
50
47 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
48} 52}
49 53
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
15 15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc, 16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int node, int nr)
19{ 19{
20 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, node, nr);
21 21
22 if (desc->kstat_irqs != old_desc->kstat_irqs) 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
34} 34}
35 35
36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
37 struct irq_desc *desc, int cpu) 37 struct irq_desc *desc, int node)
38{ 38{
39 memcpy(desc, old_desc, sizeof(struct irq_desc)); 39 memcpy(desc, old_desc, sizeof(struct irq_desc));
40 if (!init_alloc_desc_masks(desc, cpu, false)) { 40 if (!alloc_desc_masks(desc, node, false)) {
41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " 41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
49 init_copy_desc_masks(old_desc, desc); 49 init_copy_desc_masks(old_desc, desc);
50 arch_init_copy_chip_data(old_desc, desc, cpu); 50 arch_init_copy_chip_data(old_desc, desc, node);
51 return true; 51 return true;
52} 52}
53 53
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
59} 59}
60 60
61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, 61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
62 int cpu) 62 int node)
63{ 63{
64 struct irq_desc *desc; 64 struct irq_desc *desc;
65 unsigned int irq; 65 unsigned int irq;
66 unsigned long flags; 66 unsigned long flags;
67 int node;
68 67
69 irq = old_desc->irq; 68 irq = old_desc->irq;
70 69
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
77 goto out_unlock; 76 goto out_unlock;
78 77
79 node = cpu_to_node(cpu);
80 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 78 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
81 if (!desc) { 79 if (!desc) {
82 printk(KERN_ERR "irq %d: can not get new irq_desc " 80 printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
85 desc = old_desc; 83 desc = old_desc;
86 goto out_unlock; 84 goto out_unlock;
87 } 85 }
88 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { 86 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
89 /* still use old one */ 87 /* still use old one */
90 kfree(desc); 88 kfree(desc);
91 desc = old_desc; 89 desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
97 95
98 /* free the old one */ 96 /* free the old one */
99 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
100 spin_unlock(&old_desc->lock);
101 kfree(old_desc); 98 kfree(old_desc);
102 spin_lock(&desc->lock);
103 99
104 return desc; 100 return desc;
105 101
@@ -109,24 +105,14 @@ out_unlock:
109 return desc; 105 return desc;
110} 106}
111 107
112struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
113{ 109{
114 int old_cpu; 110 /* those static or target node is -1, do not move them */
115 int node, old_node; 111 if (desc->irq < NR_IRQS_LEGACY || node == -1)
116
117 /* those all static, do move them */
118 if (desc->irq < NR_IRQS_LEGACY)
119 return desc; 112 return desc;
120 113
121 old_cpu = desc->cpu; 114 if (desc->node != node)
122 if (old_cpu != cpu) { 115 desc = __real_move_irq_desc(desc, node);
123 node = cpu_to_node(cpu);
124 old_node = cpu_to_node(old_cpu);
125 if (old_node != node)
126 desc = __real_move_irq_desc(desc, cpu);
127 else
128 desc->cpu = cpu;
129 }
130 116
131 return desc; 117 return desc;
132} 118}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc7..3a29dbe7898e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33/*
34 * These will be re-linked against their real values
35 * during the second link stage.
36 */
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 37extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const u8 kallsyms_names[] __attribute__((weak)); 38extern const u8 kallsyms_names[] __attribute__((weak));
36 39
37/* tell the compiler that the count isn't in the small data section if the arch 40/*
38 * has one (eg: FRV) 41 * Tell the compiler that the count isn't in the small data section if the arch
42 * has one (eg: FRV).
39 */ 43 */
40extern const unsigned long kallsyms_num_syms 44extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 45__attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
75 return is_kernel_text(addr) || is_kernel_inittext(addr); 79 return is_kernel_text(addr) || is_kernel_inittext(addr);
76} 80}
77 81
78/* expand a compressed symbol data into the resulting uncompressed string, 82/*
79 given the offset to where the symbol is in the compressed stream */ 83 * Expand a compressed symbol data into the resulting uncompressed string,
84 * given the offset to where the symbol is in the compressed stream.
85 */
80static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 86static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
81{ 87{
82 int len, skipped_first = 0; 88 int len, skipped_first = 0;
83 const u8 *tptr, *data; 89 const u8 *tptr, *data;
84 90
85 /* get the compressed symbol length from the first symbol byte */ 91 /* Get the compressed symbol length from the first symbol byte. */
86 data = &kallsyms_names[off]; 92 data = &kallsyms_names[off];
87 len = *data; 93 len = *data;
88 data++; 94 data++;
89 95
90 /* update the offset to return the offset for the next symbol on 96 /*
91 * the compressed stream */ 97 * Update the offset to return the offset for the next symbol on
98 * the compressed stream.
99 */
92 off += len + 1; 100 off += len + 1;
93 101
94 /* for every byte on the compressed symbol data, copy the table 102 /*
95 entry for that byte */ 103 * For every byte on the compressed symbol data, copy the table
96 while(len) { 104 * entry for that byte.
97 tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; 105 */
106 while (len) {
107 tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
98 data++; 108 data++;
99 len--; 109 len--;
100 110
101 while (*tptr) { 111 while (*tptr) {
102 if(skipped_first) { 112 if (skipped_first) {
103 *result = *tptr; 113 *result = *tptr;
104 result++; 114 result++;
105 } else 115 } else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
110 120
111 *result = '\0'; 121 *result = '\0';
112 122
113 /* return to offset to the next symbol */ 123 /* Return to offset to the next symbol. */
114 return off; 124 return off;
115} 125}
116 126
117/* get symbol type information. This is encoded as a single char at the 127/*
118 * begining of the symbol name */ 128 * Get symbol type information. This is encoded as a single char at the
129 * beginning of the symbol name.
130 */
119static char kallsyms_get_symbol_type(unsigned int off) 131static char kallsyms_get_symbol_type(unsigned int off)
120{ 132{
121 /* get just the first code, look it up in the token table, and return the 133 /*
122 * first char from this token */ 134 * Get just the first code, look it up in the token table,
123 return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; 135 * and return the first char from this token.
136 */
137 return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
124} 138}
125 139
126 140
127/* find the offset on the compressed stream given and index in the 141/*
128 * kallsyms array */ 142 * Find the offset on the compressed stream given and index in the
143 * kallsyms array.
144 */
129static unsigned int get_symbol_offset(unsigned long pos) 145static unsigned int get_symbol_offset(unsigned long pos)
130{ 146{
131 const u8 *name; 147 const u8 *name;
132 int i; 148 int i;
133 149
134 /* use the closest marker we have. We have markers every 256 positions, 150 /*
135 * so that should be close enough */ 151 * Use the closest marker we have. We have markers every 256 positions,
136 name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; 152 * so that should be close enough.
153 */
154 name = &kallsyms_names[kallsyms_markers[pos >> 8]];
137 155
138 /* sequentially scan all the symbols up to the point we're searching for. 156 /*
139 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we 157 * Sequentially scan all the symbols up to the point we're searching
140 * just need to add the len to the current pointer for every symbol we 158 * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
141 * wish to skip */ 159 * so we just need to add the len to the current pointer for every
142 for(i = 0; i < (pos&0xFF); i++) 160 * symbol we wish to skip.
161 */
162 for (i = 0; i < (pos & 0xFF); i++)
143 name = name + (*name) + 1; 163 name = name + (*name) + 1;
144 164
145 return name - kallsyms_names; 165 return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
190 /* This kernel should never had been booted. */ 210 /* This kernel should never had been booted. */
191 BUG_ON(!kallsyms_addresses); 211 BUG_ON(!kallsyms_addresses);
192 212
193 /* do a binary search on the sorted kallsyms_addresses array */ 213 /* Do a binary search on the sorted kallsyms_addresses array. */
194 low = 0; 214 low = 0;
195 high = kallsyms_num_syms; 215 high = kallsyms_num_syms;
196 216
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
203 } 223 }
204 224
205 /* 225 /*
206 * search for the first aliased symbol. Aliased 226 * Search for the first aliased symbol. Aliased
207 * symbols are symbols with the same address 227 * symbols are symbols with the same address.
208 */ 228 */
209 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) 229 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
210 --low; 230 --low;
211 231
212 symbol_start = kallsyms_addresses[low]; 232 symbol_start = kallsyms_addresses[low];
213 233
214 /* Search for next non-aliased symbol */ 234 /* Search for next non-aliased symbol. */
215 for (i = low + 1; i < kallsyms_num_syms; i++) { 235 for (i = low + 1; i < kallsyms_num_syms; i++) {
216 if (kallsyms_addresses[i] > symbol_start) { 236 if (kallsyms_addresses[i] > symbol_start) {
217 symbol_end = kallsyms_addresses[i]; 237 symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
219 } 239 }
220 } 240 }
221 241
222 /* if we found no next symbol, we use the end of the section */ 242 /* If we found no next symbol, we use the end of the section. */
223 if (!symbol_end) { 243 if (!symbol_end) {
224 if (is_kernel_inittext(addr)) 244 if (is_kernel_inittext(addr))
225 symbol_end = (unsigned long)_einittext; 245 symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
252 272
253/* 273/*
254 * Lookup an address 274 * Lookup an address
255 * - modname is set to NULL if it's in the kernel 275 * - modname is set to NULL if it's in the kernel.
256 * - we guarantee that the returned name is valid until we reschedule even if 276 * - We guarantee that the returned name is valid until we reschedule even if.
257 * it resides in a module 277 * It resides in a module.
258 * - we also guarantee that modname will be valid until rescheduled 278 * - We also guarantee that modname will be valid until rescheduled.
259 */ 279 */
260const char *kallsyms_lookup(unsigned long addr, 280const char *kallsyms_lookup(unsigned long addr,
261 unsigned long *symbolsize, 281 unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
276 return namebuf; 296 return namebuf;
277 } 297 }
278 298
279 /* see if it's in a module */ 299 /* See if it's in a module. */
280 return module_address_lookup(addr, symbolsize, offset, modname, 300 return module_address_lookup(addr, symbolsize, offset, modname,
281 namebuf); 301 namebuf);
282} 302}
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
294 kallsyms_expand_symbol(get_symbol_offset(pos), symname); 314 kallsyms_expand_symbol(get_symbol_offset(pos), symname);
295 return 0; 315 return 0;
296 } 316 }
297 /* see if it's in a module */ 317 /* See if it's in a module. */
298 return lookup_module_symbol_name(addr, symname); 318 return lookup_module_symbol_name(addr, symname);
299} 319}
300 320
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
313 modname[0] = '\0'; 333 modname[0] = '\0';
314 return 0; 334 return 0;
315 } 335 }
316 /* see if it's in a module */ 336 /* See if it's in a module. */
317 return lookup_module_symbol_attrs(addr, size, offset, modname, name); 337 return lookup_module_symbol_attrs(addr, size, offset, modname, name);
318} 338}
319 339
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
342 362
343 return len; 363 return len;
344} 364}
365EXPORT_SYMBOL_GPL(sprint_symbol);
345 366
346/* Look up a kernel symbol and print it to the kernel messages. */ 367/* Look up a kernel symbol and print it to the kernel messages. */
347void __print_symbol(const char *fmt, unsigned long address) 368void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
352 373
353 printk(fmt, buffer); 374 printk(fmt, buffer);
354} 375}
376EXPORT_SYMBOL(__print_symbol);
355 377
356/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ 378/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
357struct kallsym_iter 379struct kallsym_iter {
358{
359 loff_t pos; 380 loff_t pos;
360 unsigned long value; 381 unsigned long value;
361 unsigned int nameoff; /* If iterating in core kernel symbols */ 382 unsigned int nameoff; /* If iterating in core kernel symbols. */
362 char type; 383 char type;
363 char name[KSYM_NAME_LEN]; 384 char name[KSYM_NAME_LEN];
364 char module_name[MODULE_NAME_LEN]; 385 char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
404 iter->pos = pos; 425 iter->pos = pos;
405 return get_ksymbol_mod(iter); 426 return get_ksymbol_mod(iter);
406 } 427 }
407 428
408 /* If we're not on the desired position, reset to new position. */ 429 /* If we're not on the desired position, reset to new position. */
409 if (pos != iter->pos) 430 if (pos != iter->pos)
410 reset_iter(iter, pos); 431 reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
439{ 460{
440 struct kallsym_iter *iter = m->private; 461 struct kallsym_iter *iter = m->private;
441 462
442 /* Some debugging symbols have no name. Ignore them. */ 463 /* Some debugging symbols have no name. Ignore them. */
443 if (!iter->name[0]) 464 if (!iter->name[0])
444 return 0; 465 return 0;
445 466
446 if (iter->module_name[0]) { 467 if (iter->module_name[0]) {
447 char type; 468 char type;
448 469
449 /* Label it "global" if it is exported, 470 /*
450 * "local" if not exported. */ 471 * Label it "global" if it is exported,
472 * "local" if not exported.
473 */
451 type = iter->exported ? toupper(iter->type) : 474 type = iter->exported ? toupper(iter->type) :
452 tolower(iter->type); 475 tolower(iter->type);
453 seq_printf(m, "%0*lx %c %s\t[%s]\n", 476 seq_printf(m, "%0*lx %c %s\t[%s]\n",
454 (int)(2*sizeof(void*)), 477 (int)(2 * sizeof(void *)),
455 iter->value, type, iter->name, iter->module_name); 478 iter->value, type, iter->name, iter->module_name);
456 } else 479 } else
457 seq_printf(m, "%0*lx %c %s\n", 480 seq_printf(m, "%0*lx %c %s\n",
458 (int)(2*sizeof(void*)), 481 (int)(2 * sizeof(void *)),
459 iter->value, iter->type, iter->name); 482 iter->value, iter->type, iter->name);
460 return 0; 483 return 0;
461} 484}
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
469 492
470static int kallsyms_open(struct inode *inode, struct file *file) 493static int kallsyms_open(struct inode *inode, struct file *file)
471{ 494{
472 /* We keep iterator in m->private, since normal case is to 495 /*
496 * We keep iterator in m->private, since normal case is to
473 * s_start from where we left off, so we avoid doing 497 * s_start from where we left off, so we avoid doing
474 * using get_symbol_offset for every symbol */ 498 * using get_symbol_offset for every symbol.
499 */
475 struct kallsym_iter *iter; 500 struct kallsym_iter *iter;
476 int ret; 501 int ret;
477 502
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
500 proc_create("kallsyms", 0444, NULL, &kallsyms_operations); 525 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
501 return 0; 526 return 0;
502} 527}
503__initcall(kallsyms_init); 528device_initcall(kallsyms_init);
504
505EXPORT_SYMBOL(__print_symbol);
506EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e4983770913b..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
@@ -1448,17 +1448,17 @@ int kernel_kexec(void)
1448 goto Restore_console; 1448 goto Restore_console;
1449 } 1449 }
1450 suspend_console(); 1450 suspend_console();
1451 error = device_suspend(PMSG_FREEZE); 1451 error = dpm_suspend_start(PMSG_FREEZE);
1452 if (error) 1452 if (error)
1453 goto Resume_console; 1453 goto Resume_console;
1454 /* At this point, device_suspend() has been called, 1454 /* At this point, dpm_suspend_start() has been called,
1455 * but *not* device_power_down(). We *must* 1455 * but *not* dpm_suspend_noirq(). We *must* call
1456 * device_power_down() now. Otherwise, drivers for 1456 * dpm_suspend_noirq() now. Otherwise, drivers for
1457 * some devices (e.g. interrupt controllers) become 1457 * some devices (e.g. interrupt controllers) become
1458 * desynchronized with the actual state of the 1458 * desynchronized with the actual state of the
1459 * hardware at resume time, and evil weirdness ensues. 1459 * hardware at resume time, and evil weirdness ensues.
1460 */ 1460 */
1461 error = device_power_down(PMSG_FREEZE); 1461 error = dpm_suspend_noirq(PMSG_FREEZE);
1462 if (error) 1462 if (error)
1463 goto Resume_devices; 1463 goto Resume_devices;
1464 error = disable_nonboot_cpus(); 1464 error = disable_nonboot_cpus();
@@ -1486,9 +1486,9 @@ int kernel_kexec(void)
1486 local_irq_enable(); 1486 local_irq_enable();
1487 Enable_cpus: 1487 Enable_cpus:
1488 enable_nonboot_cpus(); 1488 enable_nonboot_cpus();
1489 device_power_up(PMSG_RESTORE); 1489 dpm_resume_noirq(PMSG_RESTORE);
1490 Resume_devices: 1490 Resume_devices:
1491 device_resume(PMSG_RESTORE); 1491 dpm_resume_end(PMSG_RESTORE);
1492 Resume_console: 1492 Resume_console:
1493 resume_console(); 1493 resume_console();
1494 thaw_processes(); 1494 thaw_processes();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
72 72
73 /* 73 /*
74 * round up to the next power of 2, since our 'let the indices 74 * round up to the next power of 2, since our 'let the indices
75 * wrap' tachnique works only in this case. 75 * wrap' technique works only in this case.
76 */ 76 */
77 if (size & (size - 1)) { 77 if (!is_power_of_2(size)) {
78 BUG_ON(size > 0x80000000); 78 BUG_ON(size > 0x80000000);
79 size = roundup_pow_of_two(size); 79 size = roundup_pow_of_two(size);
80 } 80 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..0540948e29ab 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
237{ 237{
238 struct kprobe_insn_page *kip; 238 struct kprobe_insn_page *kip;
239 struct hlist_node *pos, *next; 239 struct hlist_node *pos, *next;
240 int safety;
241 240
242 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 242 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 243 return -EAGAIN;
248 244
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 694 p->addr = addr;
699 695
700 preempt_disable(); 696 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 697 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 698 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 699 preempt_enable();
704 return -EINVAL; 700 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ebaf8519abf..eb8751aa0418 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,11 +9,12 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/cpuset.h>
12#include <linux/unistd.h> 13#include <linux/unistd.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
16#include <trace/sched.h> 17#include <trace/events/sched.h>
17 18
18#define KTHREAD_NICE_LEVEL (-5) 19#define KTHREAD_NICE_LEVEL (-5)
19 20
@@ -21,15 +22,11 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 22static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 23struct task_struct *kthreadd_task;
23 24
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
27struct kthread_create_info 25struct kthread_create_info
28{ 26{
29 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
30 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
31 void *data; 29 void *data;
32 struct completion started;
33 30
34 /* Result passed back to kthread_create() from kthreadd. */ 31 /* Result passed back to kthread_create() from kthreadd. */
35 struct task_struct *result; 32 struct task_struct *result;
@@ -38,17 +35,13 @@ struct kthread_create_info
38 struct list_head list; 35 struct list_head list;
39}; 36};
40 37
41struct kthread_stop_info 38struct kthread {
42{ 39 int should_stop;
43 struct task_struct *k; 40 struct completion exited;
44 int err;
45 struct completion done;
46}; 41};
47 42
48/* Thread stopping is done by setthing this var: lock serializes 43#define to_kthread(tsk) \
49 * multiple kthread_stop calls. */ 44 container_of((tsk)->vfork_done, struct kthread, exited)
50static DEFINE_MUTEX(kthread_stop_lock);
51static struct kthread_stop_info kthread_stop_info;
52 45
53/** 46/**
54 * kthread_should_stop - should this kthread return now? 47 * kthread_should_stop - should this kthread return now?
@@ -59,36 +52,35 @@ static struct kthread_stop_info kthread_stop_info;
59 */ 52 */
60int kthread_should_stop(void) 53int kthread_should_stop(void)
61{ 54{
62 return (kthread_stop_info.k == current); 55 return to_kthread(current)->should_stop;
63} 56}
64EXPORT_SYMBOL(kthread_should_stop); 57EXPORT_SYMBOL(kthread_should_stop);
65 58
66static int kthread(void *_create) 59static int kthread(void *_create)
67{ 60{
61 /* Copy data: it's on kthread's stack */
68 struct kthread_create_info *create = _create; 62 struct kthread_create_info *create = _create;
69 int (*threadfn)(void *data); 63 int (*threadfn)(void *data) = create->threadfn;
70 void *data; 64 void *data = create->data;
71 int ret = -EINTR; 65 struct kthread self;
66 int ret;
72 67
73 /* Copy data: it's on kthread's stack */ 68 self.should_stop = 0;
74 threadfn = create->threadfn; 69 init_completion(&self.exited);
75 data = create->data; 70 current->vfork_done = &self.exited;
76 71
77 /* OK, tell user we're spawned, wait for stop or wakeup */ 72 /* OK, tell user we're spawned, wait for stop or wakeup */
78 __set_current_state(TASK_UNINTERRUPTIBLE); 73 __set_current_state(TASK_UNINTERRUPTIBLE);
79 create->result = current; 74 create->result = current;
80 complete(&create->started); 75 complete(&create->done);
81 schedule(); 76 schedule();
82 77
83 if (!kthread_should_stop()) 78 ret = -EINTR;
79 if (!self.should_stop)
84 ret = threadfn(data); 80 ret = threadfn(data);
85 81
86 /* It might have exited on its own, w/o kthread_stop. Check. */ 82 /* we can't just return, we must preserve "self" on stack */
87 if (kthread_should_stop()) { 83 do_exit(ret);
88 kthread_stop_info.err = ret;
89 complete(&kthread_stop_info.done);
90 }
91 return 0;
92} 84}
93 85
94static void create_kthread(struct kthread_create_info *create) 86static void create_kthread(struct kthread_create_info *create)
@@ -97,11 +89,10 @@ static void create_kthread(struct kthread_create_info *create)
97 89
98 /* We want our own signal handler (we take no signals by default). */ 90 /* We want our own signal handler (we take no signals by default). */
99 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 91 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
100 if (pid < 0) 92 if (pid < 0) {
101 create->result = ERR_PTR(pid); 93 create->result = ERR_PTR(pid);
102 else 94 complete(&create->done);
103 wait_for_completion(&create->started); 95 }
104 complete(&create->done);
105} 96}
106 97
107/** 98/**
@@ -132,7 +123,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
132 123
133 create.threadfn = threadfn; 124 create.threadfn = threadfn;
134 create.data = data; 125 create.data = data;
135 init_completion(&create.started);
136 init_completion(&create.done); 126 init_completion(&create.done);
137 127
138 spin_lock(&kthread_create_lock); 128 spin_lock(&kthread_create_lock);
@@ -190,40 +180,34 @@ EXPORT_SYMBOL(kthread_bind);
190 * @k: thread created by kthread_create(). 180 * @k: thread created by kthread_create().
191 * 181 *
192 * Sets kthread_should_stop() for @k to return true, wakes it, and 182 * Sets kthread_should_stop() for @k to return true, wakes it, and
193 * waits for it to exit. Your threadfn() must not call do_exit() 183 * waits for it to exit. This can also be called after kthread_create()
194 * itself if you use this function! This can also be called after 184 * instead of calling wake_up_process(): the thread will exit without
195 * kthread_create() instead of calling wake_up_process(): the thread 185 * calling threadfn().
196 * will exit without calling threadfn(). 186 *
187 * If threadfn() may call do_exit() itself, the caller must ensure
188 * task_struct can't go away.
197 * 189 *
198 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 190 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
199 * was never called. 191 * was never called.
200 */ 192 */
201int kthread_stop(struct task_struct *k) 193int kthread_stop(struct task_struct *k)
202{ 194{
195 struct kthread *kthread;
203 int ret; 196 int ret;
204 197
205 mutex_lock(&kthread_stop_lock);
206
207 /* It could exit after stop_info.k set, but before wake_up_process. */
208 get_task_struct(k);
209
210 trace_sched_kthread_stop(k); 198 trace_sched_kthread_stop(k);
199 get_task_struct(k);
211 200
212 /* Must init completion *before* thread sees kthread_stop_info.k */ 201 kthread = to_kthread(k);
213 init_completion(&kthread_stop_info.done); 202 barrier(); /* it might have exited */
214 smp_wmb(); 203 if (k->vfork_done != NULL) {
204 kthread->should_stop = 1;
205 wake_up_process(k);
206 wait_for_completion(&kthread->exited);
207 }
208 ret = k->exit_code;
215 209
216 /* Now set kthread_should_stop() to true, and wake it up. */
217 kthread_stop_info.k = k;
218 wake_up_process(k);
219 put_task_struct(k); 210 put_task_struct(k);
220
221 /* Once it dies, reset stop ptr, gather result and we're done. */
222 wait_for_completion(&kthread_stop_info.done);
223 kthread_stop_info.k = NULL;
224 ret = kthread_stop_info.err;
225 mutex_unlock(&kthread_stop_lock);
226
227 trace_sched_kthread_stop_ret(ret); 211 trace_sched_kthread_stop_ret(ret);
228 212
229 return ret; 213 return ret;
@@ -239,6 +223,7 @@ int kthreadd(void *unused)
239 ignore_signals(tsk); 223 ignore_signals(tsk);
240 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 224 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
241 set_cpus_allowed_ptr(tsk, cpu_all_mask); 225 set_cpus_allowed_ptr(tsk, cpu_all_mask);
226 set_mems_allowed(node_possible_map);
242 227
243 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 228 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
244 229
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index accb40cdb12a..8bbeef996c76 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <trace/lockdep.h>
46 45
47#include <asm/sections.h> 46#include <asm/sections.h>
48 47
49#include "lockdep_internals.h" 48#include "lockdep_internals.h"
50 49
50#define CREATE_TRACE_POINTS
51#include <trace/events/lockdep.h>
52
51#ifdef CONFIG_PROVE_LOCKING 53#ifdef CONFIG_PROVE_LOCKING
52int prove_locking = 1; 54int prove_locking = 1;
53module_param(prove_locking, int, 0644); 55module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
2935} 2937}
2936EXPORT_SYMBOL_GPL(lock_set_class); 2938EXPORT_SYMBOL_GPL(lock_set_class);
2937 2939
2938DEFINE_TRACE(lock_acquire);
2939
2940/* 2940/*
2941 * We are not always called with irqs disabled - do that here, 2941 * We are not always called with irqs disabled - do that here,
2942 * and also avoid lockdep recursion: 2942 * and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2963} 2963}
2964EXPORT_SYMBOL_GPL(lock_acquire); 2964EXPORT_SYMBOL_GPL(lock_acquire);
2965 2965
2966DEFINE_TRACE(lock_release);
2967
2968void lock_release(struct lockdep_map *lock, int nested, 2966void lock_release(struct lockdep_map *lock, int nested,
2969 unsigned long ip) 2967 unsigned long ip)
2970{ 2968{
@@ -3105,6 +3103,8 @@ found_it:
3105 hlock->holdtime_stamp = now; 3103 hlock->holdtime_stamp = now;
3106 } 3104 }
3107 3105
3106 trace_lock_acquired(lock, ip, waittime);
3107
3108 stats = get_lock_stats(hlock_class(hlock)); 3108 stats = get_lock_stats(hlock_class(hlock));
3109 if (waittime) { 3109 if (waittime) {
3110 if (hlock->read) 3110 if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
3120 lock->ip = ip; 3120 lock->ip = ip;
3121} 3121}
3122 3122
3123DEFINE_TRACE(lock_contended);
3124
3125void lock_contended(struct lockdep_map *lock, unsigned long ip) 3123void lock_contended(struct lockdep_map *lock, unsigned long ip)
3126{ 3124{
3127 unsigned long flags; 3125 unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3143} 3141}
3144EXPORT_SYMBOL_GPL(lock_contended); 3142EXPORT_SYMBOL_GPL(lock_contended);
3145 3143
3146DEFINE_TRACE(lock_acquired);
3147
3148void lock_acquired(struct lockdep_map *lock, unsigned long ip) 3144void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3149{ 3145{
3150 unsigned long flags; 3146 unsigned long flags;
3151 3147
3152 trace_lock_acquired(lock, ip);
3153
3154 if (unlikely(!lock_stat)) 3148 if (unlikely(!lock_stat))
3155 return; 3149 return;
3156 3150
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..e94caa666dba 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
758 &proc_lockdep_stats_operations); 758 &proc_lockdep_stats_operations);
759 759
760#ifdef CONFIG_LOCK_STAT 760#ifdef CONFIG_LOCK_STAT
761 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); 761 proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
762 &proc_lock_stat_operations);
762#endif 763#endif
763 764
764 return 0; 765 return 0;
diff --git a/kernel/module.c b/kernel/module.c
index e797812a4d95..eccb561dd8a3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
18*/ 18*/
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/ftrace_event.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
23#include <linux/fs.h> 24#include <linux/fs.h>
@@ -52,6 +53,7 @@
52#include <linux/ftrace.h> 53#include <linux/ftrace.h>
53#include <linux/async.h> 54#include <linux/async.h>
54#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h>
55 57
56#if 0 58#if 0
57#define DEBUGP printk 59#define DEBUGP printk
@@ -72,6 +74,9 @@ DEFINE_MUTEX(module_mutex);
72EXPORT_SYMBOL_GPL(module_mutex); 74EXPORT_SYMBOL_GPL(module_mutex);
73static LIST_HEAD(modules); 75static LIST_HEAD(modules);
74 76
77/* Block module loading/unloading? */
78int modules_disabled = 0;
79
75/* Waiting for a module to finish initializing? */ 80/* Waiting for a module to finish initializing? */
76static DECLARE_WAIT_QUEUE_HEAD(module_wq); 81static DECLARE_WAIT_QUEUE_HEAD(module_wq);
77 82
@@ -429,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
429 unsigned long extra; 434 unsigned long extra;
430 unsigned int i; 435 unsigned int i;
431 void *ptr; 436 void *ptr;
437 int cpu;
432 438
433 if (align > PAGE_SIZE) { 439 if (align > PAGE_SIZE) {
434 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 440 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -458,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
458 if (!split_block(i, size)) 464 if (!split_block(i, size))
459 return NULL; 465 return NULL;
460 466
467 /* add the per-cpu scanning areas */
468 for_each_possible_cpu(cpu)
469 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
470 GFP_KERNEL);
471
461 /* Mark allocated */ 472 /* Mark allocated */
462 pcpu_size[i] = -pcpu_size[i]; 473 pcpu_size[i] = -pcpu_size[i];
463 return ptr; 474 return ptr;
@@ -472,6 +483,7 @@ static void percpu_modfree(void *freeme)
472{ 483{
473 unsigned int i; 484 unsigned int i;
474 void *ptr = __per_cpu_start + block_size(pcpu_size[0]); 485 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
486 int cpu;
475 487
476 /* First entry is core kernel percpu data. */ 488 /* First entry is core kernel percpu data. */
477 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 489 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -483,6 +495,10 @@ static void percpu_modfree(void *freeme)
483 BUG(); 495 BUG();
484 496
485 free: 497 free:
498 /* remove the per-cpu scanning areas */
499 for_each_possible_cpu(cpu)
500 kmemleak_free(freeme + per_cpu_offset(cpu));
501
486 /* Merge with previous? */ 502 /* Merge with previous? */
487 if (pcpu_size[i-1] >= 0) { 503 if (pcpu_size[i-1] >= 0) {
488 pcpu_size[i-1] += pcpu_size[i]; 504 pcpu_size[i-1] += pcpu_size[i];
@@ -777,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
777 char name[MODULE_NAME_LEN]; 793 char name[MODULE_NAME_LEN];
778 int ret, forced = 0; 794 int ret, forced = 0;
779 795
780 if (!capable(CAP_SYS_MODULE)) 796 if (!capable(CAP_SYS_MODULE) || modules_disabled)
781 return -EPERM; 797 return -EPERM;
782 798
783 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) 799 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -893,16 +909,18 @@ void __symbol_put(const char *symbol)
893} 909}
894EXPORT_SYMBOL(__symbol_put); 910EXPORT_SYMBOL(__symbol_put);
895 911
912/* Note this assumes addr is a function, which it currently always is. */
896void symbol_put_addr(void *addr) 913void symbol_put_addr(void *addr)
897{ 914{
898 struct module *modaddr; 915 struct module *modaddr;
916 unsigned long a = (unsigned long)dereference_function_descriptor(addr);
899 917
900 if (core_kernel_text((unsigned long)addr)) 918 if (core_kernel_text(a))
901 return; 919 return;
902 920
903 /* module_text_address is safe here: we're supposed to have reference 921 /* module_text_address is safe here: we're supposed to have reference
904 * to module from symbol_get, so it can't go away. */ 922 * to module from symbol_get, so it can't go away. */
905 modaddr = __module_text_address((unsigned long)addr); 923 modaddr = __module_text_address(a);
906 BUG_ON(!modaddr); 924 BUG_ON(!modaddr);
907 module_put(modaddr); 925 module_put(modaddr);
908} 926}
@@ -1052,7 +1070,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1052{ 1070{
1053 const unsigned long *crc; 1071 const unsigned long *crc;
1054 1072
1055 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1073 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1074 &crc, true, false))
1056 BUG(); 1075 BUG();
1057 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1076 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1058} 1077}
@@ -1489,9 +1508,6 @@ static void free_module(struct module *mod)
1489 /* Free any allocated parameters. */ 1508 /* Free any allocated parameters. */
1490 destroy_params(mod->kp, mod->num_kp); 1509 destroy_params(mod->kp, mod->num_kp);
1491 1510
1492 /* release any pointers to mcount in this module */
1493 ftrace_release(mod->module_core, mod->core_size);
1494
1495 /* This may be NULL, but that's OK */ 1511 /* This may be NULL, but that's OK */
1496 module_free(mod, mod->module_init); 1512 module_free(mod, mod->module_init);
1497 kfree(mod->args); 1513 kfree(mod->args);
@@ -1878,6 +1894,36 @@ static void *module_alloc_update_bounds(unsigned long size)
1878 return ret; 1894 return ret;
1879} 1895}
1880 1896
1897#ifdef CONFIG_DEBUG_KMEMLEAK
1898static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1899 Elf_Shdr *sechdrs, char *secstrings)
1900{
1901 unsigned int i;
1902
1903 /* only scan the sections containing data */
1904 kmemleak_scan_area(mod->module_core, (unsigned long)mod -
1905 (unsigned long)mod->module_core,
1906 sizeof(struct module), GFP_KERNEL);
1907
1908 for (i = 1; i < hdr->e_shnum; i++) {
1909 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1910 continue;
1911 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
1912 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1913 continue;
1914
1915 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
1916 (unsigned long)mod->module_core,
1917 sechdrs[i].sh_size, GFP_KERNEL);
1918 }
1919}
1920#else
1921static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1922 Elf_Shdr *sechdrs, char *secstrings)
1923{
1924}
1925#endif
1926
1881/* Allocate and load the module: note that size of section 0 is always 1927/* Allocate and load the module: note that size of section 0 is always
1882 zero, and we rely on this for optional sections. */ 1928 zero, and we rely on this for optional sections. */
1883static noinline struct module *load_module(void __user *umod, 1929static noinline struct module *load_module(void __user *umod,
@@ -1892,11 +1938,9 @@ static noinline struct module *load_module(void __user *umod,
1892 unsigned int symindex = 0; 1938 unsigned int symindex = 0;
1893 unsigned int strindex = 0; 1939 unsigned int strindex = 0;
1894 unsigned int modindex, versindex, infoindex, pcpuindex; 1940 unsigned int modindex, versindex, infoindex, pcpuindex;
1895 unsigned int num_mcount;
1896 struct module *mod; 1941 struct module *mod;
1897 long err = 0; 1942 long err = 0;
1898 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1943 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1899 unsigned long *mseg;
1900 mm_segment_t old_fs; 1944 mm_segment_t old_fs;
1901 1945
1902 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1946 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2050,6 +2094,12 @@ static noinline struct module *load_module(void __user *umod,
2050 2094
2051 /* Do the allocs. */ 2095 /* Do the allocs. */
2052 ptr = module_alloc_update_bounds(mod->core_size); 2096 ptr = module_alloc_update_bounds(mod->core_size);
2097 /*
2098 * The pointer to this block is stored in the module structure
2099 * which is inside the block. Just mark it as not being a
2100 * leak.
2101 */
2102 kmemleak_not_leak(ptr);
2053 if (!ptr) { 2103 if (!ptr) {
2054 err = -ENOMEM; 2104 err = -ENOMEM;
2055 goto free_percpu; 2105 goto free_percpu;
@@ -2058,6 +2108,13 @@ static noinline struct module *load_module(void __user *umod,
2058 mod->module_core = ptr; 2108 mod->module_core = ptr;
2059 2109
2060 ptr = module_alloc_update_bounds(mod->init_size); 2110 ptr = module_alloc_update_bounds(mod->init_size);
2111 /*
2112 * The pointer to this block is stored in the module structure
2113 * which is inside the block. This block doesn't need to be
2114 * scanned as it contains data and code that will be freed
2115 * after the module is initialized.
2116 */
2117 kmemleak_ignore(ptr);
2061 if (!ptr && mod->init_size) { 2118 if (!ptr && mod->init_size) {
2062 err = -ENOMEM; 2119 err = -ENOMEM;
2063 goto free_core; 2120 goto free_core;
@@ -2088,6 +2145,7 @@ static noinline struct module *load_module(void __user *umod,
2088 } 2145 }
2089 /* Module has been moved. */ 2146 /* Module has been moved. */
2090 mod = (void *)sechdrs[modindex].sh_addr; 2147 mod = (void *)sechdrs[modindex].sh_addr;
2148 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2091 2149
2092#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2150#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2093 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2151 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
@@ -2161,6 +2219,10 @@ static noinline struct module *load_module(void __user *umod,
2161 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, 2219 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2162 "__kcrctab_unused_gpl"); 2220 "__kcrctab_unused_gpl");
2163#endif 2221#endif
2222#ifdef CONFIG_CONSTRUCTORS
2223 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2224 sizeof(*mod->ctors), &mod->num_ctors);
2225#endif
2164 2226
2165#ifdef CONFIG_MARKERS 2227#ifdef CONFIG_MARKERS
2166 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", 2228 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2172,7 +2234,19 @@ static noinline struct module *load_module(void __user *umod,
2172 sizeof(*mod->tracepoints), 2234 sizeof(*mod->tracepoints),
2173 &mod->num_tracepoints); 2235 &mod->num_tracepoints);
2174#endif 2236#endif
2175 2237#ifdef CONFIG_EVENT_TRACING
2238 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2239 "_ftrace_events",
2240 sizeof(*mod->trace_events),
2241 &mod->num_trace_events);
2242#endif
2243#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2244 /* sechdrs[0].sh_size is always zero */
2245 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2246 "__mcount_loc",
2247 sizeof(*mod->ftrace_callsites),
2248 &mod->num_ftrace_callsites);
2249#endif
2176#ifdef CONFIG_MODVERSIONS 2250#ifdef CONFIG_MODVERSIONS
2177 if ((mod->num_syms && !mod->crcs) 2251 if ((mod->num_syms && !mod->crcs)
2178 || (mod->num_gpl_syms && !mod->gpl_crcs) 2252 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2311,6 @@ static noinline struct module *load_module(void __user *umod,
2237 dynamic_debug_setup(debug, num_debug); 2311 dynamic_debug_setup(debug, num_debug);
2238 } 2312 }
2239 2313
2240 /* sechdrs[0].sh_size is always zero */
2241 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2242 sizeof(*mseg), &num_mcount);
2243 ftrace_init_module(mod, mseg, mseg + num_mcount);
2244
2245 err = module_finalize(hdr, sechdrs, mod); 2314 err = module_finalize(hdr, sechdrs, mod);
2246 if (err < 0) 2315 if (err < 0)
2247 goto cleanup; 2316 goto cleanup;
@@ -2286,7 +2355,8 @@ static noinline struct module *load_module(void __user *umod,
2286 if (err < 0) 2355 if (err < 0)
2287 goto unlink; 2356 goto unlink;
2288 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2357 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2289 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2358 if (mod->sect_attrs)
2359 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2290 2360
2291 /* Get rid of temporary copy */ 2361 /* Get rid of temporary copy */
2292 vfree(hdr); 2362 vfree(hdr);
@@ -2302,7 +2372,6 @@ static noinline struct module *load_module(void __user *umod,
2302 cleanup: 2372 cleanup:
2303 kobject_del(&mod->mkobj.kobj); 2373 kobject_del(&mod->mkobj.kobj);
2304 kobject_put(&mod->mkobj.kobj); 2374 kobject_put(&mod->mkobj.kobj);
2305 ftrace_release(mod->module_core, mod->core_size);
2306 free_unload: 2375 free_unload:
2307 module_unload_free(mod); 2376 module_unload_free(mod);
2308#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2377#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
@@ -2328,6 +2397,17 @@ static noinline struct module *load_module(void __user *umod,
2328 goto free_hdr; 2397 goto free_hdr;
2329} 2398}
2330 2399
2400/* Call module constructors. */
2401static void do_mod_ctors(struct module *mod)
2402{
2403#ifdef CONFIG_CONSTRUCTORS
2404 unsigned long i;
2405
2406 for (i = 0; i < mod->num_ctors; i++)
2407 mod->ctors[i]();
2408#endif
2409}
2410
2331/* This is where the real work happens */ 2411/* This is where the real work happens */
2332SYSCALL_DEFINE3(init_module, void __user *, umod, 2412SYSCALL_DEFINE3(init_module, void __user *, umod,
2333 unsigned long, len, const char __user *, uargs) 2413 unsigned long, len, const char __user *, uargs)
@@ -2336,7 +2416,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2336 int ret = 0; 2416 int ret = 0;
2337 2417
2338 /* Must have permission */ 2418 /* Must have permission */
2339 if (!capable(CAP_SYS_MODULE)) 2419 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2340 return -EPERM; 2420 return -EPERM;
2341 2421
2342 /* Only one module load at a time, please */ 2422 /* Only one module load at a time, please */
@@ -2356,6 +2436,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2356 blocking_notifier_call_chain(&module_notify_list, 2436 blocking_notifier_call_chain(&module_notify_list,
2357 MODULE_STATE_COMING, mod); 2437 MODULE_STATE_COMING, mod);
2358 2438
2439 do_mod_ctors(mod);
2359 /* Start the module */ 2440 /* Start the module */
2360 if (mod->init != NULL) 2441 if (mod->init != NULL)
2361 ret = do_one_initcall(mod->init); 2442 ret = do_one_initcall(mod->init);
@@ -2374,9 +2455,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2374 return ret; 2455 return ret;
2375 } 2456 }
2376 if (ret > 0) { 2457 if (ret > 0) {
2377 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2458 printk(KERN_WARNING
2378 "it should follow 0/-E convention\n" 2459"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2379 KERN_WARNING "%s: loading module anyway...\n", 2460"%s: loading module anyway...\n",
2380 __func__, mod->name, ret, 2461 __func__, mod->name, ret,
2381 __func__); 2462 __func__);
2382 dump_stack(); 2463 dump_stack();
@@ -2394,6 +2475,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2394 mutex_lock(&module_mutex); 2475 mutex_lock(&module_mutex);
2395 /* Drop initial reference. */ 2476 /* Drop initial reference. */
2396 module_put(mod); 2477 module_put(mod);
2478 trim_init_extable(mod);
2397 module_free(mod, mod->module_init); 2479 module_free(mod, mod->module_init);
2398 mod->module_init = NULL; 2480 mod->module_init = NULL;
2399 mod->init_size = 0; 2481 mod->init_size = 0;
@@ -2837,7 +2919,7 @@ void print_modules(void)
2837 struct module *mod; 2919 struct module *mod;
2838 char buf[8]; 2920 char buf[8];
2839 2921
2840 printk("Modules linked in:"); 2922 printk(KERN_DEFAULT "Modules linked in:");
2841 /* Most callers should already have preempt disabled, but make sure */ 2923 /* Most callers should already have preempt disabled, but make sure */
2842 preempt_disable(); 2924 preempt_disable();
2843 list_for_each_entry_rcu(mod, &modules, list) 2925 list_for_each_entry_rcu(mod, &modules, list)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..947b3ad551f8 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
249 249
250 /* didnt get the lock, go to sleep: */ 250 /* didnt get the lock, go to sleep: */
251 spin_unlock_mutex(&lock->wait_lock, flags); 251 spin_unlock_mutex(&lock->wait_lock, flags);
252 __schedule(); 252 preempt_enable_no_resched();
253 schedule();
254 preempt_disable();
253 spin_lock_mutex(&lock->wait_lock, flags); 255 spin_lock_mutex(&lock->wait_lock, flags);
254 } 256 }
255 257
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
471 473
472 return ret; 474 return ret;
473} 475}
474
475EXPORT_SYMBOL(mutex_trylock); 476EXPORT_SYMBOL(mutex_trylock);
477
478/**
479 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
480 * @cnt: the atomic which we are to dec
481 * @lock: the mutex to return holding if we dec to 0
482 *
483 * return true and hold lock if we dec to 0, return false otherwise
484 */
485int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
486{
487 /* dec if we can't possibly hit 0 */
488 if (atomic_add_unless(cnt, -1, 1))
489 return 0;
490 /* we might hit 0, so take the lock */
491 mutex_lock(lock);
492 if (!atomic_dec_and_test(cnt)) {
493 /* when we actually did the dec, we didn't hit 0 */
494 mutex_unlock(lock);
495 return 0;
496 }
497 /* we hit 0, and we hold the lock */
498 return 1;
499}
500EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0c..09b4ff9711b2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29/* 29static inline struct nsproxy *create_nsproxy(void)
30 * creates a copy of "orig" with refcount 1.
31 */
32static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
33{ 30{
34 struct nsproxy *ns; 31 struct nsproxy *nsproxy;
35 32
36 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); 33 nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
37 if (ns) { 34 if (nsproxy)
38 memcpy(ns, orig, sizeof(struct nsproxy)); 35 atomic_set(&nsproxy->count, 1);
39 atomic_set(&ns->count, 1); 36 return nsproxy;
40 }
41 return ns;
42} 37}
43 38
44/* 39/*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
52 struct nsproxy *new_nsp; 47 struct nsproxy *new_nsp;
53 int err; 48 int err;
54 49
55 new_nsp = clone_nsproxy(tsk->nsproxy); 50 new_nsp = create_nsproxy();
56 if (!new_nsp) 51 if (!new_nsp)
57 return ERR_PTR(-ENOMEM); 52 return ERR_PTR(-ENOMEM);
58 53
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/params.c b/kernel/params.c
index de273ec85bd2..7f6912ced2ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26
27/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
28#define KPARAM_KMALLOCED 0x80000000
29
30#if 0 27#if 0
31#define DEBUGP printk 28#define DEBUGP printk
32#else 29#else
@@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)
220 return -ENOSPC; 217 return -ENOSPC;
221 } 218 }
222 219
223 if (kp->perm & KPARAM_KMALLOCED) 220 if (kp->flags & KPARAM_KMALLOCED)
224 kfree(*(char **)kp->arg); 221 kfree(*(char **)kp->arg);
225 222
226 /* This is a hack. We can't need to strdup in early boot, and we 223 /* This is a hack. We can't need to strdup in early boot, and we
227 * don't need to; this mangled commandline is preserved. */ 224 * don't need to; this mangled commandline is preserved. */
228 if (slab_is_available()) { 225 if (slab_is_available()) {
229 kp->perm |= KPARAM_KMALLOCED; 226 kp->flags |= KPARAM_KMALLOCED;
230 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 227 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
231 if (!kp->arg) 228 if (!kp->arg)
232 return -ENOMEM; 229 return -ENOMEM;
@@ -241,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
241 return sprintf(buffer, "%s", *((char **)kp->arg)); 238 return sprintf(buffer, "%s", *((char **)kp->arg));
242} 239}
243 240
241/* Actually could be a bool or an int, for historical reasons. */
244int param_set_bool(const char *val, struct kernel_param *kp) 242int param_set_bool(const char *val, struct kernel_param *kp)
245{ 243{
244 bool v;
245
246 /* No equals means "set"... */ 246 /* No equals means "set"... */
247 if (!val) val = "1"; 247 if (!val) val = "1";
248 248
249 /* One of =[yYnN01] */ 249 /* One of =[yYnN01] */
250 switch (val[0]) { 250 switch (val[0]) {
251 case 'y': case 'Y': case '1': 251 case 'y': case 'Y': case '1':
252 *(int *)kp->arg = 1; 252 v = true;
253 return 0; 253 break;
254 case 'n': case 'N': case '0': 254 case 'n': case 'N': case '0':
255 *(int *)kp->arg = 0; 255 v = false;
256 return 0; 256 break;
257 default:
258 return -EINVAL;
257 } 259 }
258 return -EINVAL; 260
261 if (kp->flags & KPARAM_ISBOOL)
262 *(bool *)kp->arg = v;
263 else
264 *(int *)kp->arg = v;
265 return 0;
259} 266}
260 267
261int param_get_bool(char *buffer, struct kernel_param *kp) 268int param_get_bool(char *buffer, struct kernel_param *kp)
262{ 269{
270 bool val;
271 if (kp->flags & KPARAM_ISBOOL)
272 val = *(bool *)kp->arg;
273 else
274 val = *(int *)kp->arg;
275
263 /* Y and N chosen as being relatively non-coder friendly */ 276 /* Y and N chosen as being relatively non-coder friendly */
264 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); 277 return sprintf(buffer, "%c", val ? 'Y' : 'N');
265} 278}
266 279
280/* This one must be bool. */
267int param_set_invbool(const char *val, struct kernel_param *kp) 281int param_set_invbool(const char *val, struct kernel_param *kp)
268{ 282{
269 int boolval, ret; 283 int ret;
284 bool boolval;
270 struct kernel_param dummy; 285 struct kernel_param dummy;
271 286
272 dummy.arg = &boolval; 287 dummy.arg = &boolval;
288 dummy.flags = KPARAM_ISBOOL;
273 ret = param_set_bool(val, &dummy); 289 ret = param_set_bool(val, &dummy);
274 if (ret == 0) 290 if (ret == 0)
275 *(int *)kp->arg = !boolval; 291 *(bool *)kp->arg = !boolval;
276 return ret; 292 return ret;
277} 293}
278 294
279int param_get_invbool(char *buffer, struct kernel_param *kp) 295int param_get_invbool(char *buffer, struct kernel_param *kp)
280{ 296{
281 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); 297 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
282} 298}
283 299
284/* We break the rule and mangle the string. */ 300/* We break the rule and mangle the string. */
@@ -591,7 +607,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
591 unsigned int i; 607 unsigned int i;
592 608
593 for (i = 0; i < num; i++) 609 for (i = 0; i < num; i++)
594 if (params[i].perm & KPARAM_KMALLOCED) 610 if (params[i].flags & KPARAM_KMALLOCED)
595 kfree(*(char **)params[i].arg); 611 kfree(*(char **)params[i].arg);
596} 612}
597 613
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..f274e1959885
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4860 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
46
47/*
48 * perf counter paranoia level:
49 * 0 - not paranoid
50 * 1 - disallow cpu counters to unpriv
51 * 2 - disallow kernel profiling to unpriv
52 */
53int sysctl_perf_counter_paranoid __read_mostly;
54
55static inline bool perf_paranoid_cpu(void)
56{
57 return sysctl_perf_counter_paranoid > 0;
58}
59
60static inline bool perf_paranoid_kernel(void)
61{
62 return sysctl_perf_counter_paranoid > 1;
63}
64
65int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
66
67/*
68 * max perf counter sample rate
69 */
70int sysctl_perf_counter_sample_rate __read_mostly = 100000;
71
72static atomic64_t perf_counter_id;
73
74/*
75 * Lock for (sysadmin-configurable) counter reservations:
76 */
77static DEFINE_SPINLOCK(perf_resource_lock);
78
79/*
80 * Architecture provided APIs - weak aliases:
81 */
82extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
83{
84 return NULL;
85}
86
87void __weak hw_perf_disable(void) { barrier(); }
88void __weak hw_perf_enable(void) { barrier(); }
89
90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
92
93int __weak
94hw_perf_group_sched_in(struct perf_counter *group_leader,
95 struct perf_cpu_context *cpuctx,
96 struct perf_counter_context *ctx, int cpu)
97{
98 return 0;
99}
100
101void __weak perf_counter_print_debug(void) { }
102
103static DEFINE_PER_CPU(int, disable_count);
104
105void __perf_disable(void)
106{
107 __get_cpu_var(disable_count)++;
108}
109
110bool __perf_enable(void)
111{
112 return !--__get_cpu_var(disable_count);
113}
114
115void perf_disable(void)
116{
117 __perf_disable();
118 hw_perf_disable();
119}
120
121void perf_enable(void)
122{
123 if (__perf_enable())
124 hw_perf_enable();
125}
126
127static void get_ctx(struct perf_counter_context *ctx)
128{
129 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
130}
131
132static void free_ctx(struct rcu_head *head)
133{
134 struct perf_counter_context *ctx;
135
136 ctx = container_of(head, struct perf_counter_context, rcu_head);
137 kfree(ctx);
138}
139
140static void put_ctx(struct perf_counter_context *ctx)
141{
142 if (atomic_dec_and_test(&ctx->refcount)) {
143 if (ctx->parent_ctx)
144 put_ctx(ctx->parent_ctx);
145 if (ctx->task)
146 put_task_struct(ctx->task);
147 call_rcu(&ctx->rcu_head, free_ctx);
148 }
149}
150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
173/*
174 * Get the perf_counter_context for a task and lock it.
175 * This has to cope with with the fact that until it is locked,
176 * the context could get moved to another task.
177 */
178static struct perf_counter_context *
179perf_lock_task_context(struct task_struct *task, unsigned long *flags)
180{
181 struct perf_counter_context *ctx;
182
183 rcu_read_lock();
184 retry:
185 ctx = rcu_dereference(task->perf_counter_ctxp);
186 if (ctx) {
187 /*
188 * If this context is a clone of another, it might
189 * get swapped for another underneath us by
190 * perf_counter_task_sched_out, though the
191 * rcu_read_lock() protects us from any context
192 * getting freed. Lock the context and check if it
193 * got swapped before we could get the lock, and retry
194 * if so. If we locked the right context, then it
195 * can't get swapped on us any more.
196 */
197 spin_lock_irqsave(&ctx->lock, *flags);
198 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
199 spin_unlock_irqrestore(&ctx->lock, *flags);
200 goto retry;
201 }
202
203 if (!atomic_inc_not_zero(&ctx->refcount)) {
204 spin_unlock_irqrestore(&ctx->lock, *flags);
205 ctx = NULL;
206 }
207 }
208 rcu_read_unlock();
209 return ctx;
210}
211
212/*
213 * Get the context for a task and increment its pin_count so it
214 * can't get swapped to another task. This also increments its
215 * reference count so that the context can't get freed.
216 */
217static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
218{
219 struct perf_counter_context *ctx;
220 unsigned long flags;
221
222 ctx = perf_lock_task_context(task, &flags);
223 if (ctx) {
224 ++ctx->pin_count;
225 spin_unlock_irqrestore(&ctx->lock, flags);
226 }
227 return ctx;
228}
229
230static void perf_unpin_context(struct perf_counter_context *ctx)
231{
232 unsigned long flags;
233
234 spin_lock_irqsave(&ctx->lock, flags);
235 --ctx->pin_count;
236 spin_unlock_irqrestore(&ctx->lock, flags);
237 put_ctx(ctx);
238}
239
240/*
241 * Add a counter from the lists for its context.
242 * Must be called with ctx->mutex and ctx->lock held.
243 */
244static void
245list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
246{
247 struct perf_counter *group_leader = counter->group_leader;
248
249 /*
250 * Depending on whether it is a standalone or sibling counter,
251 * add it straight to the context's counter list, or to the group
252 * leader's sibling list:
253 */
254 if (group_leader == counter)
255 list_add_tail(&counter->list_entry, &ctx->counter_list);
256 else {
257 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
258 group_leader->nr_siblings++;
259 }
260
261 list_add_rcu(&counter->event_entry, &ctx->event_list);
262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
265}
266
267/*
268 * Remove a counter from the lists for its context.
269 * Must be called with ctx->mutex and ctx->lock held.
270 */
271static void
272list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
273{
274 struct perf_counter *sibling, *tmp;
275
276 if (list_empty(&counter->list_entry))
277 return;
278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
281
282 list_del_init(&counter->list_entry);
283 list_del_rcu(&counter->event_entry);
284
285 if (counter->group_leader != counter)
286 counter->group_leader->nr_siblings--;
287
288 /*
289 * If this was a group counter with sibling counters then
290 * upgrade the siblings to singleton counters by adding them
291 * to the context list directly:
292 */
293 list_for_each_entry_safe(sibling, tmp,
294 &counter->sibling_list, list_entry) {
295
296 list_move_tail(&sibling->list_entry, &ctx->counter_list);
297 sibling->group_leader = sibling;
298 }
299}
300
301static void
302counter_sched_out(struct perf_counter *counter,
303 struct perf_cpu_context *cpuctx,
304 struct perf_counter_context *ctx)
305{
306 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
307 return;
308
309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
314 counter->tstamp_stopped = ctx->time;
315 counter->pmu->disable(counter);
316 counter->oncpu = -1;
317
318 if (!is_software_counter(counter))
319 cpuctx->active_oncpu--;
320 ctx->nr_active--;
321 if (counter->attr.exclusive || !cpuctx->active_oncpu)
322 cpuctx->exclusive = 0;
323}
324
325static void
326group_sched_out(struct perf_counter *group_counter,
327 struct perf_cpu_context *cpuctx,
328 struct perf_counter_context *ctx)
329{
330 struct perf_counter *counter;
331
332 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
333 return;
334
335 counter_sched_out(group_counter, cpuctx, ctx);
336
337 /*
338 * Schedule out siblings (if any):
339 */
340 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
341 counter_sched_out(counter, cpuctx, ctx);
342
343 if (group_counter->attr.exclusive)
344 cpuctx->exclusive = 0;
345}
346
347/*
348 * Cross CPU call to remove a performance counter
349 *
350 * We disable the counter on the hardware level first. After that we
351 * remove it from the context list.
352 */
353static void __perf_counter_remove_from_context(void *info)
354{
355 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
356 struct perf_counter *counter = info;
357 struct perf_counter_context *ctx = counter->ctx;
358
359 /*
360 * If this is a task context, we need to check whether it is
361 * the current task context of this cpu. If not it has been
362 * scheduled out before the smp call arrived.
363 */
364 if (ctx->task && cpuctx->task_ctx != ctx)
365 return;
366
367 spin_lock(&ctx->lock);
368 /*
369 * Protect the list operation against NMI by disabling the
370 * counters on a global level.
371 */
372 perf_disable();
373
374 counter_sched_out(counter, cpuctx, ctx);
375
376 list_del_counter(counter, ctx);
377
378 if (!ctx->task) {
379 /*
380 * Allow more per task counters with respect to the
381 * reservation:
382 */
383 cpuctx->max_pertask =
384 min(perf_max_counters - ctx->nr_counters,
385 perf_max_counters - perf_reserved_percpu);
386 }
387
388 perf_enable();
389 spin_unlock(&ctx->lock);
390}
391
392
393/*
394 * Remove the counter from a task's (or a CPU's) list of counters.
395 *
396 * Must be called with ctx->mutex held.
397 *
398 * CPU counters are removed with a smp call. For task counters we only
399 * call when the task is on a CPU.
400 *
401 * If counter->ctx is a cloned context, callers must make sure that
402 * every task struct that counter->ctx->task could possibly point to
403 * remains valid. This is OK when called from perf_release since
404 * that only calls us on the top-level context, which can't be a clone.
405 * When called from perf_counter_exit_task, it's OK because the
406 * context has been detached from its task.
407 */
408static void perf_counter_remove_from_context(struct perf_counter *counter)
409{
410 struct perf_counter_context *ctx = counter->ctx;
411 struct task_struct *task = ctx->task;
412
413 if (!task) {
414 /*
415 * Per cpu counters are removed via an smp call and
416 * the removal is always sucessful.
417 */
418 smp_call_function_single(counter->cpu,
419 __perf_counter_remove_from_context,
420 counter, 1);
421 return;
422 }
423
424retry:
425 task_oncpu_function_call(task, __perf_counter_remove_from_context,
426 counter);
427
428 spin_lock_irq(&ctx->lock);
429 /*
430 * If the context is active we need to retry the smp call.
431 */
432 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
433 spin_unlock_irq(&ctx->lock);
434 goto retry;
435 }
436
437 /*
438 * The lock prevents that this context is scheduled in so we
439 * can remove the counter safely, if the call above did not
440 * succeed.
441 */
442 if (!list_empty(&counter->list_entry)) {
443 list_del_counter(counter, ctx);
444 }
445 spin_unlock_irq(&ctx->lock);
446}
447
448static inline u64 perf_clock(void)
449{
450 return cpu_clock(smp_processor_id());
451}
452
453/*
454 * Update the record of the current time in a context.
455 */
456static void update_context_time(struct perf_counter_context *ctx)
457{
458 u64 now = perf_clock();
459
460 ctx->time += now - ctx->timestamp;
461 ctx->timestamp = now;
462}
463
464/*
465 * Update the total_time_enabled and total_time_running fields for a counter.
466 */
467static void update_counter_times(struct perf_counter *counter)
468{
469 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end;
471
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
473 return;
474
475 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
476
477 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
478 run_end = counter->tstamp_stopped;
479 else
480 run_end = ctx->time;
481
482 counter->total_time_running = run_end - counter->tstamp_running;
483}
484
485/*
486 * Update total_time_enabled and total_time_running for all counters in a group.
487 */
488static void update_group_times(struct perf_counter *leader)
489{
490 struct perf_counter *counter;
491
492 update_counter_times(leader);
493 list_for_each_entry(counter, &leader->sibling_list, list_entry)
494 update_counter_times(counter);
495}
496
497/*
498 * Cross CPU call to disable a performance counter
499 */
500static void __perf_counter_disable(void *info)
501{
502 struct perf_counter *counter = info;
503 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
504 struct perf_counter_context *ctx = counter->ctx;
505
506 /*
507 * If this is a per-task counter, need to check whether this
508 * counter's task is the current task on this cpu.
509 */
510 if (ctx->task && cpuctx->task_ctx != ctx)
511 return;
512
513 spin_lock(&ctx->lock);
514
515 /*
516 * If the counter is on, turn it off.
517 * If it is in error state, leave it in error state.
518 */
519 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
520 update_context_time(ctx);
521 update_counter_times(counter);
522 if (counter == counter->group_leader)
523 group_sched_out(counter, cpuctx, ctx);
524 else
525 counter_sched_out(counter, cpuctx, ctx);
526 counter->state = PERF_COUNTER_STATE_OFF;
527 }
528
529 spin_unlock(&ctx->lock);
530}
531
532/*
533 * Disable a counter.
534 *
535 * If counter->ctx is a cloned context, callers must make sure that
536 * every task struct that counter->ctx->task could possibly point to
537 * remains valid. This condition is satisifed when called through
538 * perf_counter_for_each_child or perf_counter_for_each because they
539 * hold the top-level counter's child_mutex, so any descendant that
540 * goes to exit will block in sync_child_counter.
541 * When called from perf_pending_counter it's OK because counter->ctx
542 * is the current context on this CPU and preemption is disabled,
543 * hence we can't get into perf_counter_task_sched_out for this context.
544 */
545static void perf_counter_disable(struct perf_counter *counter)
546{
547 struct perf_counter_context *ctx = counter->ctx;
548 struct task_struct *task = ctx->task;
549
550 if (!task) {
551 /*
552 * Disable the counter on the cpu that it's on
553 */
554 smp_call_function_single(counter->cpu, __perf_counter_disable,
555 counter, 1);
556 return;
557 }
558
559 retry:
560 task_oncpu_function_call(task, __perf_counter_disable, counter);
561
562 spin_lock_irq(&ctx->lock);
563 /*
564 * If the counter is still active, we need to retry the cross-call.
565 */
566 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
567 spin_unlock_irq(&ctx->lock);
568 goto retry;
569 }
570
571 /*
572 * Since we have the lock this context can't be scheduled
573 * in, so we can change the state safely.
574 */
575 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
576 update_counter_times(counter);
577 counter->state = PERF_COUNTER_STATE_OFF;
578 }
579
580 spin_unlock_irq(&ctx->lock);
581}
582
583static int
584counter_sched_in(struct perf_counter *counter,
585 struct perf_cpu_context *cpuctx,
586 struct perf_counter_context *ctx,
587 int cpu)
588{
589 if (counter->state <= PERF_COUNTER_STATE_OFF)
590 return 0;
591
592 counter->state = PERF_COUNTER_STATE_ACTIVE;
593 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
594 /*
595 * The new state must be visible before we turn it on in the hardware:
596 */
597 smp_wmb();
598
599 if (counter->pmu->enable(counter)) {
600 counter->state = PERF_COUNTER_STATE_INACTIVE;
601 counter->oncpu = -1;
602 return -EAGAIN;
603 }
604
605 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
606
607 if (!is_software_counter(counter))
608 cpuctx->active_oncpu++;
609 ctx->nr_active++;
610
611 if (counter->attr.exclusive)
612 cpuctx->exclusive = 1;
613
614 return 0;
615}
616
617static int
618group_sched_in(struct perf_counter *group_counter,
619 struct perf_cpu_context *cpuctx,
620 struct perf_counter_context *ctx,
621 int cpu)
622{
623 struct perf_counter *counter, *partial_group;
624 int ret;
625
626 if (group_counter->state == PERF_COUNTER_STATE_OFF)
627 return 0;
628
629 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
630 if (ret)
631 return ret < 0 ? ret : 0;
632
633 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
634 return -EAGAIN;
635
636 /*
637 * Schedule in siblings as one group (if any):
638 */
639 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
640 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
641 partial_group = counter;
642 goto group_error;
643 }
644 }
645
646 return 0;
647
648group_error:
649 /*
650 * Groups can be scheduled in as one unit only, so undo any
651 * partial group before returning:
652 */
653 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
654 if (counter == partial_group)
655 break;
656 counter_sched_out(counter, cpuctx, ctx);
657 }
658 counter_sched_out(group_counter, cpuctx, ctx);
659
660 return -EAGAIN;
661}
662
663/*
664 * Return 1 for a group consisting entirely of software counters,
665 * 0 if the group contains any hardware counters.
666 */
667static int is_software_only_group(struct perf_counter *leader)
668{
669 struct perf_counter *counter;
670
671 if (!is_software_counter(leader))
672 return 0;
673
674 list_for_each_entry(counter, &leader->sibling_list, list_entry)
675 if (!is_software_counter(counter))
676 return 0;
677
678 return 1;
679}
680
681/*
682 * Work out whether we can put this counter group on the CPU now.
683 */
684static int group_can_go_on(struct perf_counter *counter,
685 struct perf_cpu_context *cpuctx,
686 int can_add_hw)
687{
688 /*
689 * Groups consisting entirely of software counters can always go on.
690 */
691 if (is_software_only_group(counter))
692 return 1;
693 /*
694 * If an exclusive group is already on, no other hardware
695 * counters can go on.
696 */
697 if (cpuctx->exclusive)
698 return 0;
699 /*
700 * If this group is exclusive and there are already
701 * counters on the CPU, it can't go on.
702 */
703 if (counter->attr.exclusive && cpuctx->active_oncpu)
704 return 0;
705 /*
706 * Otherwise, try to add it if all previous groups were able
707 * to go on.
708 */
709 return can_add_hw;
710}
711
712static void add_counter_to_ctx(struct perf_counter *counter,
713 struct perf_counter_context *ctx)
714{
715 list_add_counter(counter, ctx);
716 counter->tstamp_enabled = ctx->time;
717 counter->tstamp_running = ctx->time;
718 counter->tstamp_stopped = ctx->time;
719}
720
721/*
722 * Cross CPU call to install and enable a performance counter
723 *
724 * Must be called with ctx->mutex held
725 */
726static void __perf_install_in_context(void *info)
727{
728 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
729 struct perf_counter *counter = info;
730 struct perf_counter_context *ctx = counter->ctx;
731 struct perf_counter *leader = counter->group_leader;
732 int cpu = smp_processor_id();
733 int err;
734
735 /*
736 * If this is a task context, we need to check whether it is
737 * the current task context of this cpu. If not it has been
738 * scheduled out before the smp call arrived.
739 * Or possibly this is the right context but it isn't
740 * on this cpu because it had no counters.
741 */
742 if (ctx->task && cpuctx->task_ctx != ctx) {
743 if (cpuctx->task_ctx || ctx->task != current)
744 return;
745 cpuctx->task_ctx = ctx;
746 }
747
748 spin_lock(&ctx->lock);
749 ctx->is_active = 1;
750 update_context_time(ctx);
751
752 /*
753 * Protect the list operation against NMI by disabling the
754 * counters on a global level. NOP for non NMI based counters.
755 */
756 perf_disable();
757
758 add_counter_to_ctx(counter, ctx);
759
760 /*
761 * Don't put the counter on if it is disabled or if
762 * it is in a group and the group isn't on.
763 */
764 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
765 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
766 goto unlock;
767
768 /*
769 * An exclusive counter can't go on if there are already active
770 * hardware counters, and no hardware counter can go on if there
771 * is already an exclusive counter on.
772 */
773 if (!group_can_go_on(counter, cpuctx, 1))
774 err = -EEXIST;
775 else
776 err = counter_sched_in(counter, cpuctx, ctx, cpu);
777
778 if (err) {
779 /*
780 * This counter couldn't go on. If it is in a group
781 * then we have to pull the whole group off.
782 * If the counter group is pinned then put it in error state.
783 */
784 if (leader != counter)
785 group_sched_out(leader, cpuctx, ctx);
786 if (leader->attr.pinned) {
787 update_group_times(leader);
788 leader->state = PERF_COUNTER_STATE_ERROR;
789 }
790 }
791
792 if (!err && !ctx->task && cpuctx->max_pertask)
793 cpuctx->max_pertask--;
794
795 unlock:
796 perf_enable();
797
798 spin_unlock(&ctx->lock);
799}
800
801/*
802 * Attach a performance counter to a context
803 *
804 * First we add the counter to the list with the hardware enable bit
805 * in counter->hw_config cleared.
806 *
807 * If the counter is attached to a task which is on a CPU we use a smp
808 * call to enable it in the task context. The task might have been
809 * scheduled away, but we check this in the smp call again.
810 *
811 * Must be called with ctx->mutex held.
812 */
813static void
814perf_install_in_context(struct perf_counter_context *ctx,
815 struct perf_counter *counter,
816 int cpu)
817{
818 struct task_struct *task = ctx->task;
819
820 if (!task) {
821 /*
822 * Per cpu counters are installed via an smp call and
823 * the install is always sucessful.
824 */
825 smp_call_function_single(cpu, __perf_install_in_context,
826 counter, 1);
827 return;
828 }
829
830retry:
831 task_oncpu_function_call(task, __perf_install_in_context,
832 counter);
833
834 spin_lock_irq(&ctx->lock);
835 /*
836 * we need to retry the smp call.
837 */
838 if (ctx->is_active && list_empty(&counter->list_entry)) {
839 spin_unlock_irq(&ctx->lock);
840 goto retry;
841 }
842
843 /*
844 * The lock prevents that this context is scheduled in so we
845 * can add the counter safely, if it the call above did not
846 * succeed.
847 */
848 if (list_empty(&counter->list_entry))
849 add_counter_to_ctx(counter, ctx);
850 spin_unlock_irq(&ctx->lock);
851}
852
853/*
854 * Cross CPU call to enable a performance counter
855 */
856static void __perf_counter_enable(void *info)
857{
858 struct perf_counter *counter = info;
859 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
860 struct perf_counter_context *ctx = counter->ctx;
861 struct perf_counter *leader = counter->group_leader;
862 int err;
863
864 /*
865 * If this is a per-task counter, need to check whether this
866 * counter's task is the current task on this cpu.
867 */
868 if (ctx->task && cpuctx->task_ctx != ctx) {
869 if (cpuctx->task_ctx || ctx->task != current)
870 return;
871 cpuctx->task_ctx = ctx;
872 }
873
874 spin_lock(&ctx->lock);
875 ctx->is_active = 1;
876 update_context_time(ctx);
877
878 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
879 goto unlock;
880 counter->state = PERF_COUNTER_STATE_INACTIVE;
881 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
882
883 /*
884 * If the counter is in a group and isn't the group leader,
885 * then don't put it on unless the group is on.
886 */
887 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
888 goto unlock;
889
890 if (!group_can_go_on(counter, cpuctx, 1)) {
891 err = -EEXIST;
892 } else {
893 perf_disable();
894 if (counter == leader)
895 err = group_sched_in(counter, cpuctx, ctx,
896 smp_processor_id());
897 else
898 err = counter_sched_in(counter, cpuctx, ctx,
899 smp_processor_id());
900 perf_enable();
901 }
902
903 if (err) {
904 /*
905 * If this counter can't go on and it's part of a
906 * group, then the whole group has to come off.
907 */
908 if (leader != counter)
909 group_sched_out(leader, cpuctx, ctx);
910 if (leader->attr.pinned) {
911 update_group_times(leader);
912 leader->state = PERF_COUNTER_STATE_ERROR;
913 }
914 }
915
916 unlock:
917 spin_unlock(&ctx->lock);
918}
919
920/*
921 * Enable a counter.
922 *
923 * If counter->ctx is a cloned context, callers must make sure that
924 * every task struct that counter->ctx->task could possibly point to
925 * remains valid. This condition is satisfied when called through
926 * perf_counter_for_each_child or perf_counter_for_each as described
927 * for perf_counter_disable.
928 */
929static void perf_counter_enable(struct perf_counter *counter)
930{
931 struct perf_counter_context *ctx = counter->ctx;
932 struct task_struct *task = ctx->task;
933
934 if (!task) {
935 /*
936 * Enable the counter on the cpu that it's on
937 */
938 smp_call_function_single(counter->cpu, __perf_counter_enable,
939 counter, 1);
940 return;
941 }
942
943 spin_lock_irq(&ctx->lock);
944 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
945 goto out;
946
947 /*
948 * If the counter is in error state, clear that first.
949 * That way, if we see the counter in error state below, we
950 * know that it has gone back into error state, as distinct
951 * from the task having been scheduled away before the
952 * cross-call arrived.
953 */
954 if (counter->state == PERF_COUNTER_STATE_ERROR)
955 counter->state = PERF_COUNTER_STATE_OFF;
956
957 retry:
958 spin_unlock_irq(&ctx->lock);
959 task_oncpu_function_call(task, __perf_counter_enable, counter);
960
961 spin_lock_irq(&ctx->lock);
962
963 /*
964 * If the context is active and the counter is still off,
965 * we need to retry the cross-call.
966 */
967 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
968 goto retry;
969
970 /*
971 * Since we have the lock this context can't be scheduled
972 * in, so we can change the state safely.
973 */
974 if (counter->state == PERF_COUNTER_STATE_OFF) {
975 counter->state = PERF_COUNTER_STATE_INACTIVE;
976 counter->tstamp_enabled =
977 ctx->time - counter->total_time_enabled;
978 }
979 out:
980 spin_unlock_irq(&ctx->lock);
981}
982
983static int perf_counter_refresh(struct perf_counter *counter, int refresh)
984{
985 /*
986 * not supported on inherited counters
987 */
988 if (counter->attr.inherit)
989 return -EINVAL;
990
991 atomic_add(refresh, &counter->event_limit);
992 perf_counter_enable(counter);
993
994 return 0;
995}
996
997void __perf_counter_sched_out(struct perf_counter_context *ctx,
998 struct perf_cpu_context *cpuctx)
999{
1000 struct perf_counter *counter;
1001
1002 spin_lock(&ctx->lock);
1003 ctx->is_active = 0;
1004 if (likely(!ctx->nr_counters))
1005 goto out;
1006 update_context_time(ctx);
1007
1008 perf_disable();
1009 if (ctx->nr_active) {
1010 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1011 if (counter != counter->group_leader)
1012 counter_sched_out(counter, cpuctx, ctx);
1013 else
1014 group_sched_out(counter, cpuctx, ctx);
1015 }
1016 }
1017 perf_enable();
1018 out:
1019 spin_unlock(&ctx->lock);
1020}
1021
1022/*
1023 * Test whether two contexts are equivalent, i.e. whether they
1024 * have both been cloned from the same version of the same context
1025 * and they both have the same number of enabled counters.
1026 * If the number of enabled counters is the same, then the set
1027 * of enabled counters should be the same, because these are both
1028 * inherited contexts, therefore we can't access individual counters
1029 * in them directly with an fd; we can only enable/disable all
1030 * counters via prctl, or enable/disable all counters in a family
1031 * via ioctl, which will have the same effect on both contexts.
1032 */
1033static int context_equiv(struct perf_counter_context *ctx1,
1034 struct perf_counter_context *ctx2)
1035{
1036 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1037 && ctx1->parent_gen == ctx2->parent_gen
1038 && !ctx1->pin_count && !ctx2->pin_count;
1039}
1040
1041static void __perf_counter_read(void *counter);
1042
1043static void __perf_counter_sync_stat(struct perf_counter *counter,
1044 struct perf_counter *next_counter)
1045{
1046 u64 value;
1047
1048 if (!counter->attr.inherit_stat)
1049 return;
1050
1051 /*
1052 * Update the counter value, we cannot use perf_counter_read()
1053 * because we're in the middle of a context switch and have IRQs
1054 * disabled, which upsets smp_call_function_single(), however
1055 * we know the counter must be on the current CPU, therefore we
1056 * don't need to use it.
1057 */
1058 switch (counter->state) {
1059 case PERF_COUNTER_STATE_ACTIVE:
1060 __perf_counter_read(counter);
1061 break;
1062
1063 case PERF_COUNTER_STATE_INACTIVE:
1064 update_counter_times(counter);
1065 break;
1066
1067 default:
1068 break;
1069 }
1070
1071 /*
1072 * In order to keep per-task stats reliable we need to flip the counter
1073 * values when we flip the contexts.
1074 */
1075 value = atomic64_read(&next_counter->count);
1076 value = atomic64_xchg(&counter->count, value);
1077 atomic64_set(&next_counter->count, value);
1078
1079 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1080 swap(counter->total_time_running, next_counter->total_time_running);
1081
1082 /*
1083 * Since we swizzled the values, update the user visible data too.
1084 */
1085 perf_counter_update_userpage(counter);
1086 perf_counter_update_userpage(next_counter);
1087}
1088
1089#define list_next_entry(pos, member) \
1090 list_entry(pos->member.next, typeof(*pos), member)
1091
1092static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1093 struct perf_counter_context *next_ctx)
1094{
1095 struct perf_counter *counter, *next_counter;
1096
1097 if (!ctx->nr_stat)
1098 return;
1099
1100 counter = list_first_entry(&ctx->event_list,
1101 struct perf_counter, event_entry);
1102
1103 next_counter = list_first_entry(&next_ctx->event_list,
1104 struct perf_counter, event_entry);
1105
1106 while (&counter->event_entry != &ctx->event_list &&
1107 &next_counter->event_entry != &next_ctx->event_list) {
1108
1109 __perf_counter_sync_stat(counter, next_counter);
1110
1111 counter = list_next_entry(counter, event_entry);
1112 next_counter = list_next_entry(next_counter, event_entry);
1113 }
1114}
1115
1116/*
1117 * Called from scheduler to remove the counters of the current task,
1118 * with interrupts disabled.
1119 *
1120 * We stop each counter and update the counter value in counter->count.
1121 *
1122 * This does not protect us against NMI, but disable()
1123 * sets the disabled bit in the control field of counter _before_
1124 * accessing the counter control register. If a NMI hits, then it will
1125 * not restart the counter.
1126 */
1127void perf_counter_task_sched_out(struct task_struct *task,
1128 struct task_struct *next, int cpu)
1129{
1130 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1131 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1132 struct perf_counter_context *next_ctx;
1133 struct perf_counter_context *parent;
1134 struct pt_regs *regs;
1135 int do_switch = 1;
1136
1137 regs = task_pt_regs(task);
1138 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1139
1140 if (likely(!ctx || !cpuctx->task_ctx))
1141 return;
1142
1143 update_context_time(ctx);
1144
1145 rcu_read_lock();
1146 parent = rcu_dereference(ctx->parent_ctx);
1147 next_ctx = next->perf_counter_ctxp;
1148 if (parent && next_ctx &&
1149 rcu_dereference(next_ctx->parent_ctx) == parent) {
1150 /*
1151 * Looks like the two contexts are clones, so we might be
1152 * able to optimize the context switch. We lock both
1153 * contexts and check that they are clones under the
1154 * lock (including re-checking that neither has been
1155 * uncloned in the meantime). It doesn't matter which
1156 * order we take the locks because no other cpu could
1157 * be trying to lock both of these tasks.
1158 */
1159 spin_lock(&ctx->lock);
1160 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1161 if (context_equiv(ctx, next_ctx)) {
1162 /*
1163 * XXX do we need a memory barrier of sorts
1164 * wrt to rcu_dereference() of perf_counter_ctxp
1165 */
1166 task->perf_counter_ctxp = next_ctx;
1167 next->perf_counter_ctxp = ctx;
1168 ctx->task = next;
1169 next_ctx->task = task;
1170 do_switch = 0;
1171
1172 perf_counter_sync_stat(ctx, next_ctx);
1173 }
1174 spin_unlock(&next_ctx->lock);
1175 spin_unlock(&ctx->lock);
1176 }
1177 rcu_read_unlock();
1178
1179 if (do_switch) {
1180 __perf_counter_sched_out(ctx, cpuctx);
1181 cpuctx->task_ctx = NULL;
1182 }
1183}
1184
1185/*
1186 * Called with IRQs disabled
1187 */
1188static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1189{
1190 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1191
1192 if (!cpuctx->task_ctx)
1193 return;
1194
1195 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1196 return;
1197
1198 __perf_counter_sched_out(ctx, cpuctx);
1199 cpuctx->task_ctx = NULL;
1200}
1201
1202/*
1203 * Called with IRQs disabled
1204 */
1205static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1206{
1207 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1208}
1209
1210static void
1211__perf_counter_sched_in(struct perf_counter_context *ctx,
1212 struct perf_cpu_context *cpuctx, int cpu)
1213{
1214 struct perf_counter *counter;
1215 int can_add_hw = 1;
1216
1217 spin_lock(&ctx->lock);
1218 ctx->is_active = 1;
1219 if (likely(!ctx->nr_counters))
1220 goto out;
1221
1222 ctx->timestamp = perf_clock();
1223
1224 perf_disable();
1225
1226 /*
1227 * First go through the list and put on any pinned groups
1228 * in order to give them the best chance of going on.
1229 */
1230 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1231 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1232 !counter->attr.pinned)
1233 continue;
1234 if (counter->cpu != -1 && counter->cpu != cpu)
1235 continue;
1236
1237 if (counter != counter->group_leader)
1238 counter_sched_in(counter, cpuctx, ctx, cpu);
1239 else {
1240 if (group_can_go_on(counter, cpuctx, 1))
1241 group_sched_in(counter, cpuctx, ctx, cpu);
1242 }
1243
1244 /*
1245 * If this pinned group hasn't been scheduled,
1246 * put it in error state.
1247 */
1248 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1249 update_group_times(counter);
1250 counter->state = PERF_COUNTER_STATE_ERROR;
1251 }
1252 }
1253
1254 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1255 /*
1256 * Ignore counters in OFF or ERROR state, and
1257 * ignore pinned counters since we did them already.
1258 */
1259 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1260 counter->attr.pinned)
1261 continue;
1262
1263 /*
1264 * Listen to the 'cpu' scheduling filter constraint
1265 * of counters:
1266 */
1267 if (counter->cpu != -1 && counter->cpu != cpu)
1268 continue;
1269
1270 if (counter != counter->group_leader) {
1271 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1272 can_add_hw = 0;
1273 } else {
1274 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1275 if (group_sched_in(counter, cpuctx, ctx, cpu))
1276 can_add_hw = 0;
1277 }
1278 }
1279 }
1280 perf_enable();
1281 out:
1282 spin_unlock(&ctx->lock);
1283}
1284
1285/*
1286 * Called from scheduler to add the counters of the current task
1287 * with interrupts disabled.
1288 *
1289 * We restore the counter value and then enable it.
1290 *
1291 * This does not protect us against NMI, but enable()
1292 * sets the enabled bit in the control field of counter _before_
1293 * accessing the counter control register. If a NMI hits, then it will
1294 * keep the counter running.
1295 */
1296void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1297{
1298 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1299 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1300
1301 if (likely(!ctx))
1302 return;
1303 if (cpuctx->task_ctx == ctx)
1304 return;
1305 __perf_counter_sched_in(ctx, cpuctx, cpu);
1306 cpuctx->task_ctx = ctx;
1307}
1308
1309static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1310{
1311 struct perf_counter_context *ctx = &cpuctx->ctx;
1312
1313 __perf_counter_sched_in(ctx, cpuctx, cpu);
1314}
1315
1316#define MAX_INTERRUPTS (~0ULL)
1317
1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1319
1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1321{
1322 struct hw_perf_counter *hwc = &counter->hw;
1323 u64 period, sample_period;
1324 s64 delta;
1325
1326 events *= hwc->sample_period;
1327 period = div64_u64(events, counter->attr.sample_freq);
1328
1329 delta = (s64)(period - hwc->sample_period);
1330 delta = (delta + 7) / 8; /* low pass filter */
1331
1332 sample_period = hwc->sample_period + delta;
1333
1334 if (!sample_period)
1335 sample_period = 1;
1336
1337 hwc->sample_period = sample_period;
1338}
1339
1340static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1341{
1342 struct perf_counter *counter;
1343 struct hw_perf_counter *hwc;
1344 u64 interrupts, freq;
1345
1346 spin_lock(&ctx->lock);
1347 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1348 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1349 continue;
1350
1351 hwc = &counter->hw;
1352
1353 interrupts = hwc->interrupts;
1354 hwc->interrupts = 0;
1355
1356 /*
1357 * unthrottle counters on the tick
1358 */
1359 if (interrupts == MAX_INTERRUPTS) {
1360 perf_log_throttle(counter, 1);
1361 counter->pmu->unthrottle(counter);
1362 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1363 }
1364
1365 if (!counter->attr.freq || !counter->attr.sample_freq)
1366 continue;
1367
1368 /*
1369 * if the specified freq < HZ then we need to skip ticks
1370 */
1371 if (counter->attr.sample_freq < HZ) {
1372 freq = counter->attr.sample_freq;
1373
1374 hwc->freq_count += freq;
1375 hwc->freq_interrupts += interrupts;
1376
1377 if (hwc->freq_count < HZ)
1378 continue;
1379
1380 interrupts = hwc->freq_interrupts;
1381 hwc->freq_interrupts = 0;
1382 hwc->freq_count -= HZ;
1383 } else
1384 freq = HZ;
1385
1386 perf_adjust_period(counter, freq * interrupts);
1387
1388 /*
1389 * In order to avoid being stalled by an (accidental) huge
1390 * sample period, force reset the sample period if we didn't
1391 * get any events in this freq period.
1392 */
1393 if (!interrupts) {
1394 perf_disable();
1395 counter->pmu->disable(counter);
1396 atomic64_set(&hwc->period_left, 0);
1397 counter->pmu->enable(counter);
1398 perf_enable();
1399 }
1400 }
1401 spin_unlock(&ctx->lock);
1402}
1403
1404/*
1405 * Round-robin a context's counters:
1406 */
1407static void rotate_ctx(struct perf_counter_context *ctx)
1408{
1409 struct perf_counter *counter;
1410
1411 if (!ctx->nr_counters)
1412 return;
1413
1414 spin_lock(&ctx->lock);
1415 /*
1416 * Rotate the first entry last (works just fine for group counters too):
1417 */
1418 perf_disable();
1419 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1420 list_move_tail(&counter->list_entry, &ctx->counter_list);
1421 break;
1422 }
1423 perf_enable();
1424
1425 spin_unlock(&ctx->lock);
1426}
1427
1428void perf_counter_task_tick(struct task_struct *curr, int cpu)
1429{
1430 struct perf_cpu_context *cpuctx;
1431 struct perf_counter_context *ctx;
1432
1433 if (!atomic_read(&nr_counters))
1434 return;
1435
1436 cpuctx = &per_cpu(perf_cpu_context, cpu);
1437 ctx = curr->perf_counter_ctxp;
1438
1439 perf_ctx_adjust_freq(&cpuctx->ctx);
1440 if (ctx)
1441 perf_ctx_adjust_freq(ctx);
1442
1443 perf_counter_cpu_sched_out(cpuctx);
1444 if (ctx)
1445 __perf_counter_task_sched_out(ctx);
1446
1447 rotate_ctx(&cpuctx->ctx);
1448 if (ctx)
1449 rotate_ctx(ctx);
1450
1451 perf_counter_cpu_sched_in(cpuctx, cpu);
1452 if (ctx)
1453 perf_counter_task_sched_in(curr, cpu);
1454}
1455
1456/*
1457 * Enable all of a task's counters that have been marked enable-on-exec.
1458 * This expects task == current.
1459 */
1460static void perf_counter_enable_on_exec(struct task_struct *task)
1461{
1462 struct perf_counter_context *ctx;
1463 struct perf_counter *counter;
1464 unsigned long flags;
1465 int enabled = 0;
1466
1467 local_irq_save(flags);
1468 ctx = task->perf_counter_ctxp;
1469 if (!ctx || !ctx->nr_counters)
1470 goto out;
1471
1472 __perf_counter_task_sched_out(ctx);
1473
1474 spin_lock(&ctx->lock);
1475
1476 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1477 if (!counter->attr.enable_on_exec)
1478 continue;
1479 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE;
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1;
1486 }
1487
1488 /*
1489 * Unclone this context if we enabled any counter.
1490 */
1491 if (enabled)
1492 unclone_ctx(ctx);
1493
1494 spin_unlock(&ctx->lock);
1495
1496 perf_counter_task_sched_in(task, smp_processor_id());
1497 out:
1498 local_irq_restore(flags);
1499}
1500
1501/*
1502 * Cross CPU call to read the hardware counter
1503 */
1504static void __perf_counter_read(void *info)
1505{
1506 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1507 struct perf_counter *counter = info;
1508 struct perf_counter_context *ctx = counter->ctx;
1509 unsigned long flags;
1510
1511 /*
1512 * If this is a task context, we need to check whether it is
1513 * the current task context of this cpu. If not it has been
1514 * scheduled out before the smp call arrived. In that case
1515 * counter->count would have been updated to a recent sample
1516 * when the counter was scheduled out.
1517 */
1518 if (ctx->task && cpuctx->task_ctx != ctx)
1519 return;
1520
1521 local_irq_save(flags);
1522 if (ctx->is_active)
1523 update_context_time(ctx);
1524 counter->pmu->read(counter);
1525 update_counter_times(counter);
1526 local_irq_restore(flags);
1527}
1528
1529static u64 perf_counter_read(struct perf_counter *counter)
1530{
1531 /*
1532 * If counter is enabled and currently active on a CPU, update the
1533 * value in the counter structure:
1534 */
1535 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1536 smp_call_function_single(counter->oncpu,
1537 __perf_counter_read, counter, 1);
1538 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1539 update_counter_times(counter);
1540 }
1541
1542 return atomic64_read(&counter->count);
1543}
1544
1545/*
1546 * Initialize the perf_counter context in a task_struct:
1547 */
1548static void
1549__perf_counter_init_context(struct perf_counter_context *ctx,
1550 struct task_struct *task)
1551{
1552 memset(ctx, 0, sizeof(*ctx));
1553 spin_lock_init(&ctx->lock);
1554 mutex_init(&ctx->mutex);
1555 INIT_LIST_HEAD(&ctx->counter_list);
1556 INIT_LIST_HEAD(&ctx->event_list);
1557 atomic_set(&ctx->refcount, 1);
1558 ctx->task = task;
1559}
1560
1561static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1562{
1563 struct perf_counter_context *ctx;
1564 struct perf_cpu_context *cpuctx;
1565 struct task_struct *task;
1566 unsigned long flags;
1567 int err;
1568
1569 /*
1570 * If cpu is not a wildcard then this is a percpu counter:
1571 */
1572 if (cpu != -1) {
1573 /* Must be root to operate on a CPU counter: */
1574 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1575 return ERR_PTR(-EACCES);
1576
1577 if (cpu < 0 || cpu > num_possible_cpus())
1578 return ERR_PTR(-EINVAL);
1579
1580 /*
1581 * We could be clever and allow to attach a counter to an
1582 * offline CPU and activate it when the CPU comes up, but
1583 * that's for later.
1584 */
1585 if (!cpu_isset(cpu, cpu_online_map))
1586 return ERR_PTR(-ENODEV);
1587
1588 cpuctx = &per_cpu(perf_cpu_context, cpu);
1589 ctx = &cpuctx->ctx;
1590 get_ctx(ctx);
1591
1592 return ctx;
1593 }
1594
1595 rcu_read_lock();
1596 if (!pid)
1597 task = current;
1598 else
1599 task = find_task_by_vpid(pid);
1600 if (task)
1601 get_task_struct(task);
1602 rcu_read_unlock();
1603
1604 if (!task)
1605 return ERR_PTR(-ESRCH);
1606
1607 /*
1608 * Can't attach counters to a dying task.
1609 */
1610 err = -ESRCH;
1611 if (task->flags & PF_EXITING)
1612 goto errout;
1613
1614 /* Reuse ptrace permission checks for now. */
1615 err = -EACCES;
1616 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1617 goto errout;
1618
1619 retry:
1620 ctx = perf_lock_task_context(task, &flags);
1621 if (ctx) {
1622 unclone_ctx(ctx);
1623 spin_unlock_irqrestore(&ctx->lock, flags);
1624 }
1625
1626 if (!ctx) {
1627 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1628 err = -ENOMEM;
1629 if (!ctx)
1630 goto errout;
1631 __perf_counter_init_context(ctx, task);
1632 get_ctx(ctx);
1633 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1634 /*
1635 * We raced with some other task; use
1636 * the context they set.
1637 */
1638 kfree(ctx);
1639 goto retry;
1640 }
1641 get_task_struct(task);
1642 }
1643
1644 put_task_struct(task);
1645 return ctx;
1646
1647 errout:
1648 put_task_struct(task);
1649 return ERR_PTR(err);
1650}
1651
1652static void free_counter_rcu(struct rcu_head *head)
1653{
1654 struct perf_counter *counter;
1655
1656 counter = container_of(head, struct perf_counter, rcu_head);
1657 if (counter->ns)
1658 put_pid_ns(counter->ns);
1659 kfree(counter);
1660}
1661
1662static void perf_pending_sync(struct perf_counter *counter);
1663
1664static void free_counter(struct perf_counter *counter)
1665{
1666 perf_pending_sync(counter);
1667
1668 if (!counter->parent) {
1669 atomic_dec(&nr_counters);
1670 if (counter->attr.mmap)
1671 atomic_dec(&nr_mmap_counters);
1672 if (counter->attr.comm)
1673 atomic_dec(&nr_comm_counters);
1674 if (counter->attr.task)
1675 atomic_dec(&nr_task_counters);
1676 }
1677
1678 if (counter->destroy)
1679 counter->destroy(counter);
1680
1681 put_ctx(counter->ctx);
1682 call_rcu(&counter->rcu_head, free_counter_rcu);
1683}
1684
1685/*
1686 * Called when the last reference to the file is gone.
1687 */
1688static int perf_release(struct inode *inode, struct file *file)
1689{
1690 struct perf_counter *counter = file->private_data;
1691 struct perf_counter_context *ctx = counter->ctx;
1692
1693 file->private_data = NULL;
1694
1695 WARN_ON_ONCE(ctx->parent_ctx);
1696 mutex_lock(&ctx->mutex);
1697 perf_counter_remove_from_context(counter);
1698 mutex_unlock(&ctx->mutex);
1699
1700 mutex_lock(&counter->owner->perf_counter_mutex);
1701 list_del_init(&counter->owner_entry);
1702 mutex_unlock(&counter->owner->perf_counter_mutex);
1703 put_task_struct(counter->owner);
1704
1705 free_counter(counter);
1706
1707 return 0;
1708}
1709
1710static int perf_counter_read_size(struct perf_counter *counter)
1711{
1712 int entry = sizeof(u64); /* value */
1713 int size = 0;
1714 int nr = 1;
1715
1716 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1717 size += sizeof(u64);
1718
1719 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1720 size += sizeof(u64);
1721
1722 if (counter->attr.read_format & PERF_FORMAT_ID)
1723 entry += sizeof(u64);
1724
1725 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1726 nr += counter->group_leader->nr_siblings;
1727 size += sizeof(u64);
1728 }
1729
1730 size += entry * nr;
1731
1732 return size;
1733}
1734
1735static u64 perf_counter_read_value(struct perf_counter *counter)
1736{
1737 struct perf_counter *child;
1738 u64 total = 0;
1739
1740 total += perf_counter_read(counter);
1741 list_for_each_entry(child, &counter->child_list, child_list)
1742 total += perf_counter_read(child);
1743
1744 return total;
1745}
1746
1747static int perf_counter_read_entry(struct perf_counter *counter,
1748 u64 read_format, char __user *buf)
1749{
1750 int n = 0, count = 0;
1751 u64 values[2];
1752
1753 values[n++] = perf_counter_read_value(counter);
1754 if (read_format & PERF_FORMAT_ID)
1755 values[n++] = primary_counter_id(counter);
1756
1757 count = n * sizeof(u64);
1758
1759 if (copy_to_user(buf, values, count))
1760 return -EFAULT;
1761
1762 return count;
1763}
1764
1765static int perf_counter_read_group(struct perf_counter *counter,
1766 u64 read_format, char __user *buf)
1767{
1768 struct perf_counter *leader = counter->group_leader, *sub;
1769 int n = 0, size = 0, err = -EFAULT;
1770 u64 values[3];
1771
1772 values[n++] = 1 + leader->nr_siblings;
1773 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1774 values[n++] = leader->total_time_enabled +
1775 atomic64_read(&leader->child_total_time_enabled);
1776 }
1777 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1778 values[n++] = leader->total_time_running +
1779 atomic64_read(&leader->child_total_time_running);
1780 }
1781
1782 size = n * sizeof(u64);
1783
1784 if (copy_to_user(buf, values, size))
1785 return -EFAULT;
1786
1787 err = perf_counter_read_entry(leader, read_format, buf + size);
1788 if (err < 0)
1789 return err;
1790
1791 size += err;
1792
1793 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1794 err = perf_counter_read_entry(sub, read_format,
1795 buf + size);
1796 if (err < 0)
1797 return err;
1798
1799 size += err;
1800 }
1801
1802 return size;
1803}
1804
1805static int perf_counter_read_one(struct perf_counter *counter,
1806 u64 read_format, char __user *buf)
1807{
1808 u64 values[4];
1809 int n = 0;
1810
1811 values[n++] = perf_counter_read_value(counter);
1812 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1813 values[n++] = counter->total_time_enabled +
1814 atomic64_read(&counter->child_total_time_enabled);
1815 }
1816 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1817 values[n++] = counter->total_time_running +
1818 atomic64_read(&counter->child_total_time_running);
1819 }
1820 if (read_format & PERF_FORMAT_ID)
1821 values[n++] = primary_counter_id(counter);
1822
1823 if (copy_to_user(buf, values, n * sizeof(u64)))
1824 return -EFAULT;
1825
1826 return n * sizeof(u64);
1827}
1828
1829/*
1830 * Read the performance counter - simple non blocking version for now
1831 */
1832static ssize_t
1833perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1834{
1835 u64 read_format = counter->attr.read_format;
1836 int ret;
1837
1838 /*
1839 * Return end-of-file for a read on a counter that is in
1840 * error state (i.e. because it was pinned but it couldn't be
1841 * scheduled on to the CPU at some point).
1842 */
1843 if (counter->state == PERF_COUNTER_STATE_ERROR)
1844 return 0;
1845
1846 if (count < perf_counter_read_size(counter))
1847 return -ENOSPC;
1848
1849 WARN_ON_ONCE(counter->ctx->parent_ctx);
1850 mutex_lock(&counter->child_mutex);
1851 if (read_format & PERF_FORMAT_GROUP)
1852 ret = perf_counter_read_group(counter, read_format, buf);
1853 else
1854 ret = perf_counter_read_one(counter, read_format, buf);
1855 mutex_unlock(&counter->child_mutex);
1856
1857 return ret;
1858}
1859
1860static ssize_t
1861perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1862{
1863 struct perf_counter *counter = file->private_data;
1864
1865 return perf_read_hw(counter, buf, count);
1866}
1867
1868static unsigned int perf_poll(struct file *file, poll_table *wait)
1869{
1870 struct perf_counter *counter = file->private_data;
1871 struct perf_mmap_data *data;
1872 unsigned int events = POLL_HUP;
1873
1874 rcu_read_lock();
1875 data = rcu_dereference(counter->data);
1876 if (data)
1877 events = atomic_xchg(&data->poll, 0);
1878 rcu_read_unlock();
1879
1880 poll_wait(file, &counter->waitq, wait);
1881
1882 return events;
1883}
1884
1885static void perf_counter_reset(struct perf_counter *counter)
1886{
1887 (void)perf_counter_read(counter);
1888 atomic64_set(&counter->count, 0);
1889 perf_counter_update_userpage(counter);
1890}
1891
1892/*
1893 * Holding the top-level counter's child_mutex means that any
1894 * descendant process that has inherited this counter will block
1895 * in sync_child_counter if it goes to exit, thus satisfying the
1896 * task existence requirements of perf_counter_enable/disable.
1897 */
1898static void perf_counter_for_each_child(struct perf_counter *counter,
1899 void (*func)(struct perf_counter *))
1900{
1901 struct perf_counter *child;
1902
1903 WARN_ON_ONCE(counter->ctx->parent_ctx);
1904 mutex_lock(&counter->child_mutex);
1905 func(counter);
1906 list_for_each_entry(child, &counter->child_list, child_list)
1907 func(child);
1908 mutex_unlock(&counter->child_mutex);
1909}
1910
1911static void perf_counter_for_each(struct perf_counter *counter,
1912 void (*func)(struct perf_counter *))
1913{
1914 struct perf_counter_context *ctx = counter->ctx;
1915 struct perf_counter *sibling;
1916
1917 WARN_ON_ONCE(ctx->parent_ctx);
1918 mutex_lock(&ctx->mutex);
1919 counter = counter->group_leader;
1920
1921 perf_counter_for_each_child(counter, func);
1922 func(counter);
1923 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1924 perf_counter_for_each_child(counter, func);
1925 mutex_unlock(&ctx->mutex);
1926}
1927
1928static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1929{
1930 struct perf_counter_context *ctx = counter->ctx;
1931 unsigned long size;
1932 int ret = 0;
1933 u64 value;
1934
1935 if (!counter->attr.sample_period)
1936 return -EINVAL;
1937
1938 size = copy_from_user(&value, arg, sizeof(value));
1939 if (size != sizeof(value))
1940 return -EFAULT;
1941
1942 if (!value)
1943 return -EINVAL;
1944
1945 spin_lock_irq(&ctx->lock);
1946 if (counter->attr.freq) {
1947 if (value > sysctl_perf_counter_sample_rate) {
1948 ret = -EINVAL;
1949 goto unlock;
1950 }
1951
1952 counter->attr.sample_freq = value;
1953 } else {
1954 counter->attr.sample_period = value;
1955 counter->hw.sample_period = value;
1956 }
1957unlock:
1958 spin_unlock_irq(&ctx->lock);
1959
1960 return ret;
1961}
1962
1963static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1964{
1965 struct perf_counter *counter = file->private_data;
1966 void (*func)(struct perf_counter *);
1967 u32 flags = arg;
1968
1969 switch (cmd) {
1970 case PERF_COUNTER_IOC_ENABLE:
1971 func = perf_counter_enable;
1972 break;
1973 case PERF_COUNTER_IOC_DISABLE:
1974 func = perf_counter_disable;
1975 break;
1976 case PERF_COUNTER_IOC_RESET:
1977 func = perf_counter_reset;
1978 break;
1979
1980 case PERF_COUNTER_IOC_REFRESH:
1981 return perf_counter_refresh(counter, arg);
1982
1983 case PERF_COUNTER_IOC_PERIOD:
1984 return perf_counter_period(counter, (u64 __user *)arg);
1985
1986 default:
1987 return -ENOTTY;
1988 }
1989
1990 if (flags & PERF_IOC_FLAG_GROUP)
1991 perf_counter_for_each(counter, func);
1992 else
1993 perf_counter_for_each_child(counter, func);
1994
1995 return 0;
1996}
1997
1998int perf_counter_task_enable(void)
1999{
2000 struct perf_counter *counter;
2001
2002 mutex_lock(&current->perf_counter_mutex);
2003 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2004 perf_counter_for_each_child(counter, perf_counter_enable);
2005 mutex_unlock(&current->perf_counter_mutex);
2006
2007 return 0;
2008}
2009
2010int perf_counter_task_disable(void)
2011{
2012 struct perf_counter *counter;
2013
2014 mutex_lock(&current->perf_counter_mutex);
2015 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2016 perf_counter_for_each_child(counter, perf_counter_disable);
2017 mutex_unlock(&current->perf_counter_mutex);
2018
2019 return 0;
2020}
2021
2022#ifndef PERF_COUNTER_INDEX_OFFSET
2023# define PERF_COUNTER_INDEX_OFFSET 0
2024#endif
2025
2026static int perf_counter_index(struct perf_counter *counter)
2027{
2028 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2029 return 0;
2030
2031 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2032}
2033
2034/*
2035 * Callers need to ensure there can be no nesting of this function, otherwise
2036 * the seqlock logic goes bad. We can not serialize this because the arch
2037 * code calls this from NMI context.
2038 */
2039void perf_counter_update_userpage(struct perf_counter *counter)
2040{
2041 struct perf_counter_mmap_page *userpg;
2042 struct perf_mmap_data *data;
2043
2044 rcu_read_lock();
2045 data = rcu_dereference(counter->data);
2046 if (!data)
2047 goto unlock;
2048
2049 userpg = data->user_page;
2050
2051 /*
2052 * Disable preemption so as to not let the corresponding user-space
2053 * spin too long if we get preempted.
2054 */
2055 preempt_disable();
2056 ++userpg->lock;
2057 barrier();
2058 userpg->index = perf_counter_index(counter);
2059 userpg->offset = atomic64_read(&counter->count);
2060 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
2061 userpg->offset -= atomic64_read(&counter->hw.prev_count);
2062
2063 userpg->time_enabled = counter->total_time_enabled +
2064 atomic64_read(&counter->child_total_time_enabled);
2065
2066 userpg->time_running = counter->total_time_running +
2067 atomic64_read(&counter->child_total_time_running);
2068
2069 barrier();
2070 ++userpg->lock;
2071 preempt_enable();
2072unlock:
2073 rcu_read_unlock();
2074}
2075
2076static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2077{
2078 struct perf_counter *counter = vma->vm_file->private_data;
2079 struct perf_mmap_data *data;
2080 int ret = VM_FAULT_SIGBUS;
2081
2082 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2083 if (vmf->pgoff == 0)
2084 ret = 0;
2085 return ret;
2086 }
2087
2088 rcu_read_lock();
2089 data = rcu_dereference(counter->data);
2090 if (!data)
2091 goto unlock;
2092
2093 if (vmf->pgoff == 0) {
2094 vmf->page = virt_to_page(data->user_page);
2095 } else {
2096 int nr = vmf->pgoff - 1;
2097
2098 if ((unsigned)nr > data->nr_pages)
2099 goto unlock;
2100
2101 if (vmf->flags & FAULT_FLAG_WRITE)
2102 goto unlock;
2103
2104 vmf->page = virt_to_page(data->data_pages[nr]);
2105 }
2106
2107 get_page(vmf->page);
2108 vmf->page->mapping = vma->vm_file->f_mapping;
2109 vmf->page->index = vmf->pgoff;
2110
2111 ret = 0;
2112unlock:
2113 rcu_read_unlock();
2114
2115 return ret;
2116}
2117
2118static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2119{
2120 struct perf_mmap_data *data;
2121 unsigned long size;
2122 int i;
2123
2124 WARN_ON(atomic_read(&counter->mmap_count));
2125
2126 size = sizeof(struct perf_mmap_data);
2127 size += nr_pages * sizeof(void *);
2128
2129 data = kzalloc(size, GFP_KERNEL);
2130 if (!data)
2131 goto fail;
2132
2133 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2134 if (!data->user_page)
2135 goto fail_user_page;
2136
2137 for (i = 0; i < nr_pages; i++) {
2138 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2139 if (!data->data_pages[i])
2140 goto fail_data_pages;
2141 }
2142
2143 data->nr_pages = nr_pages;
2144 atomic_set(&data->lock, -1);
2145
2146 rcu_assign_pointer(counter->data, data);
2147
2148 return 0;
2149
2150fail_data_pages:
2151 for (i--; i >= 0; i--)
2152 free_page((unsigned long)data->data_pages[i]);
2153
2154 free_page((unsigned long)data->user_page);
2155
2156fail_user_page:
2157 kfree(data);
2158
2159fail:
2160 return -ENOMEM;
2161}
2162
2163static void perf_mmap_free_page(unsigned long addr)
2164{
2165 struct page *page = virt_to_page((void *)addr);
2166
2167 page->mapping = NULL;
2168 __free_page(page);
2169}
2170
2171static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2172{
2173 struct perf_mmap_data *data;
2174 int i;
2175
2176 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2177
2178 perf_mmap_free_page((unsigned long)data->user_page);
2179 for (i = 0; i < data->nr_pages; i++)
2180 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2181
2182 kfree(data);
2183}
2184
2185static void perf_mmap_data_free(struct perf_counter *counter)
2186{
2187 struct perf_mmap_data *data = counter->data;
2188
2189 WARN_ON(atomic_read(&counter->mmap_count));
2190
2191 rcu_assign_pointer(counter->data, NULL);
2192 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2193}
2194
2195static void perf_mmap_open(struct vm_area_struct *vma)
2196{
2197 struct perf_counter *counter = vma->vm_file->private_data;
2198
2199 atomic_inc(&counter->mmap_count);
2200}
2201
2202static void perf_mmap_close(struct vm_area_struct *vma)
2203{
2204 struct perf_counter *counter = vma->vm_file->private_data;
2205
2206 WARN_ON_ONCE(counter->ctx->parent_ctx);
2207 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2208 struct user_struct *user = current_user();
2209
2210 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2211 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2212 perf_mmap_data_free(counter);
2213 mutex_unlock(&counter->mmap_mutex);
2214 }
2215}
2216
2217static struct vm_operations_struct perf_mmap_vmops = {
2218 .open = perf_mmap_open,
2219 .close = perf_mmap_close,
2220 .fault = perf_mmap_fault,
2221 .page_mkwrite = perf_mmap_fault,
2222};
2223
2224static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2225{
2226 struct perf_counter *counter = file->private_data;
2227 unsigned long user_locked, user_lock_limit;
2228 struct user_struct *user = current_user();
2229 unsigned long locked, lock_limit;
2230 unsigned long vma_size;
2231 unsigned long nr_pages;
2232 long user_extra, extra;
2233 int ret = 0;
2234
2235 if (!(vma->vm_flags & VM_SHARED))
2236 return -EINVAL;
2237
2238 vma_size = vma->vm_end - vma->vm_start;
2239 nr_pages = (vma_size / PAGE_SIZE) - 1;
2240
2241 /*
2242 * If we have data pages ensure they're a power-of-two number, so we
2243 * can do bitmasks instead of modulo.
2244 */
2245 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2246 return -EINVAL;
2247
2248 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2249 return -EINVAL;
2250
2251 if (vma->vm_pgoff != 0)
2252 return -EINVAL;
2253
2254 WARN_ON_ONCE(counter->ctx->parent_ctx);
2255 mutex_lock(&counter->mmap_mutex);
2256 if (atomic_inc_not_zero(&counter->mmap_count)) {
2257 if (nr_pages != counter->data->nr_pages)
2258 ret = -EINVAL;
2259 goto unlock;
2260 }
2261
2262 user_extra = nr_pages + 1;
2263 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2264
2265 /*
2266 * Increase the limit linearly with more CPUs:
2267 */
2268 user_lock_limit *= num_online_cpus();
2269
2270 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2271
2272 extra = 0;
2273 if (user_locked > user_lock_limit)
2274 extra = user_locked - user_lock_limit;
2275
2276 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2277 lock_limit >>= PAGE_SHIFT;
2278 locked = vma->vm_mm->locked_vm + extra;
2279
2280 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2281 ret = -EPERM;
2282 goto unlock;
2283 }
2284
2285 WARN_ON(counter->data);
2286 ret = perf_mmap_data_alloc(counter, nr_pages);
2287 if (ret)
2288 goto unlock;
2289
2290 atomic_set(&counter->mmap_count, 1);
2291 atomic_long_add(user_extra, &user->locked_vm);
2292 vma->vm_mm->locked_vm += extra;
2293 counter->data->nr_locked = extra;
2294 if (vma->vm_flags & VM_WRITE)
2295 counter->data->writable = 1;
2296
2297unlock:
2298 mutex_unlock(&counter->mmap_mutex);
2299
2300 vma->vm_flags |= VM_RESERVED;
2301 vma->vm_ops = &perf_mmap_vmops;
2302
2303 return ret;
2304}
2305
2306static int perf_fasync(int fd, struct file *filp, int on)
2307{
2308 struct inode *inode = filp->f_path.dentry->d_inode;
2309 struct perf_counter *counter = filp->private_data;
2310 int retval;
2311
2312 mutex_lock(&inode->i_mutex);
2313 retval = fasync_helper(fd, filp, on, &counter->fasync);
2314 mutex_unlock(&inode->i_mutex);
2315
2316 if (retval < 0)
2317 return retval;
2318
2319 return 0;
2320}
2321
2322static const struct file_operations perf_fops = {
2323 .release = perf_release,
2324 .read = perf_read,
2325 .poll = perf_poll,
2326 .unlocked_ioctl = perf_ioctl,
2327 .compat_ioctl = perf_ioctl,
2328 .mmap = perf_mmap,
2329 .fasync = perf_fasync,
2330};
2331
2332/*
2333 * Perf counter wakeup
2334 *
2335 * If there's data, ensure we set the poll() state and publish everything
2336 * to user-space before waking everybody up.
2337 */
2338
2339void perf_counter_wakeup(struct perf_counter *counter)
2340{
2341 wake_up_all(&counter->waitq);
2342
2343 if (counter->pending_kill) {
2344 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2345 counter->pending_kill = 0;
2346 }
2347}
2348
2349/*
2350 * Pending wakeups
2351 *
2352 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2353 *
2354 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2355 * single linked list and use cmpxchg() to add entries lockless.
2356 */
2357
2358static void perf_pending_counter(struct perf_pending_entry *entry)
2359{
2360 struct perf_counter *counter = container_of(entry,
2361 struct perf_counter, pending);
2362
2363 if (counter->pending_disable) {
2364 counter->pending_disable = 0;
2365 __perf_counter_disable(counter);
2366 }
2367
2368 if (counter->pending_wakeup) {
2369 counter->pending_wakeup = 0;
2370 perf_counter_wakeup(counter);
2371 }
2372}
2373
2374#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2375
2376static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2377 PENDING_TAIL,
2378};
2379
2380static void perf_pending_queue(struct perf_pending_entry *entry,
2381 void (*func)(struct perf_pending_entry *))
2382{
2383 struct perf_pending_entry **head;
2384
2385 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2386 return;
2387
2388 entry->func = func;
2389
2390 head = &get_cpu_var(perf_pending_head);
2391
2392 do {
2393 entry->next = *head;
2394 } while (cmpxchg(head, entry->next, entry) != entry->next);
2395
2396 set_perf_counter_pending();
2397
2398 put_cpu_var(perf_pending_head);
2399}
2400
2401static int __perf_pending_run(void)
2402{
2403 struct perf_pending_entry *list;
2404 int nr = 0;
2405
2406 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2407 while (list != PENDING_TAIL) {
2408 void (*func)(struct perf_pending_entry *);
2409 struct perf_pending_entry *entry = list;
2410
2411 list = list->next;
2412
2413 func = entry->func;
2414 entry->next = NULL;
2415 /*
2416 * Ensure we observe the unqueue before we issue the wakeup,
2417 * so that we won't be waiting forever.
2418 * -- see perf_not_pending().
2419 */
2420 smp_wmb();
2421
2422 func(entry);
2423 nr++;
2424 }
2425
2426 return nr;
2427}
2428
2429static inline int perf_not_pending(struct perf_counter *counter)
2430{
2431 /*
2432 * If we flush on whatever cpu we run, there is a chance we don't
2433 * need to wait.
2434 */
2435 get_cpu();
2436 __perf_pending_run();
2437 put_cpu();
2438
2439 /*
2440 * Ensure we see the proper queue state before going to sleep
2441 * so that we do not miss the wakeup. -- see perf_pending_handle()
2442 */
2443 smp_rmb();
2444 return counter->pending.next == NULL;
2445}
2446
2447static void perf_pending_sync(struct perf_counter *counter)
2448{
2449 wait_event(counter->waitq, perf_not_pending(counter));
2450}
2451
2452void perf_counter_do_pending(void)
2453{
2454 __perf_pending_run();
2455}
2456
2457/*
2458 * Callchain support -- arch specific
2459 */
2460
2461__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2462{
2463 return NULL;
2464}
2465
2466/*
2467 * Output
2468 */
2469
2470struct perf_output_handle {
2471 struct perf_counter *counter;
2472 struct perf_mmap_data *data;
2473 unsigned long head;
2474 unsigned long offset;
2475 int nmi;
2476 int sample;
2477 int locked;
2478 unsigned long flags;
2479};
2480
2481static bool perf_output_space(struct perf_mmap_data *data,
2482 unsigned int offset, unsigned int head)
2483{
2484 unsigned long tail;
2485 unsigned long mask;
2486
2487 if (!data->writable)
2488 return true;
2489
2490 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2491 /*
2492 * Userspace could choose to issue a mb() before updating the tail
2493 * pointer. So that all reads will be completed before the write is
2494 * issued.
2495 */
2496 tail = ACCESS_ONCE(data->user_page->data_tail);
2497 smp_rmb();
2498
2499 offset = (offset - tail) & mask;
2500 head = (head - tail) & mask;
2501
2502 if ((int)(head - offset) < 0)
2503 return false;
2504
2505 return true;
2506}
2507
2508static void perf_output_wakeup(struct perf_output_handle *handle)
2509{
2510 atomic_set(&handle->data->poll, POLL_IN);
2511
2512 if (handle->nmi) {
2513 handle->counter->pending_wakeup = 1;
2514 perf_pending_queue(&handle->counter->pending,
2515 perf_pending_counter);
2516 } else
2517 perf_counter_wakeup(handle->counter);
2518}
2519
2520/*
2521 * Curious locking construct.
2522 *
2523 * We need to ensure a later event doesn't publish a head when a former
2524 * event isn't done writing. However since we need to deal with NMIs we
2525 * cannot fully serialize things.
2526 *
2527 * What we do is serialize between CPUs so we only have to deal with NMI
2528 * nesting on a single CPU.
2529 *
2530 * We only publish the head (and generate a wakeup) when the outer-most
2531 * event completes.
2532 */
2533static void perf_output_lock(struct perf_output_handle *handle)
2534{
2535 struct perf_mmap_data *data = handle->data;
2536 int cpu;
2537
2538 handle->locked = 0;
2539
2540 local_irq_save(handle->flags);
2541 cpu = smp_processor_id();
2542
2543 if (in_nmi() && atomic_read(&data->lock) == cpu)
2544 return;
2545
2546 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2547 cpu_relax();
2548
2549 handle->locked = 1;
2550}
2551
2552static void perf_output_unlock(struct perf_output_handle *handle)
2553{
2554 struct perf_mmap_data *data = handle->data;
2555 unsigned long head;
2556 int cpu;
2557
2558 data->done_head = data->head;
2559
2560 if (!handle->locked)
2561 goto out;
2562
2563again:
2564 /*
2565 * The xchg implies a full barrier that ensures all writes are done
2566 * before we publish the new head, matched by a rmb() in userspace when
2567 * reading this position.
2568 */
2569 while ((head = atomic_long_xchg(&data->done_head, 0)))
2570 data->user_page->data_head = head;
2571
2572 /*
2573 * NMI can happen here, which means we can miss a done_head update.
2574 */
2575
2576 cpu = atomic_xchg(&data->lock, -1);
2577 WARN_ON_ONCE(cpu != smp_processor_id());
2578
2579 /*
2580 * Therefore we have to validate we did not indeed do so.
2581 */
2582 if (unlikely(atomic_long_read(&data->done_head))) {
2583 /*
2584 * Since we had it locked, we can lock it again.
2585 */
2586 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2587 cpu_relax();
2588
2589 goto again;
2590 }
2591
2592 if (atomic_xchg(&data->wakeup, 0))
2593 perf_output_wakeup(handle);
2594out:
2595 local_irq_restore(handle->flags);
2596}
2597
2598static void perf_output_copy(struct perf_output_handle *handle,
2599 const void *buf, unsigned int len)
2600{
2601 unsigned int pages_mask;
2602 unsigned int offset;
2603 unsigned int size;
2604 void **pages;
2605
2606 offset = handle->offset;
2607 pages_mask = handle->data->nr_pages - 1;
2608 pages = handle->data->data_pages;
2609
2610 do {
2611 unsigned int page_offset;
2612 int nr;
2613
2614 nr = (offset >> PAGE_SHIFT) & pages_mask;
2615 page_offset = offset & (PAGE_SIZE - 1);
2616 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2617
2618 memcpy(pages[nr] + page_offset, buf, size);
2619
2620 len -= size;
2621 buf += size;
2622 offset += size;
2623 } while (len);
2624
2625 handle->offset = offset;
2626
2627 /*
2628 * Check we didn't copy past our reservation window, taking the
2629 * possible unsigned int wrap into account.
2630 */
2631 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2632}
2633
2634#define perf_output_put(handle, x) \
2635 perf_output_copy((handle), &(x), sizeof(x))
2636
2637static int perf_output_begin(struct perf_output_handle *handle,
2638 struct perf_counter *counter, unsigned int size,
2639 int nmi, int sample)
2640{
2641 struct perf_mmap_data *data;
2642 unsigned int offset, head;
2643 int have_lost;
2644 struct {
2645 struct perf_event_header header;
2646 u64 id;
2647 u64 lost;
2648 } lost_event;
2649
2650 /*
2651 * For inherited counters we send all the output towards the parent.
2652 */
2653 if (counter->parent)
2654 counter = counter->parent;
2655
2656 rcu_read_lock();
2657 data = rcu_dereference(counter->data);
2658 if (!data)
2659 goto out;
2660
2661 handle->data = data;
2662 handle->counter = counter;
2663 handle->nmi = nmi;
2664 handle->sample = sample;
2665
2666 if (!data->nr_pages)
2667 goto fail;
2668
2669 have_lost = atomic_read(&data->lost);
2670 if (have_lost)
2671 size += sizeof(lost_event);
2672
2673 perf_output_lock(handle);
2674
2675 do {
2676 offset = head = atomic_long_read(&data->head);
2677 head += size;
2678 if (unlikely(!perf_output_space(data, offset, head)))
2679 goto fail;
2680 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2681
2682 handle->offset = offset;
2683 handle->head = head;
2684
2685 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2686 atomic_set(&data->wakeup, 1);
2687
2688 if (have_lost) {
2689 lost_event.header.type = PERF_EVENT_LOST;
2690 lost_event.header.misc = 0;
2691 lost_event.header.size = sizeof(lost_event);
2692 lost_event.id = counter->id;
2693 lost_event.lost = atomic_xchg(&data->lost, 0);
2694
2695 perf_output_put(handle, lost_event);
2696 }
2697
2698 return 0;
2699
2700fail:
2701 atomic_inc(&data->lost);
2702 perf_output_unlock(handle);
2703out:
2704 rcu_read_unlock();
2705
2706 return -ENOSPC;
2707}
2708
2709static void perf_output_end(struct perf_output_handle *handle)
2710{
2711 struct perf_counter *counter = handle->counter;
2712 struct perf_mmap_data *data = handle->data;
2713
2714 int wakeup_events = counter->attr.wakeup_events;
2715
2716 if (handle->sample && wakeup_events) {
2717 int events = atomic_inc_return(&data->events);
2718 if (events >= wakeup_events) {
2719 atomic_sub(wakeup_events, &data->events);
2720 atomic_set(&data->wakeup, 1);
2721 }
2722 }
2723
2724 perf_output_unlock(handle);
2725 rcu_read_unlock();
2726}
2727
2728static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2729{
2730 /*
2731 * only top level counters have the pid namespace they were created in
2732 */
2733 if (counter->parent)
2734 counter = counter->parent;
2735
2736 return task_tgid_nr_ns(p, counter->ns);
2737}
2738
2739static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2740{
2741 /*
2742 * only top level counters have the pid namespace they were created in
2743 */
2744 if (counter->parent)
2745 counter = counter->parent;
2746
2747 return task_pid_nr_ns(p, counter->ns);
2748}
2749
2750static void perf_output_read_one(struct perf_output_handle *handle,
2751 struct perf_counter *counter)
2752{
2753 u64 read_format = counter->attr.read_format;
2754 u64 values[4];
2755 int n = 0;
2756
2757 values[n++] = atomic64_read(&counter->count);
2758 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2759 values[n++] = counter->total_time_enabled +
2760 atomic64_read(&counter->child_total_time_enabled);
2761 }
2762 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2763 values[n++] = counter->total_time_running +
2764 atomic64_read(&counter->child_total_time_running);
2765 }
2766 if (read_format & PERF_FORMAT_ID)
2767 values[n++] = primary_counter_id(counter);
2768
2769 perf_output_copy(handle, values, n * sizeof(u64));
2770}
2771
2772/*
2773 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2774 */
2775static void perf_output_read_group(struct perf_output_handle *handle,
2776 struct perf_counter *counter)
2777{
2778 struct perf_counter *leader = counter->group_leader, *sub;
2779 u64 read_format = counter->attr.read_format;
2780 u64 values[5];
2781 int n = 0;
2782
2783 values[n++] = 1 + leader->nr_siblings;
2784
2785 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2786 values[n++] = leader->total_time_enabled;
2787
2788 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2789 values[n++] = leader->total_time_running;
2790
2791 if (leader != counter)
2792 leader->pmu->read(leader);
2793
2794 values[n++] = atomic64_read(&leader->count);
2795 if (read_format & PERF_FORMAT_ID)
2796 values[n++] = primary_counter_id(leader);
2797
2798 perf_output_copy(handle, values, n * sizeof(u64));
2799
2800 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2801 n = 0;
2802
2803 if (sub != counter)
2804 sub->pmu->read(sub);
2805
2806 values[n++] = atomic64_read(&sub->count);
2807 if (read_format & PERF_FORMAT_ID)
2808 values[n++] = primary_counter_id(sub);
2809
2810 perf_output_copy(handle, values, n * sizeof(u64));
2811 }
2812}
2813
2814static void perf_output_read(struct perf_output_handle *handle,
2815 struct perf_counter *counter)
2816{
2817 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2818 perf_output_read_group(handle, counter);
2819 else
2820 perf_output_read_one(handle, counter);
2821}
2822
2823void perf_counter_output(struct perf_counter *counter, int nmi,
2824 struct perf_sample_data *data)
2825{
2826 int ret;
2827 u64 sample_type = counter->attr.sample_type;
2828 struct perf_output_handle handle;
2829 struct perf_event_header header;
2830 u64 ip;
2831 struct {
2832 u32 pid, tid;
2833 } tid_entry;
2834 struct perf_callchain_entry *callchain = NULL;
2835 int callchain_size = 0;
2836 u64 time;
2837 struct {
2838 u32 cpu, reserved;
2839 } cpu_entry;
2840
2841 header.type = PERF_EVENT_SAMPLE;
2842 header.size = sizeof(header);
2843
2844 header.misc = 0;
2845 header.misc |= perf_misc_flags(data->regs);
2846
2847 if (sample_type & PERF_SAMPLE_IP) {
2848 ip = perf_instruction_pointer(data->regs);
2849 header.size += sizeof(ip);
2850 }
2851
2852 if (sample_type & PERF_SAMPLE_TID) {
2853 /* namespace issues */
2854 tid_entry.pid = perf_counter_pid(counter, current);
2855 tid_entry.tid = perf_counter_tid(counter, current);
2856
2857 header.size += sizeof(tid_entry);
2858 }
2859
2860 if (sample_type & PERF_SAMPLE_TIME) {
2861 /*
2862 * Maybe do better on x86 and provide cpu_clock_nmi()
2863 */
2864 time = sched_clock();
2865
2866 header.size += sizeof(u64);
2867 }
2868
2869 if (sample_type & PERF_SAMPLE_ADDR)
2870 header.size += sizeof(u64);
2871
2872 if (sample_type & PERF_SAMPLE_ID)
2873 header.size += sizeof(u64);
2874
2875 if (sample_type & PERF_SAMPLE_STREAM_ID)
2876 header.size += sizeof(u64);
2877
2878 if (sample_type & PERF_SAMPLE_CPU) {
2879 header.size += sizeof(cpu_entry);
2880
2881 cpu_entry.cpu = raw_smp_processor_id();
2882 cpu_entry.reserved = 0;
2883 }
2884
2885 if (sample_type & PERF_SAMPLE_PERIOD)
2886 header.size += sizeof(u64);
2887
2888 if (sample_type & PERF_SAMPLE_READ)
2889 header.size += perf_counter_read_size(counter);
2890
2891 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2892 callchain = perf_callchain(data->regs);
2893
2894 if (callchain) {
2895 callchain_size = (1 + callchain->nr) * sizeof(u64);
2896 header.size += callchain_size;
2897 } else
2898 header.size += sizeof(u64);
2899 }
2900
2901 if (sample_type & PERF_SAMPLE_RAW) {
2902 int size = sizeof(u32);
2903
2904 if (data->raw)
2905 size += data->raw->size;
2906 else
2907 size += sizeof(u32);
2908
2909 WARN_ON_ONCE(size & (sizeof(u64)-1));
2910 header.size += size;
2911 }
2912
2913 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2914 if (ret)
2915 return;
2916
2917 perf_output_put(&handle, header);
2918
2919 if (sample_type & PERF_SAMPLE_IP)
2920 perf_output_put(&handle, ip);
2921
2922 if (sample_type & PERF_SAMPLE_TID)
2923 perf_output_put(&handle, tid_entry);
2924
2925 if (sample_type & PERF_SAMPLE_TIME)
2926 perf_output_put(&handle, time);
2927
2928 if (sample_type & PERF_SAMPLE_ADDR)
2929 perf_output_put(&handle, data->addr);
2930
2931 if (sample_type & PERF_SAMPLE_ID) {
2932 u64 id = primary_counter_id(counter);
2933
2934 perf_output_put(&handle, id);
2935 }
2936
2937 if (sample_type & PERF_SAMPLE_STREAM_ID)
2938 perf_output_put(&handle, counter->id);
2939
2940 if (sample_type & PERF_SAMPLE_CPU)
2941 perf_output_put(&handle, cpu_entry);
2942
2943 if (sample_type & PERF_SAMPLE_PERIOD)
2944 perf_output_put(&handle, data->period);
2945
2946 if (sample_type & PERF_SAMPLE_READ)
2947 perf_output_read(&handle, counter);
2948
2949 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2950 if (callchain)
2951 perf_output_copy(&handle, callchain, callchain_size);
2952 else {
2953 u64 nr = 0;
2954 perf_output_put(&handle, nr);
2955 }
2956 }
2957
2958 if (sample_type & PERF_SAMPLE_RAW) {
2959 if (data->raw) {
2960 perf_output_put(&handle, data->raw->size);
2961 perf_output_copy(&handle, data->raw->data, data->raw->size);
2962 } else {
2963 struct {
2964 u32 size;
2965 u32 data;
2966 } raw = {
2967 .size = sizeof(u32),
2968 .data = 0,
2969 };
2970 perf_output_put(&handle, raw);
2971 }
2972 }
2973
2974 perf_output_end(&handle);
2975}
2976
2977/*
2978 * read event
2979 */
2980
2981struct perf_read_event {
2982 struct perf_event_header header;
2983
2984 u32 pid;
2985 u32 tid;
2986};
2987
2988static void
2989perf_counter_read_event(struct perf_counter *counter,
2990 struct task_struct *task)
2991{
2992 struct perf_output_handle handle;
2993 struct perf_read_event event = {
2994 .header = {
2995 .type = PERF_EVENT_READ,
2996 .misc = 0,
2997 .size = sizeof(event) + perf_counter_read_size(counter),
2998 },
2999 .pid = perf_counter_pid(counter, task),
3000 .tid = perf_counter_tid(counter, task),
3001 };
3002 int ret;
3003
3004 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3005 if (ret)
3006 return;
3007
3008 perf_output_put(&handle, event);
3009 perf_output_read(&handle, counter);
3010
3011 perf_output_end(&handle);
3012}
3013
3014/*
3015 * task tracking -- fork/exit
3016 *
3017 * enabled by: attr.comm | attr.mmap | attr.task
3018 */
3019
3020struct perf_task_event {
3021 struct task_struct *task;
3022 struct perf_counter_context *task_ctx;
3023
3024 struct {
3025 struct perf_event_header header;
3026
3027 u32 pid;
3028 u32 ppid;
3029 u32 tid;
3030 u32 ptid;
3031 } event;
3032};
3033
3034static void perf_counter_task_output(struct perf_counter *counter,
3035 struct perf_task_event *task_event)
3036{
3037 struct perf_output_handle handle;
3038 int size = task_event->event.header.size;
3039 struct task_struct *task = task_event->task;
3040 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3041
3042 if (ret)
3043 return;
3044
3045 task_event->event.pid = perf_counter_pid(counter, task);
3046 task_event->event.ppid = perf_counter_pid(counter, current);
3047
3048 task_event->event.tid = perf_counter_tid(counter, task);
3049 task_event->event.ptid = perf_counter_tid(counter, current);
3050
3051 perf_output_put(&handle, task_event->event);
3052 perf_output_end(&handle);
3053}
3054
3055static int perf_counter_task_match(struct perf_counter *counter)
3056{
3057 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
3058 return 1;
3059
3060 return 0;
3061}
3062
3063static void perf_counter_task_ctx(struct perf_counter_context *ctx,
3064 struct perf_task_event *task_event)
3065{
3066 struct perf_counter *counter;
3067
3068 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3069 return;
3070
3071 rcu_read_lock();
3072 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3073 if (perf_counter_task_match(counter))
3074 perf_counter_task_output(counter, task_event);
3075 }
3076 rcu_read_unlock();
3077}
3078
3079static void perf_counter_task_event(struct perf_task_event *task_event)
3080{
3081 struct perf_cpu_context *cpuctx;
3082 struct perf_counter_context *ctx = task_event->task_ctx;
3083
3084 cpuctx = &get_cpu_var(perf_cpu_context);
3085 perf_counter_task_ctx(&cpuctx->ctx, task_event);
3086 put_cpu_var(perf_cpu_context);
3087
3088 rcu_read_lock();
3089 if (!ctx)
3090 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
3091 if (ctx)
3092 perf_counter_task_ctx(ctx, task_event);
3093 rcu_read_unlock();
3094}
3095
3096static void perf_counter_task(struct task_struct *task,
3097 struct perf_counter_context *task_ctx,
3098 int new)
3099{
3100 struct perf_task_event task_event;
3101
3102 if (!atomic_read(&nr_comm_counters) &&
3103 !atomic_read(&nr_mmap_counters) &&
3104 !atomic_read(&nr_task_counters))
3105 return;
3106
3107 task_event = (struct perf_task_event){
3108 .task = task,
3109 .task_ctx = task_ctx,
3110 .event = {
3111 .header = {
3112 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
3113 .misc = 0,
3114 .size = sizeof(task_event.event),
3115 },
3116 /* .pid */
3117 /* .ppid */
3118 /* .tid */
3119 /* .ptid */
3120 },
3121 };
3122
3123 perf_counter_task_event(&task_event);
3124}
3125
3126void perf_counter_fork(struct task_struct *task)
3127{
3128 perf_counter_task(task, NULL, 1);
3129}
3130
3131/*
3132 * comm tracking
3133 */
3134
3135struct perf_comm_event {
3136 struct task_struct *task;
3137 char *comm;
3138 int comm_size;
3139
3140 struct {
3141 struct perf_event_header header;
3142
3143 u32 pid;
3144 u32 tid;
3145 } event;
3146};
3147
3148static void perf_counter_comm_output(struct perf_counter *counter,
3149 struct perf_comm_event *comm_event)
3150{
3151 struct perf_output_handle handle;
3152 int size = comm_event->event.header.size;
3153 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3154
3155 if (ret)
3156 return;
3157
3158 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
3159 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
3160
3161 perf_output_put(&handle, comm_event->event);
3162 perf_output_copy(&handle, comm_event->comm,
3163 comm_event->comm_size);
3164 perf_output_end(&handle);
3165}
3166
3167static int perf_counter_comm_match(struct perf_counter *counter)
3168{
3169 if (counter->attr.comm)
3170 return 1;
3171
3172 return 0;
3173}
3174
3175static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
3176 struct perf_comm_event *comm_event)
3177{
3178 struct perf_counter *counter;
3179
3180 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3181 return;
3182
3183 rcu_read_lock();
3184 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3185 if (perf_counter_comm_match(counter))
3186 perf_counter_comm_output(counter, comm_event);
3187 }
3188 rcu_read_unlock();
3189}
3190
3191static void perf_counter_comm_event(struct perf_comm_event *comm_event)
3192{
3193 struct perf_cpu_context *cpuctx;
3194 struct perf_counter_context *ctx;
3195 unsigned int size;
3196 char comm[TASK_COMM_LEN];
3197
3198 memset(comm, 0, sizeof(comm));
3199 strncpy(comm, comm_event->task->comm, sizeof(comm));
3200 size = ALIGN(strlen(comm)+1, sizeof(u64));
3201
3202 comm_event->comm = comm;
3203 comm_event->comm_size = size;
3204
3205 comm_event->event.header.size = sizeof(comm_event->event) + size;
3206
3207 cpuctx = &get_cpu_var(perf_cpu_context);
3208 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3209 put_cpu_var(perf_cpu_context);
3210
3211 rcu_read_lock();
3212 /*
3213 * doesn't really matter which of the child contexts the
3214 * events ends up in.
3215 */
3216 ctx = rcu_dereference(current->perf_counter_ctxp);
3217 if (ctx)
3218 perf_counter_comm_ctx(ctx, comm_event);
3219 rcu_read_unlock();
3220}
3221
3222void perf_counter_comm(struct task_struct *task)
3223{
3224 struct perf_comm_event comm_event;
3225
3226 if (task->perf_counter_ctxp)
3227 perf_counter_enable_on_exec(task);
3228
3229 if (!atomic_read(&nr_comm_counters))
3230 return;
3231
3232 comm_event = (struct perf_comm_event){
3233 .task = task,
3234 /* .comm */
3235 /* .comm_size */
3236 .event = {
3237 .header = {
3238 .type = PERF_EVENT_COMM,
3239 .misc = 0,
3240 /* .size */
3241 },
3242 /* .pid */
3243 /* .tid */
3244 },
3245 };
3246
3247 perf_counter_comm_event(&comm_event);
3248}
3249
3250/*
3251 * mmap tracking
3252 */
3253
3254struct perf_mmap_event {
3255 struct vm_area_struct *vma;
3256
3257 const char *file_name;
3258 int file_size;
3259
3260 struct {
3261 struct perf_event_header header;
3262
3263 u32 pid;
3264 u32 tid;
3265 u64 start;
3266 u64 len;
3267 u64 pgoff;
3268 } event;
3269};
3270
3271static void perf_counter_mmap_output(struct perf_counter *counter,
3272 struct perf_mmap_event *mmap_event)
3273{
3274 struct perf_output_handle handle;
3275 int size = mmap_event->event.header.size;
3276 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3277
3278 if (ret)
3279 return;
3280
3281 mmap_event->event.pid = perf_counter_pid(counter, current);
3282 mmap_event->event.tid = perf_counter_tid(counter, current);
3283
3284 perf_output_put(&handle, mmap_event->event);
3285 perf_output_copy(&handle, mmap_event->file_name,
3286 mmap_event->file_size);
3287 perf_output_end(&handle);
3288}
3289
3290static int perf_counter_mmap_match(struct perf_counter *counter,
3291 struct perf_mmap_event *mmap_event)
3292{
3293 if (counter->attr.mmap)
3294 return 1;
3295
3296 return 0;
3297}
3298
3299static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3300 struct perf_mmap_event *mmap_event)
3301{
3302 struct perf_counter *counter;
3303
3304 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3305 return;
3306
3307 rcu_read_lock();
3308 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3309 if (perf_counter_mmap_match(counter, mmap_event))
3310 perf_counter_mmap_output(counter, mmap_event);
3311 }
3312 rcu_read_unlock();
3313}
3314
3315static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3316{
3317 struct perf_cpu_context *cpuctx;
3318 struct perf_counter_context *ctx;
3319 struct vm_area_struct *vma = mmap_event->vma;
3320 struct file *file = vma->vm_file;
3321 unsigned int size;
3322 char tmp[16];
3323 char *buf = NULL;
3324 const char *name;
3325
3326 memset(tmp, 0, sizeof(tmp));
3327
3328 if (file) {
3329 /*
3330 * d_path works from the end of the buffer backwards, so we
3331 * need to add enough zero bytes after the string to handle
3332 * the 64bit alignment we do later.
3333 */
3334 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3335 if (!buf) {
3336 name = strncpy(tmp, "//enomem", sizeof(tmp));
3337 goto got_name;
3338 }
3339 name = d_path(&file->f_path, buf, PATH_MAX);
3340 if (IS_ERR(name)) {
3341 name = strncpy(tmp, "//toolong", sizeof(tmp));
3342 goto got_name;
3343 }
3344 } else {
3345 if (arch_vma_name(mmap_event->vma)) {
3346 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3347 sizeof(tmp));
3348 goto got_name;
3349 }
3350
3351 if (!vma->vm_mm) {
3352 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3353 goto got_name;
3354 }
3355
3356 name = strncpy(tmp, "//anon", sizeof(tmp));
3357 goto got_name;
3358 }
3359
3360got_name:
3361 size = ALIGN(strlen(name)+1, sizeof(u64));
3362
3363 mmap_event->file_name = name;
3364 mmap_event->file_size = size;
3365
3366 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3367
3368 cpuctx = &get_cpu_var(perf_cpu_context);
3369 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3370 put_cpu_var(perf_cpu_context);
3371
3372 rcu_read_lock();
3373 /*
3374 * doesn't really matter which of the child contexts the
3375 * events ends up in.
3376 */
3377 ctx = rcu_dereference(current->perf_counter_ctxp);
3378 if (ctx)
3379 perf_counter_mmap_ctx(ctx, mmap_event);
3380 rcu_read_unlock();
3381
3382 kfree(buf);
3383}
3384
3385void __perf_counter_mmap(struct vm_area_struct *vma)
3386{
3387 struct perf_mmap_event mmap_event;
3388
3389 if (!atomic_read(&nr_mmap_counters))
3390 return;
3391
3392 mmap_event = (struct perf_mmap_event){
3393 .vma = vma,
3394 /* .file_name */
3395 /* .file_size */
3396 .event = {
3397 .header = {
3398 .type = PERF_EVENT_MMAP,
3399 .misc = 0,
3400 /* .size */
3401 },
3402 /* .pid */
3403 /* .tid */
3404 .start = vma->vm_start,
3405 .len = vma->vm_end - vma->vm_start,
3406 .pgoff = vma->vm_pgoff,
3407 },
3408 };
3409
3410 perf_counter_mmap_event(&mmap_event);
3411}
3412
3413/*
3414 * IRQ throttle logging
3415 */
3416
3417static void perf_log_throttle(struct perf_counter *counter, int enable)
3418{
3419 struct perf_output_handle handle;
3420 int ret;
3421
3422 struct {
3423 struct perf_event_header header;
3424 u64 time;
3425 u64 id;
3426 u64 stream_id;
3427 } throttle_event = {
3428 .header = {
3429 .type = PERF_EVENT_THROTTLE,
3430 .misc = 0,
3431 .size = sizeof(throttle_event),
3432 },
3433 .time = sched_clock(),
3434 .id = primary_counter_id(counter),
3435 .stream_id = counter->id,
3436 };
3437
3438 if (enable)
3439 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3440
3441 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3442 if (ret)
3443 return;
3444
3445 perf_output_put(&handle, throttle_event);
3446 perf_output_end(&handle);
3447}
3448
3449/*
3450 * Generic counter overflow handling, sampling.
3451 */
3452
3453int perf_counter_overflow(struct perf_counter *counter, int nmi,
3454 struct perf_sample_data *data)
3455{
3456 int events = atomic_read(&counter->event_limit);
3457 int throttle = counter->pmu->unthrottle != NULL;
3458 struct hw_perf_counter *hwc = &counter->hw;
3459 int ret = 0;
3460
3461 if (!throttle) {
3462 hwc->interrupts++;
3463 } else {
3464 if (hwc->interrupts != MAX_INTERRUPTS) {
3465 hwc->interrupts++;
3466 if (HZ * hwc->interrupts >
3467 (u64)sysctl_perf_counter_sample_rate) {
3468 hwc->interrupts = MAX_INTERRUPTS;
3469 perf_log_throttle(counter, 0);
3470 ret = 1;
3471 }
3472 } else {
3473 /*
3474 * Keep re-disabling counters even though on the previous
3475 * pass we disabled it - just in case we raced with a
3476 * sched-in and the counter got enabled again:
3477 */
3478 ret = 1;
3479 }
3480 }
3481
3482 if (counter->attr.freq) {
3483 u64 now = sched_clock();
3484 s64 delta = now - hwc->freq_stamp;
3485
3486 hwc->freq_stamp = now;
3487
3488 if (delta > 0 && delta < TICK_NSEC)
3489 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3490 }
3491
3492 /*
3493 * XXX event_limit might not quite work as expected on inherited
3494 * counters
3495 */
3496
3497 counter->pending_kill = POLL_IN;
3498 if (events && atomic_dec_and_test(&counter->event_limit)) {
3499 ret = 1;
3500 counter->pending_kill = POLL_HUP;
3501 if (nmi) {
3502 counter->pending_disable = 1;
3503 perf_pending_queue(&counter->pending,
3504 perf_pending_counter);
3505 } else
3506 perf_counter_disable(counter);
3507 }
3508
3509 perf_counter_output(counter, nmi, data);
3510 return ret;
3511}
3512
3513/*
3514 * Generic software counter infrastructure
3515 */
3516
3517/*
3518 * We directly increment counter->count and keep a second value in
3519 * counter->hw.period_left to count intervals. This period counter
3520 * is kept in the range [-sample_period, 0] so that we can use the
3521 * sign as trigger.
3522 */
3523
3524static u64 perf_swcounter_set_period(struct perf_counter *counter)
3525{
3526 struct hw_perf_counter *hwc = &counter->hw;
3527 u64 period = hwc->last_period;
3528 u64 nr, offset;
3529 s64 old, val;
3530
3531 hwc->last_period = hwc->sample_period;
3532
3533again:
3534 old = val = atomic64_read(&hwc->period_left);
3535 if (val < 0)
3536 return 0;
3537
3538 nr = div64_u64(period + val, period);
3539 offset = nr * period;
3540 val -= offset;
3541 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3542 goto again;
3543
3544 return nr;
3545}
3546
3547static void perf_swcounter_overflow(struct perf_counter *counter,
3548 int nmi, struct perf_sample_data *data)
3549{
3550 struct hw_perf_counter *hwc = &counter->hw;
3551 u64 overflow;
3552
3553 data->period = counter->hw.last_period;
3554 overflow = perf_swcounter_set_period(counter);
3555
3556 if (hwc->interrupts == MAX_INTERRUPTS)
3557 return;
3558
3559 for (; overflow; overflow--) {
3560 if (perf_counter_overflow(counter, nmi, data)) {
3561 /*
3562 * We inhibit the overflow from happening when
3563 * hwc->interrupts == MAX_INTERRUPTS.
3564 */
3565 break;
3566 }
3567 }
3568}
3569
3570static void perf_swcounter_unthrottle(struct perf_counter *counter)
3571{
3572 /*
3573 * Nothing to do, we already reset hwc->interrupts.
3574 */
3575}
3576
3577static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3578 int nmi, struct perf_sample_data *data)
3579{
3580 struct hw_perf_counter *hwc = &counter->hw;
3581
3582 atomic64_add(nr, &counter->count);
3583
3584 if (!hwc->sample_period)
3585 return;
3586
3587 if (!data->regs)
3588 return;
3589
3590 if (!atomic64_add_negative(nr, &hwc->period_left))
3591 perf_swcounter_overflow(counter, nmi, data);
3592}
3593
3594static int perf_swcounter_is_counting(struct perf_counter *counter)
3595{
3596 /*
3597 * The counter is active, we're good!
3598 */
3599 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3600 return 1;
3601
3602 /*
3603 * The counter is off/error, not counting.
3604 */
3605 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3606 return 0;
3607
3608 /*
3609 * The counter is inactive, if the context is active
3610 * we're part of a group that didn't make it on the 'pmu',
3611 * not counting.
3612 */
3613 if (counter->ctx->is_active)
3614 return 0;
3615
3616 /*
3617 * We're inactive and the context is too, this means the
3618 * task is scheduled out, we're counting events that happen
3619 * to us, like migration events.
3620 */
3621 return 1;
3622}
3623
3624static int perf_swcounter_match(struct perf_counter *counter,
3625 enum perf_type_id type,
3626 u32 event, struct pt_regs *regs)
3627{
3628 if (!perf_swcounter_is_counting(counter))
3629 return 0;
3630
3631 if (counter->attr.type != type)
3632 return 0;
3633 if (counter->attr.config != event)
3634 return 0;
3635
3636 if (regs) {
3637 if (counter->attr.exclude_user && user_mode(regs))
3638 return 0;
3639
3640 if (counter->attr.exclude_kernel && !user_mode(regs))
3641 return 0;
3642 }
3643
3644 return 1;
3645}
3646
3647static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3648 enum perf_type_id type,
3649 u32 event, u64 nr, int nmi,
3650 struct perf_sample_data *data)
3651{
3652 struct perf_counter *counter;
3653
3654 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3655 return;
3656
3657 rcu_read_lock();
3658 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3659 if (perf_swcounter_match(counter, type, event, data->regs))
3660 perf_swcounter_add(counter, nr, nmi, data);
3661 }
3662 rcu_read_unlock();
3663}
3664
3665static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3666{
3667 if (in_nmi())
3668 return &cpuctx->recursion[3];
3669
3670 if (in_irq())
3671 return &cpuctx->recursion[2];
3672
3673 if (in_softirq())
3674 return &cpuctx->recursion[1];
3675
3676 return &cpuctx->recursion[0];
3677}
3678
3679static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3680 u64 nr, int nmi,
3681 struct perf_sample_data *data)
3682{
3683 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3684 int *recursion = perf_swcounter_recursion_context(cpuctx);
3685 struct perf_counter_context *ctx;
3686
3687 if (*recursion)
3688 goto out;
3689
3690 (*recursion)++;
3691 barrier();
3692
3693 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3694 nr, nmi, data);
3695 rcu_read_lock();
3696 /*
3697 * doesn't really matter which of the child contexts the
3698 * events ends up in.
3699 */
3700 ctx = rcu_dereference(current->perf_counter_ctxp);
3701 if (ctx)
3702 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3703 rcu_read_unlock();
3704
3705 barrier();
3706 (*recursion)--;
3707
3708out:
3709 put_cpu_var(perf_cpu_context);
3710}
3711
3712void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3713 struct pt_regs *regs, u64 addr)
3714{
3715 struct perf_sample_data data = {
3716 .regs = regs,
3717 .addr = addr,
3718 };
3719
3720 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3721}
3722
3723static void perf_swcounter_read(struct perf_counter *counter)
3724{
3725}
3726
3727static int perf_swcounter_enable(struct perf_counter *counter)
3728{
3729 struct hw_perf_counter *hwc = &counter->hw;
3730
3731 if (hwc->sample_period) {
3732 hwc->last_period = hwc->sample_period;
3733 perf_swcounter_set_period(counter);
3734 }
3735 return 0;
3736}
3737
3738static void perf_swcounter_disable(struct perf_counter *counter)
3739{
3740}
3741
3742static const struct pmu perf_ops_generic = {
3743 .enable = perf_swcounter_enable,
3744 .disable = perf_swcounter_disable,
3745 .read = perf_swcounter_read,
3746 .unthrottle = perf_swcounter_unthrottle,
3747};
3748
3749/*
3750 * hrtimer based swcounter callback
3751 */
3752
3753static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3754{
3755 enum hrtimer_restart ret = HRTIMER_RESTART;
3756 struct perf_sample_data data;
3757 struct perf_counter *counter;
3758 u64 period;
3759
3760 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3761 counter->pmu->read(counter);
3762
3763 data.addr = 0;
3764 data.regs = get_irq_regs();
3765 /*
3766 * In case we exclude kernel IPs or are somehow not in interrupt
3767 * context, provide the next best thing, the user IP.
3768 */
3769 if ((counter->attr.exclude_kernel || !data.regs) &&
3770 !counter->attr.exclude_user)
3771 data.regs = task_pt_regs(current);
3772
3773 if (data.regs) {
3774 if (perf_counter_overflow(counter, 0, &data))
3775 ret = HRTIMER_NORESTART;
3776 }
3777
3778 period = max_t(u64, 10000, counter->hw.sample_period);
3779 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3780
3781 return ret;
3782}
3783
3784/*
3785 * Software counter: cpu wall time clock
3786 */
3787
3788static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3789{
3790 int cpu = raw_smp_processor_id();
3791 s64 prev;
3792 u64 now;
3793
3794 now = cpu_clock(cpu);
3795 prev = atomic64_read(&counter->hw.prev_count);
3796 atomic64_set(&counter->hw.prev_count, now);
3797 atomic64_add(now - prev, &counter->count);
3798}
3799
3800static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3801{
3802 struct hw_perf_counter *hwc = &counter->hw;
3803 int cpu = raw_smp_processor_id();
3804
3805 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3806 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3807 hwc->hrtimer.function = perf_swcounter_hrtimer;
3808 if (hwc->sample_period) {
3809 u64 period = max_t(u64, 10000, hwc->sample_period);
3810 __hrtimer_start_range_ns(&hwc->hrtimer,
3811 ns_to_ktime(period), 0,
3812 HRTIMER_MODE_REL, 0);
3813 }
3814
3815 return 0;
3816}
3817
3818static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3819{
3820 if (counter->hw.sample_period)
3821 hrtimer_cancel(&counter->hw.hrtimer);
3822 cpu_clock_perf_counter_update(counter);
3823}
3824
3825static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3826{
3827 cpu_clock_perf_counter_update(counter);
3828}
3829
3830static const struct pmu perf_ops_cpu_clock = {
3831 .enable = cpu_clock_perf_counter_enable,
3832 .disable = cpu_clock_perf_counter_disable,
3833 .read = cpu_clock_perf_counter_read,
3834};
3835
3836/*
3837 * Software counter: task time clock
3838 */
3839
3840static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3841{
3842 u64 prev;
3843 s64 delta;
3844
3845 prev = atomic64_xchg(&counter->hw.prev_count, now);
3846 delta = now - prev;
3847 atomic64_add(delta, &counter->count);
3848}
3849
3850static int task_clock_perf_counter_enable(struct perf_counter *counter)
3851{
3852 struct hw_perf_counter *hwc = &counter->hw;
3853 u64 now;
3854
3855 now = counter->ctx->time;
3856
3857 atomic64_set(&hwc->prev_count, now);
3858 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3859 hwc->hrtimer.function = perf_swcounter_hrtimer;
3860 if (hwc->sample_period) {
3861 u64 period = max_t(u64, 10000, hwc->sample_period);
3862 __hrtimer_start_range_ns(&hwc->hrtimer,
3863 ns_to_ktime(period), 0,
3864 HRTIMER_MODE_REL, 0);
3865 }
3866
3867 return 0;
3868}
3869
3870static void task_clock_perf_counter_disable(struct perf_counter *counter)
3871{
3872 if (counter->hw.sample_period)
3873 hrtimer_cancel(&counter->hw.hrtimer);
3874 task_clock_perf_counter_update(counter, counter->ctx->time);
3875
3876}
3877
3878static void task_clock_perf_counter_read(struct perf_counter *counter)
3879{
3880 u64 time;
3881
3882 if (!in_nmi()) {
3883 update_context_time(counter->ctx);
3884 time = counter->ctx->time;
3885 } else {
3886 u64 now = perf_clock();
3887 u64 delta = now - counter->ctx->timestamp;
3888 time = counter->ctx->time + delta;
3889 }
3890
3891 task_clock_perf_counter_update(counter, time);
3892}
3893
3894static const struct pmu perf_ops_task_clock = {
3895 .enable = task_clock_perf_counter_enable,
3896 .disable = task_clock_perf_counter_disable,
3897 .read = task_clock_perf_counter_read,
3898};
3899
3900#ifdef CONFIG_EVENT_PROFILE
3901void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3902 int entry_size)
3903{
3904 struct perf_raw_record raw = {
3905 .size = entry_size,
3906 .data = record,
3907 };
3908
3909 struct perf_sample_data data = {
3910 .regs = get_irq_regs(),
3911 .addr = addr,
3912 .raw = &raw,
3913 };
3914
3915 if (!data.regs)
3916 data.regs = task_pt_regs(current);
3917
3918 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3919}
3920EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3921
3922extern int ftrace_profile_enable(int);
3923extern void ftrace_profile_disable(int);
3924
3925static void tp_perf_counter_destroy(struct perf_counter *counter)
3926{
3927 ftrace_profile_disable(counter->attr.config);
3928}
3929
3930static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3931{
3932 /*
3933 * Raw tracepoint data is a severe data leak, only allow root to
3934 * have these.
3935 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3937 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM);
3939
3940 if (ftrace_profile_enable(counter->attr.config))
3941 return NULL;
3942
3943 counter->destroy = tp_perf_counter_destroy;
3944
3945 return &perf_ops_generic;
3946}
3947#else
3948static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3949{
3950 return NULL;
3951}
3952#endif
3953
3954atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3955
3956static void sw_perf_counter_destroy(struct perf_counter *counter)
3957{
3958 u64 event = counter->attr.config;
3959
3960 WARN_ON(counter->parent);
3961
3962 atomic_dec(&perf_swcounter_enabled[event]);
3963}
3964
3965static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3966{
3967 const struct pmu *pmu = NULL;
3968 u64 event = counter->attr.config;
3969
3970 /*
3971 * Software counters (currently) can't in general distinguish
3972 * between user, kernel and hypervisor events.
3973 * However, context switches and cpu migrations are considered
3974 * to be kernel events, and page faults are never hypervisor
3975 * events.
3976 */
3977 switch (event) {
3978 case PERF_COUNT_SW_CPU_CLOCK:
3979 pmu = &perf_ops_cpu_clock;
3980
3981 break;
3982 case PERF_COUNT_SW_TASK_CLOCK:
3983 /*
3984 * If the user instantiates this as a per-cpu counter,
3985 * use the cpu_clock counter instead.
3986 */
3987 if (counter->ctx->task)
3988 pmu = &perf_ops_task_clock;
3989 else
3990 pmu = &perf_ops_cpu_clock;
3991
3992 break;
3993 case PERF_COUNT_SW_PAGE_FAULTS:
3994 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3995 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3996 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3997 case PERF_COUNT_SW_CPU_MIGRATIONS:
3998 if (!counter->parent) {
3999 atomic_inc(&perf_swcounter_enabled[event]);
4000 counter->destroy = sw_perf_counter_destroy;
4001 }
4002 pmu = &perf_ops_generic;
4003 break;
4004 }
4005
4006 return pmu;
4007}
4008
4009/*
4010 * Allocate and initialize a counter structure
4011 */
4012static struct perf_counter *
4013perf_counter_alloc(struct perf_counter_attr *attr,
4014 int cpu,
4015 struct perf_counter_context *ctx,
4016 struct perf_counter *group_leader,
4017 struct perf_counter *parent_counter,
4018 gfp_t gfpflags)
4019{
4020 const struct pmu *pmu;
4021 struct perf_counter *counter;
4022 struct hw_perf_counter *hwc;
4023 long err;
4024
4025 counter = kzalloc(sizeof(*counter), gfpflags);
4026 if (!counter)
4027 return ERR_PTR(-ENOMEM);
4028
4029 /*
4030 * Single counters are their own group leaders, with an
4031 * empty sibling list:
4032 */
4033 if (!group_leader)
4034 group_leader = counter;
4035
4036 mutex_init(&counter->child_mutex);
4037 INIT_LIST_HEAD(&counter->child_list);
4038
4039 INIT_LIST_HEAD(&counter->list_entry);
4040 INIT_LIST_HEAD(&counter->event_entry);
4041 INIT_LIST_HEAD(&counter->sibling_list);
4042 init_waitqueue_head(&counter->waitq);
4043
4044 mutex_init(&counter->mmap_mutex);
4045
4046 counter->cpu = cpu;
4047 counter->attr = *attr;
4048 counter->group_leader = group_leader;
4049 counter->pmu = NULL;
4050 counter->ctx = ctx;
4051 counter->oncpu = -1;
4052
4053 counter->parent = parent_counter;
4054
4055 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
4056 counter->id = atomic64_inc_return(&perf_counter_id);
4057
4058 counter->state = PERF_COUNTER_STATE_INACTIVE;
4059
4060 if (attr->disabled)
4061 counter->state = PERF_COUNTER_STATE_OFF;
4062
4063 pmu = NULL;
4064
4065 hwc = &counter->hw;
4066 hwc->sample_period = attr->sample_period;
4067 if (attr->freq && attr->sample_freq)
4068 hwc->sample_period = 1;
4069
4070 atomic64_set(&hwc->period_left, hwc->sample_period);
4071
4072 /*
4073 * we currently do not support PERF_FORMAT_GROUP on inherited counters
4074 */
4075 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4076 goto done;
4077
4078 switch (attr->type) {
4079 case PERF_TYPE_RAW:
4080 case PERF_TYPE_HARDWARE:
4081 case PERF_TYPE_HW_CACHE:
4082 pmu = hw_perf_counter_init(counter);
4083 break;
4084
4085 case PERF_TYPE_SOFTWARE:
4086 pmu = sw_perf_counter_init(counter);
4087 break;
4088
4089 case PERF_TYPE_TRACEPOINT:
4090 pmu = tp_perf_counter_init(counter);
4091 break;
4092
4093 default:
4094 break;
4095 }
4096done:
4097 err = 0;
4098 if (!pmu)
4099 err = -EINVAL;
4100 else if (IS_ERR(pmu))
4101 err = PTR_ERR(pmu);
4102
4103 if (err) {
4104 if (counter->ns)
4105 put_pid_ns(counter->ns);
4106 kfree(counter);
4107 return ERR_PTR(err);
4108 }
4109
4110 counter->pmu = pmu;
4111
4112 if (!counter->parent) {
4113 atomic_inc(&nr_counters);
4114 if (counter->attr.mmap)
4115 atomic_inc(&nr_mmap_counters);
4116 if (counter->attr.comm)
4117 atomic_inc(&nr_comm_counters);
4118 if (counter->attr.task)
4119 atomic_inc(&nr_task_counters);
4120 }
4121
4122 return counter;
4123}
4124
4125static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4126 struct perf_counter_attr *attr)
4127{
4128 int ret;
4129 u32 size;
4130
4131 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4132 return -EFAULT;
4133
4134 /*
4135 * zero the full structure, so that a short copy will be nice.
4136 */
4137 memset(attr, 0, sizeof(*attr));
4138
4139 ret = get_user(size, &uattr->size);
4140 if (ret)
4141 return ret;
4142
4143 if (size > PAGE_SIZE) /* silly large */
4144 goto err_size;
4145
4146 if (!size) /* abi compat */
4147 size = PERF_ATTR_SIZE_VER0;
4148
4149 if (size < PERF_ATTR_SIZE_VER0)
4150 goto err_size;
4151
4152 /*
4153 * If we're handed a bigger struct than we know of,
4154 * ensure all the unknown bits are 0.
4155 */
4156 if (size > sizeof(*attr)) {
4157 unsigned long val;
4158 unsigned long __user *addr;
4159 unsigned long __user *end;
4160
4161 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
4162 sizeof(unsigned long));
4163 end = PTR_ALIGN((void __user *)uattr + size,
4164 sizeof(unsigned long));
4165
4166 for (; addr < end; addr += sizeof(unsigned long)) {
4167 ret = get_user(val, addr);
4168 if (ret)
4169 return ret;
4170 if (val)
4171 goto err_size;
4172 }
4173 }
4174
4175 ret = copy_from_user(attr, uattr, size);
4176 if (ret)
4177 return -EFAULT;
4178
4179 /*
4180 * If the type exists, the corresponding creation will verify
4181 * the attr->config.
4182 */
4183 if (attr->type >= PERF_TYPE_MAX)
4184 return -EINVAL;
4185
4186 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4187 return -EINVAL;
4188
4189 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4190 return -EINVAL;
4191
4192 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4193 return -EINVAL;
4194
4195out:
4196 return ret;
4197
4198err_size:
4199 put_user(sizeof(*attr), &uattr->size);
4200 ret = -E2BIG;
4201 goto out;
4202}
4203
4204/**
4205 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4206 *
4207 * @attr_uptr: event type attributes for monitoring/sampling
4208 * @pid: target pid
4209 * @cpu: target cpu
4210 * @group_fd: group leader counter fd
4211 */
4212SYSCALL_DEFINE5(perf_counter_open,
4213 struct perf_counter_attr __user *, attr_uptr,
4214 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4215{
4216 struct perf_counter *counter, *group_leader;
4217 struct perf_counter_attr attr;
4218 struct perf_counter_context *ctx;
4219 struct file *counter_file = NULL;
4220 struct file *group_file = NULL;
4221 int fput_needed = 0;
4222 int fput_needed2 = 0;
4223 int ret;
4224
4225 /* for future expandability... */
4226 if (flags)
4227 return -EINVAL;
4228
4229 ret = perf_copy_attr(attr_uptr, &attr);
4230 if (ret)
4231 return ret;
4232
4233 if (!attr.exclude_kernel) {
4234 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4235 return -EACCES;
4236 }
4237
4238 if (attr.freq) {
4239 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4240 return -EINVAL;
4241 }
4242
4243 /*
4244 * Get the target context (task or percpu):
4245 */
4246 ctx = find_get_context(pid, cpu);
4247 if (IS_ERR(ctx))
4248 return PTR_ERR(ctx);
4249
4250 /*
4251 * Look up the group leader (we will attach this counter to it):
4252 */
4253 group_leader = NULL;
4254 if (group_fd != -1) {
4255 ret = -EINVAL;
4256 group_file = fget_light(group_fd, &fput_needed);
4257 if (!group_file)
4258 goto err_put_context;
4259 if (group_file->f_op != &perf_fops)
4260 goto err_put_context;
4261
4262 group_leader = group_file->private_data;
4263 /*
4264 * Do not allow a recursive hierarchy (this new sibling
4265 * becoming part of another group-sibling):
4266 */
4267 if (group_leader->group_leader != group_leader)
4268 goto err_put_context;
4269 /*
4270 * Do not allow to attach to a group in a different
4271 * task or CPU context:
4272 */
4273 if (group_leader->ctx != ctx)
4274 goto err_put_context;
4275 /*
4276 * Only a group leader can be exclusive or pinned
4277 */
4278 if (attr.exclusive || attr.pinned)
4279 goto err_put_context;
4280 }
4281
4282 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4283 NULL, GFP_KERNEL);
4284 ret = PTR_ERR(counter);
4285 if (IS_ERR(counter))
4286 goto err_put_context;
4287
4288 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4289 if (ret < 0)
4290 goto err_free_put_context;
4291
4292 counter_file = fget_light(ret, &fput_needed2);
4293 if (!counter_file)
4294 goto err_free_put_context;
4295
4296 counter->filp = counter_file;
4297 WARN_ON_ONCE(ctx->parent_ctx);
4298 mutex_lock(&ctx->mutex);
4299 perf_install_in_context(ctx, counter, cpu);
4300 ++ctx->generation;
4301 mutex_unlock(&ctx->mutex);
4302
4303 counter->owner = current;
4304 get_task_struct(current);
4305 mutex_lock(&current->perf_counter_mutex);
4306 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4307 mutex_unlock(&current->perf_counter_mutex);
4308
4309 fput_light(counter_file, fput_needed2);
4310
4311out_fput:
4312 fput_light(group_file, fput_needed);
4313
4314 return ret;
4315
4316err_free_put_context:
4317 kfree(counter);
4318
4319err_put_context:
4320 put_ctx(ctx);
4321
4322 goto out_fput;
4323}
4324
4325/*
4326 * inherit a counter from parent task to child task:
4327 */
4328static struct perf_counter *
4329inherit_counter(struct perf_counter *parent_counter,
4330 struct task_struct *parent,
4331 struct perf_counter_context *parent_ctx,
4332 struct task_struct *child,
4333 struct perf_counter *group_leader,
4334 struct perf_counter_context *child_ctx)
4335{
4336 struct perf_counter *child_counter;
4337
4338 /*
4339 * Instead of creating recursive hierarchies of counters,
4340 * we link inherited counters back to the original parent,
4341 * which has a filp for sure, which we use as the reference
4342 * count:
4343 */
4344 if (parent_counter->parent)
4345 parent_counter = parent_counter->parent;
4346
4347 child_counter = perf_counter_alloc(&parent_counter->attr,
4348 parent_counter->cpu, child_ctx,
4349 group_leader, parent_counter,
4350 GFP_KERNEL);
4351 if (IS_ERR(child_counter))
4352 return child_counter;
4353 get_ctx(child_ctx);
4354
4355 /*
4356 * Make the child state follow the state of the parent counter,
4357 * not its attr.disabled bit. We hold the parent's mutex,
4358 * so we won't race with perf_counter_{en, dis}able_family.
4359 */
4360 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4361 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4362 else
4363 child_counter->state = PERF_COUNTER_STATE_OFF;
4364
4365 if (parent_counter->attr.freq)
4366 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4367
4368 /*
4369 * Link it up in the child's context:
4370 */
4371 add_counter_to_ctx(child_counter, child_ctx);
4372
4373 /*
4374 * Get a reference to the parent filp - we will fput it
4375 * when the child counter exits. This is safe to do because
4376 * we are in the parent and we know that the filp still
4377 * exists and has a nonzero count:
4378 */
4379 atomic_long_inc(&parent_counter->filp->f_count);
4380
4381 /*
4382 * Link this into the parent counter's child list
4383 */
4384 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4385 mutex_lock(&parent_counter->child_mutex);
4386 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4387 mutex_unlock(&parent_counter->child_mutex);
4388
4389 return child_counter;
4390}
4391
4392static int inherit_group(struct perf_counter *parent_counter,
4393 struct task_struct *parent,
4394 struct perf_counter_context *parent_ctx,
4395 struct task_struct *child,
4396 struct perf_counter_context *child_ctx)
4397{
4398 struct perf_counter *leader;
4399 struct perf_counter *sub;
4400 struct perf_counter *child_ctr;
4401
4402 leader = inherit_counter(parent_counter, parent, parent_ctx,
4403 child, NULL, child_ctx);
4404 if (IS_ERR(leader))
4405 return PTR_ERR(leader);
4406 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4407 child_ctr = inherit_counter(sub, parent, parent_ctx,
4408 child, leader, child_ctx);
4409 if (IS_ERR(child_ctr))
4410 return PTR_ERR(child_ctr);
4411 }
4412 return 0;
4413}
4414
4415static void sync_child_counter(struct perf_counter *child_counter,
4416 struct task_struct *child)
4417{
4418 struct perf_counter *parent_counter = child_counter->parent;
4419 u64 child_val;
4420
4421 if (child_counter->attr.inherit_stat)
4422 perf_counter_read_event(child_counter, child);
4423
4424 child_val = atomic64_read(&child_counter->count);
4425
4426 /*
4427 * Add back the child's count to the parent's count:
4428 */
4429 atomic64_add(child_val, &parent_counter->count);
4430 atomic64_add(child_counter->total_time_enabled,
4431 &parent_counter->child_total_time_enabled);
4432 atomic64_add(child_counter->total_time_running,
4433 &parent_counter->child_total_time_running);
4434
4435 /*
4436 * Remove this counter from the parent's list
4437 */
4438 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4439 mutex_lock(&parent_counter->child_mutex);
4440 list_del_init(&child_counter->child_list);
4441 mutex_unlock(&parent_counter->child_mutex);
4442
4443 /*
4444 * Release the parent counter, if this was the last
4445 * reference to it.
4446 */
4447 fput(parent_counter->filp);
4448}
4449
4450static void
4451__perf_counter_exit_task(struct perf_counter *child_counter,
4452 struct perf_counter_context *child_ctx,
4453 struct task_struct *child)
4454{
4455 struct perf_counter *parent_counter;
4456
4457 update_counter_times(child_counter);
4458 perf_counter_remove_from_context(child_counter);
4459
4460 parent_counter = child_counter->parent;
4461 /*
4462 * It can happen that parent exits first, and has counters
4463 * that are still around due to the child reference. These
4464 * counters need to be zapped - but otherwise linger.
4465 */
4466 if (parent_counter) {
4467 sync_child_counter(child_counter, child);
4468 free_counter(child_counter);
4469 }
4470}
4471
4472/*
4473 * When a child task exits, feed back counter values to parent counters.
4474 */
4475void perf_counter_exit_task(struct task_struct *child)
4476{
4477 struct perf_counter *child_counter, *tmp;
4478 struct perf_counter_context *child_ctx;
4479 unsigned long flags;
4480
4481 if (likely(!child->perf_counter_ctxp)) {
4482 perf_counter_task(child, NULL, 0);
4483 return;
4484 }
4485
4486 local_irq_save(flags);
4487 /*
4488 * We can't reschedule here because interrupts are disabled,
4489 * and either child is current or it is a task that can't be
4490 * scheduled, so we are now safe from rescheduling changing
4491 * our context.
4492 */
4493 child_ctx = child->perf_counter_ctxp;
4494 __perf_counter_task_sched_out(child_ctx);
4495
4496 /*
4497 * Take the context lock here so that if find_get_context is
4498 * reading child->perf_counter_ctxp, we wait until it has
4499 * incremented the context's refcount before we do put_ctx below.
4500 */
4501 spin_lock(&child_ctx->lock);
4502 child->perf_counter_ctxp = NULL;
4503 /*
4504 * If this context is a clone; unclone it so it can't get
4505 * swapped to another process while we're removing all
4506 * the counters from it.
4507 */
4508 unclone_ctx(child_ctx);
4509 spin_unlock_irqrestore(&child_ctx->lock, flags);
4510
4511 /*
4512 * Report the task dead after unscheduling the counters so that we
4513 * won't get any samples after PERF_EVENT_EXIT. We can however still
4514 * get a few PERF_EVENT_READ events.
4515 */
4516 perf_counter_task(child, child_ctx, 0);
4517
4518 /*
4519 * We can recurse on the same lock type through:
4520 *
4521 * __perf_counter_exit_task()
4522 * sync_child_counter()
4523 * fput(parent_counter->filp)
4524 * perf_release()
4525 * mutex_lock(&ctx->mutex)
4526 *
4527 * But since its the parent context it won't be the same instance.
4528 */
4529 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4530
4531again:
4532 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4533 list_entry)
4534 __perf_counter_exit_task(child_counter, child_ctx, child);
4535
4536 /*
4537 * If the last counter was a group counter, it will have appended all
4538 * its siblings to the list, but we obtained 'tmp' before that which
4539 * will still point to the list head terminating the iteration.
4540 */
4541 if (!list_empty(&child_ctx->counter_list))
4542 goto again;
4543
4544 mutex_unlock(&child_ctx->mutex);
4545
4546 put_ctx(child_ctx);
4547}
4548
4549/*
4550 * free an unexposed, unused context as created by inheritance by
4551 * init_task below, used by fork() in case of fail.
4552 */
4553void perf_counter_free_task(struct task_struct *task)
4554{
4555 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4556 struct perf_counter *counter, *tmp;
4557
4558 if (!ctx)
4559 return;
4560
4561 mutex_lock(&ctx->mutex);
4562again:
4563 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4564 struct perf_counter *parent = counter->parent;
4565
4566 if (WARN_ON_ONCE(!parent))
4567 continue;
4568
4569 mutex_lock(&parent->child_mutex);
4570 list_del_init(&counter->child_list);
4571 mutex_unlock(&parent->child_mutex);
4572
4573 fput(parent->filp);
4574
4575 list_del_counter(counter, ctx);
4576 free_counter(counter);
4577 }
4578
4579 if (!list_empty(&ctx->counter_list))
4580 goto again;
4581
4582 mutex_unlock(&ctx->mutex);
4583
4584 put_ctx(ctx);
4585}
4586
4587/*
4588 * Initialize the perf_counter context in task_struct
4589 */
4590int perf_counter_init_task(struct task_struct *child)
4591{
4592 struct perf_counter_context *child_ctx, *parent_ctx;
4593 struct perf_counter_context *cloned_ctx;
4594 struct perf_counter *counter;
4595 struct task_struct *parent = current;
4596 int inherited_all = 1;
4597 int ret = 0;
4598
4599 child->perf_counter_ctxp = NULL;
4600
4601 mutex_init(&child->perf_counter_mutex);
4602 INIT_LIST_HEAD(&child->perf_counter_list);
4603
4604 if (likely(!parent->perf_counter_ctxp))
4605 return 0;
4606
4607 /*
4608 * This is executed from the parent task context, so inherit
4609 * counters that have been marked for cloning.
4610 * First allocate and initialize a context for the child.
4611 */
4612
4613 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4614 if (!child_ctx)
4615 return -ENOMEM;
4616
4617 __perf_counter_init_context(child_ctx, child);
4618 child->perf_counter_ctxp = child_ctx;
4619 get_task_struct(child);
4620
4621 /*
4622 * If the parent's context is a clone, pin it so it won't get
4623 * swapped under us.
4624 */
4625 parent_ctx = perf_pin_task_context(parent);
4626
4627 /*
4628 * No need to check if parent_ctx != NULL here; since we saw
4629 * it non-NULL earlier, the only reason for it to become NULL
4630 * is if we exit, and since we're currently in the middle of
4631 * a fork we can't be exiting at the same time.
4632 */
4633
4634 /*
4635 * Lock the parent list. No need to lock the child - not PID
4636 * hashed yet and not running, so nobody can access it.
4637 */
4638 mutex_lock(&parent_ctx->mutex);
4639
4640 /*
4641 * We dont have to disable NMIs - we are only looking at
4642 * the list, not manipulating it:
4643 */
4644 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4645 if (counter != counter->group_leader)
4646 continue;
4647
4648 if (!counter->attr.inherit) {
4649 inherited_all = 0;
4650 continue;
4651 }
4652
4653 ret = inherit_group(counter, parent, parent_ctx,
4654 child, child_ctx);
4655 if (ret) {
4656 inherited_all = 0;
4657 break;
4658 }
4659 }
4660
4661 if (inherited_all) {
4662 /*
4663 * Mark the child context as a clone of the parent
4664 * context, or of whatever the parent is a clone of.
4665 * Note that if the parent is a clone, it could get
4666 * uncloned at any point, but that doesn't matter
4667 * because the list of counters and the generation
4668 * count can't have changed since we took the mutex.
4669 */
4670 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4671 if (cloned_ctx) {
4672 child_ctx->parent_ctx = cloned_ctx;
4673 child_ctx->parent_gen = parent_ctx->parent_gen;
4674 } else {
4675 child_ctx->parent_ctx = parent_ctx;
4676 child_ctx->parent_gen = parent_ctx->generation;
4677 }
4678 get_ctx(child_ctx->parent_ctx);
4679 }
4680
4681 mutex_unlock(&parent_ctx->mutex);
4682
4683 perf_unpin_context(parent_ctx);
4684
4685 return ret;
4686}
4687
4688static void __cpuinit perf_counter_init_cpu(int cpu)
4689{
4690 struct perf_cpu_context *cpuctx;
4691
4692 cpuctx = &per_cpu(perf_cpu_context, cpu);
4693 __perf_counter_init_context(&cpuctx->ctx, NULL);
4694
4695 spin_lock(&perf_resource_lock);
4696 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4697 spin_unlock(&perf_resource_lock);
4698
4699 hw_perf_counter_setup(cpu);
4700}
4701
4702#ifdef CONFIG_HOTPLUG_CPU
4703static void __perf_counter_exit_cpu(void *info)
4704{
4705 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4706 struct perf_counter_context *ctx = &cpuctx->ctx;
4707 struct perf_counter *counter, *tmp;
4708
4709 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4710 __perf_counter_remove_from_context(counter);
4711}
4712static void perf_counter_exit_cpu(int cpu)
4713{
4714 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4715 struct perf_counter_context *ctx = &cpuctx->ctx;
4716
4717 mutex_lock(&ctx->mutex);
4718 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4719 mutex_unlock(&ctx->mutex);
4720}
4721#else
4722static inline void perf_counter_exit_cpu(int cpu) { }
4723#endif
4724
4725static int __cpuinit
4726perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4727{
4728 unsigned int cpu = (long)hcpu;
4729
4730 switch (action) {
4731
4732 case CPU_UP_PREPARE:
4733 case CPU_UP_PREPARE_FROZEN:
4734 perf_counter_init_cpu(cpu);
4735 break;
4736
4737 case CPU_ONLINE:
4738 case CPU_ONLINE_FROZEN:
4739 hw_perf_counter_setup_online(cpu);
4740 break;
4741
4742 case CPU_DOWN_PREPARE:
4743 case CPU_DOWN_PREPARE_FROZEN:
4744 perf_counter_exit_cpu(cpu);
4745 break;
4746
4747 default:
4748 break;
4749 }
4750
4751 return NOTIFY_OK;
4752}
4753
4754/*
4755 * This has to have a higher priority than migration_notifier in sched.c.
4756 */
4757static struct notifier_block __cpuinitdata perf_cpu_nb = {
4758 .notifier_call = perf_cpu_notify,
4759 .priority = 20,
4760};
4761
4762void __init perf_counter_init(void)
4763{
4764 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4765 (void *)(long)smp_processor_id());
4766 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4767 (void *)(long)smp_processor_id());
4768 register_cpu_notifier(&perf_cpu_nb);
4769}
4770
4771static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4772{
4773 return sprintf(buf, "%d\n", perf_reserved_percpu);
4774}
4775
4776static ssize_t
4777perf_set_reserve_percpu(struct sysdev_class *class,
4778 const char *buf,
4779 size_t count)
4780{
4781 struct perf_cpu_context *cpuctx;
4782 unsigned long val;
4783 int err, cpu, mpt;
4784
4785 err = strict_strtoul(buf, 10, &val);
4786 if (err)
4787 return err;
4788 if (val > perf_max_counters)
4789 return -EINVAL;
4790
4791 spin_lock(&perf_resource_lock);
4792 perf_reserved_percpu = val;
4793 for_each_online_cpu(cpu) {
4794 cpuctx = &per_cpu(perf_cpu_context, cpu);
4795 spin_lock_irq(&cpuctx->ctx.lock);
4796 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4797 perf_max_counters - perf_reserved_percpu);
4798 cpuctx->max_pertask = mpt;
4799 spin_unlock_irq(&cpuctx->ctx.lock);
4800 }
4801 spin_unlock(&perf_resource_lock);
4802
4803 return count;
4804}
4805
4806static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4807{
4808 return sprintf(buf, "%d\n", perf_overcommit);
4809}
4810
4811static ssize_t
4812perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4813{
4814 unsigned long val;
4815 int err;
4816
4817 err = strict_strtoul(buf, 10, &val);
4818 if (err)
4819 return err;
4820 if (val > 1)
4821 return -EINVAL;
4822
4823 spin_lock(&perf_resource_lock);
4824 perf_overcommit = val;
4825 spin_unlock(&perf_resource_lock);
4826
4827 return count;
4828}
4829
4830static SYSDEV_CLASS_ATTR(
4831 reserve_percpu,
4832 0644,
4833 perf_show_reserve_percpu,
4834 perf_set_reserve_percpu
4835 );
4836
4837static SYSDEV_CLASS_ATTR(
4838 overcommit,
4839 0644,
4840 perf_show_overcommit,
4841 perf_set_overcommit
4842 );
4843
4844static struct attribute *perfclass_attrs[] = {
4845 &attr_reserve_percpu.attr,
4846 &attr_overcommit.attr,
4847 NULL
4848};
4849
4850static struct attribute_group perfclass_attr_group = {
4851 .attrs = perfclass_attrs,
4852 .name = "perf_counters",
4853};
4854
4855static int __init perf_counter_sysfs_init(void)
4856{
4857 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4858 &perfclass_attr_group);
4859}
4860device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd281..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
380 */ 380 */
381struct task_struct *find_task_by_pid_type_ns(int type, int nr, 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382 struct pid_namespace *ns)
383{ 382{
384 return pid_task(find_pid_ns(nr, ns), type); 383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
385} 384}
386 385
387EXPORT_SYMBOL(find_task_by_pid_type_ns);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 386struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 387{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399} 389}
400EXPORT_SYMBOL(find_task_by_pid_ns);
401 390
402struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 391struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 392{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858d..821722ae58a7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
67 return NULL; 67 return NULL;
68} 68}
69 69
70static struct pid_namespace *create_pid_namespace(unsigned int level) 70static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
71{ 71{
72 struct pid_namespace *ns; 72 struct pid_namespace *ns;
73 unsigned int level = parent_pid_ns->level + 1;
73 int i; 74 int i;
74 75
75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 76 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
86 87
87 kref_init(&ns->kref); 88 kref_init(&ns->kref);
88 ns->level = level; 89 ns->level = level;
90 ns->parent = get_pid_ns(parent_pid_ns);
89 91
90 set_bit(0, ns->pidmap[0].page); 92 set_bit(0, ns->pidmap[0].page);
91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 93 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
114 116
115struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 117struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
116{ 118{
117 struct pid_namespace *new_ns;
118
119 BUG_ON(!old_ns);
120 new_ns = get_pid_ns(old_ns);
121 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
122 goto out; 120 return get_pid_ns(old_ns);
123
124 new_ns = ERR_PTR(-EINVAL);
125 if (flags & CLONE_THREAD) 121 if (flags & CLONE_THREAD)
126 goto out_put; 122 return ERR_PTR(-EINVAL);
127 123 return create_pid_namespace(old_ns);
128 new_ns = create_pid_namespace(old_ns->level + 1);
129 if (!IS_ERR(new_ns))
130 new_ns->parent = get_pid_ns(old_ns);
131
132out_put:
133 put_pid_ns(old_ns);
134out:
135 return new_ns;
136} 124}
137 125
138void free_pid_ns(struct kref *kref) 126void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
521} 521}
522void posix_cpu_timers_exit_group(struct task_struct *tsk) 522void posix_cpu_timers_exit_group(struct task_struct *tsk)
523{ 523{
524 struct task_cputime cputime; 524 struct signal_struct *const sig = tsk->signal;
525 525
526 thread_group_cputimer(tsk, &cputime);
527 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
528 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime_add(tsk->utime, sig->utime),
528 cputime_add(tsk->stime, sig->stime),
529 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
529} 530}
530 531
531static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 532static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 260 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 261 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 262 .timer_create = no_timer_create,
263 .nsleep = no_nsleep,
257 }; 264 };
258 265
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 266 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96b..72067cbdb37f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
116 116
117 Turning OFF this setting is NOT recommended! If in doubt, say Y. 117 Turning OFF this setting is NOT recommended! If in doubt, say Y.
118 118
119config HIBERNATION_NVS
120 bool
121
119config HIBERNATION 122config HIBERNATION
120 bool "Hibernation (aka 'suspend to disk')" 123 bool "Hibernation (aka 'suspend to disk')"
121 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 124 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
125 select HIBERNATION_NVS if HAS_IOMEM
122 ---help--- 126 ---help---
123 Enable the suspend to disk (STD) functionality, which is usually 127 Enable the suspend to disk (STD) functionality, which is usually
124 called "hibernation" in user interfaces. STD checkpoints the 128 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 720ea4f781bd..c3b81c30e5d5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -6,6 +6,9 @@ endif
6obj-$(CONFIG_PM) += main.o 6obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 7obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
10 13
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c
index 5cb080e7eebd..81d2e7464893 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/hibernate.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * kernel/power/disk.c - Suspend-to-disk support. 2 * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
7 * 8 *
8 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
9 *
10 */ 10 */
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
@@ -215,13 +215,13 @@ static int create_image(int platform_mode)
215 if (error) 215 if (error)
216 return error; 216 return error;
217 217
218 /* At this point, device_suspend() has been called, but *not* 218 /* At this point, dpm_suspend_start() has been called, but *not*
219 * device_power_down(). We *must* call device_power_down() now. 219 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
220 * Otherwise, drivers for some devices (e.g. interrupt controllers) 220 * Otherwise, drivers for some devices (e.g. interrupt controllers)
221 * become desynchronized with the actual state of the hardware 221 * become desynchronized with the actual state of the hardware
222 * at resume time, and evil weirdness ensues. 222 * at resume time, and evil weirdness ensues.
223 */ 223 */
224 error = device_power_down(PMSG_FREEZE); 224 error = dpm_suspend_noirq(PMSG_FREEZE);
225 if (error) { 225 if (error) {
226 printk(KERN_ERR "PM: Some devices failed to power down, " 226 printk(KERN_ERR "PM: Some devices failed to power down, "
227 "aborting hibernation\n"); 227 "aborting hibernation\n");
@@ -262,7 +262,7 @@ static int create_image(int platform_mode)
262 262
263 Power_up: 263 Power_up:
264 sysdev_resume(); 264 sysdev_resume();
265 /* NOTE: device_power_up() is just a resume() for devices 265 /* NOTE: dpm_resume_noirq() is just a resume() for devices
266 * that suspended with irqs off ... no overall powerup. 266 * that suspended with irqs off ... no overall powerup.
267 */ 267 */
268 268
@@ -275,7 +275,7 @@ static int create_image(int platform_mode)
275 Platform_finish: 275 Platform_finish:
276 platform_finish(platform_mode); 276 platform_finish(platform_mode);
277 277
278 device_power_up(in_suspend ? 278 dpm_resume_noirq(in_suspend ?
279 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 279 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
280 280
281 return error; 281 return error;
@@ -304,7 +304,7 @@ int hibernation_snapshot(int platform_mode)
304 goto Close; 304 goto Close;
305 305
306 suspend_console(); 306 suspend_console();
307 error = device_suspend(PMSG_FREEZE); 307 error = dpm_suspend_start(PMSG_FREEZE);
308 if (error) 308 if (error)
309 goto Recover_platform; 309 goto Recover_platform;
310 310
@@ -315,7 +315,7 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 device_resume(in_suspend ? 318 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 320 resume_console();
321 Close: 321 Close:
@@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode)
339{ 339{
340 int error; 340 int error;
341 341
342 error = device_power_down(PMSG_QUIESCE); 342 error = dpm_suspend_noirq(PMSG_QUIESCE);
343 if (error) { 343 if (error) {
344 printk(KERN_ERR "PM: Some devices failed to power down, " 344 printk(KERN_ERR "PM: Some devices failed to power down, "
345 "aborting resume\n"); 345 "aborting resume\n");
@@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode)
394 Cleanup: 394 Cleanup:
395 platform_restore_cleanup(platform_mode); 395 platform_restore_cleanup(platform_mode);
396 396
397 device_power_up(PMSG_RECOVER); 397 dpm_resume_noirq(PMSG_RECOVER);
398 398
399 return error; 399 return error;
400} 400}
@@ -414,10 +414,10 @@ int hibernation_restore(int platform_mode)
414 414
415 pm_prepare_console(); 415 pm_prepare_console();
416 suspend_console(); 416 suspend_console();
417 error = device_suspend(PMSG_QUIESCE); 417 error = dpm_suspend_start(PMSG_QUIESCE);
418 if (!error) { 418 if (!error) {
419 error = resume_target_kernel(platform_mode); 419 error = resume_target_kernel(platform_mode);
420 device_resume(PMSG_RECOVER); 420 dpm_resume_end(PMSG_RECOVER);
421 } 421 }
422 resume_console(); 422 resume_console();
423 pm_restore_console(); 423 pm_restore_console();
@@ -447,14 +447,14 @@ int hibernation_platform_enter(void)
447 447
448 entering_platform_hibernation = true; 448 entering_platform_hibernation = true;
449 suspend_console(); 449 suspend_console();
450 error = device_suspend(PMSG_HIBERNATE); 450 error = dpm_suspend_start(PMSG_HIBERNATE);
451 if (error) { 451 if (error) {
452 if (hibernation_ops->recover) 452 if (hibernation_ops->recover)
453 hibernation_ops->recover(); 453 hibernation_ops->recover();
454 goto Resume_devices; 454 goto Resume_devices;
455 } 455 }
456 456
457 error = device_power_down(PMSG_HIBERNATE); 457 error = dpm_suspend_noirq(PMSG_HIBERNATE);
458 if (error) 458 if (error)
459 goto Resume_devices; 459 goto Resume_devices;
460 460
@@ -479,11 +479,11 @@ int hibernation_platform_enter(void)
479 Platofrm_finish: 479 Platofrm_finish:
480 hibernation_ops->finish(); 480 hibernation_ops->finish();
481 481
482 device_power_up(PMSG_RESTORE); 482 dpm_suspend_noirq(PMSG_RESTORE);
483 483
484 Resume_devices: 484 Resume_devices:
485 entering_platform_hibernation = false; 485 entering_platform_hibernation = false;
486 device_resume(PMSG_RESTORE); 486 dpm_resume_end(PMSG_RESTORE);
487 resume_console(); 487 resume_console();
488 488
489 Close: 489 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 000000000000..39ac698ef836
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/suspend.h>
14
15/*
16 * Platforms, like ACPI, may want us to save some memory used by them during
17 * hibernation and to restore the contents of this memory during the subsequent
18 * resume. The code below implements a mechanism allowing us to do that.
19 */
20
21struct nvs_page {
22 unsigned long phys_start;
23 unsigned int size;
24 void *kaddr;
25 void *data;
26 struct list_head node;
27};
28
29static LIST_HEAD(nvs_list);
30
31/**
32 * hibernate_nvs_register - register platform NVS memory region to save
33 * @start - physical address of the region
34 * @size - size of the region
35 *
36 * The NVS region need not be page-aligned (both ends) and we arrange
37 * things so that the data from page-aligned addresses in this region will
38 * be copied into separate RAM pages.
39 */
40int hibernate_nvs_register(unsigned long start, unsigned long size)
41{
42 struct nvs_page *entry, *next;
43
44 while (size > 0) {
45 unsigned int nr_bytes;
46
47 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
48 if (!entry)
49 goto Error;
50
51 list_add_tail(&entry->node, &nvs_list);
52 entry->phys_start = start;
53 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
54 entry->size = (size < nr_bytes) ? size : nr_bytes;
55
56 start += entry->size;
57 size -= entry->size;
58 }
59 return 0;
60
61 Error:
62 list_for_each_entry_safe(entry, next, &nvs_list, node) {
63 list_del(&entry->node);
64 kfree(entry);
65 }
66 return -ENOMEM;
67}
68
69/**
70 * hibernate_nvs_free - free data pages allocated for saving NVS regions
71 */
72void hibernate_nvs_free(void)
73{
74 struct nvs_page *entry;
75
76 list_for_each_entry(entry, &nvs_list, node)
77 if (entry->data) {
78 free_page((unsigned long)entry->data);
79 entry->data = NULL;
80 if (entry->kaddr) {
81 iounmap(entry->kaddr);
82 entry->kaddr = NULL;
83 }
84 }
85}
86
87/**
88 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
89 */
90int hibernate_nvs_alloc(void)
91{
92 struct nvs_page *entry;
93
94 list_for_each_entry(entry, &nvs_list, node) {
95 entry->data = (void *)__get_free_page(GFP_KERNEL);
96 if (!entry->data) {
97 hibernate_nvs_free();
98 return -ENOMEM;
99 }
100 }
101 return 0;
102}
103
104/**
105 * hibernate_nvs_save - save NVS memory regions
106 */
107void hibernate_nvs_save(void)
108{
109 struct nvs_page *entry;
110
111 printk(KERN_INFO "PM: Saving platform NVS memory\n");
112
113 list_for_each_entry(entry, &nvs_list, node)
114 if (entry->data) {
115 entry->kaddr = ioremap(entry->phys_start, entry->size);
116 memcpy(entry->data, entry->kaddr, entry->size);
117 }
118}
119
120/**
121 * hibernate_nvs_restore - restore NVS memory regions
122 *
123 * This function is going to be called with interrupts disabled, so it
124 * cannot iounmap the virtual addresses used to access the NVS region.
125 */
126void hibernate_nvs_restore(void)
127{
128 struct nvs_page *entry;
129
130 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
131
132 list_for_each_entry(entry, &nvs_list, node)
133 if (entry->data)
134 memcpy(entry->kaddr, entry->data, entry->size);
135}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 868028280d13..f710e36930cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,20 +8,9 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
12#include <linux/suspend.h>
13#include <linux/kobject.h> 11#include <linux/kobject.h>
14#include <linux/string.h> 12#include <linux/string.h>
15#include <linux/delay.h>
16#include <linux/errno.h>
17#include <linux/kmod.h>
18#include <linux/init.h>
19#include <linux/console.h>
20#include <linux/cpu.h>
21#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
22#include <linux/freezer.h>
23#include <linux/vmstat.h>
24#include <linux/syscalls.h>
25 14
26#include "power.h" 15#include "power.h"
27 16
@@ -119,373 +108,6 @@ power_attr(pm_test);
119 108
120#endif /* CONFIG_PM_SLEEP */ 109#endif /* CONFIG_PM_SLEEP */
121 110
122#ifdef CONFIG_SUSPEND
123
124static int suspend_test(int level)
125{
126#ifdef CONFIG_PM_DEBUG
127 if (pm_test_level == level) {
128 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
129 mdelay(5000);
130 return 1;
131 }
132#endif /* !CONFIG_PM_DEBUG */
133 return 0;
134}
135
136#ifdef CONFIG_PM_TEST_SUSPEND
137
138/*
139 * We test the system suspend code by setting an RTC wakealarm a short
140 * time in the future, then suspending. Suspending the devices won't
141 * normally take long ... some systems only need a few milliseconds.
142 *
143 * The time it takes is system-specific though, so when we test this
144 * during system bootup we allow a LOT of time.
145 */
146#define TEST_SUSPEND_SECONDS 5
147
148static unsigned long suspend_test_start_time;
149
150static void suspend_test_start(void)
151{
152 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
153 * What we want is a hardware counter that will work correctly even
154 * during the irqs-are-off stages of the suspend/resume cycle...
155 */
156 suspend_test_start_time = jiffies;
157}
158
159static void suspend_test_finish(const char *label)
160{
161 long nj = jiffies - suspend_test_start_time;
162 unsigned msec;
163
164 msec = jiffies_to_msecs(abs(nj));
165 pr_info("PM: %s took %d.%03d seconds\n", label,
166 msec / 1000, msec % 1000);
167
168 /* Warning on suspend means the RTC alarm period needs to be
169 * larger -- the system was sooo slooowwww to suspend that the
170 * alarm (should have) fired before the system went to sleep!
171 *
172 * Warning on either suspend or resume also means the system
173 * has some performance issues. The stack dump of a WARN_ON
174 * is more likely to get the right attention than a printk...
175 */
176 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
177}
178
179#else
180
181static void suspend_test_start(void)
182{
183}
184
185static void suspend_test_finish(const char *label)
186{
187}
188
189#endif
190
191/* This is just an arbitrary number */
192#define FREE_PAGE_NUMBER (100)
193
194static struct platform_suspend_ops *suspend_ops;
195
196/**
197 * suspend_set_ops - Set the global suspend method table.
198 * @ops: Pointer to ops structure.
199 */
200
201void suspend_set_ops(struct platform_suspend_ops *ops)
202{
203 mutex_lock(&pm_mutex);
204 suspend_ops = ops;
205 mutex_unlock(&pm_mutex);
206}
207
208/**
209 * suspend_valid_only_mem - generic memory-only valid callback
210 *
211 * Platform drivers that implement mem suspend only and only need
212 * to check for that in their .valid callback can use this instead
213 * of rolling their own .valid callback.
214 */
215int suspend_valid_only_mem(suspend_state_t state)
216{
217 return state == PM_SUSPEND_MEM;
218}
219
220/**
221 * suspend_prepare - Do prep work before entering low-power state.
222 *
223 * This is common code that is called for each state that we're entering.
224 * Run suspend notifiers, allocate a console and stop all processes.
225 */
226static int suspend_prepare(void)
227{
228 int error;
229 unsigned int free_pages;
230
231 if (!suspend_ops || !suspend_ops->enter)
232 return -EPERM;
233
234 pm_prepare_console();
235
236 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
237 if (error)
238 goto Finish;
239
240 error = usermodehelper_disable();
241 if (error)
242 goto Finish;
243
244 if (suspend_freeze_processes()) {
245 error = -EAGAIN;
246 goto Thaw;
247 }
248
249 free_pages = global_page_state(NR_FREE_PAGES);
250 if (free_pages < FREE_PAGE_NUMBER) {
251 pr_debug("PM: free some memory\n");
252 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
253 if (nr_free_pages() < FREE_PAGE_NUMBER) {
254 error = -ENOMEM;
255 printk(KERN_ERR "PM: No enough memory\n");
256 }
257 }
258 if (!error)
259 return 0;
260
261 Thaw:
262 suspend_thaw_processes();
263 usermodehelper_enable();
264 Finish:
265 pm_notifier_call_chain(PM_POST_SUSPEND);
266 pm_restore_console();
267 return error;
268}
269
270/* default implementation */
271void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
272{
273 local_irq_disable();
274}
275
276/* default implementation */
277void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
278{
279 local_irq_enable();
280}
281
282/**
283 * suspend_enter - enter the desired system sleep state.
284 * @state: state to enter
285 *
286 * This function should be called after devices have been suspended.
287 */
288static int suspend_enter(suspend_state_t state)
289{
290 int error;
291
292 if (suspend_ops->prepare) {
293 error = suspend_ops->prepare();
294 if (error)
295 return error;
296 }
297
298 error = device_power_down(PMSG_SUSPEND);
299 if (error) {
300 printk(KERN_ERR "PM: Some devices failed to power down\n");
301 goto Platfrom_finish;
302 }
303
304 if (suspend_ops->prepare_late) {
305 error = suspend_ops->prepare_late();
306 if (error)
307 goto Power_up_devices;
308 }
309
310 if (suspend_test(TEST_PLATFORM))
311 goto Platform_wake;
312
313 error = disable_nonboot_cpus();
314 if (error || suspend_test(TEST_CPUS))
315 goto Enable_cpus;
316
317 arch_suspend_disable_irqs();
318 BUG_ON(!irqs_disabled());
319
320 error = sysdev_suspend(PMSG_SUSPEND);
321 if (!error) {
322 if (!suspend_test(TEST_CORE))
323 error = suspend_ops->enter(state);
324 sysdev_resume();
325 }
326
327 arch_suspend_enable_irqs();
328 BUG_ON(irqs_disabled());
329
330 Enable_cpus:
331 enable_nonboot_cpus();
332
333 Platform_wake:
334 if (suspend_ops->wake)
335 suspend_ops->wake();
336
337 Power_up_devices:
338 device_power_up(PMSG_RESUME);
339
340 Platfrom_finish:
341 if (suspend_ops->finish)
342 suspend_ops->finish();
343
344 return error;
345}
346
347/**
348 * suspend_devices_and_enter - suspend devices and enter the desired system
349 * sleep state.
350 * @state: state to enter
351 */
352int suspend_devices_and_enter(suspend_state_t state)
353{
354 int error;
355
356 if (!suspend_ops)
357 return -ENOSYS;
358
359 if (suspend_ops->begin) {
360 error = suspend_ops->begin(state);
361 if (error)
362 goto Close;
363 }
364 suspend_console();
365 suspend_test_start();
366 error = device_suspend(PMSG_SUSPEND);
367 if (error) {
368 printk(KERN_ERR "PM: Some devices failed to suspend\n");
369 goto Recover_platform;
370 }
371 suspend_test_finish("suspend devices");
372 if (suspend_test(TEST_DEVICES))
373 goto Recover_platform;
374
375 suspend_enter(state);
376
377 Resume_devices:
378 suspend_test_start();
379 device_resume(PMSG_RESUME);
380 suspend_test_finish("resume devices");
381 resume_console();
382 Close:
383 if (suspend_ops->end)
384 suspend_ops->end();
385 return error;
386
387 Recover_platform:
388 if (suspend_ops->recover)
389 suspend_ops->recover();
390 goto Resume_devices;
391}
392
393/**
394 * suspend_finish - Do final work before exiting suspend sequence.
395 *
396 * Call platform code to clean up, restart processes, and free the
397 * console that we've allocated. This is not called for suspend-to-disk.
398 */
399static void suspend_finish(void)
400{
401 suspend_thaw_processes();
402 usermodehelper_enable();
403 pm_notifier_call_chain(PM_POST_SUSPEND);
404 pm_restore_console();
405}
406
407
408
409
410static const char * const pm_states[PM_SUSPEND_MAX] = {
411 [PM_SUSPEND_STANDBY] = "standby",
412 [PM_SUSPEND_MEM] = "mem",
413};
414
415static inline int valid_state(suspend_state_t state)
416{
417 /* All states need lowlevel support and need to be valid
418 * to the lowlevel implementation, no valid callback
419 * implies that none are valid. */
420 if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
421 return 0;
422 return 1;
423}
424
425
426/**
427 * enter_state - Do common work of entering low-power state.
428 * @state: pm_state structure for state we're entering.
429 *
430 * Make sure we're the only ones trying to enter a sleep state. Fail
431 * if someone has beat us to it, since we don't want anything weird to
432 * happen when we wake up.
433 * Then, do the setup for suspend, enter the state, and cleaup (after
434 * we've woken up).
435 */
436static int enter_state(suspend_state_t state)
437{
438 int error;
439
440 if (!valid_state(state))
441 return -ENODEV;
442
443 if (!mutex_trylock(&pm_mutex))
444 return -EBUSY;
445
446 printk(KERN_INFO "PM: Syncing filesystems ... ");
447 sys_sync();
448 printk("done.\n");
449
450 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
451 error = suspend_prepare();
452 if (error)
453 goto Unlock;
454
455 if (suspend_test(TEST_FREEZER))
456 goto Finish;
457
458 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
459 error = suspend_devices_and_enter(state);
460
461 Finish:
462 pr_debug("PM: Finishing wakeup.\n");
463 suspend_finish();
464 Unlock:
465 mutex_unlock(&pm_mutex);
466 return error;
467}
468
469
470/**
471 * pm_suspend - Externally visible function for suspending system.
472 * @state: Enumerated value of state to enter.
473 *
474 * Determine whether or not value is within range, get state
475 * structure, and enter (above).
476 */
477
478int pm_suspend(suspend_state_t state)
479{
480 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
481 return enter_state(state);
482 return -EINVAL;
483}
484
485EXPORT_SYMBOL(pm_suspend);
486
487#endif /* CONFIG_SUSPEND */
488
489struct kobject *power_kobj; 111struct kobject *power_kobj;
490 112
491/** 113/**
@@ -498,7 +120,6 @@ struct kobject *power_kobj;
498 * store() accepts one of those strings, translates it into the 120 * store() accepts one of those strings, translates it into the
499 * proper enumerated value, and initiates a suspend transition. 121 * proper enumerated value, and initiates a suspend transition.
500 */ 122 */
501
502static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 123static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
503 char *buf) 124 char *buf)
504{ 125{
@@ -596,7 +217,6 @@ static struct attribute_group attr_group = {
596 .attrs = g, 217 .attrs = g,
597}; 218};
598 219
599
600static int __init pm_init(void) 220static int __init pm_init(void)
601{ 221{
602 power_kobj = kobject_create_and_add("power", NULL); 222 power_kobj = kobject_create_and_add("power", NULL);
@@ -606,144 +226,3 @@ static int __init pm_init(void)
606} 226}
607 227
608core_initcall(pm_init); 228core_initcall(pm_init);
609
610
611#ifdef CONFIG_PM_TEST_SUSPEND
612
613#include <linux/rtc.h>
614
615/*
616 * To test system suspend, we need a hands-off mechanism to resume the
617 * system. RTCs wake alarms are a common self-contained mechanism.
618 */
619
620static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
621{
622 static char err_readtime[] __initdata =
623 KERN_ERR "PM: can't read %s time, err %d\n";
624 static char err_wakealarm [] __initdata =
625 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
626 static char err_suspend[] __initdata =
627 KERN_ERR "PM: suspend test failed, error %d\n";
628 static char info_test[] __initdata =
629 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
630
631 unsigned long now;
632 struct rtc_wkalrm alm;
633 int status;
634
635 /* this may fail if the RTC hasn't been initialized */
636 status = rtc_read_time(rtc, &alm.time);
637 if (status < 0) {
638 printk(err_readtime, dev_name(&rtc->dev), status);
639 return;
640 }
641 rtc_tm_to_time(&alm.time, &now);
642
643 memset(&alm, 0, sizeof alm);
644 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
645 alm.enabled = true;
646
647 status = rtc_set_alarm(rtc, &alm);
648 if (status < 0) {
649 printk(err_wakealarm, dev_name(&rtc->dev), status);
650 return;
651 }
652
653 if (state == PM_SUSPEND_MEM) {
654 printk(info_test, pm_states[state]);
655 status = pm_suspend(state);
656 if (status == -ENODEV)
657 state = PM_SUSPEND_STANDBY;
658 }
659 if (state == PM_SUSPEND_STANDBY) {
660 printk(info_test, pm_states[state]);
661 status = pm_suspend(state);
662 }
663 if (status < 0)
664 printk(err_suspend, status);
665
666 /* Some platforms can't detect that the alarm triggered the
667 * wakeup, or (accordingly) disable it after it afterwards.
668 * It's supposed to give oneshot behavior; cope.
669 */
670 alm.enabled = false;
671 rtc_set_alarm(rtc, &alm);
672}
673
674static int __init has_wakealarm(struct device *dev, void *name_ptr)
675{
676 struct rtc_device *candidate = to_rtc_device(dev);
677
678 if (!candidate->ops->set_alarm)
679 return 0;
680 if (!device_may_wakeup(candidate->dev.parent))
681 return 0;
682
683 *(const char **)name_ptr = dev_name(dev);
684 return 1;
685}
686
687/*
688 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
689 * at startup time. They're normally disabled, for faster boot and because
690 * we can't know which states really work on this particular system.
691 */
692static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
693
694static char warn_bad_state[] __initdata =
695 KERN_WARNING "PM: can't test '%s' suspend state\n";
696
697static int __init setup_test_suspend(char *value)
698{
699 unsigned i;
700
701 /* "=mem" ==> "mem" */
702 value++;
703 for (i = 0; i < PM_SUSPEND_MAX; i++) {
704 if (!pm_states[i])
705 continue;
706 if (strcmp(pm_states[i], value) != 0)
707 continue;
708 test_state = (__force suspend_state_t) i;
709 return 0;
710 }
711 printk(warn_bad_state, value);
712 return 0;
713}
714__setup("test_suspend", setup_test_suspend);
715
716static int __init test_suspend(void)
717{
718 static char warn_no_rtc[] __initdata =
719 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
720
721 char *pony = NULL;
722 struct rtc_device *rtc = NULL;
723
724 /* PM is initialized by now; is that state testable? */
725 if (test_state == PM_SUSPEND_ON)
726 goto done;
727 if (!valid_state(test_state)) {
728 printk(warn_bad_state, pm_states[test_state]);
729 goto done;
730 }
731
732 /* RTCs have initialized by now too ... can we use one? */
733 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
734 if (pony)
735 rtc = rtc_class_open(pony);
736 if (!rtc) {
737 printk(warn_no_rtc);
738 goto done;
739 }
740
741 /* go for it */
742 test_wakealarm(rtc, test_state);
743 rtc_class_close(rtc);
744done:
745 return 0;
746}
747late_initcall(test_suspend);
748
749#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7a3afb..26d5a26f82e3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
45 */ 45 */
46#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 46#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
47 47
48/* kernel/power/disk.c */ 48/* kernel/power/hibernate.c */
49extern int hibernation_snapshot(int platform_mode); 49extern int hibernation_snapshot(int platform_mode);
50extern int hibernation_restore(int platform_mode); 50extern int hibernation_restore(int platform_mode);
51extern int hibernation_platform_enter(void); 51extern int hibernation_platform_enter(void);
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern unsigned int count_data_pages(void); 77extern int swsusp_shrink_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
@@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void);
147 */ 147 */
148#define SF_PLATFORM_MODE 1 148#define SF_PLATFORM_MODE 1
149 149
150/* kernel/power/disk.c */ 150/* kernel/power/hibernate.c */
151extern int swsusp_check(void); 151extern int swsusp_check(void);
152extern int swsusp_shrink_memory(void);
153extern void swsusp_free(void); 152extern void swsusp_free(void);
154extern int swsusp_read(unsigned int *flags_p); 153extern int swsusp_read(unsigned int *flags_p);
155extern int swsusp_write(unsigned int flags); 154extern int swsusp_write(unsigned int flags);
@@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
161 unsigned int, char *); 160 unsigned int, char *);
162 161
163#ifdef CONFIG_SUSPEND 162#ifdef CONFIG_SUSPEND
164/* kernel/power/main.c */ 163/* kernel/power/suspend.c */
164extern const char *const pm_states[];
165
166extern bool valid_state(suspend_state_t state);
165extern int suspend_devices_and_enter(suspend_state_t state); 167extern int suspend_devices_and_enter(suspend_state_t state);
168extern int enter_state(suspend_state_t state);
166#else /* !CONFIG_SUSPEND */ 169#else /* !CONFIG_SUSPEND */
167static inline int suspend_devices_and_enter(suspend_state_t state) 170static inline int suspend_devices_and_enter(suspend_state_t state)
168{ 171{
169 return -ENOSYS; 172 return -ENOSYS;
170} 173}
174static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
175static inline bool valid_state(suspend_state_t state) { return false; }
171#endif /* !CONFIG_SUSPEND */ 176#endif /* !CONFIG_SUSPEND */
172 177
178#ifdef CONFIG_PM_TEST_SUSPEND
179/* kernel/power/suspend_test.c */
180extern void suspend_test_start(void);
181extern void suspend_test_finish(const char *label);
182#else /* !CONFIG_PM_TEST_SUSPEND */
183static inline void suspend_test_start(void) {}
184static inline void suspend_test_finish(const char *label) {}
185#endif /* !CONFIG_PM_TEST_SUSPEND */
186
173#ifdef CONFIG_PM_SLEEP 187#ifdef CONFIG_PM_SLEEP
174/* kernel/power/main.c */ 188/* kernel/power/main.c */
175extern int pm_notifier_call_chain(unsigned long val); 189extern int pm_notifier_call_chain(unsigned long val);
176#endif 190#endif
177 191
178#ifdef CONFIG_HIGHMEM 192#ifdef CONFIG_HIGHMEM
179unsigned int count_highmem_pages(void);
180int restore_highmem(void); 193int restore_highmem(void);
181#else 194#else
182static inline unsigned int count_highmem_pages(void) { return 0; } 195static inline unsigned int count_highmem_pages(void) { return 0; }
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b5..e8b337006276 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
34 .handler = handle_poweroff, 34 .handler = handle_poweroff,
35 .help_msg = "powerOff", 35 .help_msg = "powerOff",
36 .action_msg = "Power Off", 36 .action_msg = "Power Off",
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
39 39
40static int pm_sysrq_init(void) 40static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
117 if (error) 117 if (error)
118 goto Exit; 118 goto Exit;
119 printk("done."); 119 printk("done.");
120
121 oom_killer_disable();
120 Exit: 122 Exit:
121 BUG_ON(in_atomic()); 123 BUG_ON(in_atomic());
122 printk("\n"); 124 printk("\n");
125
123 return error; 126 return error;
124} 127}
125 128
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
145 148
146void thaw_processes(void) 149void thaw_processes(void)
147{ 150{
151 oom_killer_enable();
152
148 printk("Restarting tasks ... "); 153 printk("Restarting tasks ... ");
149 thaw_tasks(true); 154 thaw_tasks(true);
150 thaw_tasks(false); 155 thaw_tasks(false);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 33e2e4a819f9..523a451b45d3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);
39static void swsusp_set_page_forbidden(struct page *); 39static void swsusp_set_page_forbidden(struct page *);
40static void swsusp_unset_page_forbidden(struct page *); 40static void swsusp_unset_page_forbidden(struct page *);
41 41
42/*
43 * Preferred image size in bytes (tunable via /sys/power/image_size).
44 * When it is set to N, swsusp will do its best to ensure the image
45 * size will not exceed N bytes, but if that is impossible, it will
46 * try to create the smallest image possible.
47 */
48unsigned long image_size = 500 * 1024 * 1024;
49
42/* List of PBEs needed for restoring the pages that were allocated before 50/* List of PBEs needed for restoring the pages that were allocated before
43 * the suspend and included in the suspend image, but have also been 51 * the suspend and included in the suspend image, but have also been
44 * allocated by the "resume" kernel, so their contents cannot be written 52 * allocated by the "resume" kernel, so their contents cannot be written
@@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
840 * pages. 848 * pages.
841 */ 849 */
842 850
843unsigned int count_highmem_pages(void) 851static unsigned int count_highmem_pages(void)
844{ 852{
845 struct zone *zone; 853 struct zone *zone;
846 unsigned int n = 0; 854 unsigned int n = 0;
@@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
902 * pages. 910 * pages.
903 */ 911 */
904 912
905unsigned int count_data_pages(void) 913static unsigned int count_data_pages(void)
906{ 914{
907 struct zone *zone; 915 struct zone *zone;
908 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
@@ -1058,6 +1066,74 @@ void swsusp_free(void)
1058 buffer = NULL; 1066 buffer = NULL;
1059} 1067}
1060 1068
1069/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed
1071 *
1072 * ... but do not OOM-kill anyone
1073 *
1074 * Notice: all userland should be stopped before it is called, or
1075 * livelock is possible.
1076 */
1077
1078#define SHRINK_BITE 10000
1079static inline unsigned long __shrink_memory(long tmp)
1080{
1081 if (tmp > SHRINK_BITE)
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084}
1085
1086int swsusp_shrink_memory(void)
1087{
1088 long tmp;
1089 struct zone *zone;
1090 unsigned long pages = 0;
1091 unsigned int i = 0;
1092 char *p = "-\\|/";
1093 struct timeval start, stop;
1094
1095 printk(KERN_INFO "PM: Shrinking memory... ");
1096 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114
1115 if (highmem_size < 0)
1116 highmem_size = 0;
1117
1118 tmp += highmem_size;
1119 if (tmp > 0) {
1120 tmp = __shrink_memory(tmp);
1121 if (!tmp)
1122 return -ENOMEM;
1123 pages += tmp;
1124 } else if (size > image_size / PAGE_SIZE) {
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
1126 pages += tmp;
1127 }
1128 printk("\b%c", p[i++%4]);
1129 } while (tmp > 0);
1130 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed");
1133
1134 return 0;
1135}
1136
1061#ifdef CONFIG_HIGHMEM 1137#ifdef CONFIG_HIGHMEM
1062/** 1138/**
1063 * count_pages_for_highmem - compute the number of non-highmem pages 1139 * count_pages_for_highmem - compute the number of non-highmem pages
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 000000000000..6f10dfc2d3e9
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,300 @@
1/*
2 * kernel/power/suspend.c - Suspend to RAM and standby functionality.
3 *
4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/string.h>
12#include <linux/delay.h>
13#include <linux/errno.h>
14#include <linux/init.h>
15#include <linux/console.h>
16#include <linux/cpu.h>
17#include <linux/syscalls.h>
18
19#include "power.h"
20
21const char *const pm_states[PM_SUSPEND_MAX] = {
22 [PM_SUSPEND_STANDBY] = "standby",
23 [PM_SUSPEND_MEM] = "mem",
24};
25
26static struct platform_suspend_ops *suspend_ops;
27
28/**
29 * suspend_set_ops - Set the global suspend method table.
30 * @ops: Pointer to ops structure.
31 */
32void suspend_set_ops(struct platform_suspend_ops *ops)
33{
34 mutex_lock(&pm_mutex);
35 suspend_ops = ops;
36 mutex_unlock(&pm_mutex);
37}
38
39bool valid_state(suspend_state_t state)
40{
41 /*
42 * All states need lowlevel support and need to be valid to the lowlevel
43 * implementation, no valid callback implies that none are valid.
44 */
45 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
46}
47
48/**
49 * suspend_valid_only_mem - generic memory-only valid callback
50 *
51 * Platform drivers that implement mem suspend only and only need
52 * to check for that in their .valid callback can use this instead
53 * of rolling their own .valid callback.
54 */
55int suspend_valid_only_mem(suspend_state_t state)
56{
57 return state == PM_SUSPEND_MEM;
58}
59
60static int suspend_test(int level)
61{
62#ifdef CONFIG_PM_DEBUG
63 if (pm_test_level == level) {
64 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
65 mdelay(5000);
66 return 1;
67 }
68#endif /* !CONFIG_PM_DEBUG */
69 return 0;
70}
71
72/**
73 * suspend_prepare - Do prep work before entering low-power state.
74 *
75 * This is common code that is called for each state that we're entering.
76 * Run suspend notifiers, allocate a console and stop all processes.
77 */
78static int suspend_prepare(void)
79{
80 int error;
81
82 if (!suspend_ops || !suspend_ops->enter)
83 return -EPERM;
84
85 pm_prepare_console();
86
87 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
88 if (error)
89 goto Finish;
90
91 error = usermodehelper_disable();
92 if (error)
93 goto Finish;
94
95 error = suspend_freeze_processes();
96 if (!error)
97 return 0;
98
99 suspend_thaw_processes();
100 usermodehelper_enable();
101 Finish:
102 pm_notifier_call_chain(PM_POST_SUSPEND);
103 pm_restore_console();
104 return error;
105}
106
107/* default implementation */
108void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
109{
110 local_irq_disable();
111}
112
113/* default implementation */
114void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
115{
116 local_irq_enable();
117}
118
119/**
120 * suspend_enter - enter the desired system sleep state.
121 * @state: state to enter
122 *
123 * This function should be called after devices have been suspended.
124 */
125static int suspend_enter(suspend_state_t state)
126{
127 int error;
128
129 if (suspend_ops->prepare) {
130 error = suspend_ops->prepare();
131 if (error)
132 return error;
133 }
134
135 error = dpm_suspend_noirq(PMSG_SUSPEND);
136 if (error) {
137 printk(KERN_ERR "PM: Some devices failed to power down\n");
138 goto Platfrom_finish;
139 }
140
141 if (suspend_ops->prepare_late) {
142 error = suspend_ops->prepare_late();
143 if (error)
144 goto Power_up_devices;
145 }
146
147 if (suspend_test(TEST_PLATFORM))
148 goto Platform_wake;
149
150 error = disable_nonboot_cpus();
151 if (error || suspend_test(TEST_CPUS))
152 goto Enable_cpus;
153
154 arch_suspend_disable_irqs();
155 BUG_ON(!irqs_disabled());
156
157 error = sysdev_suspend(PMSG_SUSPEND);
158 if (!error) {
159 if (!suspend_test(TEST_CORE))
160 error = suspend_ops->enter(state);
161 sysdev_resume();
162 }
163
164 arch_suspend_enable_irqs();
165 BUG_ON(irqs_disabled());
166
167 Enable_cpus:
168 enable_nonboot_cpus();
169
170 Platform_wake:
171 if (suspend_ops->wake)
172 suspend_ops->wake();
173
174 Power_up_devices:
175 dpm_resume_noirq(PMSG_RESUME);
176
177 Platfrom_finish:
178 if (suspend_ops->finish)
179 suspend_ops->finish();
180
181 return error;
182}
183
184/**
185 * suspend_devices_and_enter - suspend devices and enter the desired system
186 * sleep state.
187 * @state: state to enter
188 */
189int suspend_devices_and_enter(suspend_state_t state)
190{
191 int error;
192
193 if (!suspend_ops)
194 return -ENOSYS;
195
196 if (suspend_ops->begin) {
197 error = suspend_ops->begin(state);
198 if (error)
199 goto Close;
200 }
201 suspend_console();
202 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) {
205 printk(KERN_ERR "PM: Some devices failed to suspend\n");
206 goto Recover_platform;
207 }
208 suspend_test_finish("suspend devices");
209 if (suspend_test(TEST_DEVICES))
210 goto Recover_platform;
211
212 suspend_enter(state);
213
214 Resume_devices:
215 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices");
218 resume_console();
219 Close:
220 if (suspend_ops->end)
221 suspend_ops->end();
222 return error;
223
224 Recover_platform:
225 if (suspend_ops->recover)
226 suspend_ops->recover();
227 goto Resume_devices;
228}
229
230/**
231 * suspend_finish - Do final work before exiting suspend sequence.
232 *
233 * Call platform code to clean up, restart processes, and free the
234 * console that we've allocated. This is not called for suspend-to-disk.
235 */
236static void suspend_finish(void)
237{
238 suspend_thaw_processes();
239 usermodehelper_enable();
240 pm_notifier_call_chain(PM_POST_SUSPEND);
241 pm_restore_console();
242}
243
244/**
245 * enter_state - Do common work of entering low-power state.
246 * @state: pm_state structure for state we're entering.
247 *
248 * Make sure we're the only ones trying to enter a sleep state. Fail
249 * if someone has beat us to it, since we don't want anything weird to
250 * happen when we wake up.
251 * Then, do the setup for suspend, enter the state, and cleaup (after
252 * we've woken up).
253 */
254int enter_state(suspend_state_t state)
255{
256 int error;
257
258 if (!valid_state(state))
259 return -ENODEV;
260
261 if (!mutex_trylock(&pm_mutex))
262 return -EBUSY;
263
264 printk(KERN_INFO "PM: Syncing filesystems ... ");
265 sys_sync();
266 printk("done.\n");
267
268 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
269 error = suspend_prepare();
270 if (error)
271 goto Unlock;
272
273 if (suspend_test(TEST_FREEZER))
274 goto Finish;
275
276 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
277 error = suspend_devices_and_enter(state);
278
279 Finish:
280 pr_debug("PM: Finishing wakeup.\n");
281 suspend_finish();
282 Unlock:
283 mutex_unlock(&pm_mutex);
284 return error;
285}
286
287/**
288 * pm_suspend - Externally visible function for suspending system.
289 * @state: Enumerated value of state to enter.
290 *
291 * Determine whether or not value is within range, get state
292 * structure, and enter (above).
293 */
294int pm_suspend(suspend_state_t state)
295{
296 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
297 return enter_state(state);
298 return -EINVAL;
299}
300EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 000000000000..17d8bb1acf9c
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,187 @@
1/*
2 * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
3 *
4 * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/init.h>
10#include <linux/rtc.h>
11
12#include "power.h"
13
14/*
15 * We test the system suspend code by setting an RTC wakealarm a short
16 * time in the future, then suspending. Suspending the devices won't
17 * normally take long ... some systems only need a few milliseconds.
18 *
19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time.
21 */
22#define TEST_SUSPEND_SECONDS 5
23
24static unsigned long suspend_test_start_time;
25
26void suspend_test_start(void)
27{
28 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
29 * What we want is a hardware counter that will work correctly even
30 * during the irqs-are-off stages of the suspend/resume cycle...
31 */
32 suspend_test_start_time = jiffies;
33}
34
35void suspend_test_finish(const char *label)
36{
37 long nj = jiffies - suspend_test_start_time;
38 unsigned msec;
39
40 msec = jiffies_to_msecs(abs(nj));
41 pr_info("PM: %s took %d.%03d seconds\n", label,
42 msec / 1000, msec % 1000);
43
44 /* Warning on suspend means the RTC alarm period needs to be
45 * larger -- the system was sooo slooowwww to suspend that the
46 * alarm (should have) fired before the system went to sleep!
47 *
48 * Warning on either suspend or resume also means the system
49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk...
51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
53}
54
55/*
56 * To test system suspend, we need a hands-off mechanism to resume the
57 * system. RTCs wake alarms are a common self-contained mechanism.
58 */
59
60static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
61{
62 static char err_readtime[] __initdata =
63 KERN_ERR "PM: can't read %s time, err %d\n";
64 static char err_wakealarm [] __initdata =
65 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
66 static char err_suspend[] __initdata =
67 KERN_ERR "PM: suspend test failed, error %d\n";
68 static char info_test[] __initdata =
69 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
70
71 unsigned long now;
72 struct rtc_wkalrm alm;
73 int status;
74
75 /* this may fail if the RTC hasn't been initialized */
76 status = rtc_read_time(rtc, &alm.time);
77 if (status < 0) {
78 printk(err_readtime, dev_name(&rtc->dev), status);
79 return;
80 }
81 rtc_tm_to_time(&alm.time, &now);
82
83 memset(&alm, 0, sizeof alm);
84 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
85 alm.enabled = true;
86
87 status = rtc_set_alarm(rtc, &alm);
88 if (status < 0) {
89 printk(err_wakealarm, dev_name(&rtc->dev), status);
90 return;
91 }
92
93 if (state == PM_SUSPEND_MEM) {
94 printk(info_test, pm_states[state]);
95 status = pm_suspend(state);
96 if (status == -ENODEV)
97 state = PM_SUSPEND_STANDBY;
98 }
99 if (state == PM_SUSPEND_STANDBY) {
100 printk(info_test, pm_states[state]);
101 status = pm_suspend(state);
102 }
103 if (status < 0)
104 printk(err_suspend, status);
105
106 /* Some platforms can't detect that the alarm triggered the
107 * wakeup, or (accordingly) disable it after it afterwards.
108 * It's supposed to give oneshot behavior; cope.
109 */
110 alm.enabled = false;
111 rtc_set_alarm(rtc, &alm);
112}
113
114static int __init has_wakealarm(struct device *dev, void *name_ptr)
115{
116 struct rtc_device *candidate = to_rtc_device(dev);
117
118 if (!candidate->ops->set_alarm)
119 return 0;
120 if (!device_may_wakeup(candidate->dev.parent))
121 return 0;
122
123 *(const char **)name_ptr = dev_name(dev);
124 return 1;
125}
126
127/*
128 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
129 * at startup time. They're normally disabled, for faster boot and because
130 * we can't know which states really work on this particular system.
131 */
132static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
133
134static char warn_bad_state[] __initdata =
135 KERN_WARNING "PM: can't test '%s' suspend state\n";
136
137static int __init setup_test_suspend(char *value)
138{
139 unsigned i;
140
141 /* "=mem" ==> "mem" */
142 value++;
143 for (i = 0; i < PM_SUSPEND_MAX; i++) {
144 if (!pm_states[i])
145 continue;
146 if (strcmp(pm_states[i], value) != 0)
147 continue;
148 test_state = (__force suspend_state_t) i;
149 return 0;
150 }
151 printk(warn_bad_state, value);
152 return 0;
153}
154__setup("test_suspend", setup_test_suspend);
155
156static int __init test_suspend(void)
157{
158 static char warn_no_rtc[] __initdata =
159 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
160
161 char *pony = NULL;
162 struct rtc_device *rtc = NULL;
163
164 /* PM is initialized by now; is that state testable? */
165 if (test_state == PM_SUSPEND_ON)
166 goto done;
167 if (!valid_state(test_state)) {
168 printk(warn_bad_state, pm_states[test_state]);
169 goto done;
170 }
171
172 /* RTCs have initialized by now too ... can we use one? */
173 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
174 if (pony)
175 rtc = rtc_class_open(pony);
176 if (!rtc) {
177 printk(warn_no_rtc);
178 goto done;
179 }
180
181 /* go for it */
182 test_wakealarm(rtc, test_state);
183 rtc_class_close(rtc);
184done:
185 return 0;
186}
187late_initcall(test_suspend);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 78c35047586d..6a07f4dbf2f8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -55,14 +55,6 @@
55 55
56#include "power.h" 56#include "power.h"
57 57
58/*
59 * Preferred image size in bytes (tunable via /sys/power/image_size).
60 * When it is set to N, swsusp will do its best to ensure the image
61 * size will not exceed N bytes, but if that is impossible, it will
62 * try to create the smallest image possible.
63 */
64unsigned long image_size = 500 * 1024 * 1024;
65
66int in_suspend __nosavedata = 0; 58int in_suspend __nosavedata = 0;
67 59
68/** 60/**
@@ -194,193 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
194 centisecs / 100, centisecs % 100, 186 centisecs / 100, centisecs % 100,
195 kps / 1000, (kps % 1000) / 10); 187 kps / 1000, (kps % 1000) / 10);
196} 188}
197
198/**
199 * swsusp_shrink_memory - Try to free as much memory as needed
200 *
201 * ... but do not OOM-kill anyone
202 *
203 * Notice: all userland should be stopped before it is called, or
204 * livelock is possible.
205 */
206
207#define SHRINK_BITE 10000
208static inline unsigned long __shrink_memory(long tmp)
209{
210 if (tmp > SHRINK_BITE)
211 tmp = SHRINK_BITE;
212 return shrink_all_memory(tmp);
213}
214
215int swsusp_shrink_memory(void)
216{
217 long tmp;
218 struct zone *zone;
219 unsigned long pages = 0;
220 unsigned int i = 0;
221 char *p = "-\\|/";
222 struct timeval start, stop;
223
224 printk(KERN_INFO "PM: Shrinking memory... ");
225 do_gettimeofday(&start);
226 do {
227 long size, highmem_size;
228
229 highmem_size = count_highmem_pages();
230 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
231 tmp = size;
232 size += highmem_size;
233 for_each_populated_zone(zone) {
234 tmp += snapshot_additional_pages(zone);
235 if (is_highmem(zone)) {
236 highmem_size -=
237 zone_page_state(zone, NR_FREE_PAGES);
238 } else {
239 tmp -= zone_page_state(zone, NR_FREE_PAGES);
240 tmp += zone->lowmem_reserve[ZONE_NORMAL];
241 }
242 }
243
244 if (highmem_size < 0)
245 highmem_size = 0;
246
247 tmp += highmem_size;
248 if (tmp > 0) {
249 tmp = __shrink_memory(tmp);
250 if (!tmp)
251 return -ENOMEM;
252 pages += tmp;
253 } else if (size > image_size / PAGE_SIZE) {
254 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
255 pages += tmp;
256 }
257 printk("\b%c", p[i++%4]);
258 } while (tmp > 0);
259 do_gettimeofday(&stop);
260 printk("\bdone (%lu pages freed)\n", pages);
261 swsusp_show_speed(&start, &stop, pages, "Freed");
262
263 return 0;
264}
265
266/*
267 * Platforms, like ACPI, may want us to save some memory used by them during
268 * hibernation and to restore the contents of this memory during the subsequent
269 * resume. The code below implements a mechanism allowing us to do that.
270 */
271
272struct nvs_page {
273 unsigned long phys_start;
274 unsigned int size;
275 void *kaddr;
276 void *data;
277 struct list_head node;
278};
279
280static LIST_HEAD(nvs_list);
281
282/**
283 * hibernate_nvs_register - register platform NVS memory region to save
284 * @start - physical address of the region
285 * @size - size of the region
286 *
287 * The NVS region need not be page-aligned (both ends) and we arrange
288 * things so that the data from page-aligned addresses in this region will
289 * be copied into separate RAM pages.
290 */
291int hibernate_nvs_register(unsigned long start, unsigned long size)
292{
293 struct nvs_page *entry, *next;
294
295 while (size > 0) {
296 unsigned int nr_bytes;
297
298 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
299 if (!entry)
300 goto Error;
301
302 list_add_tail(&entry->node, &nvs_list);
303 entry->phys_start = start;
304 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
305 entry->size = (size < nr_bytes) ? size : nr_bytes;
306
307 start += entry->size;
308 size -= entry->size;
309 }
310 return 0;
311
312 Error:
313 list_for_each_entry_safe(entry, next, &nvs_list, node) {
314 list_del(&entry->node);
315 kfree(entry);
316 }
317 return -ENOMEM;
318}
319
320/**
321 * hibernate_nvs_free - free data pages allocated for saving NVS regions
322 */
323void hibernate_nvs_free(void)
324{
325 struct nvs_page *entry;
326
327 list_for_each_entry(entry, &nvs_list, node)
328 if (entry->data) {
329 free_page((unsigned long)entry->data);
330 entry->data = NULL;
331 if (entry->kaddr) {
332 iounmap(entry->kaddr);
333 entry->kaddr = NULL;
334 }
335 }
336}
337
338/**
339 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
340 */
341int hibernate_nvs_alloc(void)
342{
343 struct nvs_page *entry;
344
345 list_for_each_entry(entry, &nvs_list, node) {
346 entry->data = (void *)__get_free_page(GFP_KERNEL);
347 if (!entry->data) {
348 hibernate_nvs_free();
349 return -ENOMEM;
350 }
351 }
352 return 0;
353}
354
355/**
356 * hibernate_nvs_save - save NVS memory regions
357 */
358void hibernate_nvs_save(void)
359{
360 struct nvs_page *entry;
361
362 printk(KERN_INFO "PM: Saving platform NVS memory\n");
363
364 list_for_each_entry(entry, &nvs_list, node)
365 if (entry->data) {
366 entry->kaddr = ioremap(entry->phys_start, entry->size);
367 memcpy(entry->data, entry->kaddr, entry->size);
368 }
369}
370
371/**
372 * hibernate_nvs_restore - restore NVS memory regions
373 *
374 * This function is going to be called with interrupts disabled, so it
375 * cannot iounmap the virtual addresses used to access the NVS region.
376 */
377void hibernate_nvs_restore(void)
378{
379 struct nvs_page *entry;
380
381 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
382
383 list_for_each_entry(entry, &nvs_list, node)
384 if (entry->data)
385 memcpy(entry->kaddr, entry->data, entry->size);
386}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c67..b4d97b54c1ec 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
687 sizeof(printk_buf) - printed_len, fmt, args); 687 sizeof(printk_buf) - printed_len, fmt, args);
688 688
689 689
690 p = printk_buf;
691
692 /* Do we have a loglevel in the string? */
693 if (p[0] == '<') {
694 unsigned char c = p[1];
695 if (c && p[2] == '>') {
696 switch (c) {
697 case '0' ... '7': /* loglevel */
698 current_log_level = c - '0';
699 /* Fallthrough - make sure we're on a new line */
700 case 'd': /* KERN_DEFAULT */
701 if (!new_text_line) {
702 emit_log_char('\n');
703 new_text_line = 1;
704 }
705 /* Fallthrough - skip the loglevel */
706 case 'c': /* KERN_CONT */
707 p += 3;
708 break;
709 }
710 }
711 }
712
690 /* 713 /*
691 * Copy the output into log_buf. If the caller didn't provide 714 * Copy the output into log_buf. If the caller didn't provide
692 * appropriate log level tags, we insert them here 715 * appropriate log level tags, we insert them here
693 */ 716 */
694 for (p = printk_buf; *p; p++) { 717 for ( ; *p; p++) {
695 if (new_text_line) { 718 if (new_text_line) {
696 /* If a token, set current_log_level and skip over */
697 if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
698 p[2] == '>') {
699 current_log_level = p[1] - '0';
700 p += 3;
701 printed_len -= 3;
702 }
703
704 /* Always output the token */ 719 /* Always output the token */
705 emit_log_char('<'); 720 emit_log_char('<');
706 emit_log_char(current_log_level + '0'); 721 emit_log_char(current_log_level + '0');
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409bae..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,23 +111,18 @@ int __ref profile_init(void)
111 /* only text is profiled */ 111 /* only text is profiled */
112 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
117 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
118 return 0;
119 }
120 114
121 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) 115 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
122 return -ENOMEM; 116 return -ENOMEM;
123 117
124 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
125 119
126 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
127 if (prof_buffer) 121 if (prof_buffer)
128 return 0; 122 return 0;
129 123
130 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
131 if (prof_buffer) 126 if (prof_buffer)
132 return 0; 127 return 0;
133 128
@@ -371,7 +366,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
371 node = cpu_to_node(cpu); 366 node = cpu_to_node(cpu);
372 per_cpu(cpu_profile_flip, cpu) = 0; 367 per_cpu(cpu_profile_flip, cpu) = 0;
373 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 368 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
374 page = alloc_pages_node(node, 369 page = alloc_pages_exact_node(node,
375 GFP_KERNEL | __GFP_ZERO, 370 GFP_KERNEL | __GFP_ZERO,
376 0); 371 0);
377 if (!page) 372 if (!page)
@@ -379,7 +374,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
379 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 374 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
380 } 375 }
381 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 376 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
382 page = alloc_pages_node(node, 377 page = alloc_pages_exact_node(node,
383 GFP_KERNEL | __GFP_ZERO, 378 GFP_KERNEL | __GFP_ZERO,
384 0); 379 0);
385 if (!page) 380 if (!page)
@@ -570,14 +565,14 @@ static int create_hash_tables(void)
570 int node = cpu_to_node(cpu); 565 int node = cpu_to_node(cpu);
571 struct page *page; 566 struct page *page;
572 567
573 page = alloc_pages_node(node, 568 page = alloc_pages_exact_node(node,
574 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 569 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
575 0); 570 0);
576 if (!page) 571 if (!page)
577 goto out_cleanup; 572 goto out_cleanup;
578 per_cpu(cpu_profile_hits, cpu)[1] 573 per_cpu(cpu_profile_hits, cpu)[1]
579 = (struct profile_hit *)page_address(page); 574 = (struct profile_hit *)page_address(page);
580 page = alloc_pages_node(node, 575 page = alloc_pages_exact_node(node,
581 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 576 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
582 0); 577 0);
583 if (!page) 578 if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42c317874cfa..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,16 +25,6 @@
25 25
26 26
27/* 27/*
28 * Initialize a new task whose father had been ptraced.
29 *
30 * Called from copy_process().
31 */
32void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
33{
34 arch_ptrace_fork(child, clone_flags);
35}
36
37/*
38 * ptrace a task: make the debugger its new parent and 28 * ptrace a task: make the debugger its new parent and
39 * move it to the ptrace list. 29 * move it to the ptrace list.
40 * 30 *
@@ -177,66 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
177int ptrace_attach(struct task_struct *task) 167int ptrace_attach(struct task_struct *task)
178{ 168{
179 int retval; 169 int retval;
180 unsigned long flags;
181 170
182 audit_ptrace(task); 171 audit_ptrace(task);
183 172
184 retval = -EPERM; 173 retval = -EPERM;
174 if (unlikely(task->flags & PF_KTHREAD))
175 goto out;
185 if (same_thread_group(task, current)) 176 if (same_thread_group(task, current))
186 goto out; 177 goto out;
187 178
188 /* Protect exec's credential calculations against our interference; 179 /*
189 * SUID, SGID and LSM creds get determined differently under ptrace. 180 * Protect exec's credential calculations against our interference;
181 * interference; SUID, SGID and LSM creds get determined differently
182 * under ptrace.
190 */ 183 */
191 retval = mutex_lock_interruptible(&task->cred_exec_mutex); 184 retval = -ERESTARTNOINTR;
192 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
193 goto out; 186 goto out;
194 187
195 retval = -EPERM;
196repeat:
197 /*
198 * Nasty, nasty.
199 *
200 * We want to hold both the task-lock and the
201 * tasklist_lock for writing at the same time.
202 * But that's against the rules (tasklist_lock
203 * is taken for reading by interrupts on other
204 * cpu's that may have task_lock).
205 */
206 task_lock(task); 188 task_lock(task);
207 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
208 task_unlock(task);
209 do {
210 cpu_relax();
211 } while (!write_can_lock(&tasklist_lock));
212 goto repeat;
213 }
214
215 if (!task->mm)
216 goto bad;
217 /* the same process cannot be attached many times */
218 if (task->ptrace & PT_PTRACED)
219 goto bad;
220 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); 189 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
190 task_unlock(task);
221 if (retval) 191 if (retval)
222 goto bad; 192 goto unlock_creds;
193
194 write_lock_irq(&tasklist_lock);
195 retval = -EPERM;
196 if (unlikely(task->exit_state))
197 goto unlock_tasklist;
198 if (task->ptrace)
199 goto unlock_tasklist;
223 200
224 /* Go */ 201 task->ptrace = PT_PTRACED;
225 task->ptrace |= PT_PTRACED;
226 if (capable(CAP_SYS_PTRACE)) 202 if (capable(CAP_SYS_PTRACE))
227 task->ptrace |= PT_PTRACE_CAP; 203 task->ptrace |= PT_PTRACE_CAP;
228 204
229 __ptrace_link(task, current); 205 __ptrace_link(task, current);
230
231 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 206 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
232bad: 207
233 write_unlock_irqrestore(&tasklist_lock, flags); 208 retval = 0;
234 task_unlock(task); 209unlock_tasklist:
235 mutex_unlock(&task->cred_exec_mutex); 210 write_unlock_irq(&tasklist_lock);
211unlock_creds:
212 mutex_unlock(&task->cred_guard_mutex);
236out: 213out:
237 return retval; 214 return retval;
238} 215}
239 216
217/**
218 * ptrace_traceme -- helper for PTRACE_TRACEME
219 *
220 * Performs checks and sets PT_PTRACED.
221 * Should be used by all ptrace implementations for PTRACE_TRACEME.
222 */
223int ptrace_traceme(void)
224{
225 int ret = -EPERM;
226
227 write_lock_irq(&tasklist_lock);
228 /* Are we already being traced? */
229 if (!current->ptrace) {
230 ret = security_ptrace_traceme(current->parent);
231 /*
232 * Check PF_EXITING to ensure ->real_parent has not passed
233 * exit_ptrace(). Otherwise we don't report the error but
234 * pretend ->real_parent untraces us right after return.
235 */
236 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
237 current->ptrace = PT_PTRACED;
238 __ptrace_link(current, current->real_parent);
239 }
240 }
241 write_unlock_irq(&tasklist_lock);
242
243 return ret;
244}
245
240/* 246/*
241 * Called with irqs disabled, returns true if childs should reap themselves. 247 * Called with irqs disabled, returns true if childs should reap themselves.
242 */ 248 */
@@ -418,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
418 424
419static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 425static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
420{ 426{
427 unsigned long flags;
421 int error = -ESRCH; 428 int error = -ESRCH;
422 429
423 read_lock(&tasklist_lock); 430 if (lock_task_sighand(child, &flags)) {
424 if (likely(child->sighand != NULL)) {
425 error = -EINVAL; 431 error = -EINVAL;
426 spin_lock_irq(&child->sighand->siglock);
427 if (likely(child->last_siginfo != NULL)) { 432 if (likely(child->last_siginfo != NULL)) {
428 *info = *child->last_siginfo; 433 *info = *child->last_siginfo;
429 error = 0; 434 error = 0;
430 } 435 }
431 spin_unlock_irq(&child->sighand->siglock); 436 unlock_task_sighand(child, &flags);
432 } 437 }
433 read_unlock(&tasklist_lock);
434 return error; 438 return error;
435} 439}
436 440
437static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) 441static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
438{ 442{
443 unsigned long flags;
439 int error = -ESRCH; 444 int error = -ESRCH;
440 445
441 read_lock(&tasklist_lock); 446 if (lock_task_sighand(child, &flags)) {
442 if (likely(child->sighand != NULL)) {
443 error = -EINVAL; 447 error = -EINVAL;
444 spin_lock_irq(&child->sighand->siglock);
445 if (likely(child->last_siginfo != NULL)) { 448 if (likely(child->last_siginfo != NULL)) {
446 *child->last_siginfo = *info; 449 *child->last_siginfo = *info;
447 error = 0; 450 error = 0;
448 } 451 }
449 spin_unlock_irq(&child->sighand->siglock); 452 unlock_task_sighand(child, &flags);
450 } 453 }
451 read_unlock(&tasklist_lock);
452 return error; 454 return error;
453} 455}
454 456
@@ -575,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
575 return ret; 577 return ret;
576} 578}
577 579
578/** 580static struct task_struct *ptrace_get_task_struct(pid_t pid)
579 * ptrace_traceme -- helper for PTRACE_TRACEME
580 *
581 * Performs checks and sets PT_PTRACED.
582 * Should be used by all ptrace implementations for PTRACE_TRACEME.
583 */
584int ptrace_traceme(void)
585{
586 int ret = -EPERM;
587
588 /*
589 * Are we already being traced?
590 */
591repeat:
592 task_lock(current);
593 if (!(current->ptrace & PT_PTRACED)) {
594 /*
595 * See ptrace_attach() comments about the locking here.
596 */
597 unsigned long flags;
598 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
599 task_unlock(current);
600 do {
601 cpu_relax();
602 } while (!write_can_lock(&tasklist_lock));
603 goto repeat;
604 }
605
606 ret = security_ptrace_traceme(current->parent);
607
608 /*
609 * Check PF_EXITING to ensure ->real_parent has not passed
610 * exit_ptrace(). Otherwise we don't report the error but
611 * pretend ->real_parent untraces us right after return.
612 */
613 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
614 current->ptrace |= PT_PTRACED;
615 __ptrace_link(current, current->real_parent);
616 }
617
618 write_unlock_irqrestore(&tasklist_lock, flags);
619 }
620 task_unlock(current);
621 return ret;
622}
623
624/**
625 * ptrace_get_task_struct -- grab a task struct reference for ptrace
626 * @pid: process id to grab a task_struct reference of
627 *
628 * This function is a helper for ptrace implementations. It checks
629 * permissions and then grabs a task struct for use of the actual
630 * ptrace implementation.
631 *
632 * Returns the task_struct for @pid or an ERR_PTR() on failure.
633 */
634struct task_struct *ptrace_get_task_struct(pid_t pid)
635{ 581{
636 struct task_struct *child; 582 struct task_struct *child;
637 583
638 read_lock(&tasklist_lock); 584 rcu_read_lock();
639 child = find_task_by_vpid(pid); 585 child = find_task_by_vpid(pid);
640 if (child) 586 if (child)
641 get_task_struct(child); 587 get_task_struct(child);
588 rcu_read_unlock();
642 589
643 read_unlock(&tasklist_lock);
644 if (!child) 590 if (!child)
645 return ERR_PTR(-ESRCH); 591 return ERR_PTR(-ESRCH);
646 return child; 592 return child;
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4df64d3..beb0e659adcc 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
1356 1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; 1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); 1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; 1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq, 1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, 1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret); 1362 ret);
1363 1363
1364 /*
1365 * Signals would prevent us from sleeping, and we cannot
1366 * do much with them in any case. So flush them.
1367 */
1368 if (ret)
1369 flush_signals(current);
1370 couldsleepnext = 0; 1364 couldsleepnext = 0;
1371 1365
1372 } while (!kthread_should_stop()); 1366 } while (!kthread_should_stop());
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9b..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1259 check_cpu_stall(rsp, rdp); 1259 check_cpu_stall(rsp, rdp);
1260 1260
1261 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1261 /* Is the RCU core waiting for a quiescent state from this CPU? */
1262 if (rdp->qs_pending) 1262 if (rdp->qs_pending) {
1263 rdp->n_rp_qs_pending++;
1263 return 1; 1264 return 1;
1265 }
1264 1266
1265 /* Does this CPU have callbacks ready to invoke? */ 1267 /* Does this CPU have callbacks ready to invoke? */
1266 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1268 if (cpu_has_callbacks_ready_to_invoke(rdp)) {
1269 rdp->n_rp_cb_ready++;
1267 return 1; 1270 return 1;
1271 }
1268 1272
1269 /* Has RCU gone idle with this CPU needing another grace period? */ 1273 /* Has RCU gone idle with this CPU needing another grace period? */
1270 if (cpu_needs_another_gp(rsp, rdp)) 1274 if (cpu_needs_another_gp(rsp, rdp)) {
1275 rdp->n_rp_cpu_needs_gp++;
1271 return 1; 1276 return 1;
1277 }
1272 1278
1273 /* Has another RCU grace period completed? */ 1279 /* Has another RCU grace period completed? */
1274 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ 1280 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
1281 rdp->n_rp_gp_completed++;
1275 return 1; 1282 return 1;
1283 }
1276 1284
1277 /* Has a new RCU grace period started? */ 1285 /* Has a new RCU grace period started? */
1278 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ 1286 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
1287 rdp->n_rp_gp_started++;
1279 return 1; 1288 return 1;
1289 }
1280 1290
1281 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1291 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1282 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1292 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1283 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) 1293 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1294 rdp->n_rp_need_fqs++;
1284 return 1; 1295 return 1;
1296 }
1285 1297
1286 /* nothing to do */ 1298 /* nothing to do */
1299 rdp->n_rp_need_nothing++;
1287 return 0; 1300 return 0;
1288} 1301}
1289 1302
@@ -1520,7 +1533,7 @@ void __init __rcu_init(void)
1520 int j; 1533 int j;
1521 struct rcu_node *rnp; 1534 struct rcu_node *rnp;
1522 1535
1523 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1536 printk(KERN_INFO "Hierarchical RCU implementation.\n");
1524#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1525 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1526#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1533,7 +1546,6 @@ void __init __rcu_init(void)
1533 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); 1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1534 /* Register notifier for non-boot CPUs */ 1547 /* Register notifier for non-boot CPUs */
1535 register_cpu_notifier(&rcu_nb); 1548 register_cpu_notifier(&rcu_nb);
1536 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1537} 1549}
1538 1550
1539module_param(blimit, int, 0); 1551module_param(blimit, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba9404..fe1dcdbf1ca3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
213 .release = single_release, 213 .release = single_release,
214}; 214};
215 215
216static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; 216static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
217{
218 seq_printf(m, "%3d%cnp=%ld "
219 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
220 rdp->cpu,
221 cpu_is_offline(rdp->cpu) ? '!' : ' ',
222 rdp->n_rcu_pending,
223 rdp->n_rp_qs_pending,
224 rdp->n_rp_cb_ready,
225 rdp->n_rp_cpu_needs_gp,
226 rdp->n_rp_gp_completed,
227 rdp->n_rp_gp_started,
228 rdp->n_rp_need_fqs,
229 rdp->n_rp_need_nothing);
230}
231
232static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
233{
234 int cpu;
235 struct rcu_data *rdp;
236
237 for_each_possible_cpu(cpu) {
238 rdp = rsp->rda[cpu];
239 if (rdp->beenonline)
240 print_one_rcu_pending(m, rdp);
241 }
242}
243
244static int show_rcu_pending(struct seq_file *m, void *unused)
245{
246 seq_puts(m, "rcu:\n");
247 print_rcu_pendings(m, &rcu_state);
248 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state);
250 return 0;
251}
252
253static int rcu_pending_open(struct inode *inode, struct file *file)
254{
255 return single_open(file, show_rcu_pending, NULL);
256}
257
258static struct file_operations rcu_pending_fops = {
259 .owner = THIS_MODULE,
260 .open = rcu_pending_open,
261 .read = seq_read,
262 .llseek = seq_lseek,
263 .release = single_release,
264};
265
266static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272
217static int __init rcuclassic_trace_init(void) 273static int __init rcuclassic_trace_init(void)
218{ 274{
219 rcudir = debugfs_create_dir("rcu", NULL); 275 rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
238 NULL, &rcuhier_fops); 294 NULL, &rcuhier_fops);
239 if (!hierdir) 295 if (!hierdir)
240 goto free_out; 296 goto free_out;
297
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir)
301 goto free_out;
241 return 0; 302 return 0;
242free_out: 303free_out:
243 if (datadir) 304 if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
257 debugfs_remove(datadir_csv); 318 debugfs_remove(datadir_csv);
258 debugfs_remove(gpdir); 319 debugfs_remove(gpdir);
259 debugfs_remove(hierdir); 320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
260 debugfs_remove(rcudir); 322 debugfs_remove(rcudir);
261} 323}
262 324
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c803..e1338f074314 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
18void res_counter_init(struct res_counter *counter, struct res_counter *parent) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
133 unsigned long long *res) 133 unsigned long long *res)
134{ 134{
135 char *end; 135 char *end;
136
137 /* return RESOURCE_MAX(unlimited) if "-1" is specified */
138 if (*buf == '-') {
139 *res = simple_strtoull(buf + 1, &end, 10);
140 if (*res != 1 || *end != '\0')
141 return -EINVAL;
142 *res = RESOURCE_MAX;
143 return 0;
144 }
145
136 /* FIXME - make memparse() take const char* args */ 146 /* FIXME - make memparse() take const char* args */
137 *res = memparse((char *)buf, &end); 147 *res = memparse((char *)buf, &end);
138 if (*end != '\0') 148 if (*end != '\0')
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
300 * assigned pending owner [which might not have taken the 300 * assigned pending owner [which might not have taken the
301 * lock yet]: 301 * lock yet]:
302 */ 302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock) 303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
304{ 305{
305 struct task_struct *pendowner = rt_mutex_owner(lock); 306 struct task_struct *pendowner = rt_mutex_owner(lock);
306 struct rt_mutex_waiter *next; 307 struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
309 if (!rt_mutex_owner_pending(lock)) 310 if (!rt_mutex_owner_pending(lock))
310 return 0; 311 return 0;
311 312
312 if (pendowner == current) 313 if (pendowner == task)
313 return 1; 314 return 1;
314 315
315 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 spin_lock_irqsave(&pendowner->pi_lock, flags);
316 if (current->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
317 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
318 return 0; 319 return 0;
319 } 320 }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
338 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
339 * enqueued on the pending owners pi_waiters queue. So 340 * enqueued on the pending owners pi_waiters queue. So
340 * we have to enqueue this waiter into 341 * we have to enqueue this waiter into
341 * current->pi_waiters list. This covers the case, 342 * task->pi_waiters list. This covers the case,
342 * where current is boosted because it holds another 343 * where task is boosted because it holds another
343 * lock and gets unboosted because the booster is 344 * lock and gets unboosted because the booster is
344 * interrupted, so we would delay a waiter with higher 345 * interrupted, so we would delay a waiter with higher
345 * priority as current->normal_prio. 346 * priority as task->normal_prio.
346 * 347 *
347 * Note: in the rare case of a SCHED_OTHER task changing 348 * Note: in the rare case of a SCHED_OTHER task changing
348 * its priority and thus stealing the lock, next->task 349 * its priority and thus stealing the lock, next->task
349 * might be current: 350 * might be task:
350 */ 351 */
351 if (likely(next->task != current)) { 352 if (likely(next->task != task)) {
352 spin_lock_irqsave(&current->pi_lock, flags); 353 spin_lock_irqsave(&task->pi_lock, flags);
353 plist_add(&next->pi_list_entry, &current->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
354 __rt_mutex_adjust_prio(current); 355 __rt_mutex_adjust_prio(task);
355 spin_unlock_irqrestore(&current->pi_lock, flags); 356 spin_unlock_irqrestore(&task->pi_lock, flags);
356 } 357 }
357 return 1; 358 return 1;
358} 359}
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
389 */ 390 */
390 mark_rt_mutex_waiters(lock); 391 mark_rt_mutex_waiters(lock);
391 392
392 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) 393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
393 return 0; 394 return 0;
394 395
395 /* We got the lock. */ 396 /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
411 */ 412 */
412static int task_blocks_on_rt_mutex(struct rt_mutex *lock, 413static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
413 struct rt_mutex_waiter *waiter, 414 struct rt_mutex_waiter *waiter,
415 struct task_struct *task,
414 int detect_deadlock) 416 int detect_deadlock)
415{ 417{
416 struct task_struct *owner = rt_mutex_owner(lock); 418 struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
418 unsigned long flags; 420 unsigned long flags;
419 int chain_walk = 0, res; 421 int chain_walk = 0, res;
420 422
421 spin_lock_irqsave(&current->pi_lock, flags); 423 spin_lock_irqsave(&task->pi_lock, flags);
422 __rt_mutex_adjust_prio(current); 424 __rt_mutex_adjust_prio(task);
423 waiter->task = current; 425 waiter->task = task;
424 waiter->lock = lock; 426 waiter->lock = lock;
425 plist_node_init(&waiter->list_entry, current->prio); 427 plist_node_init(&waiter->list_entry, task->prio);
426 plist_node_init(&waiter->pi_list_entry, current->prio); 428 plist_node_init(&waiter->pi_list_entry, task->prio);
427 429
428 /* Get the top priority waiter on the lock */ 430 /* Get the top priority waiter on the lock */
429 if (rt_mutex_has_waiters(lock)) 431 if (rt_mutex_has_waiters(lock))
430 top_waiter = rt_mutex_top_waiter(lock); 432 top_waiter = rt_mutex_top_waiter(lock);
431 plist_add(&waiter->list_entry, &lock->wait_list); 433 plist_add(&waiter->list_entry, &lock->wait_list);
432 434
433 current->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
434 436
435 spin_unlock_irqrestore(&current->pi_lock, flags); 437 spin_unlock_irqrestore(&task->pi_lock, flags);
436 438
437 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
438 spin_lock_irqsave(&owner->pi_lock, flags); 440 spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
460 spin_unlock(&lock->wait_lock); 462 spin_unlock(&lock->wait_lock);
461 463
462 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
463 current); 465 task);
464 466
465 spin_lock(&lock->wait_lock); 467 spin_lock(&lock->wait_lock);
466 468
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
605 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 607 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
606} 608}
607 609
608/* 610/**
609 * Slow path lock function: 611 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
612 * @lock: the rt_mutex to take
613 * @state: the state the task should block in (TASK_INTERRUPTIBLE
614 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 *
619 * lock->wait_lock must be held by the caller.
610 */ 620 */
611static int __sched 621static int __sched
612rt_mutex_slowlock(struct rt_mutex *lock, int state, 622__rt_mutex_slowlock(struct rt_mutex *lock, int state,
613 struct hrtimer_sleeper *timeout, 623 struct hrtimer_sleeper *timeout,
614 int detect_deadlock) 624 struct rt_mutex_waiter *waiter,
625 int detect_deadlock)
615{ 626{
616 struct rt_mutex_waiter waiter;
617 int ret = 0; 627 int ret = 0;
618 628
619 debug_rt_mutex_init_waiter(&waiter);
620 waiter.task = NULL;
621
622 spin_lock(&lock->wait_lock);
623
624 /* Try to acquire the lock again: */
625 if (try_to_take_rt_mutex(lock)) {
626 spin_unlock(&lock->wait_lock);
627 return 0;
628 }
629
630 set_current_state(state);
631
632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) {
634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 if (!hrtimer_active(&timeout->timer))
636 timeout->task = NULL;
637 }
638
639 for (;;) { 629 for (;;) {
640 /* Try to acquire the lock: */ 630 /* Try to acquire the lock: */
641 if (try_to_take_rt_mutex(lock)) 631 if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
656 } 646 }
657 647
658 /* 648 /*
659 * waiter.task is NULL the first time we come here and 649 * waiter->task is NULL the first time we come here and
660 * when we have been woken up by the previous owner 650 * when we have been woken up by the previous owner
661 * but the lock got stolen by a higher prio task. 651 * but the lock got stolen by a higher prio task.
662 */ 652 */
663 if (!waiter.task) { 653 if (!waiter->task) {
664 ret = task_blocks_on_rt_mutex(lock, &waiter, 654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
665 detect_deadlock); 655 detect_deadlock);
666 /* 656 /*
667 * If we got woken up by the owner then start loop 657 * If we got woken up by the owner then start loop
668 * all over without going into schedule to try 658 * all over without going into schedule to try
669 * to get the lock now: 659 * to get the lock now:
670 */ 660 */
671 if (unlikely(!waiter.task)) { 661 if (unlikely(!waiter->task)) {
672 /* 662 /*
673 * Reset the return value. We might 663 * Reset the return value. We might
674 * have returned with -EDEADLK and the 664 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
684 674
685 spin_unlock(&lock->wait_lock); 675 spin_unlock(&lock->wait_lock);
686 676
687 debug_rt_mutex_print_deadlock(&waiter); 677 debug_rt_mutex_print_deadlock(waiter);
688 678
689 if (waiter.task) 679 if (waiter->task)
690 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
691 681
692 spin_lock(&lock->wait_lock); 682 spin_lock(&lock->wait_lock);
693 set_current_state(state); 683 set_current_state(state);
694 } 684 }
695 685
686 return ret;
687}
688
689/*
690 * Slow path lock function:
691 */
692static int __sched
693rt_mutex_slowlock(struct rt_mutex *lock, int state,
694 struct hrtimer_sleeper *timeout,
695 int detect_deadlock)
696{
697 struct rt_mutex_waiter waiter;
698 int ret = 0;
699
700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702
703 spin_lock(&lock->wait_lock);
704
705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock);
708 return 0;
709 }
710
711 set_current_state(state);
712
713 /* Setup the timer, when timeout != NULL */
714 if (unlikely(timeout)) {
715 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
716 if (!hrtimer_active(&timeout->timer))
717 timeout->task = NULL;
718 }
719
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
721 detect_deadlock);
722
696 set_current_state(TASK_RUNNING); 723 set_current_state(TASK_RUNNING);
697 724
698 if (unlikely(waiter.task)) 725 if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
864EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 891EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
865 892
866/** 893/**
867 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible 894 * rt_mutex_timed_lock - lock a rt_mutex interruptible
868 * the timeout structure is provided 895 * the timeout structure is provided
869 * by the caller 896 * by the caller
870 * 897 *
871 * @lock: the rt_mutex to be locked 898 * @lock: the rt_mutex to be locked
872 * @timeout: timeout structure or NULL (no timeout) 899 * @timeout: timeout structure or NULL (no timeout)
@@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
875 * Returns: 902 * Returns:
876 * 0 on success 903 * 0 on success
877 * -EINTR when interrupted by a signal 904 * -EINTR when interrupted by a signal
878 * -ETIMEOUT when the timeout expired 905 * -ETIMEDOUT when the timeout expired
879 * -EDEADLK when the lock would deadlock (when deadlock detection is on) 906 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
880 */ 907 */
881int 908int
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
913} 940}
914EXPORT_SYMBOL_GPL(rt_mutex_unlock); 941EXPORT_SYMBOL_GPL(rt_mutex_unlock);
915 942
916/*** 943/**
917 * rt_mutex_destroy - mark a mutex unusable 944 * rt_mutex_destroy - mark a mutex unusable
918 * @lock: the mutex to be destroyed 945 * @lock: the mutex to be destroyed
919 * 946 *
@@ -986,6 +1013,57 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
986} 1013}
987 1014
988/** 1015/**
1016 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1017 * @lock: the rt_mutex to take
1018 * @waiter: the pre-initialized rt_mutex_waiter
1019 * @task: the task to prepare
1020 * @detect_deadlock: perform deadlock detection (1) or not (0)
1021 *
1022 * Returns:
1023 * 0 - task blocked on lock
1024 * 1 - acquired the lock for task, caller should wake it up
1025 * <0 - error
1026 *
1027 * Special API call for FUTEX_REQUEUE_PI support.
1028 */
1029int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1030 struct rt_mutex_waiter *waiter,
1031 struct task_struct *task, int detect_deadlock)
1032{
1033 int ret;
1034
1035 spin_lock(&lock->wait_lock);
1036
1037 mark_rt_mutex_waiters(lock);
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1;
1046 }
1047
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049
1050 if (ret && !waiter->task) {
1051 /*
1052 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner
1054 * released the lock while we were walking the
1055 * pi chain. Let the waiter sort it out.
1056 */
1057 ret = 0;
1058 }
1059 spin_unlock(&lock->wait_lock);
1060
1061 debug_rt_mutex_print_deadlock(waiter);
1062
1063 return ret;
1064}
1065
1066/**
989 * rt_mutex_next_owner - return the next owner of the lock 1067 * rt_mutex_next_owner - return the next owner of the lock
990 * 1068 *
991 * @lock: the rt lock query 1069 * @lock: the rt lock query
@@ -1004,3 +1082,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1004 1082
1005 return rt_mutex_top_waiter(lock)->task; 1083 return rt_mutex_top_waiter(lock)->task;
1006} 1084}
1085
1086/**
1087 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
1088 * @lock: the rt_mutex we were woken on
1089 * @to: the timeout, null if none. hrtimer should already have
1090 * been started.
1091 * @waiter: the pre-initialized rt_mutex_waiter
1092 * @detect_deadlock: perform deadlock detection (1) or not (0)
1093 *
1094 * Complete the lock acquisition started our behalf by another thread.
1095 *
1096 * Returns:
1097 * 0 - success
1098 * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
1099 *
1100 * Special API call for PI-futex requeue support
1101 */
1102int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1103 struct hrtimer_sleeper *to,
1104 struct rt_mutex_waiter *waiter,
1105 int detect_deadlock)
1106{
1107 int ret;
1108
1109 spin_lock(&lock->wait_lock);
1110
1111 set_current_state(TASK_INTERRUPTIBLE);
1112
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
1114 detect_deadlock);
1115
1116 set_current_state(TASK_RUNNING);
1117
1118 if (unlikely(waiter->task))
1119 remove_waiter(lock, waiter);
1120
1121 /*
1122 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1123 * have to fix that up.
1124 */
1125 fixup_rt_mutex_waiters(lock);
1126
1127 spin_unlock(&lock->wait_lock);
1128
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret;
1138}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea..97a2f81866af 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
124 struct rt_mutex_waiter *waiter,
125 struct task_struct *task,
126 int detect_deadlock);
127extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
128 struct hrtimer_sleeper *to,
129 struct rt_mutex_waiter *waiter,
130 int detect_deadlock);
123 131
124#ifdef CONFIG_DEBUG_RT_MUTEXES 132#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h" 133# include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..1b59e265273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -68,17 +69,18 @@
68#include <linux/pagemap.h> 69#include <linux/pagemap.h>
69#include <linux/hrtimer.h> 70#include <linux/hrtimer.h>
70#include <linux/tick.h> 71#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
76 75
77#include <asm/tlb.h> 76#include <asm/tlb.h>
78#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
79 78
80#include "sched_cpupri.h" 79#include "sched_cpupri.h"
81 80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
82/* 84/*
83 * Convert user-nice values [ -20 ... 0 ... 19 ] 85 * Convert user-nice values [ -20 ... 0 ... 19 ]
84 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
118 */ 120 */
119#define RUNTIME_INF ((u64)~0ULL) 121#define RUNTIME_INF ((u64)~0ULL)
120 122
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
128 124
129static void double_rq_lock(struct rq *rq1, struct rq *rq2); 125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 240 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft)); 241 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0); 243 HRTIMER_MODE_ABS_PINNED, 0);
248 } 244 }
249 spin_unlock(&rt_b->rt_runtime_lock); 245 spin_unlock(&rt_b->rt_runtime_lock);
250} 246}
@@ -497,6 +493,7 @@ struct rt_rq {
497#endif 493#endif
498#ifdef CONFIG_SMP 494#ifdef CONFIG_SMP
499 unsigned long rt_nr_migratory; 495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
500 int overloaded; 497 int overloaded;
501 struct plist_head pushable_tasks; 498 struct plist_head pushable_tasks;
502#endif 499#endif
@@ -584,6 +581,7 @@ struct rq {
584 struct load_weight load; 581 struct load_weight load;
585 unsigned long nr_load_updates; 582 unsigned long nr_load_updates;
586 u64 nr_switches; 583 u64 nr_switches;
584 u64 nr_migrations_in;
587 585
588 struct cfs_rq cfs; 586 struct cfs_rq cfs;
589 struct rt_rq rt; 587 struct rt_rq rt;
@@ -630,6 +628,10 @@ struct rq {
630 struct list_head migration_queue; 628 struct list_head migration_queue;
631#endif 629#endif
632 630
631 /* calc_load related fields */
632 unsigned long calc_load_update;
633 long calc_load_active;
634
633#ifdef CONFIG_SCHED_HRTICK 635#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 636#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 637 int hrtick_csd_pending;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 694#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 695#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 696
695static inline void update_rq_clock(struct rq *rq) 697inline void update_rq_clock(struct rq *rq)
696{ 698{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 699 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 700}
@@ -1154,7 +1156,7 @@ static __init void init_hrtick(void)
1154static void hrtick_start(struct rq *rq, u64 delay) 1156static void hrtick_start(struct rq *rq, u64 delay)
1155{ 1157{
1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1158 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0); 1159 HRTIMER_MODE_REL_PINNED, 0);
1158} 1160}
1159 1161
1160static inline void init_hrtick(void) 1162static inline void init_hrtick(void)
@@ -1728,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1730}
1729#endif 1731#endif
1730 1732
1733static void calc_load_account_active(struct rq *this_rq);
1734
1731#include "sched_stats.h" 1735#include "sched_stats.h"
1732#include "sched_idletask.c" 1736#include "sched_idletask.c"
1733#include "sched_fair.c" 1737#include "sched_fair.c"
@@ -1958,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1958 1962
1959 clock_offset = old_rq->clock - new_rq->clock; 1963 clock_offset = old_rq->clock - new_rq->clock;
1960 1964
1961 trace_sched_migrate_task(p, task_cpu(p), new_cpu); 1965 trace_sched_migrate_task(p, new_cpu);
1962 1966
1963#ifdef CONFIG_SCHEDSTATS 1967#ifdef CONFIG_SCHEDSTATS
1964 if (p->se.wait_start) 1968 if (p->se.wait_start)
@@ -1967,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1971 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1972 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1973 p->se.block_start -= clock_offset;
1974#endif
1970 if (old_cpu != new_cpu) { 1975 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1976 p->se.nr_migrations++;
1977 new_rq->nr_migrations_in++;
1978#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1979 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1980 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1981#endif
1982 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1983 1, 1, NULL, 0);
1984 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1985 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1986 new_cfsrq->min_vruntime;
1978 1987
@@ -2015,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2015} 2024}
2016 2025
2017/* 2026/*
2027 * wait_task_context_switch - wait for a thread to complete at least one
2028 * context switch.
2029 *
2030 * @p must not be current.
2031 */
2032void wait_task_context_switch(struct task_struct *p)
2033{
2034 unsigned long nvcsw, nivcsw, flags;
2035 int running;
2036 struct rq *rq;
2037
2038 nvcsw = p->nvcsw;
2039 nivcsw = p->nivcsw;
2040 for (;;) {
2041 /*
2042 * The runqueue is assigned before the actual context
2043 * switch. We need to take the runqueue lock.
2044 *
2045 * We could check initially without the lock but it is
2046 * very likely that we need to take the lock in every
2047 * iteration.
2048 */
2049 rq = task_rq_lock(p, &flags);
2050 running = task_running(rq, p);
2051 task_rq_unlock(rq, &flags);
2052
2053 if (likely(!running))
2054 break;
2055 /*
2056 * The switch count is incremented before the actual
2057 * context switch. We thus wait for two switches to be
2058 * sure at least one completed.
2059 */
2060 if ((p->nvcsw - nvcsw) > 1)
2061 break;
2062 if ((p->nivcsw - nivcsw) > 1)
2063 break;
2064
2065 cpu_relax();
2066 }
2067}
2068
2069/*
2018 * wait_task_inactive - wait for a thread to unschedule. 2070 * wait_task_inactive - wait for a thread to unschedule.
2019 * 2071 *
2020 * If @match_state is nonzero, it's the @p->state value just checked and 2072 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2142,6 +2194,7 @@ void kick_process(struct task_struct *p)
2142 smp_send_reschedule(cpu); 2194 smp_send_reschedule(cpu);
2143 preempt_enable(); 2195 preempt_enable();
2144} 2196}
2197EXPORT_SYMBOL_GPL(kick_process);
2145 2198
2146/* 2199/*
2147 * Return a low guess at the load of a migration-source cpu weighted 2200 * Return a low guess at the load of a migration-source cpu weighted
@@ -2324,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2377
2325#endif /* CONFIG_SMP */ 2378#endif /* CONFIG_SMP */
2326 2379
2380/**
2381 * task_oncpu_function_call - call a function on the cpu on which a task runs
2382 * @p: the task to evaluate
2383 * @func: the function to be called
2384 * @info: the function call argument
2385 *
2386 * Calls the function @func when the task is currently running. This might
2387 * be on the current CPU, which just calls the function directly
2388 */
2389void task_oncpu_function_call(struct task_struct *p,
2390 void (*func) (void *info), void *info)
2391{
2392 int cpu;
2393
2394 preempt_disable();
2395 cpu = task_cpu(p);
2396 if (task_curr(p))
2397 smp_call_function_single(cpu, func, info, 1);
2398 preempt_enable();
2399}
2400
2327/*** 2401/***
2328 * try_to_wake_up - wake up a thread 2402 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2403 * @p: the to-be-woken-up thread
@@ -2458,6 +2532,17 @@ out:
2458 return success; 2532 return success;
2459} 2533}
2460 2534
2535/**
2536 * wake_up_process - Wake up a specific process
2537 * @p: The process to be woken up.
2538 *
2539 * Attempt to wake up the nominated process and move it to the set of runnable
2540 * processes. Returns 1 if the process was woken up, 0 if it was already
2541 * running.
2542 *
2543 * It may be assumed that this function implies a write memory barrier before
2544 * changing the task state if and only if any tasks are woken up.
2545 */
2461int wake_up_process(struct task_struct *p) 2546int wake_up_process(struct task_struct *p)
2462{ 2547{
2463 return try_to_wake_up(p, TASK_ALL, 0); 2548 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,21 +2565,44 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2565 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2566 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2567 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2569 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2570 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2571 p->se.start_runtime = 0;
2486 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2487 2573
2488#ifdef CONFIG_SCHEDSTATS 2574#ifdef CONFIG_SCHEDSTATS
2489 p->se.wait_start = 0; 2575 p->se.wait_start = 0;
2490 p->se.sum_sleep_runtime = 0; 2576 p->se.wait_max = 0;
2491 p->se.sleep_start = 0; 2577 p->se.wait_count = 0;
2492 p->se.block_start = 0; 2578 p->se.wait_sum = 0;
2493 p->se.sleep_max = 0; 2579
2494 p->se.block_max = 0; 2580 p->se.sleep_start = 0;
2495 p->se.exec_max = 0; 2581 p->se.sleep_max = 0;
2496 p->se.slice_max = 0; 2582 p->se.sum_sleep_runtime = 0;
2497 p->se.wait_max = 0; 2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2498#endif 2606#endif
2499 2607
2500 INIT_LIST_HEAD(&p->rt.run_list); 2608 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2710,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2818 */
2711 prev_state = prev->state; 2819 prev_state = prev->state;
2712 finish_arch_switch(prev); 2820 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2822 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2823#ifdef CONFIG_SMP
2715 if (post_schedule) 2824 if (post_schedule)
@@ -2766,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2766 * combine the page table reload and the switch backend into 2875 * combine the page table reload and the switch backend into
2767 * one hypercall. 2876 * one hypercall.
2768 */ 2877 */
2769 arch_enter_lazy_cpu_mode(); 2878 arch_start_context_switch(prev);
2770 2879
2771 if (unlikely(!mm)) { 2880 if (unlikely(!mm)) {
2772 next->active_mm = oldmm; 2881 next->active_mm = oldmm;
@@ -2856,19 +2965,81 @@ unsigned long nr_iowait(void)
2856 return sum; 2965 return sum;
2857} 2966}
2858 2967
2859unsigned long nr_active(void) 2968/* Variables and functions for calc_load */
2969static atomic_long_t calc_load_tasks;
2970static unsigned long calc_load_update;
2971unsigned long avenrun[3];
2972EXPORT_SYMBOL(avenrun);
2973
2974/**
2975 * get_avenrun - get the load average array
2976 * @loads: pointer to dest load array
2977 * @offset: offset to add
2978 * @shift: shift count to shift the result left
2979 *
2980 * These values are estimates at best, so no need for locking.
2981 */
2982void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2983{
2861 unsigned long i, running = 0, uninterruptible = 0; 2984 loads[0] = (avenrun[0] + offset) << shift;
2985 loads[1] = (avenrun[1] + offset) << shift;
2986 loads[2] = (avenrun[2] + offset) << shift;
2987}
2862 2988
2863 for_each_online_cpu(i) { 2989static unsigned long
2864 running += cpu_rq(i)->nr_running; 2990calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2991{
2866 } 2992 load *= exp;
2993 load += active * (FIXED_1 - exp);
2994 return load >> FSHIFT;
2995}
2996
2997/*
2998 * calc_load - update the avenrun load estimates 10 ticks after the
2999 * CPUs have updated calc_load_tasks.
3000 */
3001void calc_global_load(void)
3002{
3003 unsigned long upd = calc_load_update + 10;
3004 long active;
3005
3006 if (time_before(jiffies, upd))
3007 return;
3008
3009 active = atomic_long_read(&calc_load_tasks);
3010 active = active > 0 ? active * FIXED_1 : 0;
2867 3011
2868 if (unlikely((long)uninterruptible < 0)) 3012 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2869 uninterruptible = 0; 3013 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3014 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2870 3015
2871 return running + uninterruptible; 3016 calc_load_update += LOAD_FREQ;
3017}
3018
3019/*
3020 * Either called from update_cpu_load() or from a cpu going idle
3021 */
3022static void calc_load_account_active(struct rq *this_rq)
3023{
3024 long nr_active, delta;
3025
3026 nr_active = this_rq->nr_running;
3027 nr_active += (long) this_rq->nr_uninterruptible;
3028
3029 if (nr_active != this_rq->calc_load_active) {
3030 delta = nr_active - this_rq->calc_load_active;
3031 this_rq->calc_load_active = nr_active;
3032 atomic_long_add(delta, &calc_load_tasks);
3033 }
3034}
3035
3036/*
3037 * Externally visible per-cpu scheduler statistics:
3038 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3039 */
3040u64 cpu_nr_migrations(int cpu)
3041{
3042 return cpu_rq(cpu)->nr_migrations_in;
2872} 3043}
2873 3044
2874/* 3045/*
@@ -2899,6 +3070,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 3070 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3071 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 3072 }
3073
3074 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3075 this_rq->calc_load_update += LOAD_FREQ;
3076 calc_load_account_active(this_rq);
3077 }
2902} 3078}
2903 3079
2904#ifdef CONFIG_SMP 3080#ifdef CONFIG_SMP
@@ -4240,10 +4416,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4416static struct {
4241 atomic_t load_balancer; 4417 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4418 cpumask_var_t cpu_mask;
4419 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4420} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4421 .load_balancer = ATOMIC_INIT(-1),
4245}; 4422};
4246 4423
4424int get_nohz_load_balancer(void)
4425{
4426 return atomic_read(&nohz.load_balancer);
4427}
4428
4429#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4430/**
4431 * lowest_flag_domain - Return lowest sched_domain containing flag.
4432 * @cpu: The cpu whose lowest level of sched domain is to
4433 * be returned.
4434 * @flag: The flag to check for the lowest sched_domain
4435 * for the given cpu.
4436 *
4437 * Returns the lowest sched_domain of a cpu which contains the given flag.
4438 */
4439static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4440{
4441 struct sched_domain *sd;
4442
4443 for_each_domain(cpu, sd)
4444 if (sd && (sd->flags & flag))
4445 break;
4446
4447 return sd;
4448}
4449
4450/**
4451 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4452 * @cpu: The cpu whose domains we're iterating over.
4453 * @sd: variable holding the value of the power_savings_sd
4454 * for cpu.
4455 * @flag: The flag to filter the sched_domains to be iterated.
4456 *
4457 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4458 * set, starting from the lowest sched_domain to the highest.
4459 */
4460#define for_each_flag_domain(cpu, sd, flag) \
4461 for (sd = lowest_flag_domain(cpu, flag); \
4462 (sd && (sd->flags & flag)); sd = sd->parent)
4463
4464/**
4465 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4466 * @ilb_group: group to be checked for semi-idleness
4467 *
4468 * Returns: 1 if the group is semi-idle. 0 otherwise.
4469 *
4470 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4471 * and atleast one non-idle CPU. This helper function checks if the given
4472 * sched_group is semi-idle or not.
4473 */
4474static inline int is_semi_idle_group(struct sched_group *ilb_group)
4475{
4476 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4477 sched_group_cpus(ilb_group));
4478
4479 /*
4480 * A sched_group is semi-idle when it has atleast one busy cpu
4481 * and atleast one idle cpu.
4482 */
4483 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4484 return 0;
4485
4486 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4487 return 0;
4488
4489 return 1;
4490}
4491/**
4492 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4493 * @cpu: The cpu which is nominating a new idle_load_balancer.
4494 *
4495 * Returns: Returns the id of the idle load balancer if it exists,
4496 * Else, returns >= nr_cpu_ids.
4497 *
4498 * This algorithm picks the idle load balancer such that it belongs to a
4499 * semi-idle powersavings sched_domain. The idea is to try and avoid
4500 * completely idle packages/cores just for the purpose of idle load balancing
4501 * when there are other idle cpu's which are better suited for that job.
4502 */
4503static int find_new_ilb(int cpu)
4504{
4505 struct sched_domain *sd;
4506 struct sched_group *ilb_group;
4507
4508 /*
4509 * Have idle load balancer selection from semi-idle packages only
4510 * when power-aware load balancing is enabled
4511 */
4512 if (!(sched_smt_power_savings || sched_mc_power_savings))
4513 goto out_done;
4514
4515 /*
4516 * Optimize for the case when we have no idle CPUs or only one
4517 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4518 */
4519 if (cpumask_weight(nohz.cpu_mask) < 2)
4520 goto out_done;
4521
4522 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4523 ilb_group = sd->groups;
4524
4525 do {
4526 if (is_semi_idle_group(ilb_group))
4527 return cpumask_first(nohz.ilb_grp_nohz_mask);
4528
4529 ilb_group = ilb_group->next;
4530
4531 } while (ilb_group != sd->groups);
4532 }
4533
4534out_done:
4535 return cpumask_first(nohz.cpu_mask);
4536}
4537#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4538static inline int find_new_ilb(int call_cpu)
4539{
4540 return cpumask_first(nohz.cpu_mask);
4541}
4542#endif
4543
4247/* 4544/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4545 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4546 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4595,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4595 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4596 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4597 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4598 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4599 int new_ilb;
4600
4601 if (!(sched_smt_power_savings ||
4602 sched_mc_power_savings))
4603 return 1;
4604 /*
4605 * Check to see if there is a more power-efficient
4606 * ilb.
4607 */
4608 new_ilb = find_new_ilb(cpu);
4609 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4610 atomic_set(&nohz.load_balancer, -1);
4611 resched_cpu(new_ilb);
4612 return 0;
4613 }
4302 return 1; 4614 return 1;
4615 }
4303 } else { 4616 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4617 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4618 return 0;
@@ -4468,15 +4781,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4781 }
4469 4782
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4783 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4784 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4785
4481 if (ilb < nr_cpu_ids) 4786 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4787 resched_cpu(ilb);
@@ -4840,6 +5145,8 @@ void scheduler_tick(void)
4840 curr->sched_class->task_tick(rq, curr, 0); 5145 curr->sched_class->task_tick(rq, curr, 0);
4841 spin_unlock(&rq->lock); 5146 spin_unlock(&rq->lock);
4842 5147
5148 perf_counter_task_tick(curr, cpu);
5149
4843#ifdef CONFIG_SMP 5150#ifdef CONFIG_SMP
4844 rq->idle_at_tick = idle_cpu(cpu); 5151 rq->idle_at_tick = idle_cpu(cpu);
4845 trigger_load_balance(rq, cpu); 5152 trigger_load_balance(rq, cpu);
@@ -5007,13 +5314,15 @@ pick_next_task(struct rq *rq)
5007/* 5314/*
5008 * schedule() is the main scheduler function. 5315 * schedule() is the main scheduler function.
5009 */ 5316 */
5010asmlinkage void __sched __schedule(void) 5317asmlinkage void __sched schedule(void)
5011{ 5318{
5012 struct task_struct *prev, *next; 5319 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5320 unsigned long *switch_count;
5014 struct rq *rq; 5321 struct rq *rq;
5015 int cpu; 5322 int cpu;
5016 5323
5324need_resched:
5325 preempt_disable();
5017 cpu = smp_processor_id(); 5326 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5327 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5328 rcu_qsctr_inc(cpu);
@@ -5053,6 +5362,7 @@ need_resched_nonpreemptible:
5053 5362
5054 if (likely(prev != next)) { 5363 if (likely(prev != next)) {
5055 sched_info_switch(prev, next); 5364 sched_info_switch(prev, next);
5365 perf_counter_task_sched_out(prev, next, cpu);
5056 5366
5057 rq->nr_switches++; 5367 rq->nr_switches++;
5058 rq->curr = next; 5368 rq->curr = next;
@@ -5070,15 +5380,9 @@ need_resched_nonpreemptible:
5070 5380
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5381 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5382 goto need_resched_nonpreemptible;
5073}
5074 5383
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5384 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5385 if (need_resched())
5082 goto need_resched; 5386 goto need_resched;
5083} 5387}
5084EXPORT_SYMBOL(schedule); 5388EXPORT_SYMBOL(schedule);
@@ -5221,7 +5525,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5525 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5526 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5527 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5529 int nr_exclusive, int sync, void *key)
5226{ 5530{
5227 wait_queue_t *curr, *next; 5531 wait_queue_t *curr, *next;
@@ -5241,6 +5545,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5241 * @mode: which threads 5545 * @mode: which threads
5242 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5546 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5243 * @key: is directly passed to the wakeup function 5547 * @key: is directly passed to the wakeup function
5548 *
5549 * It may be assumed that this function implies a write memory barrier before
5550 * changing the task state if and only if any tasks are woken up.
5244 */ 5551 */
5245void __wake_up(wait_queue_head_t *q, unsigned int mode, 5552void __wake_up(wait_queue_head_t *q, unsigned int mode,
5246 int nr_exclusive, void *key) 5553 int nr_exclusive, void *key)
@@ -5279,6 +5586,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5279 * with each other. This can prevent needless bouncing between CPUs. 5586 * with each other. This can prevent needless bouncing between CPUs.
5280 * 5587 *
5281 * On UP it can prevent extra preemption. 5588 * On UP it can prevent extra preemption.
5589 *
5590 * It may be assumed that this function implies a write memory barrier before
5591 * changing the task state if and only if any tasks are woken up.
5282 */ 5592 */
5283void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5593void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5284 int nr_exclusive, void *key) 5594 int nr_exclusive, void *key)
@@ -5315,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5315 * awakened in the same order in which they were queued. 5625 * awakened in the same order in which they were queued.
5316 * 5626 *
5317 * See also complete_all(), wait_for_completion() and related routines. 5627 * See also complete_all(), wait_for_completion() and related routines.
5628 *
5629 * It may be assumed that this function implies a write memory barrier before
5630 * changing the task state if and only if any tasks are woken up.
5318 */ 5631 */
5319void complete(struct completion *x) 5632void complete(struct completion *x)
5320{ 5633{
@@ -5332,6 +5645,9 @@ EXPORT_SYMBOL(complete);
5332 * @x: holds the state of this particular completion 5645 * @x: holds the state of this particular completion
5333 * 5646 *
5334 * This will wake up all threads waiting on this particular completion event. 5647 * This will wake up all threads waiting on this particular completion event.
5648 *
5649 * It may be assumed that this function implies a write memory barrier before
5650 * changing the task state if and only if any tasks are woken up.
5335 */ 5651 */
5336void complete_all(struct completion *x) 5652void complete_all(struct completion *x)
5337{ 5653{
@@ -6248,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
6248 return 0; 6564 return 0;
6249} 6565}
6250 6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6251static void __cond_resched(void) 6572static void __cond_resched(void)
6252{ 6573{
6253#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6267,8 +6588,7 @@ static void __cond_resched(void)
6267 6588
6268int __sched _cond_resched(void) 6589int __sched _cond_resched(void)
6269{ 6590{
6270 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6591 if (should_resched()) {
6271 system_state == SYSTEM_RUNNING) {
6272 __cond_resched(); 6592 __cond_resched();
6273 return 1; 6593 return 1;
6274 } 6594 }
@@ -6286,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
6286 */ 6606 */
6287int cond_resched_lock(spinlock_t *lock) 6607int cond_resched_lock(spinlock_t *lock)
6288{ 6608{
6289 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6609 int resched = should_resched();
6290 int ret = 0; 6610 int ret = 0;
6291 6611
6292 if (spin_needbreak(lock) || resched) { 6612 if (spin_needbreak(lock) || resched) {
6293 spin_unlock(lock); 6613 spin_unlock(lock);
6294 if (resched && need_resched()) 6614 if (resched)
6295 __cond_resched(); 6615 __cond_resched();
6296 else 6616 else
6297 cpu_relax(); 6617 cpu_relax();
@@ -6306,7 +6626,7 @@ int __sched cond_resched_softirq(void)
6306{ 6626{
6307 BUG_ON(!in_softirq()); 6627 BUG_ON(!in_softirq());
6308 6628
6309 if (need_resched() && system_state == SYSTEM_RUNNING) { 6629 if (should_resched()) {
6310 local_bh_enable(); 6630 local_bh_enable();
6311 __cond_resched(); 6631 __cond_resched();
6312 local_bh_disable(); 6632 local_bh_disable();
@@ -6490,8 +6810,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6810#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6811 free = stack_not_used(p);
6492#endif 6812#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6813 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6814 task_pid_nr(p), task_pid_nr(p->real_parent),
6815 (unsigned long)task_thread_info(p)->flags);
6495 6816
6496 show_stack(p, NULL); 6817 show_stack(p, NULL);
6497} 6818}
@@ -6752,7 +7073,7 @@ static int migration_thread(void *data)
6752 7073
6753 if (cpu_is_offline(cpu)) { 7074 if (cpu_is_offline(cpu)) {
6754 spin_unlock_irq(&rq->lock); 7075 spin_unlock_irq(&rq->lock);
6755 goto wait_to_die; 7076 break;
6756 } 7077 }
6757 7078
6758 if (rq->active_balance) { 7079 if (rq->active_balance) {
@@ -6778,16 +7099,7 @@ static int migration_thread(void *data)
6778 complete(&req->done); 7099 complete(&req->done);
6779 } 7100 }
6780 __set_current_state(TASK_RUNNING); 7101 __set_current_state(TASK_RUNNING);
6781 return 0;
6782 7102
6783wait_to_die:
6784 /* Wait for kthread_stop */
6785 set_current_state(TASK_INTERRUPTIBLE);
6786 while (!kthread_should_stop()) {
6787 schedule();
6788 set_current_state(TASK_INTERRUPTIBLE);
6789 }
6790 __set_current_state(TASK_RUNNING);
6791 return 0; 7103 return 0;
6792} 7104}
6793 7105
@@ -6970,6 +7282,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7282
6971 } 7283 }
6972} 7284}
7285
7286/*
7287 * remove the tasks which were accounted by rq from calc_load_tasks.
7288 */
7289static void calc_global_load_remove(struct rq *rq)
7290{
7291 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7292 rq->calc_load_active = 0;
7293}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7294#endif /* CONFIG_HOTPLUG_CPU */
6974 7295
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7296#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7193,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7193 rq = task_rq_lock(p, &flags); 7514 rq = task_rq_lock(p, &flags);
7194 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7515 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7195 task_rq_unlock(rq, &flags); 7516 task_rq_unlock(rq, &flags);
7517 get_task_struct(p);
7196 cpu_rq(cpu)->migration_thread = p; 7518 cpu_rq(cpu)->migration_thread = p;
7519 rq->calc_load_update = calc_load_update;
7197 break; 7520 break;
7198 7521
7199 case CPU_ONLINE: 7522 case CPU_ONLINE:
@@ -7221,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7221 kthread_bind(cpu_rq(cpu)->migration_thread, 7544 kthread_bind(cpu_rq(cpu)->migration_thread,
7222 cpumask_any(cpu_online_mask)); 7545 cpumask_any(cpu_online_mask));
7223 kthread_stop(cpu_rq(cpu)->migration_thread); 7546 kthread_stop(cpu_rq(cpu)->migration_thread);
7547 put_task_struct(cpu_rq(cpu)->migration_thread);
7224 cpu_rq(cpu)->migration_thread = NULL; 7548 cpu_rq(cpu)->migration_thread = NULL;
7225 break; 7549 break;
7226 7550
@@ -7230,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7230 migrate_live_tasks(cpu); 7554 migrate_live_tasks(cpu);
7231 rq = cpu_rq(cpu); 7555 rq = cpu_rq(cpu);
7232 kthread_stop(rq->migration_thread); 7556 kthread_stop(rq->migration_thread);
7557 put_task_struct(rq->migration_thread);
7233 rq->migration_thread = NULL; 7558 rq->migration_thread = NULL;
7234 /* Idle task back to normal (off runqueue, low prio) */ 7559 /* Idle task back to normal (off runqueue, low prio) */
7235 spin_lock_irq(&rq->lock); 7560 spin_lock_irq(&rq->lock);
@@ -7243,7 +7568,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7568 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7569 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7570 BUG_ON(rq->nr_running != 0);
7246 7571 calc_global_load_remove(rq);
7247 /* 7572 /*
7248 * No need to migrate the tasks: it was best-effort if 7573 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7574 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7604,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7279 return NOTIFY_OK; 7604 return NOTIFY_OK;
7280} 7605}
7281 7606
7282/* Register at highest priority so that task migration (migrate_all_tasks) 7607/*
7283 * happens before everything else. 7608 * Register at high priority so that task migration (migrate_all_tasks)
7609 * happens before everything else. This has to be lower priority than
7610 * the notifier in the perf_counter subsystem, though.
7284 */ 7611 */
7285static struct notifier_block __cpuinitdata migration_notifier = { 7612static struct notifier_block __cpuinitdata migration_notifier = {
7286 .notifier_call = migration_call, 7613 .notifier_call = migration_call,
@@ -7523,26 +7850,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7523 free_rootdomain(old_rd); 7850 free_rootdomain(old_rd);
7524} 7851}
7525 7852
7526static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7853static int init_rootdomain(struct root_domain *rd, bool bootmem)
7527{ 7854{
7855 gfp_t gfp = GFP_KERNEL;
7856
7528 memset(rd, 0, sizeof(*rd)); 7857 memset(rd, 0, sizeof(*rd));
7529 7858
7530 if (bootmem) { 7859 if (bootmem)
7531 alloc_bootmem_cpumask_var(&def_root_domain.span); 7860 gfp = GFP_NOWAIT;
7532 alloc_bootmem_cpumask_var(&def_root_domain.online);
7533 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7534 cpupri_init(&rd->cpupri, true);
7535 return 0;
7536 }
7537 7861
7538 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 7862 if (!alloc_cpumask_var(&rd->span, gfp))
7539 goto out; 7863 goto out;
7540 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 7864 if (!alloc_cpumask_var(&rd->online, gfp))
7541 goto free_span; 7865 goto free_span;
7542 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 7866 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7543 goto free_online; 7867 goto free_online;
7544 7868
7545 if (cpupri_init(&rd->cpupri, false) != 0) 7869 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7546 goto free_rto_mask; 7870 goto free_rto_mask;
7547 return 0; 7871 return 0;
7548 7872
@@ -7753,8 +8077,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 8077
7754/* 8078/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 8079 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8080 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 8081 * ( See the the comments in include/linux/sched.h:struct sched_group
8082 * and struct sched_domain. )
7758 */ 8083 */
7759struct static_sched_group { 8084struct static_sched_group {
7760 struct sched_group sg; 8085 struct sched_group sg;
@@ -7875,7 +8200,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8200 struct sched_domain *sd;
7876 8201
7877 sd = &per_cpu(phys_domains, j).sd; 8202 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8203 if (j != group_first_cpu(sd->groups)) {
7879 /* 8204 /*
7880 * Only add "power" once for each 8205 * Only add "power" once for each
7881 * physical package. 8206 * physical package.
@@ -7953,7 +8278,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8278
7954 WARN_ON(!sd || !sd->groups); 8279 WARN_ON(!sd || !sd->groups);
7955 8280
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8281 if (cpu != group_first_cpu(sd->groups))
7957 return; 8282 return;
7958 8283
7959 child = sd->child; 8284 child = sd->child;
@@ -8731,6 +9056,8 @@ void __init sched_init_smp(void)
8731} 9056}
8732#endif /* CONFIG_SMP */ 9057#endif /* CONFIG_SMP */
8733 9058
9059const_debug unsigned int sysctl_timer_migration = 1;
9060
8734int in_sched_functions(unsigned long addr) 9061int in_sched_functions(unsigned long addr)
8735{ 9062{
8736 return in_lock_functions(addr) || 9063 return in_lock_functions(addr) ||
@@ -8770,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8770#ifdef CONFIG_SMP 9097#ifdef CONFIG_SMP
8771 rt_rq->rt_nr_migratory = 0; 9098 rt_rq->rt_nr_migratory = 0;
8772 rt_rq->overloaded = 0; 9099 rt_rq->overloaded = 0;
8773 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9100 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
8774#endif 9101#endif
8775 9102
8776 rt_rq->rt_time = 0; 9103 rt_rq->rt_time = 0;
@@ -8865,7 +9192,7 @@ void __init sched_init(void)
8865 * we use alloc_bootmem(). 9192 * we use alloc_bootmem().
8866 */ 9193 */
8867 if (alloc_size) { 9194 if (alloc_size) {
8868 ptr = (unsigned long)alloc_bootmem(alloc_size); 9195 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8869 9196
8870#ifdef CONFIG_FAIR_GROUP_SCHED 9197#ifdef CONFIG_FAIR_GROUP_SCHED
8871 init_task_group.se = (struct sched_entity **)ptr; 9198 init_task_group.se = (struct sched_entity **)ptr;
@@ -8938,6 +9265,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9265 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9266 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9267 rq->nr_running = 0;
9268 rq->calc_load_active = 0;
9269 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9270 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9271 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9272#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9287,7 @@ void __init sched_init(void)
8958 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9287 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8959 * then A0's share of the cpu resource is: 9288 * then A0's share of the cpu resource is:
8960 * 9289 *
8961 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9290 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8962 * 9291 *
8963 * We achieve this by letting init_task_group's tasks sit 9292 * We achieve this by letting init_task_group's tasks sit
8964 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9293 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,20 +9374,26 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9374 * when this runqueue becomes "idle".
9046 */ 9375 */
9047 init_idle(current, smp_processor_id()); 9376 init_idle(current, smp_processor_id());
9377
9378 calc_load_update = jiffies + LOAD_FREQ;
9379
9048 /* 9380 /*
9049 * During early bootup we pretend to be a normal task: 9381 * During early bootup we pretend to be a normal task:
9050 */ 9382 */
9051 current->sched_class = &fair_sched_class; 9383 current->sched_class = &fair_sched_class;
9052 9384
9053 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9385 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9054 alloc_bootmem_cpumask_var(&nohz_cpu_mask); 9386 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9055#ifdef CONFIG_SMP 9387#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9388#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9389 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9390 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9058#endif 9391#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9392 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9060#endif /* SMP */ 9393#endif /* SMP */
9061 9394
9395 perf_counter_init();
9396
9062 scheduler_running = 1; 9397 scheduler_running = 1;
9063} 9398}
9064 9399
@@ -9800,6 +10135,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10135 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10136 return -EINVAL;
9802 10137
10138 /*
10139 * There's always some RT tasks in the root group
10140 * -- migration, kstopmachine etc..
10141 */
10142 if (sysctl_sched_rt_runtime == 0)
10143 return -EBUSY;
10144
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10145 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10146 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10147 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a5e3ed..d014efbf947a 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
@@ -152,10 +165,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
152 * 165 *
153 * Returns: -ENOMEM if memory fails. 166 * Returns: -ENOMEM if memory fails.
154 */ 167 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 168int cpupri_init(struct cpupri *cp, bool bootmem)
156{ 169{
170 gfp_t gfp = GFP_KERNEL;
157 int i; 171 int i;
158 172
173 if (bootmem)
174 gfp = GFP_NOWAIT;
175
159 memset(cp, 0, sizeof(*cp)); 176 memset(cp, 0, sizeof(*cp));
160 177
161 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 178 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +180,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
163 180
164 spin_lock_init(&vec->lock); 181 spin_lock_init(&vec->lock);
165 vec->count = 0; 182 vec->count = 0;
166 if (bootmem) 183 if (!zalloc_cpumask_var(&vec->mask, gfp))
167 alloc_bootmem_cpumask_var(&vec->mask);
168 else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
169 goto cleanup; 184 goto cleanup;
170 } 185 }
171 186
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..70c7e0b79946 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162{ 162{
163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
164 spread, rq0_min_vruntime, spread0; 164 spread, rq0_min_vruntime, spread0;
165 struct rq *rq = &per_cpu(runqueues, cpu); 165 struct rq *rq = cpu_rq(cpu);
166 struct sched_entity *last; 166 struct sched_entity *last;
167 unsigned long flags; 167 unsigned long flags;
168 168
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 if (last) 191 if (last)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
248 248
249static void print_cpu(struct seq_file *m, int cpu) 249static void print_cpu(struct seq_file *m, int cpu)
250{ 250{
251 struct rq *rq = &per_cpu(runqueues, cpu); 251 struct rq *rq = cpu_rq(cpu);
252 252
253#ifdef CONFIG_X86 253#ifdef CONFIG_X86
254 { 254 {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..652e8bdef9aa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 266 return min_vruntime;
267} 267}
268 268
269static inline int entity_before(struct sched_entity *a,
270 struct sched_entity *b)
271{
272 return (s64)(a->vruntime - b->vruntime) < 0;
273}
274
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 275static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 276{
271 return se->vruntime - cfs_rq->min_vruntime; 277 return se->vruntime - cfs_rq->min_vruntime;
@@ -430,12 +436,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
430 436
431 for_each_sched_entity(se) { 437 for_each_sched_entity(se) {
432 struct load_weight *load; 438 struct load_weight *load;
439 struct load_weight lw;
433 440
434 cfs_rq = cfs_rq_of(se); 441 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load; 442 load = &cfs_rq->load;
436 443
437 if (unlikely(!se->on_rq)) { 444 if (unlikely(!se->on_rq)) {
438 struct load_weight lw = cfs_rq->load; 445 lw = cfs_rq->load;
439 446
440 update_load_add(&lw, se->load.weight); 447 update_load_add(&lw, se->load.weight);
441 load = &lw; 448 load = &lw;
@@ -604,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
604static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 611static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
605{ 612{
606#ifdef CONFIG_SCHEDSTATS 613#ifdef CONFIG_SCHEDSTATS
614 struct task_struct *tsk = NULL;
615
616 if (entity_is_task(se))
617 tsk = task_of(se);
618
607 if (se->sleep_start) { 619 if (se->sleep_start) {
608 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
609 struct task_struct *tsk = task_of(se);
610 621
611 if ((s64)delta < 0) 622 if ((s64)delta < 0)
612 delta = 0; 623 delta = 0;
@@ -617,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
617 se->sleep_start = 0; 628 se->sleep_start = 0;
618 se->sum_sleep_runtime += delta; 629 se->sum_sleep_runtime += delta;
619 630
620 account_scheduler_latency(tsk, delta >> 10, 1); 631 if (tsk)
632 account_scheduler_latency(tsk, delta >> 10, 1);
621 } 633 }
622 if (se->block_start) { 634 if (se->block_start) {
623 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 635 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
624 struct task_struct *tsk = task_of(se);
625 636
626 if ((s64)delta < 0) 637 if ((s64)delta < 0)
627 delta = 0; 638 delta = 0;
@@ -632,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
632 se->block_start = 0; 643 se->block_start = 0;
633 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
634 645
635 /* 646 if (tsk) {
636 * Blocking time is in units of nanosecs, so shift by 20 to 647 /*
637 * get a milliseconds-range estimation of the amount of 648 * Blocking time is in units of nanosecs, so shift by
638 * time that the task spent sleeping: 649 * 20 to get a milliseconds-range estimation of the
639 */ 650 * amount of time that the task spent sleeping:
640 if (unlikely(prof_on == SLEEP_PROFILING)) { 651 */
641 652 if (unlikely(prof_on == SLEEP_PROFILING)) {
642 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 653 profile_hits(SLEEP_PROFILING,
643 delta >> 20); 654 (void *)get_wchan(tsk),
655 delta >> 20);
656 }
657 account_scheduler_latency(tsk, delta >> 10, 0);
644 } 658 }
645 account_scheduler_latency(tsk, delta >> 10, 0);
646 } 659 }
647#endif 660#endif
648} 661}
@@ -686,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
686 * all of which have the same weight. 699 * all of which have the same weight.
687 */ 700 */
688 if (sched_feat(NORMALIZED_SLEEPER) && 701 if (sched_feat(NORMALIZED_SLEEPER) &&
689 task_of(se)->policy != SCHED_IDLE) 702 (!entity_is_task(se) ||
703 task_of(se)->policy != SCHED_IDLE))
690 thresh = calc_delta_fair(thresh, se); 704 thresh = calc_delta_fair(thresh, se);
691 705
692 vruntime -= thresh; 706 vruntime -= thresh;
@@ -1015,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
1015 /* 1029 /*
1016 * Already in the rightmost position? 1030 * Already in the rightmost position?
1017 */ 1031 */
1018 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1032 if (unlikely(!rightmost || entity_before(rightmost, se)))
1019 return; 1033 return;
1020 1034
1021 /* 1035 /*
@@ -1487,17 +1501,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1487 1501
1488 find_matching_se(&se, &pse); 1502 find_matching_se(&se, &pse);
1489 1503
1490 while (se) { 1504 BUG_ON(!pse);
1491 BUG_ON(!pse);
1492
1493 if (wakeup_preempt_entity(se, pse) == 1) {
1494 resched_task(curr);
1495 break;
1496 }
1497 1505
1498 se = parent_entity(se); 1506 if (wakeup_preempt_entity(se, pse) == 1)
1499 pse = parent_entity(pse); 1507 resched_task(curr);
1500 }
1501} 1508}
1502 1509
1503static struct task_struct *pick_next_task_fair(struct rq *rq) 1510static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1718,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1718 1725
1719 /* 'curr' will be NULL if the child belongs to a different group */ 1726 /* 'curr' will be NULL if the child belongs to a different group */
1720 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1727 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1721 curr && curr->vruntime < se->vruntime) { 1728 curr && entity_before(curr, se)) {
1722 /* 1729 /*
1723 * Upon rescheduling, sched_class::put_prev_task() will place 1730 * Upon rescheduling, sched_class::put_prev_task() will place
1724 * 'current' within the tree based on its new key value. 1731 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..3918e01994e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
10 10
11#ifdef CONFIG_RT_GROUP_SCHED 11#ifdef CONFIG_RT_GROUP_SCHED
12 12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 16{
15 return rt_rq->rq; 17 return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 24
23#else /* CONFIG_RT_GROUP_SCHED */ 25#else /* CONFIG_RT_GROUP_SCHED */
24 26
27#define rt_entity_is_task(rt_se) (1)
28
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 30{
27 return container_of(rt_rq, struct rq, rt); 31 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 77
74static void update_rt_migration(struct rt_rq *rt_rq) 78static void update_rt_migration(struct rt_rq *rt_rq)
75{ 79{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 80 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 81 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 82 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 83 rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 90
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 91static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 92{
93 if (!rt_entity_is_task(rt_se))
94 return;
95
96 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
97
98 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 99 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 100 rt_rq->rt_nr_migratory++;
91 101
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 104
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 105static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 106{
107 if (!rt_entity_is_task(rt_se))
108 return;
109
110 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
111
112 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 113 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 114 rt_rq->rt_nr_migratory--;
99 115
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4c..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
27#include <linux/freezer.h> 27#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <trace/sched.h> 30#include <trace/events/sched.h>
31 31
32#include <asm/param.h> 32#include <asm/param.h>
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
@@ -41,8 +41,6 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
46static void __user *sig_handler(struct task_struct *t, int sig) 44static void __user *sig_handler(struct task_struct *t, int sig)
47{ 45{
48 return t->sighand->action[sig - 1].sa.sa_handler; 46 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
249/* 247/*
250 * Flush all pending signals for a task. 248 * Flush all pending signals for a task.
251 */ 249 */
250void __flush_signals(struct task_struct *t)
251{
252 clear_tsk_thread_flag(t, TIF_SIGPENDING);
253 flush_sigqueue(&t->pending);
254 flush_sigqueue(&t->signal->shared_pending);
255}
256
252void flush_signals(struct task_struct *t) 257void flush_signals(struct task_struct *t)
253{ 258{
254 unsigned long flags; 259 unsigned long flags;
255 260
256 spin_lock_irqsave(&t->sighand->siglock, flags); 261 spin_lock_irqsave(&t->sighand->siglock, flags);
257 clear_tsk_thread_flag(t, TIF_SIGPENDING); 262 __flush_signals(t);
258 flush_sigqueue(&t->pending);
259 flush_sigqueue(&t->signal->shared_pending);
260 spin_unlock_irqrestore(&t->sighand->siglock, flags); 263 spin_unlock_irqrestore(&t->sighand->siglock, flags);
261} 264}
262 265
@@ -829,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
829{ 832{
830 struct sigpending *pending; 833 struct sigpending *pending;
831 struct sigqueue *q; 834 struct sigqueue *q;
835 int override_rlimit;
832 836
833 trace_sched_signal_send(sig, t); 837 trace_sched_signal_send(sig, t);
834 838
@@ -860,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
860 make sure at least one signal gets delivered and don't 864 make sure at least one signal gets delivered and don't
861 pass on the info struct. */ 865 pass on the info struct. */
862 866
863 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 867 if (sig < SIGRTMIN)
864 (is_si_special(info) || 868 override_rlimit = (is_si_special(info) || info->si_code >= 0);
865 info->si_code >= 0))); 869 else
870 override_rlimit = 0;
871
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit);
866 if (q) { 874 if (q) {
867 list_add_tail(&q->list, &pending->list); 875 list_add_tail(&q->list, &pending->list);
868 switch ((unsigned long) info) { 876 switch ((unsigned long) info) {
@@ -1402,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1402 /* do_notify_parent_cldstop should have been called instead. */ 1410 /* do_notify_parent_cldstop should have been called instead. */
1403 BUG_ON(task_is_stopped_or_traced(tsk)); 1411 BUG_ON(task_is_stopped_or_traced(tsk));
1404 1412
1405 BUG_ON(!tsk->ptrace && 1413 BUG_ON(!task_ptrace(tsk) &&
1406 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1414 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1407 1415
1408 info.si_signo = sig; 1416 info.si_signo = sig;
@@ -1441,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1441 1449
1442 psig = tsk->parent->sighand; 1450 psig = tsk->parent->sighand;
1443 spin_lock_irqsave(&psig->siglock, flags); 1451 spin_lock_irqsave(&psig->siglock, flags);
1444 if (!tsk->ptrace && sig == SIGCHLD && 1452 if (!task_ptrace(tsk) && sig == SIGCHLD &&
1445 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1453 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1446 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1454 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1447 /* 1455 /*
@@ -1478,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1478 struct task_struct *parent; 1486 struct task_struct *parent;
1479 struct sighand_struct *sighand; 1487 struct sighand_struct *sighand;
1480 1488
1481 if (tsk->ptrace & PT_PTRACED) 1489 if (task_ptrace(tsk))
1482 parent = tsk->parent; 1490 parent = tsk->parent;
1483 else { 1491 else {
1484 tsk = tsk->group_leader; 1492 tsk = tsk->group_leader;
@@ -1491,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1491 * see comment in do_notify_parent() abot the following 3 lines 1499 * see comment in do_notify_parent() abot the following 3 lines
1492 */ 1500 */
1493 rcu_read_lock(); 1501 rcu_read_lock();
1494 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1502 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1495 info.si_uid = __task_cred(tsk)->uid; 1503 info.si_uid = __task_cred(tsk)->uid;
1496 rcu_read_unlock(); 1504 rcu_read_unlock();
1497 1505
@@ -1527,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1527 1535
1528static inline int may_ptrace_stop(void) 1536static inline int may_ptrace_stop(void)
1529{ 1537{
1530 if (!likely(current->ptrace & PT_PTRACED)) 1538 if (!likely(task_ptrace(current)))
1531 return 0; 1539 return 0;
1532 /* 1540 /*
1533 * Are we in the middle of do_coredump? 1541 * Are we in the middle of do_coredump?
@@ -1745,7 +1753,7 @@ static int do_signal_stop(int signr)
1745static int ptrace_signal(int signr, siginfo_t *info, 1753static int ptrace_signal(int signr, siginfo_t *info,
1746 struct pt_regs *regs, void *cookie) 1754 struct pt_regs *regs, void *cookie)
1747{ 1755{
1748 if (!(current->ptrace & PT_PTRACED)) 1756 if (!task_ptrace(current))
1749 return signr; 1757 return signr;
1750 1758
1751 ptrace_signal_deliver(regs, cookie); 1759 ptrace_signal_deliver(regs, cookie);
@@ -2278,24 +2286,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2278 return kill_something_info(sig, &info, pid); 2286 return kill_something_info(sig, &info, pid);
2279} 2287}
2280 2288
2281static int do_tkill(pid_t tgid, pid_t pid, int sig) 2289static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2282{ 2291{
2283 int error;
2284 struct siginfo info;
2285 struct task_struct *p; 2292 struct task_struct *p;
2286 unsigned long flags; 2293 unsigned long flags;
2287 2294 int error = -ESRCH;
2288 error = -ESRCH;
2289 info.si_signo = sig;
2290 info.si_errno = 0;
2291 info.si_code = SI_TKILL;
2292 info.si_pid = task_tgid_vnr(current);
2293 info.si_uid = current_uid();
2294 2295
2295 rcu_read_lock(); 2296 rcu_read_lock();
2296 p = find_task_by_vpid(pid); 2297 p = find_task_by_vpid(pid);
2297 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2298 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2298 error = check_kill_permission(sig, &info, p); 2299 error = check_kill_permission(sig, info, p);
2299 /* 2300 /*
2300 * The null signal is a permissions and process existence 2301 * The null signal is a permissions and process existence
2301 * probe. No signal is actually delivered. 2302 * probe. No signal is actually delivered.
@@ -2305,7 +2306,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2305 * signal is private anyway. 2306 * signal is private anyway.
2306 */ 2307 */
2307 if (!error && sig && lock_task_sighand(p, &flags)) { 2308 if (!error && sig && lock_task_sighand(p, &flags)) {
2308 error = specific_send_sig_info(sig, &info, p); 2309 error = specific_send_sig_info(sig, info, p);
2309 unlock_task_sighand(p, &flags); 2310 unlock_task_sighand(p, &flags);
2310 } 2311 }
2311 } 2312 }
@@ -2314,6 +2315,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2314 return error; 2315 return error;
2315} 2316}
2316 2317
2318static int do_tkill(pid_t tgid, pid_t pid, int sig)
2319{
2320 struct siginfo info;
2321
2322 info.si_signo = sig;
2323 info.si_errno = 0;
2324 info.si_code = SI_TKILL;
2325 info.si_pid = task_tgid_vnr(current);
2326 info.si_uid = current_uid();
2327
2328 return do_send_specific(tgid, pid, sig, &info);
2329}
2330
2317/** 2331/**
2318 * sys_tgkill - send signal to one specific thread 2332 * sys_tgkill - send signal to one specific thread
2319 * @tgid: the thread group ID of the thread 2333 * @tgid: the thread group ID of the thread
@@ -2363,6 +2377,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2363 return kill_proc_info(sig, &info, pid); 2377 return kill_proc_info(sig, &info, pid);
2364} 2378}
2365 2379
2380long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2381{
2382 /* This is only valid for single tasks */
2383 if (pid <= 0 || tgid <= 0)
2384 return -EINVAL;
2385
2386 /* Not even root can pretend to send signals from the kernel.
2387 Nor can they impersonate a kill(), which adds source info. */
2388 if (info->si_code >= 0)
2389 return -EPERM;
2390 info->si_signo = sig;
2391
2392 return do_send_specific(tgid, pid, sig, info);
2393}
2394
2395SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2396 siginfo_t __user *, uinfo)
2397{
2398 siginfo_t info;
2399
2400 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2401 return -EFAULT;
2402
2403 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2404}
2405
2366int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2406int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2367{ 2407{
2368 struct task_struct *t = current; 2408 struct task_struct *t = current;
@@ -2414,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2414 stack_t oss; 2454 stack_t oss;
2415 int error; 2455 int error;
2416 2456
2417 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2418 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2419 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2420 oss.ss_flags = sas_ss_flags(sp);
2421 }
2422 2460
2423 if (uss) { 2461 if (uss) {
2424 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2426,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2426 int ss_flags; 2464 int ss_flags;
2427 2465
2428 error = -EFAULT; 2466 error = -EFAULT;
2429 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2430 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2431 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2432 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2433 goto out; 2473 goto out;
2434 2474
2435 error = -EPERM; 2475 error = -EPERM;
@@ -2461,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2461 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2462 } 2502 }
2463 2503
2504 error = 0;
2464 if (uoss) { 2505 if (uoss) {
2465 error = -EFAULT; 2506 error = -EFAULT;
2466 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2467 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2468 } 2512 }
2469 2513
2470 error = 0;
2471out: 2514out:
2472 return error; 2515 return error;
2473} 2516}
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f43..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
319EXPORT_SYMBOL(slow_work_enqueue); 319EXPORT_SYMBOL(slow_work_enqueue);
320 320
321/* 321/*
322 * Schedule a cull of the thread pool at some time in the near future
323 */
324static void slow_work_schedule_cull(void)
325{
326 mod_timer(&slow_work_cull_timer,
327 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
328}
329
330/*
322 * Worker thread culling algorithm 331 * Worker thread culling algorithm
323 */ 332 */
324static bool slow_work_cull_thread(void) 333static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
335 list_empty(&vslow_work_queue) && 344 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) > 345 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) { 346 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer, 347 slow_work_schedule_cull();
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true; 348 do_cull = true;
341 } 349 }
342 } 350 }
@@ -372,8 +380,8 @@ static int slow_work_thread(void *_data)
372 vsmax *= atomic_read(&slow_work_thread_count); 380 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100; 381 vsmax /= 100;
374 382
375 prepare_to_wait(&slow_work_thread_wq, &wait, 383 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE); 384 TASK_INTERRUPTIBLE);
377 if (!freezing(current) && 385 if (!freezing(current) &&
378 !slow_work_threads_should_exit && 386 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) && 387 !slow_work_available(vsmax) &&
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
393 list_empty(&vslow_work_queue) && 401 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) > 402 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads) 403 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer, 404 slow_work_schedule_cull();
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue; 405 continue;
399 } 406 }
400 407
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
458 if (atomic_dec_and_test(&slow_work_thread_count)) 465 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */ 466 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer, 467 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT); 468 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
462 } else { 469 } else {
463 /* ratelimit the starting of new threads */ 470 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1); 471 mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
502 if (n < 0 && !slow_work_may_not_start_new_thread) 509 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread); 510 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0) 511 else if (n > 0)
505 mod_timer(&slow_work_cull_timer, 512 slow_work_schedule_cull();
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 } 513 }
508 mutex_unlock(&slow_work_user_lock); 514 mutex_unlock(&slow_work_user_lock);
509 } 515 }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
529 atomic_read(&slow_work_thread_count); 535 atomic_read(&slow_work_thread_count);
530 536
531 if (n < 0) 537 if (n < 0)
532 mod_timer(&slow_work_cull_timer, 538 slow_work_schedule_cull();
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 } 539 }
535 mutex_unlock(&slow_work_user_lock); 540 mutex_unlock(&slow_work_user_lock);
536 } 541 }
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..94188b8ecc33 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd348511..eb5e131a0485 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,9 @@
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/tick.h> 26#include <linux/tick.h>
27#include <trace/irq.h> 27
28#define CREATE_TRACE_POINTS
29#include <trace/events/irq.h>
28 30
29#include <asm/irq.h> 31#include <asm/irq.h>
30/* 32/*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
186 */ 188 */
187#define MAX_SOFTIRQ_RESTART 10 189#define MAX_SOFTIRQ_RESTART 10
188 190
189DEFINE_TRACE(softirq_entry);
190DEFINE_TRACE(softirq_exit);
191
192asmlinkage void __do_softirq(void) 191asmlinkage void __do_softirq(void)
193{ 192{
194 struct softirq_action *h; 193 struct softirq_action *h;
@@ -214,6 +213,7 @@ restart:
214 do { 213 do {
215 if (pending & 1) { 214 if (pending & 1) {
216 int prev_count = preempt_count(); 215 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
217 217
218 trace_softirq_entry(h, softirq_vec); 218 trace_softirq_entry(h, softirq_vec);
219 h->action(h); 219 h->action(h);
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
345 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
346} 346}
347 347
348/* Tasklets */ 348/*
349 * Tasklets
350 */
349struct tasklet_head 351struct tasklet_head
350{ 352{
351 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -383,6 +385,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
383 385
384EXPORT_SYMBOL(__tasklet_hi_schedule); 386EXPORT_SYMBOL(__tasklet_hi_schedule);
385 387
388void __tasklet_hi_schedule_first(struct tasklet_struct *t)
389{
390 BUG_ON(!irqs_disabled());
391
392 t->next = __get_cpu_var(tasklet_hi_vec).head;
393 __get_cpu_var(tasklet_hi_vec).head = t;
394 __raise_softirq_irqoff(HI_SOFTIRQ);
395}
396
397EXPORT_SYMBOL(__tasklet_hi_schedule_first);
398
386static void tasklet_action(struct softirq_action *a) 399static void tasklet_action(struct softirq_action *a)
387{ 400{
388 struct tasklet_struct *list; 401 struct tasklet_struct *list;
@@ -482,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
482 495
483EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
484 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
485DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
486EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
487 560
@@ -828,7 +901,7 @@ int __init __weak arch_early_irq_init(void)
828 return 0; 901 return 0;
829} 902}
830 903
831int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) 904int __weak arch_init_chip_data(struct irq_desc *desc, int node)
832{ 905{
833 return 0; 906 return 0;
834} 907}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1112,289 +1113,6 @@ out:
1112 return err; 1113 return err;
1113} 1114}
1114 1115
1115/*
1116 * Supplementary group IDs
1117 */
1118
1119/* init to 2 - one for init_task, one to ensure it is never freed */
1120struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
1121
1122struct group_info *groups_alloc(int gidsetsize)
1123{
1124 struct group_info *group_info;
1125 int nblocks;
1126 int i;
1127
1128 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
1129 /* Make sure we always allocate at least one indirect block pointer */
1130 nblocks = nblocks ? : 1;
1131 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
1132 if (!group_info)
1133 return NULL;
1134 group_info->ngroups = gidsetsize;
1135 group_info->nblocks = nblocks;
1136 atomic_set(&group_info->usage, 1);
1137
1138 if (gidsetsize <= NGROUPS_SMALL)
1139 group_info->blocks[0] = group_info->small_block;
1140 else {
1141 for (i = 0; i < nblocks; i++) {
1142 gid_t *b;
1143 b = (void *)__get_free_page(GFP_USER);
1144 if (!b)
1145 goto out_undo_partial_alloc;
1146 group_info->blocks[i] = b;
1147 }
1148 }
1149 return group_info;
1150
1151out_undo_partial_alloc:
1152 while (--i >= 0) {
1153 free_page((unsigned long)group_info->blocks[i]);
1154 }
1155 kfree(group_info);
1156 return NULL;
1157}
1158
1159EXPORT_SYMBOL(groups_alloc);
1160
1161void groups_free(struct group_info *group_info)
1162{
1163 if (group_info->blocks[0] != group_info->small_block) {
1164 int i;
1165 for (i = 0; i < group_info->nblocks; i++)
1166 free_page((unsigned long)group_info->blocks[i]);
1167 }
1168 kfree(group_info);
1169}
1170
1171EXPORT_SYMBOL(groups_free);
1172
1173/* export the group_info to a user-space array */
1174static int groups_to_user(gid_t __user *grouplist,
1175 const struct group_info *group_info)
1176{
1177 int i;
1178 unsigned int count = group_info->ngroups;
1179
1180 for (i = 0; i < group_info->nblocks; i++) {
1181 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1182 unsigned int len = cp_count * sizeof(*grouplist);
1183
1184 if (copy_to_user(grouplist, group_info->blocks[i], len))
1185 return -EFAULT;
1186
1187 grouplist += NGROUPS_PER_BLOCK;
1188 count -= cp_count;
1189 }
1190 return 0;
1191}
1192
1193/* fill a group_info from a user-space array - it must be allocated already */
1194static int groups_from_user(struct group_info *group_info,
1195 gid_t __user *grouplist)
1196{
1197 int i;
1198 unsigned int count = group_info->ngroups;
1199
1200 for (i = 0; i < group_info->nblocks; i++) {
1201 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1202 unsigned int len = cp_count * sizeof(*grouplist);
1203
1204 if (copy_from_user(group_info->blocks[i], grouplist, len))
1205 return -EFAULT;
1206
1207 grouplist += NGROUPS_PER_BLOCK;
1208 count -= cp_count;
1209 }
1210 return 0;
1211}
1212
1213/* a simple Shell sort */
1214static void groups_sort(struct group_info *group_info)
1215{
1216 int base, max, stride;
1217 int gidsetsize = group_info->ngroups;
1218
1219 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
1220 ; /* nothing */
1221 stride /= 3;
1222
1223 while (stride) {
1224 max = gidsetsize - stride;
1225 for (base = 0; base < max; base++) {
1226 int left = base;
1227 int right = left + stride;
1228 gid_t tmp = GROUP_AT(group_info, right);
1229
1230 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
1231 GROUP_AT(group_info, right) =
1232 GROUP_AT(group_info, left);
1233 right = left;
1234 left -= stride;
1235 }
1236 GROUP_AT(group_info, right) = tmp;
1237 }
1238 stride /= 3;
1239 }
1240}
1241
1242/* a simple bsearch */
1243int groups_search(const struct group_info *group_info, gid_t grp)
1244{
1245 unsigned int left, right;
1246
1247 if (!group_info)
1248 return 0;
1249
1250 left = 0;
1251 right = group_info->ngroups;
1252 while (left < right) {
1253 unsigned int mid = (left+right)/2;
1254 int cmp = grp - GROUP_AT(group_info, mid);
1255 if (cmp > 0)
1256 left = mid + 1;
1257 else if (cmp < 0)
1258 right = mid;
1259 else
1260 return 1;
1261 }
1262 return 0;
1263}
1264
1265/**
1266 * set_groups - Change a group subscription in a set of credentials
1267 * @new: The newly prepared set of credentials to alter
1268 * @group_info: The group list to install
1269 *
1270 * Validate a group subscription and, if valid, insert it into a set
1271 * of credentials.
1272 */
1273int set_groups(struct cred *new, struct group_info *group_info)
1274{
1275 int retval;
1276
1277 retval = security_task_setgroups(group_info);
1278 if (retval)
1279 return retval;
1280
1281 put_group_info(new->group_info);
1282 groups_sort(group_info);
1283 get_group_info(group_info);
1284 new->group_info = group_info;
1285 return 0;
1286}
1287
1288EXPORT_SYMBOL(set_groups);
1289
1290/**
1291 * set_current_groups - Change current's group subscription
1292 * @group_info: The group list to impose
1293 *
1294 * Validate a group subscription and, if valid, impose it upon current's task
1295 * security record.
1296 */
1297int set_current_groups(struct group_info *group_info)
1298{
1299 struct cred *new;
1300 int ret;
1301
1302 new = prepare_creds();
1303 if (!new)
1304 return -ENOMEM;
1305
1306 ret = set_groups(new, group_info);
1307 if (ret < 0) {
1308 abort_creds(new);
1309 return ret;
1310 }
1311
1312 return commit_creds(new);
1313}
1314
1315EXPORT_SYMBOL(set_current_groups);
1316
1317SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
1318{
1319 const struct cred *cred = current_cred();
1320 int i;
1321
1322 if (gidsetsize < 0)
1323 return -EINVAL;
1324
1325 /* no need to grab task_lock here; it cannot change */
1326 i = cred->group_info->ngroups;
1327 if (gidsetsize) {
1328 if (i > gidsetsize) {
1329 i = -EINVAL;
1330 goto out;
1331 }
1332 if (groups_to_user(grouplist, cred->group_info)) {
1333 i = -EFAULT;
1334 goto out;
1335 }
1336 }
1337out:
1338 return i;
1339}
1340
1341/*
1342 * SMP: Our groups are copy-on-write. We can set them safely
1343 * without another task interfering.
1344 */
1345
1346SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
1347{
1348 struct group_info *group_info;
1349 int retval;
1350
1351 if (!capable(CAP_SETGID))
1352 return -EPERM;
1353 if ((unsigned)gidsetsize > NGROUPS_MAX)
1354 return -EINVAL;
1355
1356 group_info = groups_alloc(gidsetsize);
1357 if (!group_info)
1358 return -ENOMEM;
1359 retval = groups_from_user(group_info, grouplist);
1360 if (retval) {
1361 put_group_info(group_info);
1362 return retval;
1363 }
1364
1365 retval = set_current_groups(group_info);
1366 put_group_info(group_info);
1367
1368 return retval;
1369}
1370
1371/*
1372 * Check whether we're fsgid/egid or in the supplemental group..
1373 */
1374int in_group_p(gid_t grp)
1375{
1376 const struct cred *cred = current_cred();
1377 int retval = 1;
1378
1379 if (grp != cred->fsgid)
1380 retval = groups_search(cred->group_info, grp);
1381 return retval;
1382}
1383
1384EXPORT_SYMBOL(in_group_p);
1385
1386int in_egroup_p(gid_t grp)
1387{
1388 const struct cred *cred = current_cred();
1389 int retval = 1;
1390
1391 if (grp != cred->egid)
1392 retval = groups_search(cred->group_info, grp);
1393 return retval;
1394}
1395
1396EXPORT_SYMBOL(in_egroup_p);
1397
1398DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1399 1117
1400SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1118SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
@@ -1793,6 +1511,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1511 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1512 error = SET_TSC_CTL(arg2);
1795 break; 1513 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE:
1515 error = perf_counter_task_disable();
1516 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE:
1518 error = perf_counter_task_enable();
1519 break;
1796 case PR_GET_TIMERSLACK: 1520 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1521 error = current->timer_slack_ns;
1798 break; 1522 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2970d56fb76..58be76017fd0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/fs.h> 32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
@@ -48,7 +49,9 @@
48#include <linux/acpi.h> 49#include <linux/acpi.h>
49#include <linux/reboot.h> 50#include <linux/reboot.h>
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/security.h>
51#include <linux/slow-work.h> 53#include <linux/slow-work.h>
54#include <linux/perf_counter.h>
52 55
53#include <asm/uaccess.h> 56#include <asm/uaccess.h>
54#include <asm/processor.h> 57#include <asm/processor.h>
@@ -114,6 +117,7 @@ static int ngroups_max = NGROUPS_MAX;
114 117
115#ifdef CONFIG_MODULES 118#ifdef CONFIG_MODULES
116extern char modprobe_path[]; 119extern char modprobe_path[];
120extern int modules_disabled;
117#endif 121#endif
118#ifdef CONFIG_CHR_DEV_SG 122#ifdef CONFIG_CHR_DEV_SG
119extern int sg_big_buff; 123extern int sg_big_buff;
@@ -326,6 +330,17 @@ static struct ctl_table kern_table[] = {
326 .mode = 0644, 330 .mode = 0644,
327 .proc_handler = &proc_dointvec, 331 .proc_handler = &proc_dointvec,
328 }, 332 },
333 {
334 .ctl_name = CTL_UNNUMBERED,
335 .procname = "timer_migration",
336 .data = &sysctl_timer_migration,
337 .maxlen = sizeof(unsigned int),
338 .mode = 0644,
339 .proc_handler = &proc_dointvec_minmax,
340 .strategy = &sysctl_intvec,
341 .extra1 = &zero,
342 .extra2 = &one,
343 },
329#endif 344#endif
330 { 345 {
331 .ctl_name = CTL_UNNUMBERED, 346 .ctl_name = CTL_UNNUMBERED,
@@ -534,6 +549,17 @@ static struct ctl_table kern_table[] = {
534 .proc_handler = &proc_dostring, 549 .proc_handler = &proc_dostring,
535 .strategy = &sysctl_string, 550 .strategy = &sysctl_string,
536 }, 551 },
552 {
553 .ctl_name = CTL_UNNUMBERED,
554 .procname = "modules_disabled",
555 .data = &modules_disabled,
556 .maxlen = sizeof(int),
557 .mode = 0644,
558 /* only handle a transition from default "0" to "1" */
559 .proc_handler = &proc_dointvec_minmax,
560 .extra1 = &one,
561 .extra2 = &one,
562 },
537#endif 563#endif
538#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 564#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
539 { 565 {
@@ -722,6 +748,14 @@ static struct ctl_table kern_table[] = {
722 .proc_handler = &proc_dointvec, 748 .proc_handler = &proc_dointvec,
723 }, 749 },
724 { 750 {
751 .ctl_name = CTL_UNNUMBERED,
752 .procname = "panic_on_io_nmi",
753 .data = &panic_on_io_nmi,
754 .maxlen = sizeof(int),
755 .mode = 0644,
756 .proc_handler = &proc_dointvec,
757 },
758 {
725 .ctl_name = KERN_BOOTLOADER_TYPE, 759 .ctl_name = KERN_BOOTLOADER_TYPE,
726 .procname = "bootloader_type", 760 .procname = "bootloader_type",
727 .data = &bootloader_type, 761 .data = &bootloader_type,
@@ -731,6 +765,14 @@ static struct ctl_table kern_table[] = {
731 }, 765 },
732 { 766 {
733 .ctl_name = CTL_UNNUMBERED, 767 .ctl_name = CTL_UNNUMBERED,
768 .procname = "bootloader_version",
769 .data = &bootloader_version,
770 .maxlen = sizeof (int),
771 .mode = 0444,
772 .proc_handler = &proc_dointvec,
773 },
774 {
775 .ctl_name = CTL_UNNUMBERED,
734 .procname = "kstack_depth_to_print", 776 .procname = "kstack_depth_to_print",
735 .data = &kstack_depth_to_print, 777 .data = &kstack_depth_to_print,
736 .maxlen = sizeof(int), 778 .maxlen = sizeof(int),
@@ -912,6 +954,43 @@ static struct ctl_table kern_table[] = {
912 .child = slow_work_sysctls, 954 .child = slow_work_sysctls,
913 }, 955 },
914#endif 956#endif
957#ifdef CONFIG_PERF_COUNTERS
958 {
959 .ctl_name = CTL_UNNUMBERED,
960 .procname = "perf_counter_paranoid",
961 .data = &sysctl_perf_counter_paranoid,
962 .maxlen = sizeof(sysctl_perf_counter_paranoid),
963 .mode = 0644,
964 .proc_handler = &proc_dointvec,
965 },
966 {
967 .ctl_name = CTL_UNNUMBERED,
968 .procname = "perf_counter_mlock_kb",
969 .data = &sysctl_perf_counter_mlock,
970 .maxlen = sizeof(sysctl_perf_counter_mlock),
971 .mode = 0644,
972 .proc_handler = &proc_dointvec,
973 },
974 {
975 .ctl_name = CTL_UNNUMBERED,
976 .procname = "perf_counter_max_sample_rate",
977 .data = &sysctl_perf_counter_sample_rate,
978 .maxlen = sizeof(sysctl_perf_counter_sample_rate),
979 .mode = 0644,
980 .proc_handler = &proc_dointvec,
981 },
982#endif
983#ifdef CONFIG_KMEMCHECK
984 {
985 .ctl_name = CTL_UNNUMBERED,
986 .procname = "kmemcheck",
987 .data = &kmemcheck_enabled,
988 .maxlen = sizeof(int),
989 .mode = 0644,
990 .proc_handler = &proc_dointvec,
991 },
992#endif
993
915/* 994/*
916 * NOTE: do not add new entries to this table unless you have read 995 * NOTE: do not add new entries to this table unless you have read
917 * Documentation/sysctl/ctl_unnumbered.txt 996 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1225,16 +1304,14 @@ static struct ctl_table vm_table[] = {
1225 .strategy = &sysctl_jiffies, 1304 .strategy = &sysctl_jiffies,
1226 }, 1305 },
1227#endif 1306#endif
1228#ifdef CONFIG_SECURITY
1229 { 1307 {
1230 .ctl_name = CTL_UNNUMBERED, 1308 .ctl_name = CTL_UNNUMBERED,
1231 .procname = "mmap_min_addr", 1309 .procname = "mmap_min_addr",
1232 .data = &mmap_min_addr, 1310 .data = &dac_mmap_min_addr,
1233 .maxlen = sizeof(unsigned long), 1311 .maxlen = sizeof(unsigned long),
1234 .mode = 0644, 1312 .mode = 0644,
1235 .proc_handler = &proc_doulongvec_minmax, 1313 .proc_handler = &mmap_min_addr_handler,
1236 }, 1314 },
1237#endif
1238#ifdef CONFIG_NUMA 1315#ifdef CONFIG_NUMA
1239 { 1316 {
1240 .ctl_name = CTL_UNNUMBERED, 1317 .ctl_name = CTL_UNNUMBERED,
@@ -1272,7 +1349,6 @@ static struct ctl_table vm_table[] = {
1272 .extra2 = &one, 1349 .extra2 = &one,
1273 }, 1350 },
1274#endif 1351#endif
1275#ifdef CONFIG_UNEVICTABLE_LRU
1276 { 1352 {
1277 .ctl_name = CTL_UNNUMBERED, 1353 .ctl_name = CTL_UNNUMBERED,
1278 .procname = "scan_unevictable_pages", 1354 .procname = "scan_unevictable_pages",
@@ -1281,7 +1357,6 @@ static struct ctl_table vm_table[] = {
1281 .mode = 0644, 1357 .mode = 0644,
1282 .proc_handler = &scan_unevictable_handler, 1358 .proc_handler = &scan_unevictable_handler,
1283 }, 1359 },
1284#endif
1285/* 1360/*
1286 * NOTE: do not add new entries to this table unless you have read 1361 * NOTE: do not add new entries to this table unless you have read
1287 * Documentation/sysctl/ctl_unnumbered.txt 1362 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2220,7 +2295,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2220 void *data) 2295 void *data)
2221{ 2296{
2222#define TMPBUFLEN 21 2297#define TMPBUFLEN 21
2223 int *i, vleft, first=1, neg, val; 2298 int *i, vleft, first = 1, neg;
2224 unsigned long lval; 2299 unsigned long lval;
2225 size_t left, len; 2300 size_t left, len;
2226 2301
@@ -2273,8 +2348,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2273 len = p-buf; 2348 len = p-buf;
2274 if ((len < left) && *p && !isspace(*p)) 2349 if ((len < left) && *p && !isspace(*p))
2275 break; 2350 break;
2276 if (neg)
2277 val = -val;
2278 s += len; 2351 s += len;
2279 left -= len; 2352 left -= len;
2280 2353
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a790..620b58abdc32 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
21 22
22/* The registered clock event devices */ 23/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices); 24static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
54 55
55 return (unsigned long) clc; 56 return (unsigned long) clc;
56} 57}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns);
57 59
58/** 60/**
59 * clockevents_set_mode - set the operating mode of a clock event device 61 * clockevents_set_mode - set the operating mode of a clock event device
@@ -135,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
135 */ 137 */
136int clockevents_register_notifier(struct notifier_block *nb) 138int clockevents_register_notifier(struct notifier_block *nb)
137{ 139{
140 unsigned long flags;
138 int ret; 141 int ret;
139 142
140 spin_lock(&clockevents_lock); 143 spin_lock_irqsave(&clockevents_lock, flags);
141 ret = raw_notifier_chain_register(&clockevents_chain, nb); 144 ret = raw_notifier_chain_register(&clockevents_chain, nb);
142 spin_unlock(&clockevents_lock); 145 spin_unlock_irqrestore(&clockevents_lock, flags);
143 146
144 return ret; 147 return ret;
145} 148}
@@ -176,17 +179,20 @@ static void clockevents_notify_released(void)
176 */ 179 */
177void clockevents_register_device(struct clock_event_device *dev) 180void clockevents_register_device(struct clock_event_device *dev)
178{ 181{
182 unsigned long flags;
183
179 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
180 BUG_ON(!dev->cpumask); 185 BUG_ON(!dev->cpumask);
181 186
182 spin_lock(&clockevents_lock); 187 spin_lock_irqsave(&clockevents_lock, flags);
183 188
184 list_add(&dev->list, &clockevent_devices); 189 list_add(&dev->list, &clockevent_devices);
185 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
186 clockevents_notify_released(); 191 clockevents_notify_released();
187 192
188 spin_unlock(&clockevents_lock); 193 spin_unlock_irqrestore(&clockevents_lock, flags);
189} 194}
195EXPORT_SYMBOL_GPL(clockevents_register_device);
190 196
191/* 197/*
192 * Noop handler when we shut down an event device 198 * Noop handler when we shut down an event device
@@ -232,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
232void clockevents_notify(unsigned long reason, void *arg) 238void clockevents_notify(unsigned long reason, void *arg)
233{ 239{
234 struct list_head *node, *tmp; 240 struct list_head *node, *tmp;
241 unsigned long flags;
235 242
236 spin_lock(&clockevents_lock); 243 spin_lock_irqsave(&clockevents_lock, flags);
237 clockevents_do_notify(reason, arg); 244 clockevents_do_notify(reason, arg);
238 245
239 switch (reason) { 246 switch (reason) {
@@ -248,7 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
248 default: 255 default:
249 break; 256 break;
250 } 257 }
251 spin_unlock(&clockevents_lock); 258 spin_unlock_irqrestore(&clockevents_lock, flags);
252} 259}
253EXPORT_SYMBOL_GPL(clockevents_notify); 260EXPORT_SYMBOL_GPL(clockevents_notify);
254#endif 261#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e0..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
402 unsigned long flags; 402 unsigned long flags;
403 int ret; 403 int ret;
404 404
405 /* save mult_orig on registration */
406 c->mult_orig = c->mult;
407
408 spin_lock_irqsave(&clocksource_lock, flags); 405 spin_lock_irqsave(&clocksource_lock, flags);
409 ret = clocksource_enqueue(c); 406 ret = clocksource_enqueue(c);
410 if (!ret) 407 if (!ret)
@@ -512,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
512 } 509 }
513 } 510 }
514 511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
515 /* Reselect, when the override name has changed */ 524 /* Reselect, when the override name has changed */
516 if (ovr != clocksource_override) { 525 if (ovr != clocksource_override) {
517 clocksource_override = ovr; 526 clocksource_override = ovr;
@@ -540,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
540 549
541 spin_lock_irq(&clocksource_lock); 550 spin_lock_irq(&clocksource_lock);
542 list_for_each_entry(src, &clocksource_list, list) { 551 list_for_each_entry(src, &clocksource_list, list) {
543 count += snprintf(buf + count, 552 /*
553 * Don't show non-HRES clocksource if the tick code is
554 * in one shot mode (highres=on or nohz=on)
555 */
556 if (!tick_oneshot_mode_active() ||
557 (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
558 count += snprintf(buf + count,
544 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
545 "%s ", src->name); 560 "%s ", src->name);
546 } 561 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9a..c2ec25087a35 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
27 * timer stops in C3 state. 27 * timer stops in C3 state.
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
205 * Powerstate information: The system enters/leaves a state, where 205 * Powerstate information: The system enters/leaves a state, where
206 * affected devices might stop 206 * affected devices might stop
207 */ 207 */
208static void tick_do_broadcast_on_off(void *why) 208static void tick_do_broadcast_on_off(unsigned long *reason)
209{ 209{
210 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
211 struct tick_device *td; 211 struct tick_device *td;
212 unsigned long flags, *reason = why; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
279 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 279 tick_do_broadcast_on_off(&reason);
280 &reason, 1);
281} 280}
282 281
283/* 282/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e767..a96c0e2b89cf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
128 return 0; 128 return 0;
129} 129}
130 130
131/**
132 * tick_check_oneshot_mode - check whether the system is in oneshot mode
133 *
134 * returns 1 when either nohz or highres are enabled. otherwise 0.
135 */
136int tick_oneshot_mode_active(void)
137{
138 unsigned long flags;
139 int ret;
140
141 local_irq_save(flags);
142 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
143 local_irq_restore(flags);
144
145 return ret;
146}
147
131#ifdef CONFIG_HIGH_RES_TIMERS 148#ifdef CONFIG_HIGH_RES_TIMERS
132/** 149/**
133 * tick_init_highres - switch to high resolution mode 150 * tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cbe..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
222 222
223 cpu = smp_processor_id(); 223 cpu = smp_processor_id();
224 ts = &per_cpu(tick_cpu_sched, cpu); 224 ts = &per_cpu(tick_cpu_sched, cpu);
225
226 /*
227 * Call to tick_nohz_start_idle stops the last_update_time from being
228 * updated. Thus, it must not be called in the event we are called from
229 * irq_exit() with the prior state different than idle.
230 */
231 if (!inidle && !ts->inidle)
232 goto end;
233
225 now = tick_nohz_start_idle(ts); 234 now = tick_nohz_start_idle(ts);
226 235
227 /* 236 /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
240 goto end; 249 goto end;
241 250
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1; 251 ts->inidle = 1;
246 252
247 if (need_resched()) 253 if (need_resched())
@@ -349,7 +355,7 @@ void tick_nohz_stop_sched_tick(int inidle)
349 355
350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 356 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
351 hrtimer_start(&ts->sched_timer, expires, 357 hrtimer_start(&ts->sched_timer, expires,
352 HRTIMER_MODE_ABS); 358 HRTIMER_MODE_ABS_PINNED);
353 /* Check, if the timer was already in the past */ 359 /* Check, if the timer was already in the past */
354 if (hrtimer_active(&ts->sched_timer)) 360 if (hrtimer_active(&ts->sched_timer))
355 goto out; 361 goto out;
@@ -395,7 +401,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
395 401
396 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 402 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
397 hrtimer_start_expires(&ts->sched_timer, 403 hrtimer_start_expires(&ts->sched_timer,
398 HRTIMER_MODE_ABS); 404 HRTIMER_MODE_ABS_PINNED);
399 /* Check, if the timer was already in the past */ 405 /* Check, if the timer was already in the past */
400 if (hrtimer_active(&ts->sched_timer)) 406 if (hrtimer_active(&ts->sched_timer))
401 break; 407 break;
@@ -698,7 +704,8 @@ void tick_setup_sched_timer(void)
698 704
699 for (;;) { 705 for (;;) {
700 hrtimer_forward(&ts->sched_timer, now, tick_period); 706 hrtimer_forward(&ts->sched_timer, now, tick_period);
701 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); 707 hrtimer_start_expires(&ts->sched_timer,
708 HRTIMER_MODE_ABS_PINNED);
702 /* Check, if the timer was already in the past */ 709 /* Check, if the timer was already in the past */
703 if (hrtimer_active(&ts->sched_timer)) 710 if (hrtimer_active(&ts->sched_timer))
704 break; 711 break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..e8c77d9c633a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
77 clock->cycle_last = cycle_now; 77 clock->cycle_last = cycle_now;
78 78
79 nsec = cyc2ns(clock, cycle_delta); 79 nsec = cyc2ns(clock, cycle_delta);
80
81 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset();
83
80 timespec_add_ns(&xtime, nsec); 84 timespec_add_ns(&xtime, nsec);
81 85
82 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
111 /* convert to nanoseconds: */ 115 /* convert to nanoseconds: */
112 nsecs = cyc2ns(clock, cycle_delta); 116 nsecs = cyc2ns(clock, cycle_delta);
113 117
118 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset();
120
114 } while (read_seqretry(&xtime_lock, seq)); 121 } while (read_seqretry(&xtime_lock, seq));
115 122
116 timespec_add_ns(ts, nsecs); 123 timespec_add_ns(ts, nsecs);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a1277..fddd69d16e03 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
286{ 286{
287 struct proc_dir_entry *pe; 287 struct proc_dir_entry *pe;
288 288
289 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); 289 pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
290 if (!pe) 290 if (!pe)
291 return -ENOMEM; 291 return -ENOMEM;
292 return 0; 292 return 0;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..a7f07d5a6241 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,8 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
41#include <linux/sched.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42#include <asm/unistd.h> 44#include <asm/unistd.h>
@@ -378,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
378{ 380{
379 unsigned int flag = 0; 381 unsigned int flag = 0;
380 382
383 if (likely(!timer->start_site))
384 return;
381 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
382 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
383 387
@@ -604,13 +608,12 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
604} 608}
605 609
606static inline int 610static inline int
607__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) 611__mod_timer(struct timer_list *timer, unsigned long expires,
612 bool pending_only, int pinned)
608{ 613{
609 struct tvec_base *base, *new_base; 614 struct tvec_base *base, *new_base;
610 unsigned long flags; 615 unsigned long flags;
611 int ret; 616 int ret = 0 , cpu;
612
613 ret = 0;
614 617
615 timer_stats_timer_set_start_info(timer); 618 timer_stats_timer_set_start_info(timer);
616 BUG_ON(!timer->function); 619 BUG_ON(!timer->function);
@@ -629,6 +632,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
629 632
630 new_base = __get_cpu_var(tvec_bases); 633 new_base = __get_cpu_var(tvec_bases);
631 634
635 cpu = smp_processor_id();
636
637#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
638 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
639 int preferred_cpu = get_nohz_load_balancer();
640
641 if (preferred_cpu >= 0)
642 cpu = preferred_cpu;
643 }
644#endif
645 new_base = per_cpu(tvec_bases, cpu);
646
632 if (base != new_base) { 647 if (base != new_base) {
633 /* 648 /*
634 * We are trying to schedule the timer on the local CPU. 649 * We are trying to schedule the timer on the local CPU.
@@ -668,7 +683,7 @@ out_unlock:
668 */ 683 */
669int mod_timer_pending(struct timer_list *timer, unsigned long expires) 684int mod_timer_pending(struct timer_list *timer, unsigned long expires)
670{ 685{
671 return __mod_timer(timer, expires, true); 686 return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
672} 687}
673EXPORT_SYMBOL(mod_timer_pending); 688EXPORT_SYMBOL(mod_timer_pending);
674 689
@@ -699,14 +714,36 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
699 * networking code - if the timer is re-modified 714 * networking code - if the timer is re-modified
700 * to be the same thing then just return: 715 * to be the same thing then just return:
701 */ 716 */
702 if (timer->expires == expires && timer_pending(timer)) 717 if (timer_pending(timer) && timer->expires == expires)
703 return 1; 718 return 1;
704 719
705 return __mod_timer(timer, expires, false); 720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
706} 721}
707EXPORT_SYMBOL(mod_timer); 722EXPORT_SYMBOL(mod_timer);
708 723
709/** 724/**
725 * mod_timer_pinned - modify a timer's timeout
726 * @timer: the timer to be modified
727 * @expires: new timeout in jiffies
728 *
729 * mod_timer_pinned() is a way to update the expire field of an
730 * active timer (if the timer is inactive it will be activated)
731 * and not allow the timer to be migrated to a different CPU.
732 *
733 * mod_timer_pinned(timer, expires) is equivalent to:
734 *
735 * del_timer(timer); timer->expires = expires; add_timer(timer);
736 */
737int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
738{
739 if (timer->expires == expires && timer_pending(timer))
740 return 1;
741
742 return __mod_timer(timer, expires, false, TIMER_PINNED);
743}
744EXPORT_SYMBOL(mod_timer_pinned);
745
746/**
710 * add_timer - start a timer 747 * add_timer - start a timer
711 * @timer: the timer to be added 748 * @timer: the timer to be added
712 * 749 *
@@ -756,6 +793,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
756 wake_up_idle_cpu(cpu); 793 wake_up_idle_cpu(cpu);
757 spin_unlock_irqrestore(&base->lock, flags); 794 spin_unlock_irqrestore(&base->lock, flags);
758} 795}
796EXPORT_SYMBOL_GPL(add_timer_on);
759 797
760/** 798/**
761 * del_timer - deactive a timer. 799 * del_timer - deactive a timer.
@@ -1015,6 +1053,9 @@ cascade:
1015 index = slot = timer_jiffies & TVN_MASK; 1053 index = slot = timer_jiffies & TVN_MASK;
1016 do { 1054 do {
1017 list_for_each_entry(nte, varp->vec + slot, entry) { 1055 list_for_each_entry(nte, varp->vec + slot, entry) {
1056 if (tbase_get_deferrable(nte->base))
1057 continue;
1058
1018 found = 1; 1059 found = 1;
1019 if (time_before(nte->expires, expires)) 1060 if (time_before(nte->expires, expires))
1020 expires = nte->expires; 1061 expires = nte->expires;
@@ -1123,53 +1164,14 @@ void update_process_times(int user_tick)
1123} 1164}
1124 1165
1125/* 1166/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1167 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1168 */
1169static void run_timer_softirq(struct softirq_action *h) 1169static void run_timer_softirq(struct softirq_action *h)
1170{ 1170{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1171 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1172
1173 perf_counter_do_pending();
1174
1173 hrtimer_run_pending(); 1175 hrtimer_run_pending();
1174 1176
1175 if (time_after_eq(jiffies, base->timer_jiffies)) 1177 if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1187,16 +1189,6 @@ void run_local_timers(void)
1187} 1189}
1188 1190
1189/* 1191/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1192 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1193 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1194 * jiffies is defined in the linker script...
@@ -1205,7 +1197,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1197void do_timer(unsigned long ticks)
1206{ 1198{
1207 jiffies_64 += ticks; 1199 jiffies_64 += ticks;
1208 update_times(ticks); 1200 update_wall_time();
1201 calc_global_load();
1209} 1202}
1210 1203
1211#ifdef __ARCH_WANT_SYS_ALARM 1204#ifdef __ARCH_WANT_SYS_ALARM
@@ -1353,7 +1346,7 @@ signed long __sched schedule_timeout(signed long timeout)
1353 expire = timeout + jiffies; 1346 expire = timeout + jiffies;
1354 1347
1355 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1348 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1356 __mod_timer(&timer, expire, false); 1349 __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1357 schedule(); 1350 schedule();
1358 del_singleshot_timer_sync(&timer); 1351 del_singleshot_timer_sync(&timer);
1359 1352
@@ -1406,37 +1399,17 @@ int do_sysinfo(struct sysinfo *info)
1406{ 1399{
1407 unsigned long mem_total, sav_total; 1400 unsigned long mem_total, sav_total;
1408 unsigned int mem_unit, bitcount; 1401 unsigned int mem_unit, bitcount;
1409 unsigned long seq; 1402 struct timespec tp;
1410 1403
1411 memset(info, 0, sizeof(struct sysinfo)); 1404 memset(info, 0, sizeof(struct sysinfo));
1412 1405
1413 do { 1406 ktime_get_ts(&tp);
1414 struct timespec tp; 1407 monotonic_to_bootbased(&tp);
1415 seq = read_seqbegin(&xtime_lock); 1408 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1416
1417 /*
1418 * This is annoying. The below is the same thing
1419 * posix_get_clock_monotonic() does, but it wants to
1420 * take the lock which we want to cover the loads stuff
1421 * too.
1422 */
1423
1424 getnstimeofday(&tp);
1425 tp.tv_sec += wall_to_monotonic.tv_sec;
1426 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1427 monotonic_to_bootbased(&tp);
1428 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1429 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1430 tp.tv_sec++;
1431 }
1432 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1433 1409
1434 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1410 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1435 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1436 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1437 1411
1438 info->procs = nr_threads; 1412 info->procs = nr_threads;
1439 } while (read_seqretry(&xtime_lock, seq));
1440 1413
1441 si_meminfo(info); 1414 si_meminfo(info);
1442 si_swapinfo(info); 1415 si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 417d1985e299..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
18config HAVE_FUNCTION_GRAPH_TRACER 18config HAVE_FUNCTION_GRAPH_TRACER
19 bool 19 bool
20 20
21config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool
23 help
24 An arch may pass in a unique value (frame pointer) to both the
25 entering and exiting of a function. On exit, the value is compared
26 and if it does not match, then it will panic the kernel.
27
21config HAVE_FUNCTION_TRACE_MCOUNT_TEST 28config HAVE_FUNCTION_TRACE_MCOUNT_TEST
22 bool 29 bool
23 help 30 help
@@ -48,6 +55,21 @@ config FTRACE_NMI_ENTER
48 depends on HAVE_FTRACE_NMI_ENTER 55 depends on HAVE_FTRACE_NMI_ENTER
49 default y 56 default y
50 57
58config EVENT_TRACING
59 select CONTEXT_SWITCH_TRACER
60 bool
61
62config CONTEXT_SWITCH_TRACER
63 select MARKERS
64 bool
65
66# All tracer options should select GENERIC_TRACER. For those options that are
67# enabled by all tracers (context switch and event tracer) they select TRACING.
68# This allows those options to appear when no other tracer is selected. But the
69# options do not appear when something else selects it. We need the two options
70# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
71# hidding of the automatic options options.
72
51config TRACING 73config TRACING
52 bool 74 bool
53 select DEBUG_FS 75 select DEBUG_FS
@@ -56,6 +78,11 @@ config TRACING
56 select TRACEPOINTS 78 select TRACEPOINTS
57 select NOP_TRACER 79 select NOP_TRACER
58 select BINARY_PRINTF 80 select BINARY_PRINTF
81 select EVENT_TRACING
82
83config GENERIC_TRACER
84 bool
85 select TRACING
59 86
60# 87#
61# Minimum requirements an architecture has to meet for us to 88# Minimum requirements an architecture has to meet for us to
@@ -73,14 +100,20 @@ config TRACING_SUPPORT
73 100
74if TRACING_SUPPORT 101if TRACING_SUPPORT
75 102
76menu "Tracers" 103menuconfig FTRACE
104 bool "Tracers"
105 default y if DEBUG_KERNEL
106 help
107 Enable the kernel tracing infrastructure.
108
109if FTRACE
77 110
78config FUNCTION_TRACER 111config FUNCTION_TRACER
79 bool "Kernel Function Tracer" 112 bool "Kernel Function Tracer"
80 depends on HAVE_FUNCTION_TRACER 113 depends on HAVE_FUNCTION_TRACER
81 select FRAME_POINTER 114 select FRAME_POINTER
82 select KALLSYMS 115 select KALLSYMS
83 select TRACING 116 select GENERIC_TRACER
84 select CONTEXT_SWITCH_TRACER 117 select CONTEXT_SWITCH_TRACER
85 help 118 help
86 Enable the kernel to trace every kernel function. This is done 119 Enable the kernel to trace every kernel function. This is done
@@ -95,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
95 bool "Kernel Function Graph Tracer" 128 bool "Kernel Function Graph Tracer"
96 depends on HAVE_FUNCTION_GRAPH_TRACER 129 depends on HAVE_FUNCTION_GRAPH_TRACER
97 depends on FUNCTION_TRACER 130 depends on FUNCTION_TRACER
131 depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
98 default y 132 default y
99 help 133 help
100 Enable the kernel to trace a function at both its return 134 Enable the kernel to trace a function at both its return
@@ -104,13 +138,14 @@ config FUNCTION_GRAPH_TRACER
104 the return value. This is done by setting the current return 138 the return value. This is done by setting the current return
105 address on the current task structure into a stack of calls. 139 address on the current task structure into a stack of calls.
106 140
141
107config IRQSOFF_TRACER 142config IRQSOFF_TRACER
108 bool "Interrupts-off Latency Tracer" 143 bool "Interrupts-off Latency Tracer"
109 default n 144 default n
110 depends on TRACE_IRQFLAGS_SUPPORT 145 depends on TRACE_IRQFLAGS_SUPPORT
111 depends on GENERIC_TIME 146 depends on GENERIC_TIME
112 select TRACE_IRQFLAGS 147 select TRACE_IRQFLAGS
113 select TRACING 148 select GENERIC_TRACER
114 select TRACER_MAX_TRACE 149 select TRACER_MAX_TRACE
115 help 150 help
116 This option measures the time spent in irqs-off critical 151 This option measures the time spent in irqs-off critical
@@ -120,7 +155,7 @@ config IRQSOFF_TRACER
120 disabled by default and can be runtime (re-)started 155 disabled by default and can be runtime (re-)started
121 via: 156 via:
122 157
123 echo 0 > /debugfs/tracing/tracing_max_latency 158 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
124 159
125 (Note that kernel size and overhead increases with this option 160 (Note that kernel size and overhead increases with this option
126 enabled. This option and the preempt-off timing option can be 161 enabled. This option and the preempt-off timing option can be
@@ -131,7 +166,7 @@ config PREEMPT_TRACER
131 default n 166 default n
132 depends on GENERIC_TIME 167 depends on GENERIC_TIME
133 depends on PREEMPT 168 depends on PREEMPT
134 select TRACING 169 select GENERIC_TRACER
135 select TRACER_MAX_TRACE 170 select TRACER_MAX_TRACE
136 help 171 help
137 This option measures the time spent in preemption off critical 172 This option measures the time spent in preemption off critical
@@ -141,7 +176,7 @@ config PREEMPT_TRACER
141 disabled by default and can be runtime (re-)started 176 disabled by default and can be runtime (re-)started
142 via: 177 via:
143 178
144 echo 0 > /debugfs/tracing/tracing_max_latency 179 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
145 180
146 (Note that kernel size and overhead increases with this option 181 (Note that kernel size and overhead increases with this option
147 enabled. This option and the irqs-off timing option can be 182 enabled. This option and the irqs-off timing option can be
@@ -150,7 +185,7 @@ config PREEMPT_TRACER
150config SYSPROF_TRACER 185config SYSPROF_TRACER
151 bool "Sysprof Tracer" 186 bool "Sysprof Tracer"
152 depends on X86 187 depends on X86
153 select TRACING 188 select GENERIC_TRACER
154 select CONTEXT_SWITCH_TRACER 189 select CONTEXT_SWITCH_TRACER
155 help 190 help
156 This tracer provides the trace needed by the 'Sysprof' userspace 191 This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,83 +193,103 @@ config SYSPROF_TRACER
158 193
159config SCHED_TRACER 194config SCHED_TRACER
160 bool "Scheduling Latency Tracer" 195 bool "Scheduling Latency Tracer"
161 select TRACING 196 select GENERIC_TRACER
162 select CONTEXT_SWITCH_TRACER 197 select CONTEXT_SWITCH_TRACER
163 select TRACER_MAX_TRACE 198 select TRACER_MAX_TRACE
164 help 199 help
165 This tracer tracks the latency of the highest priority task 200 This tracer tracks the latency of the highest priority task
166 to be scheduled in, starting from the point it has woken up. 201 to be scheduled in, starting from the point it has woken up.
167 202
168config CONTEXT_SWITCH_TRACER 203config ENABLE_DEFAULT_TRACERS
169 bool "Trace process context switches" 204 bool "Trace process context switches and events"
170 select TRACING 205 depends on !GENERIC_TRACER
171 select MARKERS
172 help
173 This tracer gets called from the context switch and records
174 all switching of tasks.
175
176config EVENT_TRACER
177 bool "Trace various events in the kernel"
178 select TRACING 206 select TRACING
179 help 207 help
180 This tracer hooks to various trace points in the kernel 208 This tracer hooks to various trace points in the kernel
181 allowing the user to pick and choose which trace point they 209 allowing the user to pick and choose which trace point they
182 want to trace. 210 want to trace. It also includes the sched_switch tracer plugin.
183 211
184config FTRACE_SYSCALLS 212config FTRACE_SYSCALLS
185 bool "Trace syscalls" 213 bool "Trace syscalls"
186 depends on HAVE_FTRACE_SYSCALLS 214 depends on HAVE_FTRACE_SYSCALLS
187 select TRACING 215 select GENERIC_TRACER
188 select KALLSYMS 216 select KALLSYMS
189 help 217 help
190 Basic tracer to catch the syscall entry and exit events. 218 Basic tracer to catch the syscall entry and exit events.
191 219
192config BOOT_TRACER 220config BOOT_TRACER
193 bool "Trace boot initcalls" 221 bool "Trace boot initcalls"
194 select TRACING 222 select GENERIC_TRACER
195 select CONTEXT_SWITCH_TRACER 223 select CONTEXT_SWITCH_TRACER
196 help 224 help
197 This tracer helps developers to optimize boot times: it records 225 This tracer helps developers to optimize boot times: it records
198 the timings of the initcalls and traces key events and the identity 226 the timings of the initcalls and traces key events and the identity
199 of tasks that can cause boot delays, such as context-switches. 227 of tasks that can cause boot delays, such as context-switches.
200 228
201 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 229 Its aim is to be parsed by the scripts/bootgraph.pl tool to
202 produce pretty graphics about boot inefficiencies, giving a visual 230 produce pretty graphics about boot inefficiencies, giving a visual
203 representation of the delays during initcalls - but the raw 231 representation of the delays during initcalls - but the raw
204 /debug/tracing/trace text output is readable too. 232 /debug/tracing/trace text output is readable too.
205 233
206 You must pass in ftrace=initcall to the kernel command line 234 You must pass in initcall_debug and ftrace=initcall to the kernel
207 to enable this on bootup. 235 command line to enable this on bootup.
208 236
209config TRACE_BRANCH_PROFILING 237config TRACE_BRANCH_PROFILING
238 bool
239 select GENERIC_TRACER
240
241choice
242 prompt "Branch Profiling"
243 default BRANCH_PROFILE_NONE
244 help
245 The branch profiling is a software profiler. It will add hooks
246 into the C conditionals to test which path a branch takes.
247
248 The likely/unlikely profiler only looks at the conditions that
249 are annotated with a likely or unlikely macro.
250
251 The "all branch" profiler will profile every if statement in the
252 kernel. This profiler will also enable the likely/unlikely
253 profiler as well.
254
255 Either of the above profilers add a bit of overhead to the system.
256 If unsure choose "No branch profiling".
257
258config BRANCH_PROFILE_NONE
259 bool "No branch profiling"
260 help
261 No branch profiling. Branch profiling adds a bit of overhead.
262 Only enable it if you want to analyse the branching behavior.
263 Otherwise keep it disabled.
264
265config PROFILE_ANNOTATED_BRANCHES
210 bool "Trace likely/unlikely profiler" 266 bool "Trace likely/unlikely profiler"
211 select TRACING 267 select TRACE_BRANCH_PROFILING
212 help 268 help
213 This tracer profiles all the the likely and unlikely macros 269 This tracer profiles all the the likely and unlikely macros
214 in the kernel. It will display the results in: 270 in the kernel. It will display the results in:
215 271
216 /debugfs/tracing/profile_annotated_branch 272 /sys/kernel/debug/tracing/profile_annotated_branch
217 273
218 Note: this will add a significant overhead, only turn this 274 Note: this will add a significant overhead, only turn this
219 on if you need to profile the system's use of these macros. 275 on if you need to profile the system's use of these macros.
220 276
221 Say N if unsure.
222
223config PROFILE_ALL_BRANCHES 277config PROFILE_ALL_BRANCHES
224 bool "Profile all if conditionals" 278 bool "Profile all if conditionals"
225 depends on TRACE_BRANCH_PROFILING 279 select TRACE_BRANCH_PROFILING
226 help 280 help
227 This tracer profiles all branch conditions. Every if () 281 This tracer profiles all branch conditions. Every if ()
228 taken in the kernel is recorded whether it hit or miss. 282 taken in the kernel is recorded whether it hit or miss.
229 The results will be displayed in: 283 The results will be displayed in:
230 284
231 /debugfs/tracing/profile_branch 285 /sys/kernel/debug/tracing/profile_branch
286
287 This option also enables the likely/unlikely profiler.
232 288
233 This configuration, when enabled, will impose a great overhead 289 This configuration, when enabled, will impose a great overhead
234 on the system. This should only be enabled when the system 290 on the system. This should only be enabled when the system
235 is to be analyzed 291 is to be analyzed
236 292endchoice
237 Say N if unsure.
238 293
239config TRACING_BRANCHES 294config TRACING_BRANCHES
240 bool 295 bool
@@ -261,7 +316,7 @@ config BRANCH_TRACER
261config POWER_TRACER 316config POWER_TRACER
262 bool "Trace power consumption behavior" 317 bool "Trace power consumption behavior"
263 depends on X86 318 depends on X86
264 select TRACING 319 select GENERIC_TRACER
265 help 320 help
266 This tracer helps developers to analyze and optimize the kernels 321 This tracer helps developers to analyze and optimize the kernels
267 power management decisions, specifically the C-state and P-state 322 power management decisions, specifically the C-state and P-state
@@ -276,7 +331,7 @@ config STACK_TRACER
276 select KALLSYMS 331 select KALLSYMS
277 help 332 help
278 This special tracer records the maximum stack footprint of the 333 This special tracer records the maximum stack footprint of the
279 kernel and displays it in debugfs/tracing/stack_trace. 334 kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
280 335
281 This tracer works by hooking into every function call that the 336 This tracer works by hooking into every function call that the
282 kernel executes, and keeping a maximum stack depth value and 337 kernel executes, and keeping a maximum stack depth value and
@@ -295,14 +350,14 @@ config STACK_TRACER
295config HW_BRANCH_TRACER 350config HW_BRANCH_TRACER
296 depends on HAVE_HW_BRANCH_TRACER 351 depends on HAVE_HW_BRANCH_TRACER
297 bool "Trace hw branches" 352 bool "Trace hw branches"
298 select TRACING 353 select GENERIC_TRACER
299 help 354 help
300 This tracer records all branches on the system in a circular 355 This tracer records all branches on the system in a circular
301 buffer giving access to the last N branches for each cpu. 356 buffer giving access to the last N branches for each cpu.
302 357
303config KMEMTRACE 358config KMEMTRACE
304 bool "Trace SLAB allocations" 359 bool "Trace SLAB allocations"
305 select TRACING 360 select GENERIC_TRACER
306 help 361 help
307 kmemtrace provides tracing for slab allocator functions, such as 362 kmemtrace provides tracing for slab allocator functions, such as
308 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 363 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +377,7 @@ config KMEMTRACE
322 377
323config WORKQUEUE_TRACER 378config WORKQUEUE_TRACER
324 bool "Trace workqueues" 379 bool "Trace workqueues"
325 select TRACING 380 select GENERIC_TRACER
326 help 381 help
327 The workqueue tracer provides some statistical informations 382 The workqueue tracer provides some statistical informations
328 about each cpu workqueue thread such as the number of the 383 about each cpu workqueue thread such as the number of the
@@ -338,7 +393,7 @@ config BLK_DEV_IO_TRACE
338 select RELAY 393 select RELAY
339 select DEBUG_FS 394 select DEBUG_FS
340 select TRACEPOINTS 395 select TRACEPOINTS
341 select TRACING 396 select GENERIC_TRACER
342 select STACKTRACE 397 select STACKTRACE
343 help 398 help
344 Say Y here if you want to be able to trace the block layer actions 399 Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +430,20 @@ config DYNAMIC_FTRACE
375 were made. If so, it runs stop_machine (stops all CPUS) 430 were made. If so, it runs stop_machine (stops all CPUS)
376 and modifies the code to jump over the call to ftrace. 431 and modifies the code to jump over the call to ftrace.
377 432
433config FUNCTION_PROFILER
434 bool "Kernel function profiler"
435 depends on FUNCTION_TRACER
436 default n
437 help
438 This option enables the kernel function profiler. A file is created
439 in debugfs called function_profile_enabled which defaults to zero.
440 When a 1 is echoed into this file profiling begins, and when a
441 zero is entered, profiling stops. A file in the trace_stats
442 directory called functions, that show the list of functions that
443 have been hit and their counters.
444
445 If in doubt, say N
446
378config FTRACE_MCOUNT_RECORD 447config FTRACE_MCOUNT_RECORD
379 def_bool y 448 def_bool y
380 depends on DYNAMIC_FTRACE 449 depends on DYNAMIC_FTRACE
@@ -385,7 +454,7 @@ config FTRACE_SELFTEST
385 454
386config FTRACE_STARTUP_TEST 455config FTRACE_STARTUP_TEST
387 bool "Perform a startup test on ftrace" 456 bool "Perform a startup test on ftrace"
388 depends on TRACING 457 depends on GENERIC_TRACER
389 select FTRACE_SELFTEST 458 select FTRACE_SELFTEST
390 help 459 help
391 This option performs a series of startup tests on ftrace. On bootup 460 This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +465,7 @@ config FTRACE_STARTUP_TEST
396config MMIOTRACE 465config MMIOTRACE
397 bool "Memory mapped IO tracing" 466 bool "Memory mapped IO tracing"
398 depends on HAVE_MMIOTRACE_SUPPORT && PCI 467 depends on HAVE_MMIOTRACE_SUPPORT && PCI
399 select TRACING 468 select GENERIC_TRACER
400 help 469 help
401 Mmiotrace traces Memory Mapped I/O access and is meant for 470 Mmiotrace traces Memory Mapped I/O access and is meant for
402 debugging and reverse engineering. It is called from the ioremap 471 debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +485,23 @@ config MMIOTRACE_TEST
416 485
417 Say N, unless you absolutely know what you are doing. 486 Say N, unless you absolutely know what you are doing.
418 487
419endmenu 488config RING_BUFFER_BENCHMARK
489 tristate "Ring buffer benchmark stress tester"
490 depends on RING_BUFFER
491 help
492 This option creates a test to stress the ring buffer and bench mark it.
493 It creates its own ring buffer such that it will not interfer with
494 any other users of the ring buffer (such as ftrace). It then creates
495 a producer and consumer that will run for 10 seconds and sleep for
496 10 seconds. Each interval it will print out the number of events
497 it recorded and give a rough estimate of how long each iteration took.
498
499 It does not disable interrupts or raise its priority, so it may be
500 affected by processes that are running.
501
502 If unsure, say N
503
504endif # FTRACE
420 505
421endif # TRACING_SUPPORT 506endif # TRACING_SUPPORT
422 507
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec1..844164dca90a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif 16endif
17 17
18#
19# Make the trace clocks available generally: it's infrastructure
20# relied on by ptrace for example:
21#
22obj-y += trace_clock.o
23
18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 24obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 25obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
26obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
20 27
21obj-$(CONFIG_TRACING) += trace.o 28obj-$(CONFIG_TRACING) += trace.o
22obj-$(CONFIG_TRACING) += trace_clock.o
23obj-$(CONFIG_TRACING) += trace_output.o 29obj-$(CONFIG_TRACING) += trace_output.o
24obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
25obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
39obj-$(CONFIG_POWER_TRACER) += trace_power.o 45obj-$(CONFIG_POWER_TRACER) += trace_power.o
40obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
41obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
42obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
43obj-$(CONFIG_EVENT_TRACER) += trace_events.o 49ifeq ($(CONFIG_BLOCK),y)
44obj-$(CONFIG_EVENT_TRACER) += events.o 50obj-$(CONFIG_EVENT_TRACING) += blktrace.o
45obj-$(CONFIG_EVENT_TRACER) += trace_export.o 51endif
52obj-$(CONFIG_EVENT_TRACING) += trace_events.o
53obj-$(CONFIG_EVENT_TRACING) += trace_export.o
46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
47obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
48obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o 56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
49 57
50libftrace-y := ftrace.o 58libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0ba..7a34cb563fec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,11 +22,16 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <trace/block.h>
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28
29#include <trace/events/block.h>
30
28#include "trace_output.h" 31#include "trace_output.h"
29 32
33#ifdef CONFIG_BLK_DEV_IO_TRACE
34
30static unsigned int blktrace_seq __read_mostly = 1; 35static unsigned int blktrace_seq __read_mostly = 1;
31 36
32static struct trace_array *blk_tr; 37static struct trace_array *blk_tr;
@@ -147,7 +152,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
147{ 152{
148 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) 153 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
149 return 1; 154 return 1;
150 if (sector < bt->start_lba || sector > bt->end_lba) 155 if (sector && (sector < bt->start_lba || sector > bt->end_lba))
151 return 1; 156 return 1;
152 if (bt->pid && pid != bt->pid) 157 if (bt->pid && pid != bt->pid)
153 return 1; 158 return 1;
@@ -192,7 +197,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
192 what |= MASK_TC_BIT(rw, DISCARD); 197 what |= MASK_TC_BIT(rw, DISCARD);
193 198
194 pid = tsk->pid; 199 pid = tsk->pid;
195 if (unlikely(act_log_check(bt, what, sector, pid))) 200 if (act_log_check(bt, what, sector, pid))
196 return; 201 return;
197 cpu = raw_smp_processor_id(); 202 cpu = raw_smp_processor_id();
198 203
@@ -263,6 +268,7 @@ static void blk_trace_free(struct blk_trace *bt)
263 debugfs_remove(bt->msg_file); 268 debugfs_remove(bt->msg_file);
264 debugfs_remove(bt->dropped_file); 269 debugfs_remove(bt->dropped_file);
265 relay_close(bt->rchan); 270 relay_close(bt->rchan);
271 debugfs_remove(bt->dir);
266 free_percpu(bt->sequence); 272 free_percpu(bt->sequence);
267 free_percpu(bt->msg_data); 273 free_percpu(bt->msg_data);
268 kfree(bt); 274 kfree(bt);
@@ -372,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
372 378
373static int blk_remove_buf_file_callback(struct dentry *dentry) 379static int blk_remove_buf_file_callback(struct dentry *dentry)
374{ 380{
375 struct dentry *parent = dentry->d_parent;
376 debugfs_remove(dentry); 381 debugfs_remove(dentry);
377 382
378 /*
379 * this will fail for all but the last file, but that is ok. what we
380 * care about is the top level buts->name directory going away, when
381 * the last trace file is gone. Then we don't have to rmdir() that
382 * manually on trace stop, so it nicely solves the issue with
383 * force killing of running traces.
384 */
385
386 debugfs_remove(parent);
387 return 0; 383 return 0;
388} 384}
389 385
@@ -403,11 +399,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
403 .remove_buf_file = blk_remove_buf_file_callback, 399 .remove_buf_file = blk_remove_buf_file_callback,
404}; 400};
405 401
402static void blk_trace_setup_lba(struct blk_trace *bt,
403 struct block_device *bdev)
404{
405 struct hd_struct *part = NULL;
406
407 if (bdev)
408 part = bdev->bd_part;
409
410 if (part) {
411 bt->start_lba = part->start_sect;
412 bt->end_lba = part->start_sect + part->nr_sects;
413 } else {
414 bt->start_lba = 0;
415 bt->end_lba = -1ULL;
416 }
417}
418
406/* 419/*
407 * Setup everything required to start tracing 420 * Setup everything required to start tracing
408 */ 421 */
409int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 422int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
410 struct blk_user_trace_setup *buts) 423 struct block_device *bdev,
424 struct blk_user_trace_setup *buts)
411{ 425{
412 struct blk_trace *old_bt, *bt = NULL; 426 struct blk_trace *old_bt, *bt = NULL;
413 struct dentry *dir = NULL; 427 struct dentry *dir = NULL;
@@ -480,10 +494,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
480 if (!bt->act_mask) 494 if (!bt->act_mask)
481 bt->act_mask = (u16) -1; 495 bt->act_mask = (u16) -1;
482 496
483 bt->start_lba = buts->start_lba; 497 blk_trace_setup_lba(bt, bdev);
484 bt->end_lba = buts->end_lba; 498
485 if (!bt->end_lba) 499 /* overwrite with user settings */
486 bt->end_lba = -1ULL; 500 if (buts->start_lba)
501 bt->start_lba = buts->start_lba;
502 if (buts->end_lba)
503 bt->end_lba = buts->end_lba;
487 504
488 bt->pid = buts->pid; 505 bt->pid = buts->pid;
489 bt->trace_state = Blktrace_setup; 506 bt->trace_state = Blktrace_setup;
@@ -505,6 +522,7 @@ err:
505} 522}
506 523
507int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 524int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
525 struct block_device *bdev,
508 char __user *arg) 526 char __user *arg)
509{ 527{
510 struct blk_user_trace_setup buts; 528 struct blk_user_trace_setup buts;
@@ -514,7 +532,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
514 if (ret) 532 if (ret)
515 return -EFAULT; 533 return -EFAULT;
516 534
517 ret = do_blk_trace_setup(q, name, dev, &buts); 535 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
518 if (ret) 536 if (ret)
519 return ret; 537 return ret;
520 538
@@ -582,7 +600,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
582 switch (cmd) { 600 switch (cmd) {
583 case BLKTRACESETUP: 601 case BLKTRACESETUP:
584 bdevname(bdev, b); 602 bdevname(bdev, b);
585 ret = blk_trace_setup(q, b, bdev->bd_dev, arg); 603 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
586 break; 604 break;
587 case BLKTRACESTART: 605 case BLKTRACESTART:
588 start = 1; 606 start = 1;
@@ -642,12 +660,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
642 660
643 if (blk_pc_request(rq)) { 661 if (blk_pc_request(rq)) {
644 what |= BLK_TC_ACT(BLK_TC_PC); 662 what |= BLK_TC_ACT(BLK_TC_PC);
645 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, 663 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
646 rq->cmd_len, rq->cmd); 664 what, rq->errors, rq->cmd_len, rq->cmd);
647 } else { 665 } else {
648 what |= BLK_TC_ACT(BLK_TC_FS); 666 what |= BLK_TC_ACT(BLK_TC_FS);
649 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 667 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
650 rw, what, rq->errors, 0, NULL); 668 what, rq->errors, 0, NULL);
651 } 669 }
652} 670}
653 671
@@ -809,7 +827,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
809 * @bio: the source bio 827 * @bio: the source bio
810 * @dev: target device 828 * @dev: target device
811 * @from: source sector 829 * @from: source sector
812 * @to: target sector
813 * 830 *
814 * Description: 831 * Description:
815 * Device mapper or raid target sometimes need to split a bio because 832 * Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +834,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
817 * 834 *
818 **/ 835 **/
819static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 836static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
820 dev_t dev, sector_t from, sector_t to) 837 dev_t dev, sector_t from)
821{ 838{
822 struct blk_trace *bt = q->blk_trace; 839 struct blk_trace *bt = q->blk_trace;
823 struct blk_io_trace_remap r; 840 struct blk_io_trace_remap r;
@@ -825,12 +842,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
825 if (likely(!bt)) 842 if (likely(!bt))
826 return; 843 return;
827 844
828 r.device = cpu_to_be32(dev); 845 r.device_from = cpu_to_be32(dev);
829 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); 846 r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
830 r.sector = cpu_to_be64(to); 847 r.sector_from = cpu_to_be64(from);
831 848
832 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, 849 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
833 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); 850 BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
851 sizeof(r), &r);
834} 852}
835 853
836/** 854/**
@@ -854,11 +872,11 @@ void blk_add_driver_data(struct request_queue *q,
854 return; 872 return;
855 873
856 if (blk_pc_request(rq)) 874 if (blk_pc_request(rq))
857 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, 875 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
858 rq->errors, len, data); 876 BLK_TA_DRV_DATA, rq->errors, len, data);
859 else 877 else
860 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 878 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
861 0, BLK_TA_DRV_DATA, rq->errors, len, data); 879 BLK_TA_DRV_DATA, rq->errors, len, data);
862} 880}
863EXPORT_SYMBOL_GPL(blk_add_driver_data); 881EXPORT_SYMBOL_GPL(blk_add_driver_data);
864 882
@@ -971,6 +989,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
971 return te_blk_io_trace(ent) + 1; 989 return te_blk_io_trace(ent) + 1;
972} 990}
973 991
992static inline u32 t_action(const struct trace_entry *ent)
993{
994 return te_blk_io_trace(ent)->action;
995}
996
997static inline u32 t_bytes(const struct trace_entry *ent)
998{
999 return te_blk_io_trace(ent)->bytes;
1000}
1001
974static inline u32 t_sec(const struct trace_entry *ent) 1002static inline u32 t_sec(const struct trace_entry *ent)
975{ 1003{
976 return te_blk_io_trace(ent)->bytes >> 9; 1004 return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1024,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
996 struct blk_io_trace_remap *r) 1024 struct blk_io_trace_remap *r)
997{ 1025{
998 const struct blk_io_trace_remap *__r = pdu_start(ent); 1026 const struct blk_io_trace_remap *__r = pdu_start(ent);
999 __u64 sector = __r->sector; 1027 __u64 sector_from = __r->sector_from;
1000 1028
1001 r->device = be32_to_cpu(__r->device);
1002 r->device_from = be32_to_cpu(__r->device_from); 1029 r->device_from = be32_to_cpu(__r->device_from);
1003 r->sector = be64_to_cpu(sector); 1030 r->device_to = be32_to_cpu(__r->device_to);
1031 r->sector_from = be64_to_cpu(sector_from);
1004} 1032}
1005 1033
1006typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); 1034typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1059,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
1031 MAJOR(t->device), MINOR(t->device), act, rwbs); 1059 MAJOR(t->device), MINOR(t->device), act, rwbs);
1032} 1060}
1033 1061
1062static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1063{
1064 const unsigned char *pdu_buf;
1065 int pdu_len;
1066 int i, end, ret;
1067
1068 pdu_buf = pdu_start(ent);
1069 pdu_len = te_blk_io_trace(ent)->pdu_len;
1070
1071 if (!pdu_len)
1072 return 1;
1073
1074 /* find the last zero that needs to be printed */
1075 for (end = pdu_len - 1; end >= 0; end--)
1076 if (pdu_buf[end])
1077 break;
1078 end++;
1079
1080 if (!trace_seq_putc(s, '('))
1081 return 0;
1082
1083 for (i = 0; i < pdu_len; i++) {
1084
1085 ret = trace_seq_printf(s, "%s%02x",
1086 i == 0 ? "" : " ", pdu_buf[i]);
1087 if (!ret)
1088 return ret;
1089
1090 /*
1091 * stop when the rest is just zeroes and indicate so
1092 * with a ".." appended
1093 */
1094 if (i == end && end != pdu_len - 1)
1095 return trace_seq_puts(s, " ..) ");
1096 }
1097
1098 return trace_seq_puts(s, ") ");
1099}
1100
1034static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) 1101static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1035{ 1102{
1036 char cmd[TASK_COMM_LEN]; 1103 char cmd[TASK_COMM_LEN];
1037 1104
1038 trace_find_cmdline(ent->pid, cmd); 1105 trace_find_cmdline(ent->pid, cmd);
1039 1106
1040 if (t_sec(ent)) 1107 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1041 return trace_seq_printf(s, "%llu + %u [%s]\n", 1108 int ret;
1042 t_sector(ent), t_sec(ent), cmd); 1109
1043 return trace_seq_printf(s, "[%s]\n", cmd); 1110 ret = trace_seq_printf(s, "%u ", t_bytes(ent));
1111 if (!ret)
1112 return 0;
1113 ret = blk_log_dump_pdu(s, ent);
1114 if (!ret)
1115 return 0;
1116 return trace_seq_printf(s, "[%s]\n", cmd);
1117 } else {
1118 if (t_sec(ent))
1119 return trace_seq_printf(s, "%llu + %u [%s]\n",
1120 t_sector(ent), t_sec(ent), cmd);
1121 return trace_seq_printf(s, "[%s]\n", cmd);
1122 }
1044} 1123}
1045 1124
1046static int blk_log_with_error(struct trace_seq *s, 1125static int blk_log_with_error(struct trace_seq *s,
1047 const struct trace_entry *ent) 1126 const struct trace_entry *ent)
1048{ 1127{
1049 if (t_sec(ent)) 1128 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1050 return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), 1129 int ret;
1051 t_sec(ent), t_error(ent)); 1130
1052 return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); 1131 ret = blk_log_dump_pdu(s, ent);
1132 if (ret)
1133 return trace_seq_printf(s, "[%d]\n", t_error(ent));
1134 return 0;
1135 } else {
1136 if (t_sec(ent))
1137 return trace_seq_printf(s, "%llu + %u [%d]\n",
1138 t_sector(ent),
1139 t_sec(ent), t_error(ent));
1140 return trace_seq_printf(s, "%llu [%d]\n",
1141 t_sector(ent), t_error(ent));
1142 }
1053} 1143}
1054 1144
1055static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) 1145static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1056{ 1146{
1057 struct blk_io_trace_remap r = { .device = 0, }; 1147 struct blk_io_trace_remap r = { .device_from = 0, };
1058 1148
1059 get_pdu_remap(ent, &r); 1149 get_pdu_remap(ent, &r);
1060 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", 1150 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1061 t_sector(ent), 1151 t_sector(ent), t_sec(ent),
1062 t_sec(ent), MAJOR(r.device), MINOR(r.device), 1152 MAJOR(r.device_from), MINOR(r.device_from),
1063 (unsigned long long)r.sector); 1153 (unsigned long long)r.sector_from);
1064} 1154}
1065 1155
1066static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) 1156static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1207,6 @@ static void blk_tracer_print_header(struct seq_file *m)
1117static void blk_tracer_start(struct trace_array *tr) 1207static void blk_tracer_start(struct trace_array *tr)
1118{ 1208{
1119 blk_tracer_enabled = true; 1209 blk_tracer_enabled = true;
1120 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1121} 1210}
1122 1211
1123static int blk_tracer_init(struct trace_array *tr) 1212static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1219,6 @@ static int blk_tracer_init(struct trace_array *tr)
1130static void blk_tracer_stop(struct trace_array *tr) 1219static void blk_tracer_stop(struct trace_array *tr)
1131{ 1220{
1132 blk_tracer_enabled = false; 1221 blk_tracer_enabled = false;
1133 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1134} 1222}
1135 1223
1136static void blk_tracer_reset(struct trace_array *tr) 1224static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1270,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1182 } 1270 }
1183 1271
1184 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) 1272 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1185 ret = trace_seq_printf(s, "Bad pc action %x\n", what); 1273 ret = trace_seq_printf(s, "Unknown action %x\n", what);
1186 else { 1274 else {
1187 ret = log_action(iter, what2act[what].act[long_act]); 1275 ret = log_action(iter, what2act[what].act[long_act]);
1188 if (ret) 1276 if (ret)
@@ -1195,9 +1283,6 @@ out:
1195static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1283static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1196 int flags) 1284 int flags)
1197{ 1285{
1198 if (!trace_print_context(iter))
1199 return TRACE_TYPE_PARTIAL_LINE;
1200
1201 return print_one_line(iter, false); 1286 return print_one_line(iter, false);
1202} 1287}
1203 1288
@@ -1232,6 +1317,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1232 return print_one_line(iter, true); 1317 return print_one_line(iter, true);
1233} 1318}
1234 1319
1320static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
1321{
1322 /* don't output context-info for blk_classic output */
1323 if (bit == TRACE_BLK_OPT_CLASSIC) {
1324 if (set)
1325 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1326 else
1327 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1328 }
1329 return 0;
1330}
1331
1235static struct tracer blk_tracer __read_mostly = { 1332static struct tracer blk_tracer __read_mostly = {
1236 .name = "blk", 1333 .name = "blk",
1237 .init = blk_tracer_init, 1334 .init = blk_tracer_init,
@@ -1241,6 +1338,7 @@ static struct tracer blk_tracer __read_mostly = {
1241 .print_header = blk_tracer_print_header, 1338 .print_header = blk_tracer_print_header,
1242 .print_line = blk_tracer_print_line, 1339 .print_line = blk_tracer_print_line,
1243 .flags = &blk_tracer_flags, 1340 .flags = &blk_tracer_flags,
1341 .set_flag = blk_tracer_set_flag,
1244}; 1342};
1245 1343
1246static struct trace_event trace_blk_event = { 1344static struct trace_event trace_blk_event = {
@@ -1285,7 +1383,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
1285/* 1383/*
1286 * Setup everything required to start tracing 1384 * Setup everything required to start tracing
1287 */ 1385 */
1288static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) 1386static int blk_trace_setup_queue(struct request_queue *q,
1387 struct block_device *bdev)
1289{ 1388{
1290 struct blk_trace *old_bt, *bt = NULL; 1389 struct blk_trace *old_bt, *bt = NULL;
1291 int ret = -ENOMEM; 1390 int ret = -ENOMEM;
@@ -1298,9 +1397,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1298 if (!bt->msg_data) 1397 if (!bt->msg_data)
1299 goto free_bt; 1398 goto free_bt;
1300 1399
1301 bt->dev = dev; 1400 bt->dev = bdev->bd_dev;
1302 bt->act_mask = (u16)-1; 1401 bt->act_mask = (u16)-1;
1303 bt->end_lba = -1ULL; 1402
1403 blk_trace_setup_lba(bt, bdev);
1304 1404
1305 old_bt = xchg(&q->blk_trace, bt); 1405 old_bt = xchg(&q->blk_trace, bt);
1306 if (old_bt != NULL) { 1406 if (old_bt != NULL) {
@@ -1517,7 +1617,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1517 1617
1518 if (attr == &dev_attr_enable) { 1618 if (attr == &dev_attr_enable) {
1519 if (value) 1619 if (value)
1520 ret = blk_trace_setup_queue(q, bdev->bd_dev); 1620 ret = blk_trace_setup_queue(q, bdev);
1521 else 1621 else
1522 ret = blk_trace_remove_queue(q); 1622 ret = blk_trace_remove_queue(q);
1523 goto out_unlock_bdev; 1623 goto out_unlock_bdev;
@@ -1525,7 +1625,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1525 1625
1526 ret = 0; 1626 ret = 0;
1527 if (q->blk_trace == NULL) 1627 if (q->blk_trace == NULL)
1528 ret = blk_trace_setup_queue(q, bdev->bd_dev); 1628 ret = blk_trace_setup_queue(q, bdev);
1529 1629
1530 if (ret == 0) { 1630 if (ret == 0) {
1531 if (attr == &dev_attr_act_mask) 1631 if (attr == &dev_attr_act_mask)
@@ -1548,3 +1648,77 @@ out:
1548 return ret ? ret : count; 1648 return ret ? ret : count;
1549} 1649}
1550 1650
1651int blk_trace_init_sysfs(struct device *dev)
1652{
1653 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1654}
1655
1656#endif /* CONFIG_BLK_DEV_IO_TRACE */
1657
1658#ifdef CONFIG_EVENT_TRACING
1659
1660void blk_dump_cmd(char *buf, struct request *rq)
1661{
1662 int i, end;
1663 int len = rq->cmd_len;
1664 unsigned char *cmd = rq->cmd;
1665
1666 if (!blk_pc_request(rq)) {
1667 buf[0] = '\0';
1668 return;
1669 }
1670
1671 for (end = len - 1; end >= 0; end--)
1672 if (cmd[end])
1673 break;
1674 end++;
1675
1676 for (i = 0; i < len; i++) {
1677 buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
1678 if (i == end && end != len - 1) {
1679 sprintf(buf, " ..");
1680 break;
1681 }
1682 }
1683}
1684
1685void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1686{
1687 int i = 0;
1688
1689 if (rw & WRITE)
1690 rwbs[i++] = 'W';
1691 else if (rw & 1 << BIO_RW_DISCARD)
1692 rwbs[i++] = 'D';
1693 else if (bytes)
1694 rwbs[i++] = 'R';
1695 else
1696 rwbs[i++] = 'N';
1697
1698 if (rw & 1 << BIO_RW_AHEAD)
1699 rwbs[i++] = 'A';
1700 if (rw & 1 << BIO_RW_BARRIER)
1701 rwbs[i++] = 'B';
1702 if (rw & 1 << BIO_RW_SYNCIO)
1703 rwbs[i++] = 'S';
1704 if (rw & 1 << BIO_RW_META)
1705 rwbs[i++] = 'M';
1706
1707 rwbs[i] = '\0';
1708}
1709
1710void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1711{
1712 int rw = rq->cmd_flags & 0x03;
1713 int bytes;
1714
1715 if (blk_discard_rq(rq))
1716 rw |= (1 << BIO_RW_DISCARD);
1717
1718 bytes = blk_rq_bytes(rq);
1719
1720 blk_fill_rwbs(rwbs, rw, bytes);
1721}
1722
1723#endif /* CONFIG_EVENT_TRACING */
1724
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 246f2aa6dc46..000000000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,14 +0,0 @@
1/*
2 * This is the place to register all trace points as events.
3 */
4
5#include <linux/stringify.h>
6
7#include <trace/trace_events.h>
8
9#include "trace_output.h"
10
11#include "trace_events_stage_1.h"
12#include "trace_events_stage_2.h"
13#include "trace_events_stage_3.h"
14
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080406c3..25edd5cc5935 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,11 +29,13 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h> 30#include <linux/hash.h>
31 31
32#include <trace/sched.h> 32#include <trace/events/sched.h>
33 33
34#include <asm/ftrace.h> 34#include <asm/ftrace.h>
35#include <asm/setup.h>
35 36
36#include "trace.h" 37#include "trace_output.h"
38#include "trace_stat.h"
37 39
38#define FTRACE_WARN_ON(cond) \ 40#define FTRACE_WARN_ON(cond) \
39 do { \ 41 do { \
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
68 70
69static struct ftrace_ops ftrace_list_end __read_mostly = 71static struct ftrace_ops ftrace_list_end __read_mostly =
70{ 72{
71 .func = ftrace_stub, 73 .func = ftrace_stub,
72}; 74};
73 75
74static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 76static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,581 @@ static void ftrace_update_pid_func(void)
240#endif 242#endif
241} 243}
242 244
245#ifdef CONFIG_FUNCTION_PROFILER
246struct ftrace_profile {
247 struct hlist_node node;
248 unsigned long ip;
249 unsigned long counter;
250#ifdef CONFIG_FUNCTION_GRAPH_TRACER
251 unsigned long long time;
252#endif
253};
254
255struct ftrace_profile_page {
256 struct ftrace_profile_page *next;
257 unsigned long index;
258 struct ftrace_profile records[];
259};
260
261struct ftrace_profile_stat {
262 atomic_t disabled;
263 struct hlist_head *hash;
264 struct ftrace_profile_page *pages;
265 struct ftrace_profile_page *start;
266 struct tracer_stat stat;
267};
268
269#define PROFILE_RECORDS_SIZE \
270 (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
271
272#define PROFILES_PER_PAGE \
273 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
274
275static int ftrace_profile_bits __read_mostly;
276static int ftrace_profile_enabled __read_mostly;
277
278/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
279static DEFINE_MUTEX(ftrace_profile_lock);
280
281static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
282
283#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
284
285static void *
286function_stat_next(void *v, int idx)
287{
288 struct ftrace_profile *rec = v;
289 struct ftrace_profile_page *pg;
290
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292
293 again:
294 if (idx != 0)
295 rec++;
296
297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
298 pg = pg->next;
299 if (!pg)
300 return NULL;
301 rec = &pg->records[0];
302 if (!rec->counter)
303 goto again;
304 }
305
306 return rec;
307}
308
309static void *function_stat_start(struct tracer_stat *trace)
310{
311 struct ftrace_profile_stat *stat =
312 container_of(trace, struct ftrace_profile_stat, stat);
313
314 if (!stat || !stat->start)
315 return NULL;
316
317 return function_stat_next(&stat->start->records[0], 0);
318}
319
320#ifdef CONFIG_FUNCTION_GRAPH_TRACER
321/* function graph compares on total time */
322static int function_stat_cmp(void *p1, void *p2)
323{
324 struct ftrace_profile *a = p1;
325 struct ftrace_profile *b = p2;
326
327 if (a->time < b->time)
328 return -1;
329 if (a->time > b->time)
330 return 1;
331 else
332 return 0;
333}
334#else
335/* not function graph compares against hits */
336static int function_stat_cmp(void *p1, void *p2)
337{
338 struct ftrace_profile *a = p1;
339 struct ftrace_profile *b = p2;
340
341 if (a->counter < b->counter)
342 return -1;
343 if (a->counter > b->counter)
344 return 1;
345 else
346 return 0;
347}
348#endif
349
350static int function_stat_headers(struct seq_file *m)
351{
352#ifdef CONFIG_FUNCTION_GRAPH_TRACER
353 seq_printf(m, " Function "
354 "Hit Time Avg\n"
355 " -------- "
356 "--- ---- ---\n");
357#else
358 seq_printf(m, " Function Hit\n"
359 " -------- ---\n");
360#endif
361 return 0;
362}
363
364static int function_stat_show(struct seq_file *m, void *v)
365{
366 struct ftrace_profile *rec = v;
367 char str[KSYM_SYMBOL_LEN];
368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
369 static DEFINE_MUTEX(mutex);
370 static struct trace_seq s;
371 unsigned long long avg;
372#endif
373
374 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
375 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
376
377#ifdef CONFIG_FUNCTION_GRAPH_TRACER
378 seq_printf(m, " ");
379 avg = rec->time;
380 do_div(avg, rec->counter);
381
382 mutex_lock(&mutex);
383 trace_seq_init(&s);
384 trace_print_graph_duration(rec->time, &s);
385 trace_seq_puts(&s, " ");
386 trace_print_graph_duration(avg, &s);
387 trace_print_seq(m, &s);
388 mutex_unlock(&mutex);
389#endif
390 seq_putc(m, '\n');
391
392 return 0;
393}
394
395static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
396{
397 struct ftrace_profile_page *pg;
398
399 pg = stat->pages = stat->start;
400
401 while (pg) {
402 memset(pg->records, 0, PROFILE_RECORDS_SIZE);
403 pg->index = 0;
404 pg = pg->next;
405 }
406
407 memset(stat->hash, 0,
408 FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
409}
410
411int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
412{
413 struct ftrace_profile_page *pg;
414 int functions;
415 int pages;
416 int i;
417
418 /* If we already allocated, do nothing */
419 if (stat->pages)
420 return 0;
421
422 stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
423 if (!stat->pages)
424 return -ENOMEM;
425
426#ifdef CONFIG_DYNAMIC_FTRACE
427 functions = ftrace_update_tot_cnt;
428#else
429 /*
430 * We do not know the number of functions that exist because
431 * dynamic tracing is what counts them. With past experience
432 * we have around 20K functions. That should be more than enough.
433 * It is highly unlikely we will execute every function in
434 * the kernel.
435 */
436 functions = 20000;
437#endif
438
439 pg = stat->start = stat->pages;
440
441 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
442
443 for (i = 0; i < pages; i++) {
444 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
445 if (!pg->next)
446 goto out_free;
447 pg = pg->next;
448 }
449
450 return 0;
451
452 out_free:
453 pg = stat->start;
454 while (pg) {
455 unsigned long tmp = (unsigned long)pg;
456
457 pg = pg->next;
458 free_page(tmp);
459 }
460
461 free_page((unsigned long)stat->pages);
462 stat->pages = NULL;
463 stat->start = NULL;
464
465 return -ENOMEM;
466}
467
468static int ftrace_profile_init_cpu(int cpu)
469{
470 struct ftrace_profile_stat *stat;
471 int size;
472
473 stat = &per_cpu(ftrace_profile_stats, cpu);
474
475 if (stat->hash) {
476 /* If the profile is already created, simply reset it */
477 ftrace_profile_reset(stat);
478 return 0;
479 }
480
481 /*
482 * We are profiling all functions, but usually only a few thousand
483 * functions are hit. We'll make a hash of 1024 items.
484 */
485 size = FTRACE_PROFILE_HASH_SIZE;
486
487 stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
488
489 if (!stat->hash)
490 return -ENOMEM;
491
492 if (!ftrace_profile_bits) {
493 size--;
494
495 for (; size; size >>= 1)
496 ftrace_profile_bits++;
497 }
498
499 /* Preallocate the function profiling pages */
500 if (ftrace_profile_pages_init(stat) < 0) {
501 kfree(stat->hash);
502 stat->hash = NULL;
503 return -ENOMEM;
504 }
505
506 return 0;
507}
508
509static int ftrace_profile_init(void)
510{
511 int cpu;
512 int ret = 0;
513
514 for_each_online_cpu(cpu) {
515 ret = ftrace_profile_init_cpu(cpu);
516 if (ret)
517 break;
518 }
519
520 return ret;
521}
522
523/* interrupts must be disabled */
524static struct ftrace_profile *
525ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
526{
527 struct ftrace_profile *rec;
528 struct hlist_head *hhd;
529 struct hlist_node *n;
530 unsigned long key;
531
532 key = hash_long(ip, ftrace_profile_bits);
533 hhd = &stat->hash[key];
534
535 if (hlist_empty(hhd))
536 return NULL;
537
538 hlist_for_each_entry_rcu(rec, n, hhd, node) {
539 if (rec->ip == ip)
540 return rec;
541 }
542
543 return NULL;
544}
545
546static void ftrace_add_profile(struct ftrace_profile_stat *stat,
547 struct ftrace_profile *rec)
548{
549 unsigned long key;
550
551 key = hash_long(rec->ip, ftrace_profile_bits);
552 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
553}
554
555/*
556 * The memory is already allocated, this simply finds a new record to use.
557 */
558static struct ftrace_profile *
559ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
560{
561 struct ftrace_profile *rec = NULL;
562
563 /* prevent recursion (from NMIs) */
564 if (atomic_inc_return(&stat->disabled) != 1)
565 goto out;
566
567 /*
568 * Try to find the function again since an NMI
569 * could have added it
570 */
571 rec = ftrace_find_profiled_func(stat, ip);
572 if (rec)
573 goto out;
574
575 if (stat->pages->index == PROFILES_PER_PAGE) {
576 if (!stat->pages->next)
577 goto out;
578 stat->pages = stat->pages->next;
579 }
580
581 rec = &stat->pages->records[stat->pages->index++];
582 rec->ip = ip;
583 ftrace_add_profile(stat, rec);
584
585 out:
586 atomic_dec(&stat->disabled);
587
588 return rec;
589}
590
591static void
592function_profile_call(unsigned long ip, unsigned long parent_ip)
593{
594 struct ftrace_profile_stat *stat;
595 struct ftrace_profile *rec;
596 unsigned long flags;
597
598 if (!ftrace_profile_enabled)
599 return;
600
601 local_irq_save(flags);
602
603 stat = &__get_cpu_var(ftrace_profile_stats);
604 if (!stat->hash || !ftrace_profile_enabled)
605 goto out;
606
607 rec = ftrace_find_profiled_func(stat, ip);
608 if (!rec) {
609 rec = ftrace_profile_alloc(stat, ip);
610 if (!rec)
611 goto out;
612 }
613
614 rec->counter++;
615 out:
616 local_irq_restore(flags);
617}
618
619#ifdef CONFIG_FUNCTION_GRAPH_TRACER
620static int profile_graph_entry(struct ftrace_graph_ent *trace)
621{
622 function_profile_call(trace->func, 0);
623 return 1;
624}
625
626static void profile_graph_return(struct ftrace_graph_ret *trace)
627{
628 struct ftrace_profile_stat *stat;
629 unsigned long long calltime;
630 struct ftrace_profile *rec;
631 unsigned long flags;
632
633 local_irq_save(flags);
634 stat = &__get_cpu_var(ftrace_profile_stats);
635 if (!stat->hash || !ftrace_profile_enabled)
636 goto out;
637
638 calltime = trace->rettime - trace->calltime;
639
640 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
641 int index;
642
643 index = trace->depth;
644
645 /* Append this call time to the parent time to subtract */
646 if (index)
647 current->ret_stack[index - 1].subtime += calltime;
648
649 if (current->ret_stack[index].subtime < calltime)
650 calltime -= current->ret_stack[index].subtime;
651 else
652 calltime = 0;
653 }
654
655 rec = ftrace_find_profiled_func(stat, trace->func);
656 if (rec)
657 rec->time += calltime;
658
659 out:
660 local_irq_restore(flags);
661}
662
663static int register_ftrace_profiler(void)
664{
665 return register_ftrace_graph(&profile_graph_return,
666 &profile_graph_entry);
667}
668
669static void unregister_ftrace_profiler(void)
670{
671 unregister_ftrace_graph();
672}
673#else
674static struct ftrace_ops ftrace_profile_ops __read_mostly =
675{
676 .func = function_profile_call,
677};
678
679static int register_ftrace_profiler(void)
680{
681 return register_ftrace_function(&ftrace_profile_ops);
682}
683
684static void unregister_ftrace_profiler(void)
685{
686 unregister_ftrace_function(&ftrace_profile_ops);
687}
688#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
689
690static ssize_t
691ftrace_profile_write(struct file *filp, const char __user *ubuf,
692 size_t cnt, loff_t *ppos)
693{
694 unsigned long val;
695 char buf[64]; /* big enough to hold a number */
696 int ret;
697
698 if (cnt >= sizeof(buf))
699 return -EINVAL;
700
701 if (copy_from_user(&buf, ubuf, cnt))
702 return -EFAULT;
703
704 buf[cnt] = 0;
705
706 ret = strict_strtoul(buf, 10, &val);
707 if (ret < 0)
708 return ret;
709
710 val = !!val;
711
712 mutex_lock(&ftrace_profile_lock);
713 if (ftrace_profile_enabled ^ val) {
714 if (val) {
715 ret = ftrace_profile_init();
716 if (ret < 0) {
717 cnt = ret;
718 goto out;
719 }
720
721 ret = register_ftrace_profiler();
722 if (ret < 0) {
723 cnt = ret;
724 goto out;
725 }
726 ftrace_profile_enabled = 1;
727 } else {
728 ftrace_profile_enabled = 0;
729 /*
730 * unregister_ftrace_profiler calls stop_machine
731 * so this acts like an synchronize_sched.
732 */
733 unregister_ftrace_profiler();
734 }
735 }
736 out:
737 mutex_unlock(&ftrace_profile_lock);
738
739 filp->f_pos += cnt;
740
741 return cnt;
742}
743
744static ssize_t
745ftrace_profile_read(struct file *filp, char __user *ubuf,
746 size_t cnt, loff_t *ppos)
747{
748 char buf[64]; /* big enough to hold a number */
749 int r;
750
751 r = sprintf(buf, "%u\n", ftrace_profile_enabled);
752 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
753}
754
755static const struct file_operations ftrace_profile_fops = {
756 .open = tracing_open_generic,
757 .read = ftrace_profile_read,
758 .write = ftrace_profile_write,
759};
760
761/* used to initialize the real stat files */
762static struct tracer_stat function_stats __initdata = {
763 .name = "functions",
764 .stat_start = function_stat_start,
765 .stat_next = function_stat_next,
766 .stat_cmp = function_stat_cmp,
767 .stat_headers = function_stat_headers,
768 .stat_show = function_stat_show
769};
770
771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
772{
773 struct ftrace_profile_stat *stat;
774 struct dentry *entry;
775 char *name;
776 int ret;
777 int cpu;
778
779 for_each_possible_cpu(cpu) {
780 stat = &per_cpu(ftrace_profile_stats, cpu);
781
782 /* allocate enough for function name + cpu number */
783 name = kmalloc(32, GFP_KERNEL);
784 if (!name) {
785 /*
786 * The files created are permanent, if something happens
787 * we still do not free memory.
788 */
789 WARN(1,
790 "Could not allocate stat file for cpu %d\n",
791 cpu);
792 return;
793 }
794 stat->stat = function_stats;
795 snprintf(name, 32, "function%d", cpu);
796 stat->stat.name = name;
797 ret = register_stat_tracer(&stat->stat);
798 if (ret) {
799 WARN(1,
800 "Could not register function stat for cpu %d\n",
801 cpu);
802 kfree(name);
803 return;
804 }
805 }
806
807 entry = debugfs_create_file("function_profile_enabled", 0644,
808 d_tracer, NULL, &ftrace_profile_fops);
809 if (!entry)
810 pr_warning("Could not create debugfs "
811 "'function_profile_enabled' entry\n");
812}
813
814#else /* CONFIG_FUNCTION_PROFILER */
815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
816{
817}
818#endif /* CONFIG_FUNCTION_PROFILER */
819
243/* set when tracing only a pid */ 820/* set when tracing only a pid */
244struct pid *ftrace_pid_trace; 821struct pid *ftrace_pid_trace;
245static struct pid * const ftrace_swapper_pid = &init_struct_pid; 822static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +838,6 @@ struct ftrace_func_probe {
261 struct rcu_head rcu; 838 struct rcu_head rcu;
262}; 839};
263 840
264
265enum { 841enum {
266 FTRACE_ENABLE_CALLS = (1 << 0), 842 FTRACE_ENABLE_CALLS = (1 << 0),
267 FTRACE_DISABLE_CALLS = (1 << 1), 843 FTRACE_DISABLE_CALLS = (1 << 1),
@@ -346,30 +922,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
346 rec->flags |= FTRACE_FL_FREE; 922 rec->flags |= FTRACE_FL_FREE;
347} 923}
348 924
349void ftrace_release(void *start, unsigned long size)
350{
351 struct dyn_ftrace *rec;
352 struct ftrace_page *pg;
353 unsigned long s = (unsigned long)start;
354 unsigned long e = s + size;
355
356 if (ftrace_disabled || !start)
357 return;
358
359 mutex_lock(&ftrace_lock);
360 do_for_each_ftrace_rec(pg, rec) {
361 if ((rec->ip >= s) && (rec->ip < e)) {
362 /*
363 * rec->ip is changed in ftrace_free_rec()
364 * It should not between s and e if record was freed.
365 */
366 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
367 ftrace_free_rec(rec);
368 }
369 } while_for_each_ftrace_rec();
370 mutex_unlock(&ftrace_lock);
371}
372
373static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 925static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
374{ 926{
375 struct dyn_ftrace *rec; 927 struct dyn_ftrace *rec;
@@ -673,6 +1225,13 @@ static void ftrace_shutdown(int command)
673 return; 1225 return;
674 1226
675 ftrace_start_up--; 1227 ftrace_start_up--;
1228 /*
1229 * Just warn in case of unbalance, no need to kill ftrace, it's not
1230 * critical but the ftrace_call callers may be never nopped again after
1231 * further ftrace uses.
1232 */
1233 WARN_ON_ONCE(ftrace_start_up < 0);
1234
676 if (!ftrace_start_up) 1235 if (!ftrace_start_up)
677 command |= FTRACE_DISABLE_CALLS; 1236 command |= FTRACE_DISABLE_CALLS;
678 1237
@@ -859,10 +1418,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
859{ 1418{
860 struct ftrace_iterator *iter = m->private; 1419 struct ftrace_iterator *iter = m->private;
861 void *p = NULL; 1420 void *p = NULL;
1421 loff_t l;
1422
1423 if (!(iter->flags & FTRACE_ITER_HASH))
1424 *pos = 0;
862 1425
863 iter->flags |= FTRACE_ITER_HASH; 1426 iter->flags |= FTRACE_ITER_HASH;
864 1427
865 return t_hash_next(m, p, pos); 1428 iter->hidx = 0;
1429 for (l = 0; l <= *pos; ) {
1430 p = t_hash_next(m, p, &l);
1431 if (!p)
1432 break;
1433 }
1434 return p;
866} 1435}
867 1436
868static int t_hash_show(struct seq_file *m, void *v) 1437static int t_hash_show(struct seq_file *m, void *v)
@@ -909,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
909 iter->pg = iter->pg->next; 1478 iter->pg = iter->pg->next;
910 iter->idx = 0; 1479 iter->idx = 0;
911 goto retry; 1480 goto retry;
912 } else {
913 iter->idx = -1;
914 } 1481 }
915 } else { 1482 } else {
916 rec = &iter->pg->records[iter->idx++]; 1483 rec = &iter->pg->records[iter->idx++];
@@ -939,6 +1506,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
939{ 1506{
940 struct ftrace_iterator *iter = m->private; 1507 struct ftrace_iterator *iter = m->private;
941 void *p = NULL; 1508 void *p = NULL;
1509 loff_t l;
942 1510
943 mutex_lock(&ftrace_lock); 1511 mutex_lock(&ftrace_lock);
944 /* 1512 /*
@@ -950,23 +1518,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
950 if (*pos > 0) 1518 if (*pos > 0)
951 return t_hash_start(m, pos); 1519 return t_hash_start(m, pos);
952 iter->flags |= FTRACE_ITER_PRINTALL; 1520 iter->flags |= FTRACE_ITER_PRINTALL;
953 (*pos)++;
954 return iter; 1521 return iter;
955 } 1522 }
956 1523
957 if (iter->flags & FTRACE_ITER_HASH) 1524 if (iter->flags & FTRACE_ITER_HASH)
958 return t_hash_start(m, pos); 1525 return t_hash_start(m, pos);
959 1526
960 if (*pos > 0) { 1527 iter->pg = ftrace_pages_start;
961 if (iter->idx < 0) 1528 iter->idx = 0;
962 return p; 1529 for (l = 0; l <= *pos; ) {
963 (*pos)--; 1530 p = t_next(m, p, &l);
964 iter->idx--; 1531 if (!p)
1532 break;
965 } 1533 }
966 1534
967 p = t_next(m, p, pos); 1535 if (!p && iter->flags & FTRACE_ITER_FILTER)
968
969 if (!p)
970 return t_hash_start(m, pos); 1536 return t_hash_start(m, pos);
971 1537
972 return p; 1538 return p;
@@ -1096,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1096 1662
1097 mutex_lock(&ftrace_regex_lock); 1663 mutex_lock(&ftrace_regex_lock);
1098 if ((file->f_mode & FMODE_WRITE) && 1664 if ((file->f_mode & FMODE_WRITE) &&
1099 !(file->f_flags & O_APPEND)) 1665 (file->f_flags & O_TRUNC))
1100 ftrace_filter_reset(enable); 1666 ftrace_filter_reset(enable);
1101 1667
1102 if (file->f_mode & FMODE_READ) { 1668 if (file->f_mode & FMODE_READ) {
@@ -1408,7 +1974,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1408 1974
1409static struct ftrace_ops trace_probe_ops __read_mostly = 1975static struct ftrace_ops trace_probe_ops __read_mostly =
1410{ 1976{
1411 .func = function_trace_probe_call, 1977 .func = function_trace_probe_call,
1412}; 1978};
1413 1979
1414static int ftrace_probe_registered; 1980static int ftrace_probe_registered;
@@ -1712,7 +2278,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
1712 read++; 2278 read++;
1713 cnt--; 2279 cnt--;
1714 2280
1715 if (!(iter->flags & ~FTRACE_ITER_CONT)) { 2281 /*
2282 * If the parser haven't finished with the last write,
2283 * continue reading the user input without skipping spaces.
2284 */
2285 if (!(iter->flags & FTRACE_ITER_CONT)) {
1716 /* skip white space */ 2286 /* skip white space */
1717 while (cnt && isspace(ch)) { 2287 while (cnt && isspace(ch)) {
1718 ret = get_user(ch, ubuf++); 2288 ret = get_user(ch, ubuf++);
@@ -1722,8 +2292,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
1722 cnt--; 2292 cnt--;
1723 } 2293 }
1724 2294
2295 /* only spaces were written */
1725 if (isspace(ch)) { 2296 if (isspace(ch)) {
1726 file->f_pos += read; 2297 *ppos += read;
1727 ret = read; 2298 ret = read;
1728 goto out; 2299 goto out;
1729 } 2300 }
@@ -1753,12 +2324,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
1753 if (ret) 2324 if (ret)
1754 goto out; 2325 goto out;
1755 iter->buffer_idx = 0; 2326 iter->buffer_idx = 0;
1756 } else 2327 } else {
1757 iter->flags |= FTRACE_ITER_CONT; 2328 iter->flags |= FTRACE_ITER_CONT;
2329 iter->buffer[iter->buffer_idx++] = ch;
2330 }
1758 2331
1759 2332 *ppos += read;
1760 file->f_pos += read;
1761
1762 ret = read; 2333 ret = read;
1763 out: 2334 out:
1764 mutex_unlock(&ftrace_regex_lock); 2335 mutex_unlock(&ftrace_regex_lock);
@@ -1823,6 +2394,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
1823 ftrace_set_regex(buf, len, reset, 0); 2394 ftrace_set_regex(buf, len, reset, 0);
1824} 2395}
1825 2396
2397/*
2398 * command line interface to allow users to set filters on boot up.
2399 */
2400#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
2401static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
2402static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
2403
2404static int __init set_ftrace_notrace(char *str)
2405{
2406 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
2407 return 1;
2408}
2409__setup("ftrace_notrace=", set_ftrace_notrace);
2410
2411static int __init set_ftrace_filter(char *str)
2412{
2413 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
2414 return 1;
2415}
2416__setup("ftrace_filter=", set_ftrace_filter);
2417
2418static void __init set_ftrace_early_filter(char *buf, int enable)
2419{
2420 char *func;
2421
2422 while (buf) {
2423 func = strsep(&buf, ",");
2424 ftrace_set_regex(func, strlen(func), 0, enable);
2425 }
2426}
2427
2428static void __init set_ftrace_early_filters(void)
2429{
2430 if (ftrace_filter_buf[0])
2431 set_ftrace_early_filter(ftrace_filter_buf, 1);
2432 if (ftrace_notrace_buf[0])
2433 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2434}
2435
1826static int 2436static int
1827ftrace_regex_release(struct inode *inode, struct file *file, int enable) 2437ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1828{ 2438{
@@ -1903,32 +2513,31 @@ int ftrace_graph_count;
1903unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2513unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
1904 2514
1905static void * 2515static void *
1906g_next(struct seq_file *m, void *v, loff_t *pos) 2516__g_next(struct seq_file *m, loff_t *pos)
1907{ 2517{
1908 unsigned long *array = m->private; 2518 unsigned long *array = m->private;
1909 int index = *pos;
1910
1911 (*pos)++;
1912 2519
1913 if (index >= ftrace_graph_count) 2520 if (*pos >= ftrace_graph_count)
1914 return NULL; 2521 return NULL;
2522 return &array[*pos];
2523}
1915 2524
1916 return &array[index]; 2525static void *
2526g_next(struct seq_file *m, void *v, loff_t *pos)
2527{
2528 (*pos)++;
2529 return __g_next(m, pos);
1917} 2530}
1918 2531
1919static void *g_start(struct seq_file *m, loff_t *pos) 2532static void *g_start(struct seq_file *m, loff_t *pos)
1920{ 2533{
1921 void *p = NULL;
1922
1923 mutex_lock(&graph_lock); 2534 mutex_lock(&graph_lock);
1924 2535
1925 /* Nothing, tell g_show to print all functions are enabled */ 2536 /* Nothing, tell g_show to print all functions are enabled */
1926 if (!ftrace_graph_count && !*pos) 2537 if (!ftrace_graph_count && !*pos)
1927 return (void *)1; 2538 return (void *)1;
1928 2539
1929 p = g_next(m, p, pos); 2540 return __g_next(m, pos);
1930
1931 return p;
1932} 2541}
1933 2542
1934static void g_stop(struct seq_file *m, void *p) 2543static void g_stop(struct seq_file *m, void *p)
@@ -1973,7 +2582,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
1973 2582
1974 mutex_lock(&graph_lock); 2583 mutex_lock(&graph_lock);
1975 if ((file->f_mode & FMODE_WRITE) && 2584 if ((file->f_mode & FMODE_WRITE) &&
1976 !(file->f_flags & O_APPEND)) { 2585 (file->f_flags & O_TRUNC)) {
1977 ftrace_graph_count = 0; 2586 ftrace_graph_count = 0;
1978 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2587 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
1979 } 2588 }
@@ -1992,6 +2601,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
1992} 2601}
1993 2602
1994static int 2603static int
2604ftrace_graph_release(struct inode *inode, struct file *file)
2605{
2606 if (file->f_mode & FMODE_READ)
2607 seq_release(inode, file);
2608 return 0;
2609}
2610
2611static int
1995ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2612ftrace_set_func(unsigned long *array, int *idx, char *buffer)
1996{ 2613{
1997 struct dyn_ftrace *rec; 2614 struct dyn_ftrace *rec;
@@ -2120,46 +2737,32 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2120} 2737}
2121 2738
2122static const struct file_operations ftrace_graph_fops = { 2739static const struct file_operations ftrace_graph_fops = {
2123 .open = ftrace_graph_open, 2740 .open = ftrace_graph_open,
2124 .read = seq_read, 2741 .read = seq_read,
2125 .write = ftrace_graph_write, 2742 .write = ftrace_graph_write,
2743 .release = ftrace_graph_release,
2126}; 2744};
2127#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2745#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2128 2746
2129static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 2747static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2130{ 2748{
2131 struct dentry *entry;
2132 2749
2133 entry = debugfs_create_file("available_filter_functions", 0444, 2750 trace_create_file("available_filter_functions", 0444,
2134 d_tracer, NULL, &ftrace_avail_fops); 2751 d_tracer, NULL, &ftrace_avail_fops);
2135 if (!entry)
2136 pr_warning("Could not create debugfs "
2137 "'available_filter_functions' entry\n");
2138 2752
2139 entry = debugfs_create_file("failures", 0444, 2753 trace_create_file("failures", 0444,
2140 d_tracer, NULL, &ftrace_failures_fops); 2754 d_tracer, NULL, &ftrace_failures_fops);
2141 if (!entry)
2142 pr_warning("Could not create debugfs 'failures' entry\n");
2143 2755
2144 entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, 2756 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2145 NULL, &ftrace_filter_fops); 2757 NULL, &ftrace_filter_fops);
2146 if (!entry)
2147 pr_warning("Could not create debugfs "
2148 "'set_ftrace_filter' entry\n");
2149 2758
2150 entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, 2759 trace_create_file("set_ftrace_notrace", 0644, d_tracer,
2151 NULL, &ftrace_notrace_fops); 2760 NULL, &ftrace_notrace_fops);
2152 if (!entry)
2153 pr_warning("Could not create debugfs "
2154 "'set_ftrace_notrace' entry\n");
2155 2761
2156#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2762#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2157 entry = debugfs_create_file("set_graph_function", 0444, d_tracer, 2763 trace_create_file("set_graph_function", 0444, d_tracer,
2158 NULL, 2764 NULL,
2159 &ftrace_graph_fops); 2765 &ftrace_graph_fops);
2160 if (!entry)
2161 pr_warning("Could not create debugfs "
2162 "'set_graph_function' entry\n");
2163#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2766#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2164 2767
2165 return 0; 2768 return 0;
@@ -2197,14 +2800,72 @@ static int ftrace_convert_nops(struct module *mod,
2197 return 0; 2800 return 0;
2198} 2801}
2199 2802
2200void ftrace_init_module(struct module *mod, 2803#ifdef CONFIG_MODULES
2201 unsigned long *start, unsigned long *end) 2804void ftrace_release(void *start, void *end)
2805{
2806 struct dyn_ftrace *rec;
2807 struct ftrace_page *pg;
2808 unsigned long s = (unsigned long)start;
2809 unsigned long e = (unsigned long)end;
2810
2811 if (ftrace_disabled || !start || start == end)
2812 return;
2813
2814 mutex_lock(&ftrace_lock);
2815 do_for_each_ftrace_rec(pg, rec) {
2816 if ((rec->ip >= s) && (rec->ip < e)) {
2817 /*
2818 * rec->ip is changed in ftrace_free_rec()
2819 * It should not between s and e if record was freed.
2820 */
2821 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
2822 ftrace_free_rec(rec);
2823 }
2824 } while_for_each_ftrace_rec();
2825 mutex_unlock(&ftrace_lock);
2826}
2827
2828static void ftrace_init_module(struct module *mod,
2829 unsigned long *start, unsigned long *end)
2202{ 2830{
2203 if (ftrace_disabled || start == end) 2831 if (ftrace_disabled || start == end)
2204 return; 2832 return;
2205 ftrace_convert_nops(mod, start, end); 2833 ftrace_convert_nops(mod, start, end);
2206} 2834}
2207 2835
2836static int ftrace_module_notify(struct notifier_block *self,
2837 unsigned long val, void *data)
2838{
2839 struct module *mod = data;
2840
2841 switch (val) {
2842 case MODULE_STATE_COMING:
2843 ftrace_init_module(mod, mod->ftrace_callsites,
2844 mod->ftrace_callsites +
2845 mod->num_ftrace_callsites);
2846 break;
2847 case MODULE_STATE_GOING:
2848 ftrace_release(mod->ftrace_callsites,
2849 mod->ftrace_callsites +
2850 mod->num_ftrace_callsites);
2851 break;
2852 }
2853
2854 return 0;
2855}
2856#else
2857static int ftrace_module_notify(struct notifier_block *self,
2858 unsigned long val, void *data)
2859{
2860 return 0;
2861}
2862#endif /* CONFIG_MODULES */
2863
2864struct notifier_block ftrace_module_nb = {
2865 .notifier_call = ftrace_module_notify,
2866 .priority = 0,
2867};
2868
2208extern unsigned long __start_mcount_loc[]; 2869extern unsigned long __start_mcount_loc[];
2209extern unsigned long __stop_mcount_loc[]; 2870extern unsigned long __stop_mcount_loc[];
2210 2871
@@ -2236,6 +2897,12 @@ void __init ftrace_init(void)
2236 __start_mcount_loc, 2897 __start_mcount_loc,
2237 __stop_mcount_loc); 2898 __stop_mcount_loc);
2238 2899
2900 ret = register_module_notifier(&ftrace_module_nb);
2901 if (ret)
2902 pr_warning("Failed to register trace ftrace module notifier\n");
2903
2904 set_ftrace_early_filters();
2905
2239 return; 2906 return;
2240 failed: 2907 failed:
2241 ftrace_disabled = 1; 2908 ftrace_disabled = 1;
@@ -2417,7 +3084,6 @@ static const struct file_operations ftrace_pid_fops = {
2417static __init int ftrace_init_debugfs(void) 3084static __init int ftrace_init_debugfs(void)
2418{ 3085{
2419 struct dentry *d_tracer; 3086 struct dentry *d_tracer;
2420 struct dentry *entry;
2421 3087
2422 d_tracer = tracing_init_dentry(); 3088 d_tracer = tracing_init_dentry();
2423 if (!d_tracer) 3089 if (!d_tracer)
@@ -2425,11 +3091,11 @@ static __init int ftrace_init_debugfs(void)
2425 3091
2426 ftrace_init_dyn_debugfs(d_tracer); 3092 ftrace_init_dyn_debugfs(d_tracer);
2427 3093
2428 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, 3094 trace_create_file("set_ftrace_pid", 0644, d_tracer,
2429 NULL, &ftrace_pid_fops); 3095 NULL, &ftrace_pid_fops);
2430 if (!entry) 3096
2431 pr_warning("Could not create debugfs " 3097 ftrace_profile_debugfs(d_tracer);
2432 "'set_ftrace_pid' entry\n"); 3098
2433 return 0; 3099 return 0;
2434} 3100}
2435fs_initcall(ftrace_init_debugfs); 3101fs_initcall(ftrace_init_debugfs);
@@ -2507,10 +3173,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
2507 3173
2508 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3174 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
2509 3175
2510 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3176 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
2511 goto out; 3177 goto out;
2512 3178
2513 last_ftrace_enabled = ftrace_enabled; 3179 last_ftrace_enabled = !!ftrace_enabled;
2514 3180
2515 if (ftrace_enabled) { 3181 if (ftrace_enabled) {
2516 3182
@@ -2538,7 +3204,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
2538 3204
2539#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3205#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2540 3206
2541static atomic_t ftrace_graph_active; 3207static int ftrace_graph_active;
2542static struct notifier_block ftrace_suspend_notifier; 3208static struct notifier_block ftrace_suspend_notifier;
2543 3209
2544int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 3210int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2580,12 +3246,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
2580 } 3246 }
2581 3247
2582 if (t->ret_stack == NULL) { 3248 if (t->ret_stack == NULL) {
2583 t->curr_ret_stack = -1;
2584 /* Make sure IRQs see the -1 first: */
2585 barrier();
2586 t->ret_stack = ret_stack_list[start++];
2587 atomic_set(&t->tracing_graph_pause, 0); 3249 atomic_set(&t->tracing_graph_pause, 0);
2588 atomic_set(&t->trace_overrun, 0); 3250 atomic_set(&t->trace_overrun, 0);
3251 t->curr_ret_stack = -1;
3252 /* Make sure the tasks see the -1 first: */
3253 smp_wmb();
3254 t->ret_stack = ret_stack_list[start++];
2589 } 3255 }
2590 } while_each_thread(g, t); 3256 } while_each_thread(g, t);
2591 3257
@@ -2643,8 +3309,10 @@ static int start_graph_tracing(void)
2643 return -ENOMEM; 3309 return -ENOMEM;
2644 3310
2645 /* The cpu_boot init_task->ret_stack will never be freed */ 3311 /* The cpu_boot init_task->ret_stack will never be freed */
2646 for_each_online_cpu(cpu) 3312 for_each_online_cpu(cpu) {
2647 ftrace_graph_init_task(idle_task(cpu)); 3313 if (!idle_task(cpu)->ret_stack)
3314 ftrace_graph_init_task(idle_task(cpu));
3315 }
2648 3316
2649 do { 3317 do {
2650 ret = alloc_retstack_tasklist(ret_stack_list); 3318 ret = alloc_retstack_tasklist(ret_stack_list);
@@ -2690,7 +3358,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2690 mutex_lock(&ftrace_lock); 3358 mutex_lock(&ftrace_lock);
2691 3359
2692 /* we currently allow only one tracer registered at a time */ 3360 /* we currently allow only one tracer registered at a time */
2693 if (atomic_read(&ftrace_graph_active)) { 3361 if (ftrace_graph_active) {
2694 ret = -EBUSY; 3362 ret = -EBUSY;
2695 goto out; 3363 goto out;
2696 } 3364 }
@@ -2698,10 +3366,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2698 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; 3366 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2699 register_pm_notifier(&ftrace_suspend_notifier); 3367 register_pm_notifier(&ftrace_suspend_notifier);
2700 3368
2701 atomic_inc(&ftrace_graph_active); 3369 ftrace_graph_active++;
2702 ret = start_graph_tracing(); 3370 ret = start_graph_tracing();
2703 if (ret) { 3371 if (ret) {
2704 atomic_dec(&ftrace_graph_active); 3372 ftrace_graph_active--;
2705 goto out; 3373 goto out;
2706 } 3374 }
2707 3375
@@ -2719,10 +3387,10 @@ void unregister_ftrace_graph(void)
2719{ 3387{
2720 mutex_lock(&ftrace_lock); 3388 mutex_lock(&ftrace_lock);
2721 3389
2722 if (!unlikely(atomic_read(&ftrace_graph_active))) 3390 if (unlikely(!ftrace_graph_active))
2723 goto out; 3391 goto out;
2724 3392
2725 atomic_dec(&ftrace_graph_active); 3393 ftrace_graph_active--;
2726 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); 3394 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
2727 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3395 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2728 ftrace_graph_entry = ftrace_graph_entry_stub; 3396 ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,18 +3404,25 @@ void unregister_ftrace_graph(void)
2736/* Allocate a return stack for newly created task */ 3404/* Allocate a return stack for newly created task */
2737void ftrace_graph_init_task(struct task_struct *t) 3405void ftrace_graph_init_task(struct task_struct *t)
2738{ 3406{
2739 if (atomic_read(&ftrace_graph_active)) { 3407 /* Make sure we do not use the parent ret_stack */
2740 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH 3408 t->ret_stack = NULL;
3409
3410 if (ftrace_graph_active) {
3411 struct ftrace_ret_stack *ret_stack;
3412
3413 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2741 * sizeof(struct ftrace_ret_stack), 3414 * sizeof(struct ftrace_ret_stack),
2742 GFP_KERNEL); 3415 GFP_KERNEL);
2743 if (!t->ret_stack) 3416 if (!ret_stack)
2744 return; 3417 return;
2745 t->curr_ret_stack = -1; 3418 t->curr_ret_stack = -1;
2746 atomic_set(&t->tracing_graph_pause, 0); 3419 atomic_set(&t->tracing_graph_pause, 0);
2747 atomic_set(&t->trace_overrun, 0); 3420 atomic_set(&t->trace_overrun, 0);
2748 t->ftrace_timestamp = 0; 3421 t->ftrace_timestamp = 0;
2749 } else 3422 /* make curr_ret_stack visable before we add the ret_stack */
2750 t->ret_stack = NULL; 3423 smp_wmb();
3424 t->ret_stack = ret_stack;
3425 }
2751} 3426}
2752 3427
2753void ftrace_graph_exit_task(struct task_struct *t) 3428void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e37..1edaa9516e81 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
12#include <linux/dcache.h> 12#include <linux/dcache.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14 14
15#include <trace/kmemtrace.h> 15#include <linux/kmemtrace.h>
16 16
17#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h" 18#include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
42 gfp_t gfp_flags, 42 gfp_t gfp_flags,
43 int node) 43 int node)
44{ 44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
45 struct trace_array *tr = kmemtrace_array; 46 struct trace_array *tr = kmemtrace_array;
46 struct kmemtrace_alloc_entry *entry; 47 struct kmemtrace_alloc_entry *entry;
47 struct ring_buffer_event *event; 48 struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
62 entry->gfp_flags = gfp_flags; 63 entry->gfp_flags = gfp_flags;
63 entry->node = node; 64 entry->node = node;
64 65
65 ring_buffer_unlock_commit(tr->buffer, event); 66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
66 68
67 trace_wake_up(); 69 trace_wake_up();
68} 70}
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
71 unsigned long call_site, 73 unsigned long call_site,
72 const void *ptr) 74 const void *ptr)
73{ 75{
76 struct ftrace_event_call *call = &event_kmem_free;
74 struct trace_array *tr = kmemtrace_array; 77 struct trace_array *tr = kmemtrace_array;
75 struct kmemtrace_free_entry *entry; 78 struct kmemtrace_free_entry *entry;
76 struct ring_buffer_event *event; 79 struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
86 entry->call_site = call_site; 89 entry->call_site = call_site;
87 entry->ptr = ptr; 90 entry->ptr = ptr;
88 91
89 ring_buffer_unlock_commit(tr->buffer, event); 92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
90 94
91 trace_wake_up(); 95 trace_wake_up();
92} 96}
@@ -182,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
182 int cpu; 186 int cpu;
183 kmemtrace_array = tr; 187 kmemtrace_array = tr;
184 188
185 for_each_cpu_mask(cpu, cpu_possible_map) 189 for_each_cpu(cpu, cpu_possible_mask)
186 tracing_reset(tr, cpu); 190 tracing_reset(tr, cpu);
187 191
188 kmemtrace_start_probes(); 192 kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c844..a330513d96ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <linux/kmemcheck.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/percpu.h> 15#include <linux/percpu.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
@@ -22,6 +23,28 @@
22#include "trace.h" 23#include "trace.h"
23 24
24/* 25/*
26 * The ring buffer header is special. We must manually up keep it.
27 */
28int ring_buffer_print_entry_header(struct trace_seq *s)
29{
30 int ret;
31
32 ret = trace_seq_printf(s, "# compressed entry header\n");
33 ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
34 ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
35 ret = trace_seq_printf(s, "\tarray : 32 bits\n");
36 ret = trace_seq_printf(s, "\n");
37 ret = trace_seq_printf(s, "\tpadding : type == %d\n",
38 RINGBUF_TYPE_PADDING);
39 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
40 RINGBUF_TYPE_TIME_EXTEND);
41 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
42 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
43
44 return ret;
45}
46
47/*
25 * The ring buffer is made up of a list of pages. A separate list of pages is 48 * The ring buffer is made up of a list of pages. A separate list of pages is
26 * allocated for each CPU. A writer may only write to a buffer that is 49 * allocated for each CPU. A writer may only write to a buffer that is
27 * associated with the CPU it is currently executing on. A reader may read 50 * associated with the CPU it is currently executing on. A reader may read
@@ -182,7 +205,11 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
182 205
183#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
184#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
185#define RB_MAX_SMALL_DATA 28 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
210
211/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
212#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
186 213
187enum { 214enum {
188 RB_LEN_TIME_EXTEND = 8, 215 RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +218,28 @@ enum {
191 218
192static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
193{ 220{
194 return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; 221 return event->type_len == RINGBUF_TYPE_PADDING
222 && event->time_delta == 0;
195} 223}
196 224
197static inline int rb_discarded_event(struct ring_buffer_event *event) 225static inline int rb_discarded_event(struct ring_buffer_event *event)
198{ 226{
199 return event->type == RINGBUF_TYPE_PADDING && event->time_delta; 227 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
200} 228}
201 229
202static void rb_event_set_padding(struct ring_buffer_event *event) 230static void rb_event_set_padding(struct ring_buffer_event *event)
203{ 231{
204 event->type = RINGBUF_TYPE_PADDING; 232 event->type_len = RINGBUF_TYPE_PADDING;
205 event->time_delta = 0; 233 event->time_delta = 0;
206} 234}
207 235
208/**
209 * ring_buffer_event_discard - discard an event in the ring buffer
210 * @buffer: the ring buffer
211 * @event: the event to discard
212 *
213 * Sometimes a event that is in the ring buffer needs to be ignored.
214 * This function lets the user discard an event in the ring buffer
215 * and then that event will not be read later.
216 *
217 * Note, it is up to the user to be careful with this, and protect
218 * against races. If the user discards an event that has been consumed
219 * it is possible that it could corrupt the ring buffer.
220 */
221void ring_buffer_event_discard(struct ring_buffer_event *event)
222{
223 event->type = RINGBUF_TYPE_PADDING;
224 /* time delta must be non zero */
225 if (!event->time_delta)
226 event->time_delta = 1;
227}
228
229static unsigned 236static unsigned
230rb_event_data_length(struct ring_buffer_event *event) 237rb_event_data_length(struct ring_buffer_event *event)
231{ 238{
232 unsigned length; 239 unsigned length;
233 240
234 if (event->len) 241 if (event->type_len)
235 length = event->len * RB_ALIGNMENT; 242 length = event->type_len * RB_ALIGNMENT;
236 else 243 else
237 length = event->array[0]; 244 length = event->array[0];
238 return length + RB_EVNT_HDR_SIZE; 245 return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +249,12 @@ rb_event_data_length(struct ring_buffer_event *event)
242static unsigned 249static unsigned
243rb_event_length(struct ring_buffer_event *event) 250rb_event_length(struct ring_buffer_event *event)
244{ 251{
245 switch (event->type) { 252 switch (event->type_len) {
246 case RINGBUF_TYPE_PADDING: 253 case RINGBUF_TYPE_PADDING:
247 if (rb_null_event(event)) 254 if (rb_null_event(event))
248 /* undefined */ 255 /* undefined */
249 return -1; 256 return -1;
250 return rb_event_data_length(event); 257 return event->array[0] + RB_EVNT_HDR_SIZE;
251 258
252 case RINGBUF_TYPE_TIME_EXTEND: 259 case RINGBUF_TYPE_TIME_EXTEND:
253 return RB_LEN_TIME_EXTEND; 260 return RB_LEN_TIME_EXTEND;
@@ -271,7 +278,7 @@ rb_event_length(struct ring_buffer_event *event)
271unsigned ring_buffer_event_length(struct ring_buffer_event *event) 278unsigned ring_buffer_event_length(struct ring_buffer_event *event)
272{ 279{
273 unsigned length = rb_event_length(event); 280 unsigned length = rb_event_length(event);
274 if (event->type != RINGBUF_TYPE_DATA) 281 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
275 return length; 282 return length;
276 length -= RB_EVNT_HDR_SIZE; 283 length -= RB_EVNT_HDR_SIZE;
277 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) 284 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +291,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
284static void * 291static void *
285rb_event_data(struct ring_buffer_event *event) 292rb_event_data(struct ring_buffer_event *event)
286{ 293{
287 BUG_ON(event->type != RINGBUF_TYPE_DATA); 294 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
288 /* If length is in len field, then array[0] has the data */ 295 /* If length is in len field, then array[0] has the data */
289 if (event->len) 296 if (event->type_len)
290 return (void *)&event->array[0]; 297 return (void *)&event->array[0];
291 /* Otherwise length is in array[0] and array[1] has the data */ 298 /* Otherwise length is in array[0] and array[1] has the data */
292 return (void *)&event->array[1]; 299 return (void *)&event->array[1];
@@ -316,9 +323,10 @@ struct buffer_data_page {
316}; 323};
317 324
318struct buffer_page { 325struct buffer_page {
326 struct list_head list; /* list of buffer pages */
319 local_t write; /* index for next write */ 327 local_t write; /* index for next write */
320 unsigned read; /* index for next read */ 328 unsigned read; /* index for next read */
321 struct list_head list; /* list of free pages */ 329 local_t entries; /* entries on this page */
322 struct buffer_data_page *page; /* Actual data page */ 330 struct buffer_data_page *page; /* Actual data page */
323}; 331};
324 332
@@ -361,6 +369,34 @@ static inline int test_time_stamp(u64 delta)
361 369
362#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) 370#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
363 371
372/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
373#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
374
375/* Max number of timestamps that can fit on a page */
376#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
377
378int ring_buffer_print_page_header(struct trace_seq *s)
379{
380 struct buffer_data_page field;
381 int ret;
382
383 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
384 "offset:0;\tsize:%u;\n",
385 (unsigned int)sizeof(field.time_stamp));
386
387 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
388 "offset:%u;\tsize:%u;\n",
389 (unsigned int)offsetof(typeof(field), commit),
390 (unsigned int)sizeof(field.commit));
391
392 ret = trace_seq_printf(s, "\tfield: char data;\t"
393 "offset:%u;\tsize:%u;\n",
394 (unsigned int)offsetof(typeof(field), data),
395 (unsigned int)BUF_PAGE_SIZE);
396
397 return ret;
398}
399
364/* 400/*
365 * head_page == tail_page && head == tail then buffer is empty. 401 * head_page == tail_page && head == tail then buffer is empty.
366 */ 402 */
@@ -375,8 +411,13 @@ struct ring_buffer_per_cpu {
375 struct buffer_page *tail_page; /* write to tail */ 411 struct buffer_page *tail_page; /* write to tail */
376 struct buffer_page *commit_page; /* committed pages */ 412 struct buffer_page *commit_page; /* committed pages */
377 struct buffer_page *reader_page; 413 struct buffer_page *reader_page;
414 unsigned long nmi_dropped;
415 unsigned long commit_overrun;
378 unsigned long overrun; 416 unsigned long overrun;
379 unsigned long entries; 417 unsigned long read;
418 local_t entries;
419 local_t committing;
420 local_t commits;
380 u64 write_stamp; 421 u64 write_stamp;
381 u64 read_stamp; 422 u64 read_stamp;
382 atomic_t record_disabled; 423 atomic_t record_disabled;
@@ -389,6 +430,8 @@ struct ring_buffer {
389 atomic_t record_disabled; 430 atomic_t record_disabled;
390 cpumask_var_t cpumask; 431 cpumask_var_t cpumask;
391 432
433 struct lock_class_key *reader_lock_key;
434
392 struct mutex mutex; 435 struct mutex mutex;
393 436
394 struct ring_buffer_per_cpu **buffers; 437 struct ring_buffer_per_cpu **buffers;
@@ -420,13 +463,18 @@ struct ring_buffer_iter {
420/* Up this if you want to test the TIME_EXTENTS and normalization */ 463/* Up this if you want to test the TIME_EXTENTS and normalization */
421#define DEBUG_SHIFT 0 464#define DEBUG_SHIFT 0
422 465
466static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
467{
468 /* shift to debug/test normalization and TIME_EXTENTS */
469 return buffer->clock() << DEBUG_SHIFT;
470}
471
423u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) 472u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
424{ 473{
425 u64 time; 474 u64 time;
426 475
427 preempt_disable_notrace(); 476 preempt_disable_notrace();
428 /* shift to debug/test normalization and TIME_EXTENTS */ 477 time = rb_time_stamp(buffer, cpu);
429 time = buffer->clock() << DEBUG_SHIFT;
430 preempt_enable_no_resched_notrace(); 478 preempt_enable_no_resched_notrace();
431 479
432 return time; 480 return time;
@@ -523,6 +571,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
523 cpu_buffer->cpu = cpu; 571 cpu_buffer->cpu = cpu;
524 cpu_buffer->buffer = buffer; 572 cpu_buffer->buffer = buffer;
525 spin_lock_init(&cpu_buffer->reader_lock); 573 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
526 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
527 INIT_LIST_HEAD(&cpu_buffer->pages); 576 INIT_LIST_HEAD(&cpu_buffer->pages);
528 577
@@ -572,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
572 kfree(cpu_buffer); 621 kfree(cpu_buffer);
573} 622}
574 623
575/*
576 * Causes compile errors if the struct buffer_page gets bigger
577 * than the struct page.
578 */
579extern int ring_buffer_page_too_big(void);
580
581#ifdef CONFIG_HOTPLUG_CPU 624#ifdef CONFIG_HOTPLUG_CPU
582static int rb_cpu_notify(struct notifier_block *self, 625static int rb_cpu_notify(struct notifier_block *self,
583 unsigned long action, void *hcpu); 626 unsigned long action, void *hcpu);
@@ -593,17 +636,13 @@ static int rb_cpu_notify(struct notifier_block *self,
593 * when the buffer wraps. If this flag is not set, the buffer will 636 * when the buffer wraps. If this flag is not set, the buffer will
594 * drop data when the tail hits the head. 637 * drop data when the tail hits the head.
595 */ 638 */
596struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) 639struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
640 struct lock_class_key *key)
597{ 641{
598 struct ring_buffer *buffer; 642 struct ring_buffer *buffer;
599 int bsize; 643 int bsize;
600 int cpu; 644 int cpu;
601 645
602 /* Paranoid! Optimizes out when all is well */
603 if (sizeof(struct buffer_page) > sizeof(struct page))
604 ring_buffer_page_too_big();
605
606
607 /* keep it in its own cache line */ 646 /* keep it in its own cache line */
608 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 647 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
609 GFP_KERNEL); 648 GFP_KERNEL);
@@ -616,10 +655,11 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
616 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 655 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
617 buffer->flags = flags; 656 buffer->flags = flags;
618 buffer->clock = trace_clock_local; 657 buffer->clock = trace_clock_local;
658 buffer->reader_lock_key = key;
619 659
620 /* need at least two pages */ 660 /* need at least two pages */
621 if (buffer->pages == 1) 661 if (buffer->pages < 2)
622 buffer->pages++; 662 buffer->pages = 2;
623 663
624 /* 664 /*
625 * In case of non-hotplug cpu, if the ring-buffer is allocated 665 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -673,7 +713,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
673 kfree(buffer); 713 kfree(buffer);
674 return NULL; 714 return NULL;
675} 715}
676EXPORT_SYMBOL_GPL(ring_buffer_alloc); 716EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
677 717
678/** 718/**
679 * ring_buffer_free - free a ring buffer. 719 * ring_buffer_free - free a ring buffer.
@@ -695,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
695 735
696 put_online_cpus(); 736 put_online_cpus();
697 737
738 kfree(buffer->buffers);
698 free_cpumask_var(buffer->cpumask); 739 free_cpumask_var(buffer->cpumask);
699 740
700 kfree(buffer); 741 kfree(buffer);
@@ -947,31 +988,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
947 return rb_page_commit(cpu_buffer->head_page); 988 return rb_page_commit(cpu_buffer->head_page);
948} 989}
949 990
950/*
951 * When the tail hits the head and the buffer is in overwrite mode,
952 * the head jumps to the next page and all content on the previous
953 * page is discarded. But before doing so, we update the overrun
954 * variable of the buffer.
955 */
956static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
957{
958 struct ring_buffer_event *event;
959 unsigned long head;
960
961 for (head = 0; head < rb_head_size(cpu_buffer);
962 head += rb_event_length(event)) {
963
964 event = __rb_page_index(cpu_buffer->head_page, head);
965 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
966 return;
967 /* Only count data entries */
968 if (event->type != RINGBUF_TYPE_DATA)
969 continue;
970 cpu_buffer->overrun++;
971 cpu_buffer->entries--;
972 }
973}
974
975static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 991static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
976 struct buffer_page **bpage) 992 struct buffer_page **bpage)
977{ 993{
@@ -988,12 +1004,12 @@ rb_event_index(struct ring_buffer_event *event)
988{ 1004{
989 unsigned long addr = (unsigned long)event; 1005 unsigned long addr = (unsigned long)event;
990 1006
991 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1007 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
992} 1008}
993 1009
994static int 1010static inline int
995rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1011rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
996 struct ring_buffer_event *event) 1012 struct ring_buffer_event *event)
997{ 1013{
998 unsigned long addr = (unsigned long)event; 1014 unsigned long addr = (unsigned long)event;
999 unsigned long index; 1015 unsigned long index;
@@ -1006,31 +1022,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1006} 1022}
1007 1023
1008static void 1024static void
1009rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
1010 struct ring_buffer_event *event)
1011{
1012 unsigned long addr = (unsigned long)event;
1013 unsigned long index;
1014
1015 index = rb_event_index(event);
1016 addr &= PAGE_MASK;
1017
1018 while (cpu_buffer->commit_page->page != (void *)addr) {
1019 if (RB_WARN_ON(cpu_buffer,
1020 cpu_buffer->commit_page == cpu_buffer->tail_page))
1021 return;
1022 cpu_buffer->commit_page->page->commit =
1023 cpu_buffer->commit_page->write;
1024 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1025 cpu_buffer->write_stamp =
1026 cpu_buffer->commit_page->page->time_stamp;
1027 }
1028
1029 /* Now set the commit to the event's index */
1030 local_set(&cpu_buffer->commit_page->page->commit, index);
1031}
1032
1033static void
1034rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1025rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1035{ 1026{
1036 /* 1027 /*
@@ -1110,28 +1101,21 @@ static void
1110rb_update_event(struct ring_buffer_event *event, 1101rb_update_event(struct ring_buffer_event *event,
1111 unsigned type, unsigned length) 1102 unsigned type, unsigned length)
1112{ 1103{
1113 event->type = type; 1104 event->type_len = type;
1114 1105
1115 switch (type) { 1106 switch (type) {
1116 1107
1117 case RINGBUF_TYPE_PADDING: 1108 case RINGBUF_TYPE_PADDING:
1118 break;
1119
1120 case RINGBUF_TYPE_TIME_EXTEND: 1109 case RINGBUF_TYPE_TIME_EXTEND:
1121 event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
1122 break;
1123
1124 case RINGBUF_TYPE_TIME_STAMP: 1110 case RINGBUF_TYPE_TIME_STAMP:
1125 event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
1126 break; 1111 break;
1127 1112
1128 case RINGBUF_TYPE_DATA: 1113 case 0:
1129 length -= RB_EVNT_HDR_SIZE; 1114 length -= RB_EVNT_HDR_SIZE;
1130 if (length > RB_MAX_SMALL_DATA) { 1115 if (length > RB_MAX_SMALL_DATA)
1131 event->len = 0;
1132 event->array[0] = length; 1116 event->array[0] = length;
1133 } else 1117 else
1134 event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1118 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1135 break; 1119 break;
1136 default: 1120 default:
1137 BUG(); 1121 BUG();
@@ -1155,158 +1139,241 @@ static unsigned rb_calculate_event_length(unsigned length)
1155 return length; 1139 return length;
1156} 1140}
1157 1141
1158static struct ring_buffer_event * 1142static inline void
1159__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1143rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1160 unsigned type, unsigned long length, u64 *ts) 1144 struct buffer_page *tail_page,
1145 unsigned long tail, unsigned long length)
1161{ 1146{
1162 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
1163 unsigned long tail, write;
1164 struct ring_buffer *buffer = cpu_buffer->buffer;
1165 struct ring_buffer_event *event; 1147 struct ring_buffer_event *event;
1166 unsigned long flags;
1167 bool lock_taken = false;
1168 1148
1169 commit_page = cpu_buffer->commit_page; 1149 /*
1170 /* we just need to protect against interrupts */ 1150 * Only the event that crossed the page boundary
1171 barrier(); 1151 * must fill the old tail_page with padding.
1172 tail_page = cpu_buffer->tail_page; 1152 */
1173 write = local_add_return(length, &tail_page->write); 1153 if (tail >= BUF_PAGE_SIZE) {
1174 tail = write - length; 1154 local_sub(length, &tail_page->write);
1155 return;
1156 }
1175 1157
1176 /* See if we shot pass the end of this buffer page */ 1158 event = __rb_page_index(tail_page, tail);
1177 if (write > BUF_PAGE_SIZE) { 1159 kmemcheck_annotate_bitfield(event, bitfield);
1178 struct buffer_page *next_page = tail_page;
1179 1160
1180 local_irq_save(flags); 1161 /*
1181 /* 1162 * If this event is bigger than the minimum size, then
1182 * Since the write to the buffer is still not 1163 * we need to be careful that we don't subtract the
1183 * fully lockless, we must be careful with NMIs. 1164 * write counter enough to allow another writer to slip
1184 * The locks in the writers are taken when a write 1165 * in on this page.
1185 * crosses to a new page. The locks protect against 1166 * We put in a discarded commit instead, to make sure
1186 * races with the readers (this will soon be fixed 1167 * that this space is not used again.
1187 * with a lockless solution). 1168 *
1188 * 1169 * If we are less than the minimum size, we don't need to
1189 * Because we can not protect against NMIs, and we 1170 * worry about it.
1190 * want to keep traces reentrant, we need to manage 1171 */
1191 * what happens when we are in an NMI. 1172 if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1192 * 1173 /* No room for any events */
1193 * NMIs can happen after we take the lock.
1194 * If we are in an NMI, only take the lock
1195 * if it is not already taken. Otherwise
1196 * simply fail.
1197 */
1198 if (unlikely(in_nmi())) {
1199 if (!__raw_spin_trylock(&cpu_buffer->lock))
1200 goto out_reset;
1201 } else
1202 __raw_spin_lock(&cpu_buffer->lock);
1203 1174
1204 lock_taken = true; 1175 /* Mark the rest of the page with padding */
1176 rb_event_set_padding(event);
1205 1177
1206 rb_inc_page(cpu_buffer, &next_page); 1178 /* Set the write back to the previous setting */
1179 local_sub(length, &tail_page->write);
1180 return;
1181 }
1207 1182
1208 head_page = cpu_buffer->head_page; 1183 /* Put in a discarded event */
1209 reader_page = cpu_buffer->reader_page; 1184 event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1185 event->type_len = RINGBUF_TYPE_PADDING;
1186 /* time delta must be non zero */
1187 event->time_delta = 1;
1188 /* Account for this as an entry */
1189 local_inc(&tail_page->entries);
1190 local_inc(&cpu_buffer->entries);
1210 1191
1211 /* we grabbed the lock before incrementing */ 1192 /* Set write to end of buffer */
1212 if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) 1193 length = (tail + length) - BUF_PAGE_SIZE;
1213 goto out_reset; 1194 local_sub(length, &tail_page->write);
1195}
1214 1196
1215 /* 1197static struct ring_buffer_event *
1216 * If for some reason, we had an interrupt storm that made 1198rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1217 * it all the way around the buffer, bail, and warn 1199 unsigned long length, unsigned long tail,
1218 * about it. 1200 struct buffer_page *commit_page,
1219 */ 1201 struct buffer_page *tail_page, u64 *ts)
1220 if (unlikely(next_page == commit_page)) { 1202{
1221 WARN_ON_ONCE(1); 1203 struct buffer_page *next_page, *head_page, *reader_page;
1204 struct ring_buffer *buffer = cpu_buffer->buffer;
1205 bool lock_taken = false;
1206 unsigned long flags;
1207
1208 next_page = tail_page;
1209
1210 local_irq_save(flags);
1211 /*
1212 * Since the write to the buffer is still not
1213 * fully lockless, we must be careful with NMIs.
1214 * The locks in the writers are taken when a write
1215 * crosses to a new page. The locks protect against
1216 * races with the readers (this will soon be fixed
1217 * with a lockless solution).
1218 *
1219 * Because we can not protect against NMIs, and we
1220 * want to keep traces reentrant, we need to manage
1221 * what happens when we are in an NMI.
1222 *
1223 * NMIs can happen after we take the lock.
1224 * If we are in an NMI, only take the lock
1225 * if it is not already taken. Otherwise
1226 * simply fail.
1227 */
1228 if (unlikely(in_nmi())) {
1229 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1230 cpu_buffer->nmi_dropped++;
1222 goto out_reset; 1231 goto out_reset;
1223 } 1232 }
1233 } else
1234 __raw_spin_lock(&cpu_buffer->lock);
1224 1235
1225 if (next_page == head_page) { 1236 lock_taken = true;
1226 if (!(buffer->flags & RB_FL_OVERWRITE))
1227 goto out_reset;
1228 1237
1229 /* tail_page has not moved yet? */ 1238 rb_inc_page(cpu_buffer, &next_page);
1230 if (tail_page == cpu_buffer->tail_page) {
1231 /* count overflows */
1232 rb_update_overflow(cpu_buffer);
1233 1239
1234 rb_inc_page(cpu_buffer, &head_page); 1240 head_page = cpu_buffer->head_page;
1235 cpu_buffer->head_page = head_page; 1241 reader_page = cpu_buffer->reader_page;
1236 cpu_buffer->head_page->read = 0;
1237 }
1238 }
1239 1242
1240 /* 1243 /* we grabbed the lock before incrementing */
1241 * If the tail page is still the same as what we think 1244 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1242 * it is, then it is up to us to update the tail 1245 goto out_reset;
1243 * pointer. 1246
1244 */ 1247 /*
1248 * If for some reason, we had an interrupt storm that made
1249 * it all the way around the buffer, bail, and warn
1250 * about it.
1251 */
1252 if (unlikely(next_page == commit_page)) {
1253 cpu_buffer->commit_overrun++;
1254 goto out_reset;
1255 }
1256
1257 if (next_page == head_page) {
1258 if (!(buffer->flags & RB_FL_OVERWRITE))
1259 goto out_reset;
1260
1261 /* tail_page has not moved yet? */
1245 if (tail_page == cpu_buffer->tail_page) { 1262 if (tail_page == cpu_buffer->tail_page) {
1246 local_set(&next_page->write, 0); 1263 /* count overflows */
1247 local_set(&next_page->page->commit, 0); 1264 cpu_buffer->overrun +=
1248 cpu_buffer->tail_page = next_page; 1265 local_read(&head_page->entries);
1249 1266
1250 /* reread the time stamp */ 1267 rb_inc_page(cpu_buffer, &head_page);
1251 *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); 1268 cpu_buffer->head_page = head_page;
1252 cpu_buffer->tail_page->page->time_stamp = *ts; 1269 cpu_buffer->head_page->read = 0;
1253 } 1270 }
1271 }
1254 1272
1255 /* 1273 /*
1256 * The actual tail page has moved forward. 1274 * If the tail page is still the same as what we think
1257 */ 1275 * it is, then it is up to us to update the tail
1258 if (tail < BUF_PAGE_SIZE) { 1276 * pointer.
1259 /* Mark the rest of the page with padding */ 1277 */
1260 event = __rb_page_index(tail_page, tail); 1278 if (tail_page == cpu_buffer->tail_page) {
1261 rb_event_set_padding(event); 1279 local_set(&next_page->write, 0);
1262 } 1280 local_set(&next_page->entries, 0);
1281 local_set(&next_page->page->commit, 0);
1282 cpu_buffer->tail_page = next_page;
1283
1284 /* reread the time stamp */
1285 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1286 cpu_buffer->tail_page->page->time_stamp = *ts;
1287 }
1263 1288
1264 if (tail <= BUF_PAGE_SIZE) 1289 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1265 /* Set the write back to the previous setting */
1266 local_set(&tail_page->write, tail);
1267 1290
1268 /* 1291 __raw_spin_unlock(&cpu_buffer->lock);
1269 * If this was a commit entry that failed, 1292 local_irq_restore(flags);
1270 * increment that too 1293
1271 */ 1294 /* fail and let the caller try again */
1272 if (tail_page == cpu_buffer->commit_page && 1295 return ERR_PTR(-EAGAIN);
1273 tail == rb_commit_index(cpu_buffer)) { 1296
1274 rb_set_commit_to_write(cpu_buffer); 1297 out_reset:
1275 } 1298 /* reset write */
1299 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1276 1300
1301 if (likely(lock_taken))
1277 __raw_spin_unlock(&cpu_buffer->lock); 1302 __raw_spin_unlock(&cpu_buffer->lock);
1278 local_irq_restore(flags); 1303 local_irq_restore(flags);
1304 return NULL;
1305}
1279 1306
1280 /* fail and let the caller try again */ 1307static struct ring_buffer_event *
1281 return ERR_PTR(-EAGAIN); 1308__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1282 } 1309 unsigned type, unsigned long length, u64 *ts)
1310{
1311 struct buffer_page *tail_page, *commit_page;
1312 struct ring_buffer_event *event;
1313 unsigned long tail, write;
1283 1314
1284 /* We reserved something on the buffer */ 1315 commit_page = cpu_buffer->commit_page;
1316 /* we just need to protect against interrupts */
1317 barrier();
1318 tail_page = cpu_buffer->tail_page;
1319 write = local_add_return(length, &tail_page->write);
1320 tail = write - length;
1285 1321
1286 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) 1322 /* See if we shot pass the end of this buffer page */
1287 return NULL; 1323 if (write > BUF_PAGE_SIZE)
1324 return rb_move_tail(cpu_buffer, length, tail,
1325 commit_page, tail_page, ts);
1326
1327 /* We reserved something on the buffer */
1288 1328
1289 event = __rb_page_index(tail_page, tail); 1329 event = __rb_page_index(tail_page, tail);
1330 kmemcheck_annotate_bitfield(event, bitfield);
1290 rb_update_event(event, type, length); 1331 rb_update_event(event, type, length);
1291 1332
1333 /* The passed in type is zero for DATA */
1334 if (likely(!type))
1335 local_inc(&tail_page->entries);
1336
1292 /* 1337 /*
1293 * If this is a commit and the tail is zero, then update 1338 * If this is the first commit on the page, then update
1294 * this page's time stamp. 1339 * its timestamp.
1295 */ 1340 */
1296 if (!tail && rb_is_commit(cpu_buffer, event)) 1341 if (!tail)
1297 cpu_buffer->commit_page->page->time_stamp = *ts; 1342 tail_page->page->time_stamp = *ts;
1298 1343
1299 return event; 1344 return event;
1345}
1300 1346
1301 out_reset: 1347static inline int
1302 /* reset write */ 1348rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1303 if (tail <= BUF_PAGE_SIZE) 1349 struct ring_buffer_event *event)
1304 local_set(&tail_page->write, tail); 1350{
1351 unsigned long new_index, old_index;
1352 struct buffer_page *bpage;
1353 unsigned long index;
1354 unsigned long addr;
1305 1355
1306 if (likely(lock_taken)) 1356 new_index = rb_event_index(event);
1307 __raw_spin_unlock(&cpu_buffer->lock); 1357 old_index = new_index + rb_event_length(event);
1308 local_irq_restore(flags); 1358 addr = (unsigned long)event;
1309 return NULL; 1359 addr &= PAGE_MASK;
1360
1361 bpage = cpu_buffer->tail_page;
1362
1363 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1364 /*
1365 * This is on the tail page. It is possible that
1366 * a write could come in and move the tail page
1367 * and write to the next page. That is fine
1368 * because we just shorten what is on this page.
1369 */
1370 index = local_cmpxchg(&bpage->write, old_index, new_index);
1371 if (index == old_index)
1372 return 1;
1373 }
1374
1375 /* could not discard */
1376 return 0;
1310} 1377}
1311 1378
1312static int 1379static int
@@ -1341,26 +1408,33 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1341 return -EAGAIN; 1408 return -EAGAIN;
1342 1409
1343 /* Only a commited time event can update the write stamp */ 1410 /* Only a commited time event can update the write stamp */
1344 if (rb_is_commit(cpu_buffer, event)) { 1411 if (rb_event_is_commit(cpu_buffer, event)) {
1345 /* 1412 /*
1346 * If this is the first on the page, then we need to 1413 * If this is the first on the page, then it was
1347 * update the page itself, and just put in a zero. 1414 * updated with the page itself. Try to discard it
1415 * and if we can't just make it zero.
1348 */ 1416 */
1349 if (rb_event_index(event)) { 1417 if (rb_event_index(event)) {
1350 event->time_delta = *delta & TS_MASK; 1418 event->time_delta = *delta & TS_MASK;
1351 event->array[0] = *delta >> TS_SHIFT; 1419 event->array[0] = *delta >> TS_SHIFT;
1352 } else { 1420 } else {
1353 cpu_buffer->commit_page->page->time_stamp = *ts; 1421 /* try to discard, since we do not need this */
1354 event->time_delta = 0; 1422 if (!rb_try_to_discard(cpu_buffer, event)) {
1355 event->array[0] = 0; 1423 /* nope, just zero it */
1424 event->time_delta = 0;
1425 event->array[0] = 0;
1426 }
1356 } 1427 }
1357 cpu_buffer->write_stamp = *ts; 1428 cpu_buffer->write_stamp = *ts;
1358 /* let the caller know this was the commit */ 1429 /* let the caller know this was the commit */
1359 ret = 1; 1430 ret = 1;
1360 } else { 1431 } else {
1361 /* Darn, this is just wasted space */ 1432 /* Try to discard the event */
1362 event->time_delta = 0; 1433 if (!rb_try_to_discard(cpu_buffer, event)) {
1363 event->array[0] = 0; 1434 /* Darn, this is just wasted space */
1435 event->time_delta = 0;
1436 event->array[0] = 0;
1437 }
1364 ret = 0; 1438 ret = 0;
1365 } 1439 }
1366 1440
@@ -1369,15 +1443,56 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1369 return ret; 1443 return ret;
1370} 1444}
1371 1445
1446static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
1447{
1448 local_inc(&cpu_buffer->committing);
1449 local_inc(&cpu_buffer->commits);
1450}
1451
1452static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1453{
1454 unsigned long commits;
1455
1456 if (RB_WARN_ON(cpu_buffer,
1457 !local_read(&cpu_buffer->committing)))
1458 return;
1459
1460 again:
1461 commits = local_read(&cpu_buffer->commits);
1462 /* synchronize with interrupts */
1463 barrier();
1464 if (local_read(&cpu_buffer->committing) == 1)
1465 rb_set_commit_to_write(cpu_buffer);
1466
1467 local_dec(&cpu_buffer->committing);
1468
1469 /* synchronize with interrupts */
1470 barrier();
1471
1472 /*
1473 * Need to account for interrupts coming in between the
1474 * updating of the commit page and the clearing of the
1475 * committing counter.
1476 */
1477 if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
1478 !local_read(&cpu_buffer->committing)) {
1479 local_inc(&cpu_buffer->committing);
1480 goto again;
1481 }
1482}
1483
1372static struct ring_buffer_event * 1484static struct ring_buffer_event *
1373rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 1485rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1374 unsigned type, unsigned long length) 1486 unsigned long length)
1375{ 1487{
1376 struct ring_buffer_event *event; 1488 struct ring_buffer_event *event;
1377 u64 ts, delta; 1489 u64 ts, delta = 0;
1378 int commit = 0; 1490 int commit = 0;
1379 int nr_loops = 0; 1491 int nr_loops = 0;
1380 1492
1493 rb_start_commit(cpu_buffer);
1494
1495 length = rb_calculate_event_length(length);
1381 again: 1496 again:
1382 /* 1497 /*
1383 * We allow for interrupts to reenter here and do a trace. 1498 * We allow for interrupts to reenter here and do a trace.
@@ -1389,9 +1504,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1389 * Bail! 1504 * Bail!
1390 */ 1505 */
1391 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1506 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1392 return NULL; 1507 goto out_fail;
1393 1508
1394 ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 1509 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1395 1510
1396 /* 1511 /*
1397 * Only the first commit can update the timestamp. 1512 * Only the first commit can update the timestamp.
@@ -1401,63 +1516,93 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1401 * also be made. But only the entry that did the actual 1516 * also be made. But only the entry that did the actual
1402 * commit will be something other than zero. 1517 * commit will be something other than zero.
1403 */ 1518 */
1404 if (cpu_buffer->tail_page == cpu_buffer->commit_page && 1519 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
1405 rb_page_write(cpu_buffer->tail_page) == 1520 rb_page_write(cpu_buffer->tail_page) ==
1406 rb_commit_index(cpu_buffer)) { 1521 rb_commit_index(cpu_buffer))) {
1522 u64 diff;
1407 1523
1408 delta = ts - cpu_buffer->write_stamp; 1524 diff = ts - cpu_buffer->write_stamp;
1409 1525
1410 /* make sure this delta is calculated here */ 1526 /* make sure this diff is calculated here */
1411 barrier(); 1527 barrier();
1412 1528
1413 /* Did the write stamp get updated already? */ 1529 /* Did the write stamp get updated already? */
1414 if (unlikely(ts < cpu_buffer->write_stamp)) 1530 if (unlikely(ts < cpu_buffer->write_stamp))
1415 delta = 0; 1531 goto get_event;
1416 1532
1417 if (test_time_stamp(delta)) { 1533 delta = diff;
1534 if (unlikely(test_time_stamp(delta))) {
1418 1535
1419 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1536 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1420
1421 if (commit == -EBUSY) 1537 if (commit == -EBUSY)
1422 return NULL; 1538 goto out_fail;
1423 1539
1424 if (commit == -EAGAIN) 1540 if (commit == -EAGAIN)
1425 goto again; 1541 goto again;
1426 1542
1427 RB_WARN_ON(cpu_buffer, commit < 0); 1543 RB_WARN_ON(cpu_buffer, commit < 0);
1428 } 1544 }
1429 } else 1545 }
1430 /* Non commits have zero deltas */
1431 delta = 0;
1432 1546
1433 event = __rb_reserve_next(cpu_buffer, type, length, &ts); 1547 get_event:
1434 if (PTR_ERR(event) == -EAGAIN) 1548 event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
1549 if (unlikely(PTR_ERR(event) == -EAGAIN))
1435 goto again; 1550 goto again;
1436 1551
1437 if (!event) { 1552 if (!event)
1438 if (unlikely(commit)) 1553 goto out_fail;
1439 /*
1440 * Ouch! We needed a timestamp and it was commited. But
1441 * we didn't get our event reserved.
1442 */
1443 rb_set_commit_to_write(cpu_buffer);
1444 return NULL;
1445 }
1446 1554
1447 /* 1555 if (!rb_event_is_commit(cpu_buffer, event))
1448 * If the timestamp was commited, make the commit our entry
1449 * now so that we will update it when needed.
1450 */
1451 if (commit)
1452 rb_set_commit_event(cpu_buffer, event);
1453 else if (!rb_is_commit(cpu_buffer, event))
1454 delta = 0; 1556 delta = 0;
1455 1557
1456 event->time_delta = delta; 1558 event->time_delta = delta;
1457 1559
1458 return event; 1560 return event;
1561
1562 out_fail:
1563 rb_end_commit(cpu_buffer);
1564 return NULL;
1565}
1566
1567#ifdef CONFIG_TRACING
1568
1569#define TRACE_RECURSIVE_DEPTH 16
1570
1571static int trace_recursive_lock(void)
1572{
1573 current->trace_recursion++;
1574
1575 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
1576 return 0;
1577
1578 /* Disable all tracing before we do anything else */
1579 tracing_off_permanent();
1580
1581 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
1582 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
1583 current->trace_recursion,
1584 hardirq_count() >> HARDIRQ_SHIFT,
1585 softirq_count() >> SOFTIRQ_SHIFT,
1586 in_nmi());
1587
1588 WARN_ON_ONCE(1);
1589 return -1;
1590}
1591
1592static void trace_recursive_unlock(void)
1593{
1594 WARN_ON_ONCE(!current->trace_recursion);
1595
1596 current->trace_recursion--;
1459} 1597}
1460 1598
1599#else
1600
1601#define trace_recursive_lock() (0)
1602#define trace_recursive_unlock() do { } while (0)
1603
1604#endif
1605
1461static DEFINE_PER_CPU(int, rb_need_resched); 1606static DEFINE_PER_CPU(int, rb_need_resched);
1462 1607
1463/** 1608/**
@@ -1491,6 +1636,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1491 /* If we are tracing schedule, we don't want to recurse */ 1636 /* If we are tracing schedule, we don't want to recurse */
1492 resched = ftrace_preempt_disable(); 1637 resched = ftrace_preempt_disable();
1493 1638
1639 if (trace_recursive_lock())
1640 goto out_nocheck;
1641
1494 cpu = raw_smp_processor_id(); 1642 cpu = raw_smp_processor_id();
1495 1643
1496 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1644 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1649,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1501 if (atomic_read(&cpu_buffer->record_disabled)) 1649 if (atomic_read(&cpu_buffer->record_disabled))
1502 goto out; 1650 goto out;
1503 1651
1504 length = rb_calculate_event_length(length); 1652 if (length > BUF_MAX_DATA_SIZE)
1505 if (length > BUF_PAGE_SIZE)
1506 goto out; 1653 goto out;
1507 1654
1508 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); 1655 event = rb_reserve_next_event(cpu_buffer, length);
1509 if (!event) 1656 if (!event)
1510 goto out; 1657 goto out;
1511 1658
@@ -1520,6 +1667,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1520 return event; 1667 return event;
1521 1668
1522 out: 1669 out:
1670 trace_recursive_unlock();
1671
1672 out_nocheck:
1523 ftrace_preempt_enable(resched); 1673 ftrace_preempt_enable(resched);
1524 return NULL; 1674 return NULL;
1525} 1675}
@@ -1528,15 +1678,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1528static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1678static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1529 struct ring_buffer_event *event) 1679 struct ring_buffer_event *event)
1530{ 1680{
1531 cpu_buffer->entries++; 1681 local_inc(&cpu_buffer->entries);
1532 1682
1533 /* Only process further if we own the commit */ 1683 /*
1534 if (!rb_is_commit(cpu_buffer, event)) 1684 * The event first in the commit queue updates the
1535 return; 1685 * time stamp.
1536 1686 */
1537 cpu_buffer->write_stamp += event->time_delta; 1687 if (rb_event_is_commit(cpu_buffer, event))
1688 cpu_buffer->write_stamp += event->time_delta;
1538 1689
1539 rb_set_commit_to_write(cpu_buffer); 1690 rb_end_commit(cpu_buffer);
1540} 1691}
1541 1692
1542/** 1693/**
@@ -1558,6 +1709,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1558 1709
1559 rb_commit(cpu_buffer, event); 1710 rb_commit(cpu_buffer, event);
1560 1711
1712 trace_recursive_unlock();
1713
1561 /* 1714 /*
1562 * Only the last preempt count needs to restore preemption. 1715 * Only the last preempt count needs to restore preemption.
1563 */ 1716 */
@@ -1570,6 +1723,93 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1570} 1723}
1571EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); 1724EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1572 1725
1726static inline void rb_event_discard(struct ring_buffer_event *event)
1727{
1728 /* array[0] holds the actual length for the discarded event */
1729 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
1730 event->type_len = RINGBUF_TYPE_PADDING;
1731 /* time delta must be non zero */
1732 if (!event->time_delta)
1733 event->time_delta = 1;
1734}
1735
1736/**
1737 * ring_buffer_event_discard - discard any event in the ring buffer
1738 * @event: the event to discard
1739 *
1740 * Sometimes a event that is in the ring buffer needs to be ignored.
1741 * This function lets the user discard an event in the ring buffer
1742 * and then that event will not be read later.
1743 *
1744 * Note, it is up to the user to be careful with this, and protect
1745 * against races. If the user discards an event that has been consumed
1746 * it is possible that it could corrupt the ring buffer.
1747 */
1748void ring_buffer_event_discard(struct ring_buffer_event *event)
1749{
1750 rb_event_discard(event);
1751}
1752EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1753
1754/**
1755 * ring_buffer_commit_discard - discard an event that has not been committed
1756 * @buffer: the ring buffer
1757 * @event: non committed event to discard
1758 *
1759 * This is similar to ring_buffer_event_discard but must only be
1760 * performed on an event that has not been committed yet. The difference
1761 * is that this will also try to free the event from the ring buffer
1762 * if another event has not been added behind it.
1763 *
1764 * If another event has been added behind it, it will set the event
1765 * up as discarded, and perform the commit.
1766 *
1767 * If this function is called, do not call ring_buffer_unlock_commit on
1768 * the event.
1769 */
1770void ring_buffer_discard_commit(struct ring_buffer *buffer,
1771 struct ring_buffer_event *event)
1772{
1773 struct ring_buffer_per_cpu *cpu_buffer;
1774 int cpu;
1775
1776 /* The event is discarded regardless */
1777 rb_event_discard(event);
1778
1779 cpu = smp_processor_id();
1780 cpu_buffer = buffer->buffers[cpu];
1781
1782 /*
1783 * This must only be called if the event has not been
1784 * committed yet. Thus we can assume that preemption
1785 * is still disabled.
1786 */
1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1788
1789 if (rb_try_to_discard(cpu_buffer, event))
1790 goto out;
1791
1792 /*
1793 * The commit is still visible by the reader, so we
1794 * must increment entries.
1795 */
1796 local_inc(&cpu_buffer->entries);
1797 out:
1798 rb_end_commit(cpu_buffer);
1799
1800 trace_recursive_unlock();
1801
1802 /*
1803 * Only the last preempt count needs to restore preemption.
1804 */
1805 if (preempt_count() == 1)
1806 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1807 else
1808 preempt_enable_no_resched_notrace();
1809
1810}
1811EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
1812
1573/** 1813/**
1574 * ring_buffer_write - write data to the buffer without reserving 1814 * ring_buffer_write - write data to the buffer without reserving
1575 * @buffer: The ring buffer to write to. 1815 * @buffer: The ring buffer to write to.
@@ -1589,7 +1829,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
1589{ 1829{
1590 struct ring_buffer_per_cpu *cpu_buffer; 1830 struct ring_buffer_per_cpu *cpu_buffer;
1591 struct ring_buffer_event *event; 1831 struct ring_buffer_event *event;
1592 unsigned long event_length;
1593 void *body; 1832 void *body;
1594 int ret = -EBUSY; 1833 int ret = -EBUSY;
1595 int cpu, resched; 1834 int cpu, resched;
@@ -1612,9 +1851,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
1612 if (atomic_read(&cpu_buffer->record_disabled)) 1851 if (atomic_read(&cpu_buffer->record_disabled))
1613 goto out; 1852 goto out;
1614 1853
1615 event_length = rb_calculate_event_length(length); 1854 if (length > BUF_MAX_DATA_SIZE)
1616 event = rb_reserve_next_event(cpu_buffer, 1855 goto out;
1617 RINGBUF_TYPE_DATA, event_length); 1856
1857 event = rb_reserve_next_event(cpu_buffer, length);
1618 if (!event) 1858 if (!event)
1619 goto out; 1859 goto out;
1620 1860
@@ -1728,7 +1968,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1728 return 0; 1968 return 0;
1729 1969
1730 cpu_buffer = buffer->buffers[cpu]; 1970 cpu_buffer = buffer->buffers[cpu];
1731 ret = cpu_buffer->entries; 1971 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
1972 - cpu_buffer->read;
1732 1973
1733 return ret; 1974 return ret;
1734} 1975}
@@ -1755,6 +1996,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1755EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 1996EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1756 1997
1757/** 1998/**
1999 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
2000 * @buffer: The ring buffer
2001 * @cpu: The per CPU buffer to get the number of overruns from
2002 */
2003unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2004{
2005 struct ring_buffer_per_cpu *cpu_buffer;
2006 unsigned long ret;
2007
2008 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2009 return 0;
2010
2011 cpu_buffer = buffer->buffers[cpu];
2012 ret = cpu_buffer->nmi_dropped;
2013
2014 return ret;
2015}
2016EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2017
2018/**
2019 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2020 * @buffer: The ring buffer
2021 * @cpu: The per CPU buffer to get the number of overruns from
2022 */
2023unsigned long
2024ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2025{
2026 struct ring_buffer_per_cpu *cpu_buffer;
2027 unsigned long ret;
2028
2029 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2030 return 0;
2031
2032 cpu_buffer = buffer->buffers[cpu];
2033 ret = cpu_buffer->commit_overrun;
2034
2035 return ret;
2036}
2037EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
2038
2039/**
1758 * ring_buffer_entries - get the number of entries in a buffer 2040 * ring_buffer_entries - get the number of entries in a buffer
1759 * @buffer: The ring buffer 2041 * @buffer: The ring buffer
1760 * 2042 *
@@ -1770,7 +2052,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1770 /* if you care about this being correct, lock the buffer */ 2052 /* if you care about this being correct, lock the buffer */
1771 for_each_buffer_cpu(buffer, cpu) { 2053 for_each_buffer_cpu(buffer, cpu) {
1772 cpu_buffer = buffer->buffers[cpu]; 2054 cpu_buffer = buffer->buffers[cpu];
1773 entries += cpu_buffer->entries; 2055 entries += (local_read(&cpu_buffer->entries) -
2056 cpu_buffer->overrun) - cpu_buffer->read;
1774 } 2057 }
1775 2058
1776 return entries; 2059 return entries;
@@ -1862,7 +2145,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1862{ 2145{
1863 u64 delta; 2146 u64 delta;
1864 2147
1865 switch (event->type) { 2148 switch (event->type_len) {
1866 case RINGBUF_TYPE_PADDING: 2149 case RINGBUF_TYPE_PADDING:
1867 return; 2150 return;
1868 2151
@@ -1893,7 +2176,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
1893{ 2176{
1894 u64 delta; 2177 u64 delta;
1895 2178
1896 switch (event->type) { 2179 switch (event->type_len) {
1897 case RINGBUF_TYPE_PADDING: 2180 case RINGBUF_TYPE_PADDING:
1898 return; 2181 return;
1899 2182
@@ -1966,6 +2249,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1966 cpu_buffer->reader_page->list.prev = reader->list.prev; 2249 cpu_buffer->reader_page->list.prev = reader->list.prev;
1967 2250
1968 local_set(&cpu_buffer->reader_page->write, 0); 2251 local_set(&cpu_buffer->reader_page->write, 0);
2252 local_set(&cpu_buffer->reader_page->entries, 0);
1969 local_set(&cpu_buffer->reader_page->page->commit, 0); 2253 local_set(&cpu_buffer->reader_page->page->commit, 0);
1970 2254
1971 /* Make the reader page now replace the head */ 2255 /* Make the reader page now replace the head */
@@ -2008,8 +2292,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2008 2292
2009 event = rb_reader_event(cpu_buffer); 2293 event = rb_reader_event(cpu_buffer);
2010 2294
2011 if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) 2295 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
2012 cpu_buffer->entries--; 2296 || rb_discarded_event(event))
2297 cpu_buffer->read++;
2013 2298
2014 rb_update_read_stamp(cpu_buffer, event); 2299 rb_update_read_stamp(cpu_buffer, event);
2015 2300
@@ -2031,8 +2316,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2031 * Check if we are at the end of the buffer. 2316 * Check if we are at the end of the buffer.
2032 */ 2317 */
2033 if (iter->head >= rb_page_size(iter->head_page)) { 2318 if (iter->head >= rb_page_size(iter->head_page)) {
2034 if (RB_WARN_ON(buffer, 2319 /* discarded commits can make the page empty */
2035 iter->head_page == cpu_buffer->commit_page)) 2320 if (iter->head_page == cpu_buffer->commit_page)
2036 return; 2321 return;
2037 rb_inc_iter(iter); 2322 rb_inc_iter(iter);
2038 return; 2323 return;
@@ -2075,12 +2360,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2075 /* 2360 /*
2076 * We repeat when a timestamp is encountered. It is possible 2361 * We repeat when a timestamp is encountered. It is possible
2077 * to get multiple timestamps from an interrupt entering just 2362 * to get multiple timestamps from an interrupt entering just
2078 * as one timestamp is about to be written. The max times 2363 * as one timestamp is about to be written, or from discarded
2079 * that this can happen is the number of nested interrupts we 2364 * commits. The most that we can have is the number on a single page.
2080 * can have. Nesting 10 deep of interrupts is clearly
2081 * an anomaly.
2082 */ 2365 */
2083 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2366 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2084 return NULL; 2367 return NULL;
2085 2368
2086 reader = rb_get_reader_page(cpu_buffer); 2369 reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2372,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2089 2372
2090 event = rb_reader_event(cpu_buffer); 2373 event = rb_reader_event(cpu_buffer);
2091 2374
2092 switch (event->type) { 2375 switch (event->type_len) {
2093 case RINGBUF_TYPE_PADDING: 2376 case RINGBUF_TYPE_PADDING:
2094 if (rb_null_event(event)) 2377 if (rb_null_event(event))
2095 RB_WARN_ON(cpu_buffer, 1); 2378 RB_WARN_ON(cpu_buffer, 1);
@@ -2101,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2101 * the box. Return the padding, and we will release 2384 * the box. Return the padding, and we will release
2102 * the current locks, and try again. 2385 * the current locks, and try again.
2103 */ 2386 */
2104 rb_advance_reader(cpu_buffer);
2105 return event; 2387 return event;
2106 2388
2107 case RINGBUF_TYPE_TIME_EXTEND: 2389 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2146,14 +2428,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2146 2428
2147 again: 2429 again:
2148 /* 2430 /*
2149 * We repeat when a timestamp is encountered. It is possible 2431 * We repeat when a timestamp is encountered.
2150 * to get multiple timestamps from an interrupt entering just 2432 * We can get multiple timestamps by nested interrupts or also
2151 * as one timestamp is about to be written. The max times 2433 * if filtering is on (discarding commits). Since discarding
2152 * that this can happen is the number of nested interrupts we 2434 * commits can be frequent we can get a lot of timestamps.
2153 * can have. Nesting 10 deep of interrupts is clearly 2435 * But we limit them by not adding timestamps if they begin
2154 * an anomaly. 2436 * at the start of a page.
2155 */ 2437 */
2156 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2438 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2157 return NULL; 2439 return NULL;
2158 2440
2159 if (rb_per_cpu_empty(cpu_buffer)) 2441 if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2443,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2161 2443
2162 event = rb_iter_head_event(iter); 2444 event = rb_iter_head_event(iter);
2163 2445
2164 switch (event->type) { 2446 switch (event->type_len) {
2165 case RINGBUF_TYPE_PADDING: 2447 case RINGBUF_TYPE_PADDING:
2166 if (rb_null_event(event)) { 2448 if (rb_null_event(event)) {
2167 rb_inc_iter(iter); 2449 rb_inc_iter(iter);
@@ -2196,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2196} 2478}
2197EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 2479EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
2198 2480
2481static inline int rb_ok_to_lock(void)
2482{
2483 /*
2484 * If an NMI die dumps out the content of the ring buffer
2485 * do not grab locks. We also permanently disable the ring
2486 * buffer too. A one time deal is all you get from reading
2487 * the ring buffer from an NMI.
2488 */
2489 if (likely(!in_nmi()))
2490 return 1;
2491
2492 tracing_off_permanent();
2493 return 0;
2494}
2495
2199/** 2496/**
2200 * ring_buffer_peek - peek at the next event to be read 2497 * ring_buffer_peek - peek at the next event to be read
2201 * @buffer: The ring buffer to read 2498 * @buffer: The ring buffer to read
@@ -2211,16 +2508,24 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2211 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2508 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2212 struct ring_buffer_event *event; 2509 struct ring_buffer_event *event;
2213 unsigned long flags; 2510 unsigned long flags;
2511 int dolock;
2214 2512
2215 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2513 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2216 return NULL; 2514 return NULL;
2217 2515
2516 dolock = rb_ok_to_lock();
2218 again: 2517 again:
2219 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2518 local_irq_save(flags);
2519 if (dolock)
2520 spin_lock(&cpu_buffer->reader_lock);
2220 event = rb_buffer_peek(buffer, cpu, ts); 2521 event = rb_buffer_peek(buffer, cpu, ts);
2221 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2522 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2523 rb_advance_reader(cpu_buffer);
2524 if (dolock)
2525 spin_unlock(&cpu_buffer->reader_lock);
2526 local_irq_restore(flags);
2222 2527
2223 if (event && event->type == RINGBUF_TYPE_PADDING) { 2528 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2224 cpu_relax(); 2529 cpu_relax();
2225 goto again; 2530 goto again;
2226 } 2531 }
@@ -2248,7 +2553,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2248 event = rb_iter_peek(iter, ts); 2553 event = rb_iter_peek(iter, ts);
2249 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2554 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2250 2555
2251 if (event && event->type == RINGBUF_TYPE_PADDING) { 2556 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2252 cpu_relax(); 2557 cpu_relax();
2253 goto again; 2558 goto again;
2254 } 2559 }
@@ -2270,6 +2575,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2270 struct ring_buffer_per_cpu *cpu_buffer; 2575 struct ring_buffer_per_cpu *cpu_buffer;
2271 struct ring_buffer_event *event = NULL; 2576 struct ring_buffer_event *event = NULL;
2272 unsigned long flags; 2577 unsigned long flags;
2578 int dolock;
2579
2580 dolock = rb_ok_to_lock();
2273 2581
2274 again: 2582 again:
2275 /* might be called in atomic */ 2583 /* might be called in atomic */
@@ -2279,21 +2587,22 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2279 goto out; 2587 goto out;
2280 2588
2281 cpu_buffer = buffer->buffers[cpu]; 2589 cpu_buffer = buffer->buffers[cpu];
2282 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2590 local_irq_save(flags);
2591 if (dolock)
2592 spin_lock(&cpu_buffer->reader_lock);
2283 2593
2284 event = rb_buffer_peek(buffer, cpu, ts); 2594 event = rb_buffer_peek(buffer, cpu, ts);
2285 if (!event) 2595 if (event)
2286 goto out_unlock; 2596 rb_advance_reader(cpu_buffer);
2287
2288 rb_advance_reader(cpu_buffer);
2289 2597
2290 out_unlock: 2598 if (dolock)
2291 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2599 spin_unlock(&cpu_buffer->reader_lock);
2600 local_irq_restore(flags);
2292 2601
2293 out: 2602 out:
2294 preempt_enable(); 2603 preempt_enable();
2295 2604
2296 if (event && event->type == RINGBUF_TYPE_PADDING) { 2605 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2297 cpu_relax(); 2606 cpu_relax();
2298 goto again; 2607 goto again;
2299 } 2608 }
@@ -2386,7 +2695,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2386 out: 2695 out:
2387 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2696 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2388 2697
2389 if (event && event->type == RINGBUF_TYPE_PADDING) { 2698 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2390 cpu_relax(); 2699 cpu_relax();
2391 goto again; 2700 goto again;
2392 } 2701 }
@@ -2411,6 +2720,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2411 cpu_buffer->head_page 2720 cpu_buffer->head_page
2412 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2721 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
2413 local_set(&cpu_buffer->head_page->write, 0); 2722 local_set(&cpu_buffer->head_page->write, 0);
2723 local_set(&cpu_buffer->head_page->entries, 0);
2414 local_set(&cpu_buffer->head_page->page->commit, 0); 2724 local_set(&cpu_buffer->head_page->page->commit, 0);
2415 2725
2416 cpu_buffer->head_page->read = 0; 2726 cpu_buffer->head_page->read = 0;
@@ -2420,11 +2730,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2420 2730
2421 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2731 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
2422 local_set(&cpu_buffer->reader_page->write, 0); 2732 local_set(&cpu_buffer->reader_page->write, 0);
2733 local_set(&cpu_buffer->reader_page->entries, 0);
2423 local_set(&cpu_buffer->reader_page->page->commit, 0); 2734 local_set(&cpu_buffer->reader_page->page->commit, 0);
2424 cpu_buffer->reader_page->read = 0; 2735 cpu_buffer->reader_page->read = 0;
2425 2736
2737 cpu_buffer->nmi_dropped = 0;
2738 cpu_buffer->commit_overrun = 0;
2426 cpu_buffer->overrun = 0; 2739 cpu_buffer->overrun = 0;
2427 cpu_buffer->entries = 0; 2740 cpu_buffer->read = 0;
2741 local_set(&cpu_buffer->entries, 0);
2742 local_set(&cpu_buffer->committing, 0);
2743 local_set(&cpu_buffer->commits, 0);
2428 2744
2429 cpu_buffer->write_stamp = 0; 2745 cpu_buffer->write_stamp = 0;
2430 cpu_buffer->read_stamp = 0; 2746 cpu_buffer->read_stamp = 0;
@@ -2443,6 +2759,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2443 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2759 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2444 return; 2760 return;
2445 2761
2762 atomic_inc(&cpu_buffer->record_disabled);
2763
2446 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2764 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2447 2765
2448 __raw_spin_lock(&cpu_buffer->lock); 2766 __raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2770,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2452 __raw_spin_unlock(&cpu_buffer->lock); 2770 __raw_spin_unlock(&cpu_buffer->lock);
2453 2771
2454 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2772 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2773
2774 atomic_dec(&cpu_buffer->record_disabled);
2455} 2775}
2456EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 2776EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2457 2777
@@ -2475,12 +2795,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
2475int ring_buffer_empty(struct ring_buffer *buffer) 2795int ring_buffer_empty(struct ring_buffer *buffer)
2476{ 2796{
2477 struct ring_buffer_per_cpu *cpu_buffer; 2797 struct ring_buffer_per_cpu *cpu_buffer;
2798 unsigned long flags;
2799 int dolock;
2478 int cpu; 2800 int cpu;
2801 int ret;
2802
2803 dolock = rb_ok_to_lock();
2479 2804
2480 /* yes this is racy, but if you don't like the race, lock the buffer */ 2805 /* yes this is racy, but if you don't like the race, lock the buffer */
2481 for_each_buffer_cpu(buffer, cpu) { 2806 for_each_buffer_cpu(buffer, cpu) {
2482 cpu_buffer = buffer->buffers[cpu]; 2807 cpu_buffer = buffer->buffers[cpu];
2483 if (!rb_per_cpu_empty(cpu_buffer)) 2808 local_irq_save(flags);
2809 if (dolock)
2810 spin_lock(&cpu_buffer->reader_lock);
2811 ret = rb_per_cpu_empty(cpu_buffer);
2812 if (dolock)
2813 spin_unlock(&cpu_buffer->reader_lock);
2814 local_irq_restore(flags);
2815
2816 if (!ret)
2484 return 0; 2817 return 0;
2485 } 2818 }
2486 2819
@@ -2496,14 +2829,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2496int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2829int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2497{ 2830{
2498 struct ring_buffer_per_cpu *cpu_buffer; 2831 struct ring_buffer_per_cpu *cpu_buffer;
2832 unsigned long flags;
2833 int dolock;
2499 int ret; 2834 int ret;
2500 2835
2501 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2836 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2502 return 1; 2837 return 1;
2503 2838
2839 dolock = rb_ok_to_lock();
2840
2504 cpu_buffer = buffer->buffers[cpu]; 2841 cpu_buffer = buffer->buffers[cpu];
2842 local_irq_save(flags);
2843 if (dolock)
2844 spin_lock(&cpu_buffer->reader_lock);
2505 ret = rb_per_cpu_empty(cpu_buffer); 2845 ret = rb_per_cpu_empty(cpu_buffer);
2506 2846 if (dolock)
2847 spin_unlock(&cpu_buffer->reader_lock);
2848 local_irq_restore(flags);
2507 2849
2508 return ret; 2850 return ret;
2509} 2851}
@@ -2578,28 +2920,6 @@ out:
2578} 2920}
2579EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2921EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2580 2922
2581static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2582 struct buffer_data_page *bpage,
2583 unsigned int offset)
2584{
2585 struct ring_buffer_event *event;
2586 unsigned long head;
2587
2588 __raw_spin_lock(&cpu_buffer->lock);
2589 for (head = offset; head < local_read(&bpage->commit);
2590 head += rb_event_length(event)) {
2591
2592 event = __rb_data_page_index(bpage, head);
2593 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2594 return;
2595 /* Only count data entries */
2596 if (event->type != RINGBUF_TYPE_DATA)
2597 continue;
2598 cpu_buffer->entries--;
2599 }
2600 __raw_spin_unlock(&cpu_buffer->lock);
2601}
2602
2603/** 2923/**
2604 * ring_buffer_alloc_read_page - allocate a page to read from buffer 2924 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2605 * @buffer: the buffer to allocate for. 2925 * @buffer: the buffer to allocate for.
@@ -2630,6 +2950,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2630 2950
2631 return bpage; 2951 return bpage;
2632} 2952}
2953EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
2633 2954
2634/** 2955/**
2635 * ring_buffer_free_read_page - free an allocated read page 2956 * ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2963,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2642{ 2963{
2643 free_page((unsigned long)data); 2964 free_page((unsigned long)data);
2644} 2965}
2966EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
2645 2967
2646/** 2968/**
2647 * ring_buffer_read_page - extract a page from the ring buffer 2969 * ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3090,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2768 /* we copied everything to the beginning */ 3090 /* we copied everything to the beginning */
2769 read = 0; 3091 read = 0;
2770 } else { 3092 } else {
3093 /* update the entry counter */
3094 cpu_buffer->read += local_read(&reader->entries);
3095
2771 /* swap the pages */ 3096 /* swap the pages */
2772 rb_init_page(bpage); 3097 rb_init_page(bpage);
2773 bpage = reader->page; 3098 bpage = reader->page;
2774 reader->page = *data_page; 3099 reader->page = *data_page;
2775 local_set(&reader->write, 0); 3100 local_set(&reader->write, 0);
3101 local_set(&reader->entries, 0);
2776 reader->read = 0; 3102 reader->read = 0;
2777 *data_page = bpage; 3103 *data_page = bpage;
2778
2779 /* update the entry counter */
2780 rb_remove_entries(cpu_buffer, bpage, read);
2781 } 3104 }
2782 ret = read; 3105 ret = read;
2783 3106
@@ -2787,7 +3110,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2787 out: 3110 out:
2788 return ret; 3111 return ret;
2789} 3112}
3113EXPORT_SYMBOL_GPL(ring_buffer_read_page);
2790 3114
3115#ifdef CONFIG_TRACING
2791static ssize_t 3116static ssize_t
2792rb_simple_read(struct file *filp, char __user *ubuf, 3117rb_simple_read(struct file *filp, char __user *ubuf,
2793 size_t cnt, loff_t *ppos) 3118 size_t cnt, loff_t *ppos)
@@ -2845,19 +3170,17 @@ static const struct file_operations rb_simple_fops = {
2845static __init int rb_init_debugfs(void) 3170static __init int rb_init_debugfs(void)
2846{ 3171{
2847 struct dentry *d_tracer; 3172 struct dentry *d_tracer;
2848 struct dentry *entry;
2849 3173
2850 d_tracer = tracing_init_dentry(); 3174 d_tracer = tracing_init_dentry();
2851 3175
2852 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 3176 trace_create_file("tracing_on", 0644, d_tracer,
2853 &ring_buffer_flags, &rb_simple_fops); 3177 &ring_buffer_flags, &rb_simple_fops);
2854 if (!entry)
2855 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2856 3178
2857 return 0; 3179 return 0;
2858} 3180}
2859 3181
2860fs_initcall(rb_init_debugfs); 3182fs_initcall(rb_init_debugfs);
3183#endif
2861 3184
2862#ifdef CONFIG_HOTPLUG_CPU 3185#ifdef CONFIG_HOTPLUG_CPU
2863static int rb_cpu_notify(struct notifier_block *self, 3186static int rb_cpu_notify(struct notifier_block *self,
@@ -2870,7 +3193,7 @@ static int rb_cpu_notify(struct notifier_block *self,
2870 switch (action) { 3193 switch (action) {
2871 case CPU_UP_PREPARE: 3194 case CPU_UP_PREPARE:
2872 case CPU_UP_PREPARE_FROZEN: 3195 case CPU_UP_PREPARE_FROZEN:
2873 if (cpu_isset(cpu, *buffer->cpumask)) 3196 if (cpumask_test_cpu(cpu, buffer->cpumask))
2874 return NOTIFY_OK; 3197 return NOTIFY_OK;
2875 3198
2876 buffer->buffers[cpu] = 3199 buffer->buffers[cpu] =
@@ -2881,7 +3204,7 @@ static int rb_cpu_notify(struct notifier_block *self,
2881 return NOTIFY_OK; 3204 return NOTIFY_OK;
2882 } 3205 }
2883 smp_wmb(); 3206 smp_wmb();
2884 cpu_set(cpu, *buffer->cpumask); 3207 cpumask_set_cpu(cpu, buffer->cpumask);
2885 break; 3208 break;
2886 case CPU_DOWN_PREPARE: 3209 case CPU_DOWN_PREPARE:
2887 case CPU_DOWN_PREPARE_FROZEN: 3210 case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 000000000000..573d3cc762c3
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,419 @@
1/*
2 * ring buffer tester and benchmark
3 *
4 * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/ring_buffer.h>
7#include <linux/completion.h>
8#include <linux/kthread.h>
9#include <linux/module.h>
10#include <linux/time.h>
11
12struct rb_page {
13 u64 ts;
14 local_t commit;
15 char data[4080];
16};
17
18/* run time and sleep time in seconds */
19#define RUN_TIME 10
20#define SLEEP_TIME 10
21
22/* number of events for writer to wake up the reader */
23static int wakeup_interval = 100;
24
25static int reader_finish;
26static struct completion read_start;
27static struct completion read_done;
28
29static struct ring_buffer *buffer;
30static struct task_struct *producer;
31static struct task_struct *consumer;
32static unsigned long read;
33
34static int disable_reader;
35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer");
37
38static int read_events;
39
40static int kill_test;
41
42#define KILL_TEST() \
43 do { \
44 if (!kill_test) { \
45 kill_test = 1; \
46 WARN_ON(1); \
47 } \
48 } while (0)
49
50enum event_status {
51 EVENT_FOUND,
52 EVENT_DROPPED,
53};
54
55static enum event_status read_event(int cpu)
56{
57 struct ring_buffer_event *event;
58 int *entry;
59 u64 ts;
60
61 event = ring_buffer_consume(buffer, cpu, &ts);
62 if (!event)
63 return EVENT_DROPPED;
64
65 entry = ring_buffer_event_data(event);
66 if (*entry != cpu) {
67 KILL_TEST();
68 return EVENT_DROPPED;
69 }
70
71 read++;
72 return EVENT_FOUND;
73}
74
75static enum event_status read_page(int cpu)
76{
77 struct ring_buffer_event *event;
78 struct rb_page *rpage;
79 unsigned long commit;
80 void *bpage;
81 int *entry;
82 int ret;
83 int inc;
84 int i;
85
86 bpage = ring_buffer_alloc_read_page(buffer);
87 if (!bpage)
88 return EVENT_DROPPED;
89
90 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
91 if (ret >= 0) {
92 rpage = bpage;
93 commit = local_read(&rpage->commit);
94 for (i = 0; i < commit && !kill_test; i += inc) {
95
96 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
97 KILL_TEST();
98 break;
99 }
100
101 inc = -1;
102 event = (void *)&rpage->data[i];
103 switch (event->type_len) {
104 case RINGBUF_TYPE_PADDING:
105 /* failed writes may be discarded events */
106 if (!event->time_delta)
107 KILL_TEST();
108 inc = event->array[0] + 4;
109 break;
110 case RINGBUF_TYPE_TIME_EXTEND:
111 inc = 8;
112 break;
113 case 0:
114 entry = ring_buffer_event_data(event);
115 if (*entry != cpu) {
116 KILL_TEST();
117 break;
118 }
119 read++;
120 if (!event->array[0]) {
121 KILL_TEST();
122 break;
123 }
124 inc = event->array[0] + 4;
125 break;
126 default:
127 entry = ring_buffer_event_data(event);
128 if (*entry != cpu) {
129 KILL_TEST();
130 break;
131 }
132 read++;
133 inc = ((event->type_len + 1) * 4);
134 }
135 if (kill_test)
136 break;
137
138 if (inc <= 0) {
139 KILL_TEST();
140 break;
141 }
142 }
143 }
144 ring_buffer_free_read_page(buffer, bpage);
145
146 if (ret < 0)
147 return EVENT_DROPPED;
148 return EVENT_FOUND;
149}
150
151static void ring_buffer_consumer(void)
152{
153 /* toggle between reading pages and events */
154 read_events ^= 1;
155
156 read = 0;
157 while (!reader_finish && !kill_test) {
158 int found;
159
160 do {
161 int cpu;
162
163 found = 0;
164 for_each_online_cpu(cpu) {
165 enum event_status stat;
166
167 if (read_events)
168 stat = read_event(cpu);
169 else
170 stat = read_page(cpu);
171
172 if (kill_test)
173 break;
174 if (stat == EVENT_FOUND)
175 found = 1;
176 }
177 } while (found && !kill_test);
178
179 set_current_state(TASK_INTERRUPTIBLE);
180 if (reader_finish)
181 break;
182
183 schedule();
184 __set_current_state(TASK_RUNNING);
185 }
186 reader_finish = 0;
187 complete(&read_done);
188}
189
190static void ring_buffer_producer(void)
191{
192 struct timeval start_tv;
193 struct timeval end_tv;
194 unsigned long long time;
195 unsigned long long entries;
196 unsigned long long overruns;
197 unsigned long missed = 0;
198 unsigned long hit = 0;
199 unsigned long avg;
200 int cnt = 0;
201
202 /*
203 * Hammer the buffer for 10 secs (this may
204 * make the system stall)
205 */
206 trace_printk("Starting ring buffer hammer\n");
207 do_gettimeofday(&start_tv);
208 do {
209 struct ring_buffer_event *event;
210 int *entry;
211
212 event = ring_buffer_lock_reserve(buffer, 10);
213 if (!event) {
214 missed++;
215 } else {
216 hit++;
217 entry = ring_buffer_event_data(event);
218 *entry = smp_processor_id();
219 ring_buffer_unlock_commit(buffer, event);
220 }
221 do_gettimeofday(&end_tv);
222
223 cnt++;
224 if (consumer && !(cnt % wakeup_interval))
225 wake_up_process(consumer);
226
227#ifndef CONFIG_PREEMPT
228 /*
229 * If we are a non preempt kernel, the 10 second run will
230 * stop everything while it runs. Instead, we will call
231 * cond_resched and also add any time that was lost by a
232 * rescedule.
233 *
234 * Do a cond resched at the same frequency we would wake up
235 * the reader.
236 */
237 if (cnt % wakeup_interval)
238 cond_resched();
239#endif
240
241 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
242 trace_printk("End ring buffer hammer\n");
243
244 if (consumer) {
245 /* Init both completions here to avoid races */
246 init_completion(&read_start);
247 init_completion(&read_done);
248 /* the completions must be visible before the finish var */
249 smp_wmb();
250 reader_finish = 1;
251 /* finish var visible before waking up the consumer */
252 smp_wmb();
253 wake_up_process(consumer);
254 wait_for_completion(&read_done);
255 }
256
257 time = end_tv.tv_sec - start_tv.tv_sec;
258 time *= USEC_PER_SEC;
259 time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
260
261 entries = ring_buffer_entries(buffer);
262 overruns = ring_buffer_overruns(buffer);
263
264 if (kill_test)
265 trace_printk("ERROR!\n");
266 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader)
269 trace_printk("Read: (reader disabled)\n");
270 else
271 trace_printk("Read: %ld (by %s)\n", read,
272 read_events ? "events" : "pages");
273 trace_printk("Entries: %lld\n", entries);
274 trace_printk("Total: %lld\n", entries + overruns + read);
275 trace_printk("Missed: %ld\n", missed);
276 trace_printk("Hit: %ld\n", hit);
277
278 /* Convert time from usecs to millisecs */
279 do_div(time, USEC_PER_MSEC);
280 if (time)
281 hit /= (long)time;
282 else
283 trace_printk("TIME IS ZERO??\n");
284
285 trace_printk("Entries per millisec: %ld\n", hit);
286
287 if (hit) {
288 /* Calculate the average time in nanosecs */
289 avg = NSEC_PER_MSEC / hit;
290 trace_printk("%ld ns per entry\n", avg);
291 }
292
293 if (missed) {
294 if (time)
295 missed /= (long)time;
296
297 trace_printk("Total iterations per millisec: %ld\n",
298 hit + missed);
299
300 /* it is possible that hit + missed will overflow and be zero */
301 if (!(hit + missed)) {
302 trace_printk("hit + missed overflowed and totalled zero!\n");
303 hit--; /* make it non zero */
304 }
305
306 /* Caculate the average time in nanosecs */
307 avg = NSEC_PER_MSEC / (hit + missed);
308 trace_printk("%ld ns per entry\n", avg);
309 }
310}
311
312static void wait_to_die(void)
313{
314 set_current_state(TASK_INTERRUPTIBLE);
315 while (!kthread_should_stop()) {
316 schedule();
317 set_current_state(TASK_INTERRUPTIBLE);
318 }
319 __set_current_state(TASK_RUNNING);
320}
321
322static int ring_buffer_consumer_thread(void *arg)
323{
324 while (!kthread_should_stop() && !kill_test) {
325 complete(&read_start);
326
327 ring_buffer_consumer();
328
329 set_current_state(TASK_INTERRUPTIBLE);
330 if (kthread_should_stop() || kill_test)
331 break;
332
333 schedule();
334 __set_current_state(TASK_RUNNING);
335 }
336 __set_current_state(TASK_RUNNING);
337
338 if (kill_test)
339 wait_to_die();
340
341 return 0;
342}
343
344static int ring_buffer_producer_thread(void *arg)
345{
346 init_completion(&read_start);
347
348 while (!kthread_should_stop() && !kill_test) {
349 ring_buffer_reset(buffer);
350
351 if (consumer) {
352 smp_wmb();
353 wake_up_process(consumer);
354 wait_for_completion(&read_start);
355 }
356
357 ring_buffer_producer();
358
359 trace_printk("Sleeping for 10 secs\n");
360 set_current_state(TASK_INTERRUPTIBLE);
361 schedule_timeout(HZ * SLEEP_TIME);
362 __set_current_state(TASK_RUNNING);
363 }
364
365 if (kill_test)
366 wait_to_die();
367
368 return 0;
369}
370
371static int __init ring_buffer_benchmark_init(void)
372{
373 int ret;
374
375 /* make a one meg buffer in overwite mode */
376 buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
377 if (!buffer)
378 return -ENOMEM;
379
380 if (!disable_reader) {
381 consumer = kthread_create(ring_buffer_consumer_thread,
382 NULL, "rb_consumer");
383 ret = PTR_ERR(consumer);
384 if (IS_ERR(consumer))
385 goto out_fail;
386 }
387
388 producer = kthread_run(ring_buffer_producer_thread,
389 NULL, "rb_producer");
390 ret = PTR_ERR(producer);
391
392 if (IS_ERR(producer))
393 goto out_kill;
394
395 return 0;
396
397 out_kill:
398 if (consumer)
399 kthread_stop(consumer);
400
401 out_fail:
402 ring_buffer_free(buffer);
403 return ret;
404}
405
406static void __exit ring_buffer_benchmark_exit(void)
407{
408 kthread_stop(producer);
409 if (consumer)
410 kthread_stop(consumer);
411 ring_buffer_free(buffer);
412}
413
414module_init(ring_buffer_benchmark_init);
415module_exit(ring_buffer_benchmark_exit);
416
417MODULE_AUTHOR("Steven Rostedt");
418MODULE_DESCRIPTION("ring_buffer_benchmark");
419MODULE_LICENSE("GPL");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cda81ec58d9f..8c358395d338 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -171,6 +172,13 @@ static struct trace_array global_trace;
171 172
172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 173static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
173 174
175int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
176 struct ring_buffer_event *event)
177{
178 return filter_check_discard(call, rec, global_trace.buffer, event);
179}
180EXPORT_SYMBOL_GPL(filter_current_check_discard);
181
174cycle_t ftrace_now(int cpu) 182cycle_t ftrace_now(int cpu)
175{ 183{
176 u64 ts; 184 u64 ts;
@@ -255,7 +263,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
255 263
256/* trace_flags holds trace_options default values */ 264/* trace_flags holds trace_options default values */
257unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 265unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
258 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; 266 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
267 TRACE_ITER_GRAPH_TIME;
259 268
260/** 269/**
261 * trace_wake_up - wake up tasks waiting for trace input 270 * trace_wake_up - wake up tasks waiting for trace input
@@ -276,13 +285,12 @@ void trace_wake_up(void)
276static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
277{ 286{
278 unsigned long buf_size; 287 unsigned long buf_size;
279 int ret;
280 288
281 if (!str) 289 if (!str)
282 return 0; 290 return 0;
283 ret = strict_strtoul(str, 0, &buf_size); 291 buf_size = memparse(str, &str);
284 /* nr_entries can not be zero */ 292 /* nr_entries can not be zero */
285 if (ret < 0 || buf_size == 0) 293 if (buf_size == 0)
286 return 0; 294 return 0;
287 trace_buf_size = buf_size; 295 trace_buf_size = buf_size;
288 return 1; 296 return 1;
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
317 "latency-format", 325 "latency-format",
318 "global-clock", 326 "global-clock",
319 "sleep-time", 327 "sleep-time",
328 "graph-time",
320 NULL 329 NULL
321}; 330};
322 331
@@ -335,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
335/* 344/*
336 * Copy the new maximum trace into the separate maximum-trace 345 * Copy the new maximum trace into the separate maximum-trace
337 * structure. (this way the maximum trace is permanently saved, 346 * structure. (this way the maximum trace is permanently saved,
338 * for later retrieval via /debugfs/tracing/latency_trace) 347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
339 */ 348 */
340static void 349static void
341__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
402 return cnt; 411 return cnt;
403} 412}
404 413
405static void
406trace_print_seq(struct seq_file *m, struct trace_seq *s)
407{
408 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
409
410 s->buffer[len] = 0;
411 seq_puts(m, s->buffer);
412
413 trace_seq_init(s);
414}
415
416/** 414/**
417 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
418 * @tr: tracer 416 * @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
641 tracing_reset(tr, cpu); 639 tracing_reset(tr, cpu);
642} 640}
643 641
642void tracing_reset_current(int cpu)
643{
644 tracing_reset(&global_trace, cpu);
645}
646
647void tracing_reset_current_online_cpus(void)
648{
649 tracing_reset_online_cpus(&global_trace);
650}
651
644#define SAVED_CMDLINES 128 652#define SAVED_CMDLINES 128
645#define NO_CMDLINE_MAP UINT_MAX 653#define NO_CMDLINE_MAP UINT_MAX
646static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 654static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
800 return; 808 return;
801 } 809 }
802 810
811 preempt_disable();
803 __raw_spin_lock(&trace_cmdline_lock); 812 __raw_spin_lock(&trace_cmdline_lock);
804 map = map_pid_to_cmdline[pid]; 813 map = map_pid_to_cmdline[pid];
805 if (map != NO_CMDLINE_MAP) 814 if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
808 strcpy(comm, "<...>"); 817 strcpy(comm, "<...>");
809 818
810 __raw_spin_unlock(&trace_cmdline_lock); 819 __raw_spin_unlock(&trace_cmdline_lock);
820 preempt_enable();
811} 821}
812 822
813void tracing_record_cmdline(struct task_struct *tsk) 823void tracing_record_cmdline(struct task_struct *tsk)
@@ -838,9 +848,10 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
838 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
839 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
840} 850}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
841 852
842struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
843 unsigned char type, 854 int type,
844 unsigned long len, 855 unsigned long len,
845 unsigned long flags, int pc) 856 unsigned long flags, int pc)
846{ 857{
@@ -883,30 +894,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
883} 894}
884 895
885struct ring_buffer_event * 896struct ring_buffer_event *
886trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, 897trace_current_buffer_lock_reserve(int type, unsigned long len,
887 unsigned long flags, int pc) 898 unsigned long flags, int pc)
888{ 899{
889 return trace_buffer_lock_reserve(&global_trace, 900 return trace_buffer_lock_reserve(&global_trace,
890 type, len, flags, pc); 901 type, len, flags, pc);
891} 902}
903EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
892 904
893void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 905void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
894 unsigned long flags, int pc) 906 unsigned long flags, int pc)
895{ 907{
896 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 908 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
897} 909}
910EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
898 911
899void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 912void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
900 unsigned long flags, int pc) 913 unsigned long flags, int pc)
901{ 914{
902 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 915 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
916}
917EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
918
919void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
920{
921 ring_buffer_discard_commit(global_trace.buffer, event);
903} 922}
923EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
904 924
905void 925void
906trace_function(struct trace_array *tr, 926trace_function(struct trace_array *tr,
907 unsigned long ip, unsigned long parent_ip, unsigned long flags, 927 unsigned long ip, unsigned long parent_ip, unsigned long flags,
908 int pc) 928 int pc)
909{ 929{
930 struct ftrace_event_call *call = &event_function;
910 struct ring_buffer_event *event; 931 struct ring_buffer_event *event;
911 struct ftrace_entry *entry; 932 struct ftrace_entry *entry;
912 933
@@ -921,7 +942,9 @@ trace_function(struct trace_array *tr,
921 entry = ring_buffer_event_data(event); 942 entry = ring_buffer_event_data(event);
922 entry->ip = ip; 943 entry->ip = ip;
923 entry->parent_ip = parent_ip; 944 entry->parent_ip = parent_ip;
924 ring_buffer_unlock_commit(tr->buffer, event); 945
946 if (!filter_check_discard(call, entry, tr->buffer, event))
947 ring_buffer_unlock_commit(tr->buffer, event);
925} 948}
926 949
927#ifdef CONFIG_FUNCTION_GRAPH_TRACER 950#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +953,7 @@ static int __trace_graph_entry(struct trace_array *tr,
930 unsigned long flags, 953 unsigned long flags,
931 int pc) 954 int pc)
932{ 955{
956 struct ftrace_event_call *call = &event_funcgraph_entry;
933 struct ring_buffer_event *event; 957 struct ring_buffer_event *event;
934 struct ftrace_graph_ent_entry *entry; 958 struct ftrace_graph_ent_entry *entry;
935 959
@@ -942,7 +966,8 @@ static int __trace_graph_entry(struct trace_array *tr,
942 return 0; 966 return 0;
943 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
944 entry->graph_ent = *trace; 968 entry->graph_ent = *trace;
945 ring_buffer_unlock_commit(global_trace.buffer, event); 969 if (!filter_current_check_discard(call, entry, event))
970 ring_buffer_unlock_commit(global_trace.buffer, event);
946 971
947 return 1; 972 return 1;
948} 973}
@@ -952,6 +977,7 @@ static void __trace_graph_return(struct trace_array *tr,
952 unsigned long flags, 977 unsigned long flags,
953 int pc) 978 int pc)
954{ 979{
980 struct ftrace_event_call *call = &event_funcgraph_exit;
955 struct ring_buffer_event *event; 981 struct ring_buffer_event *event;
956 struct ftrace_graph_ret_entry *entry; 982 struct ftrace_graph_ret_entry *entry;
957 983
@@ -964,7 +990,8 @@ static void __trace_graph_return(struct trace_array *tr,
964 return; 990 return;
965 entry = ring_buffer_event_data(event); 991 entry = ring_buffer_event_data(event);
966 entry->ret = *trace; 992 entry->ret = *trace;
967 ring_buffer_unlock_commit(global_trace.buffer, event); 993 if (!filter_current_check_discard(call, entry, event))
994 ring_buffer_unlock_commit(global_trace.buffer, event);
968} 995}
969#endif 996#endif
970 997
@@ -982,6 +1009,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
982 int skip, int pc) 1009 int skip, int pc)
983{ 1010{
984#ifdef CONFIG_STACKTRACE 1011#ifdef CONFIG_STACKTRACE
1012 struct ftrace_event_call *call = &event_kernel_stack;
985 struct ring_buffer_event *event; 1013 struct ring_buffer_event *event;
986 struct stack_entry *entry; 1014 struct stack_entry *entry;
987 struct stack_trace trace; 1015 struct stack_trace trace;
@@ -999,7 +1027,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
999 trace.entries = entry->caller; 1027 trace.entries = entry->caller;
1000 1028
1001 save_stack_trace(&trace); 1029 save_stack_trace(&trace);
1002 ring_buffer_unlock_commit(tr->buffer, event); 1030 if (!filter_check_discard(call, entry, tr->buffer, event))
1031 ring_buffer_unlock_commit(tr->buffer, event);
1003#endif 1032#endif
1004} 1033}
1005 1034
@@ -1024,6 +1053,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1024 unsigned long flags, int pc) 1053 unsigned long flags, int pc)
1025{ 1054{
1026#ifdef CONFIG_STACKTRACE 1055#ifdef CONFIG_STACKTRACE
1056 struct ftrace_event_call *call = &event_user_stack;
1027 struct ring_buffer_event *event; 1057 struct ring_buffer_event *event;
1028 struct userstack_entry *entry; 1058 struct userstack_entry *entry;
1029 struct stack_trace trace; 1059 struct stack_trace trace;
@@ -1045,7 +1075,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1045 trace.entries = entry->caller; 1075 trace.entries = entry->caller;
1046 1076
1047 save_stack_trace_user(&trace); 1077 save_stack_trace_user(&trace);
1048 ring_buffer_unlock_commit(tr->buffer, event); 1078 if (!filter_check_discard(call, entry, tr->buffer, event))
1079 ring_buffer_unlock_commit(tr->buffer, event);
1049#endif 1080#endif
1050} 1081}
1051 1082
@@ -1089,6 +1120,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
1089 struct task_struct *next, 1120 struct task_struct *next,
1090 unsigned long flags, int pc) 1121 unsigned long flags, int pc)
1091{ 1122{
1123 struct ftrace_event_call *call = &event_context_switch;
1092 struct ring_buffer_event *event; 1124 struct ring_buffer_event *event;
1093 struct ctx_switch_entry *entry; 1125 struct ctx_switch_entry *entry;
1094 1126
@@ -1104,7 +1136,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
1104 entry->next_prio = next->prio; 1136 entry->next_prio = next->prio;
1105 entry->next_state = next->state; 1137 entry->next_state = next->state;
1106 entry->next_cpu = task_cpu(next); 1138 entry->next_cpu = task_cpu(next);
1107 trace_buffer_unlock_commit(tr, event, flags, pc); 1139
1140 if (!filter_check_discard(call, entry, tr->buffer, event))
1141 trace_buffer_unlock_commit(tr, event, flags, pc);
1108} 1142}
1109 1143
1110void 1144void
@@ -1113,6 +1147,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1113 struct task_struct *curr, 1147 struct task_struct *curr,
1114 unsigned long flags, int pc) 1148 unsigned long flags, int pc)
1115{ 1149{
1150 struct ftrace_event_call *call = &event_wakeup;
1116 struct ring_buffer_event *event; 1151 struct ring_buffer_event *event;
1117 struct ctx_switch_entry *entry; 1152 struct ctx_switch_entry *entry;
1118 1153
@@ -1129,7 +1164,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1129 entry->next_state = wakee->state; 1164 entry->next_state = wakee->state;
1130 entry->next_cpu = task_cpu(wakee); 1165 entry->next_cpu = task_cpu(wakee);
1131 1166
1132 ring_buffer_unlock_commit(tr->buffer, event); 1167 if (!filter_check_discard(call, entry, tr->buffer, event))
1168 ring_buffer_unlock_commit(tr->buffer, event);
1133 ftrace_trace_stack(tr, flags, 6, pc); 1169 ftrace_trace_stack(tr, flags, 6, pc);
1134 ftrace_trace_userstack(tr, flags, pc); 1170 ftrace_trace_userstack(tr, flags, pc);
1135} 1171}
@@ -1230,11 +1266,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1230 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1266 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
1231 static u32 trace_buf[TRACE_BUF_SIZE]; 1267 static u32 trace_buf[TRACE_BUF_SIZE];
1232 1268
1269 struct ftrace_event_call *call = &event_bprint;
1233 struct ring_buffer_event *event; 1270 struct ring_buffer_event *event;
1234 struct trace_array *tr = &global_trace; 1271 struct trace_array *tr = &global_trace;
1235 struct trace_array_cpu *data; 1272 struct trace_array_cpu *data;
1236 struct bprint_entry *entry; 1273 struct bprint_entry *entry;
1237 unsigned long flags; 1274 unsigned long flags;
1275 int disable;
1238 int resched; 1276 int resched;
1239 int cpu, len = 0, size, pc; 1277 int cpu, len = 0, size, pc;
1240 1278
@@ -1249,7 +1287,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1249 cpu = raw_smp_processor_id(); 1287 cpu = raw_smp_processor_id();
1250 data = tr->data[cpu]; 1288 data = tr->data[cpu];
1251 1289
1252 if (unlikely(atomic_read(&data->disabled))) 1290 disable = atomic_inc_return(&data->disabled);
1291 if (unlikely(disable != 1))
1253 goto out; 1292 goto out;
1254 1293
1255 /* Lockdep uses trace_printk for lock tracing */ 1294 /* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1308,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1269 entry->fmt = fmt; 1308 entry->fmt = fmt;
1270 1309
1271 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1310 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1272 ring_buffer_unlock_commit(tr->buffer, event); 1311 if (!filter_check_discard(call, entry, tr->buffer, event))
1312 ring_buffer_unlock_commit(tr->buffer, event);
1273 1313
1274out_unlock: 1314out_unlock:
1275 __raw_spin_unlock(&trace_buf_lock); 1315 __raw_spin_unlock(&trace_buf_lock);
1276 local_irq_restore(flags); 1316 local_irq_restore(flags);
1277 1317
1278out: 1318out:
1319 atomic_dec_return(&data->disabled);
1279 ftrace_preempt_enable(resched); 1320 ftrace_preempt_enable(resched);
1280 unpause_graph_tracing(); 1321 unpause_graph_tracing();
1281 1322
@@ -1288,12 +1329,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1288 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1329 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1289 static char trace_buf[TRACE_BUF_SIZE]; 1330 static char trace_buf[TRACE_BUF_SIZE];
1290 1331
1332 struct ftrace_event_call *call = &event_print;
1291 struct ring_buffer_event *event; 1333 struct ring_buffer_event *event;
1292 struct trace_array *tr = &global_trace; 1334 struct trace_array *tr = &global_trace;
1293 struct trace_array_cpu *data; 1335 struct trace_array_cpu *data;
1294 int cpu, len = 0, size, pc; 1336 int cpu, len = 0, size, pc;
1295 struct print_entry *entry; 1337 struct print_entry *entry;
1296 unsigned long irq_flags; 1338 unsigned long irq_flags;
1339 int disable;
1297 1340
1298 if (tracing_disabled || tracing_selftest_running) 1341 if (tracing_disabled || tracing_selftest_running)
1299 return 0; 1342 return 0;
@@ -1303,7 +1346,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1303 cpu = raw_smp_processor_id(); 1346 cpu = raw_smp_processor_id();
1304 data = tr->data[cpu]; 1347 data = tr->data[cpu];
1305 1348
1306 if (unlikely(atomic_read(&data->disabled))) 1349 disable = atomic_inc_return(&data->disabled);
1350 if (unlikely(disable != 1))
1307 goto out; 1351 goto out;
1308 1352
1309 pause_graph_tracing(); 1353 pause_graph_tracing();
@@ -1323,13 +1367,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1323 1367
1324 memcpy(&entry->buf, trace_buf, len); 1368 memcpy(&entry->buf, trace_buf, len);
1325 entry->buf[len] = 0; 1369 entry->buf[len] = 0;
1326 ring_buffer_unlock_commit(tr->buffer, event); 1370 if (!filter_check_discard(call, entry, tr->buffer, event))
1371 ring_buffer_unlock_commit(tr->buffer, event);
1327 1372
1328 out_unlock: 1373 out_unlock:
1329 __raw_spin_unlock(&trace_buf_lock); 1374 __raw_spin_unlock(&trace_buf_lock);
1330 raw_local_irq_restore(irq_flags); 1375 raw_local_irq_restore(irq_flags);
1331 unpause_graph_tracing(); 1376 unpause_graph_tracing();
1332 out: 1377 out:
1378 atomic_dec_return(&data->disabled);
1333 preempt_enable_notrace(); 1379 preempt_enable_notrace();
1334 1380
1335 return len; 1381 return len;
@@ -1526,12 +1572,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1526 p = s_next(m, p, &l); 1572 p = s_next(m, p, &l);
1527 } 1573 }
1528 1574
1575 trace_event_read_lock();
1529 return p; 1576 return p;
1530} 1577}
1531 1578
1532static void s_stop(struct seq_file *m, void *p) 1579static void s_stop(struct seq_file *m, void *p)
1533{ 1580{
1534 atomic_dec(&trace_record_cmdline_disabled); 1581 atomic_dec(&trace_record_cmdline_disabled);
1582 trace_event_read_unlock();
1535} 1583}
1536 1584
1537static void print_lat_help_header(struct seq_file *m) 1585static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1822,7 @@ static int trace_empty(struct trace_iterator *iter)
1774 return 1; 1822 return 1;
1775} 1823}
1776 1824
1825/* Called with trace_event_read_lock() held. */
1777static enum print_line_t print_trace_line(struct trace_iterator *iter) 1826static enum print_line_t print_trace_line(struct trace_iterator *iter)
1778{ 1827{
1779 enum print_line_t ret; 1828 enum print_line_t ret;
@@ -1983,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)
1983 2032
1984 /* If this file was open for write, then erase contents */ 2033 /* If this file was open for write, then erase contents */
1985 if ((file->f_mode & FMODE_WRITE) && 2034 if ((file->f_mode & FMODE_WRITE) &&
1986 !(file->f_flags & O_APPEND)) { 2035 (file->f_flags & O_TRUNC)) {
1987 long cpu = (long) inode->i_private; 2036 long cpu = (long) inode->i_private;
1988 2037
1989 if (cpu == TRACE_PIPE_ALL_CPU) 2038 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2005,25 +2054,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2005static void * 2054static void *
2006t_next(struct seq_file *m, void *v, loff_t *pos) 2055t_next(struct seq_file *m, void *v, loff_t *pos)
2007{ 2056{
2008 struct tracer *t = m->private; 2057 struct tracer *t = v;
2009 2058
2010 (*pos)++; 2059 (*pos)++;
2011 2060
2012 if (t) 2061 if (t)
2013 t = t->next; 2062 t = t->next;
2014 2063
2015 m->private = t;
2016
2017 return t; 2064 return t;
2018} 2065}
2019 2066
2020static void *t_start(struct seq_file *m, loff_t *pos) 2067static void *t_start(struct seq_file *m, loff_t *pos)
2021{ 2068{
2022 struct tracer *t = m->private; 2069 struct tracer *t;
2023 loff_t l = 0; 2070 loff_t l = 0;
2024 2071
2025 mutex_lock(&trace_types_lock); 2072 mutex_lock(&trace_types_lock);
2026 for (; t && l < *pos; t = t_next(m, t, &l)) 2073 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2027 ; 2074 ;
2028 2075
2029 return t; 2076 return t;
@@ -2059,18 +2106,10 @@ static struct seq_operations show_traces_seq_ops = {
2059 2106
2060static int show_traces_open(struct inode *inode, struct file *file) 2107static int show_traces_open(struct inode *inode, struct file *file)
2061{ 2108{
2062 int ret;
2063
2064 if (tracing_disabled) 2109 if (tracing_disabled)
2065 return -ENODEV; 2110 return -ENODEV;
2066 2111
2067 ret = seq_open(file, &show_traces_seq_ops); 2112 return seq_open(file, &show_traces_seq_ops);
2068 if (!ret) {
2069 struct seq_file *m = file->private_data;
2070 m->private = trace_types;
2071 }
2072
2073 return ret;
2074} 2113}
2075 2114
2076static ssize_t 2115static ssize_t
@@ -2143,11 +2182,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2143 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 2182 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2144 return -ENOMEM; 2183 return -ENOMEM;
2145 2184
2146 mutex_lock(&tracing_cpumask_update_lock);
2147 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2185 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2148 if (err) 2186 if (err)
2149 goto err_unlock; 2187 goto err_unlock;
2150 2188
2189 mutex_lock(&tracing_cpumask_update_lock);
2190
2151 local_irq_disable(); 2191 local_irq_disable();
2152 __raw_spin_lock(&ftrace_max_lock); 2192 __raw_spin_lock(&ftrace_max_lock);
2153 for_each_tracing_cpu(cpu) { 2193 for_each_tracing_cpu(cpu) {
@@ -2175,8 +2215,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2175 return count; 2215 return count;
2176 2216
2177err_unlock: 2217err_unlock:
2178 mutex_unlock(&tracing_cpumask_update_lock); 2218 free_cpumask_var(tracing_cpumask_new);
2179 free_cpumask_var(tracing_cpumask);
2180 2219
2181 return err; 2220 return err;
2182} 2221}
@@ -2366,21 +2405,20 @@ static const struct file_operations tracing_iter_fops = {
2366 2405
2367static const char readme_msg[] = 2406static const char readme_msg[] =
2368 "tracing mini-HOWTO:\n\n" 2407 "tracing mini-HOWTO:\n\n"
2369 "# mkdir /debug\n" 2408 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2370 "# mount -t debugfs nodev /debug\n\n" 2409 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2371 "# cat /debug/tracing/available_tracers\n"
2372 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2410 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2373 "# cat /debug/tracing/current_tracer\n" 2411 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2374 "nop\n" 2412 "nop\n"
2375 "# echo sched_switch > /debug/tracing/current_tracer\n" 2413 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
2376 "# cat /debug/tracing/current_tracer\n" 2414 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2377 "sched_switch\n" 2415 "sched_switch\n"
2378 "# cat /debug/tracing/trace_options\n" 2416 "# cat /sys/kernel/debug/tracing/trace_options\n"
2379 "noprint-parent nosym-offset nosym-addr noverbose\n" 2417 "noprint-parent nosym-offset nosym-addr noverbose\n"
2380 "# echo print-parent > /debug/tracing/trace_options\n" 2418 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2381 "# echo 1 > /debug/tracing/tracing_enabled\n" 2419 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
2382 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2420 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2383 "# echo 0 > /debug/tracing/tracing_enabled\n" 2421 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
2384; 2422;
2385 2423
2386static ssize_t 2424static ssize_t
@@ -2397,6 +2435,56 @@ static const struct file_operations tracing_readme_fops = {
2397}; 2435};
2398 2436
2399static ssize_t 2437static ssize_t
2438tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2439 size_t cnt, loff_t *ppos)
2440{
2441 char *buf_comm;
2442 char *file_buf;
2443 char *buf;
2444 int len = 0;
2445 int pid;
2446 int i;
2447
2448 file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
2449 if (!file_buf)
2450 return -ENOMEM;
2451
2452 buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
2453 if (!buf_comm) {
2454 kfree(file_buf);
2455 return -ENOMEM;
2456 }
2457
2458 buf = file_buf;
2459
2460 for (i = 0; i < SAVED_CMDLINES; i++) {
2461 int r;
2462
2463 pid = map_cmdline_to_pid[i];
2464 if (pid == -1 || pid == NO_CMDLINE_MAP)
2465 continue;
2466
2467 trace_find_cmdline(pid, buf_comm);
2468 r = sprintf(buf, "%d %s\n", pid, buf_comm);
2469 buf += r;
2470 len += r;
2471 }
2472
2473 len = simple_read_from_buffer(ubuf, cnt, ppos,
2474 file_buf, len);
2475
2476 kfree(file_buf);
2477 kfree(buf_comm);
2478
2479 return len;
2480}
2481
2482static const struct file_operations tracing_saved_cmdlines_fops = {
2483 .open = tracing_open_generic,
2484 .read = tracing_saved_cmdlines_read,
2485};
2486
2487static ssize_t
2400tracing_ctrl_read(struct file *filp, char __user *ubuf, 2488tracing_ctrl_read(struct file *filp, char __user *ubuf,
2401 size_t cnt, loff_t *ppos) 2489 size_t cnt, loff_t *ppos)
2402{ 2490{
@@ -2728,6 +2816,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2728 /* trace pipe does not show start of buffer */ 2816 /* trace pipe does not show start of buffer */
2729 cpumask_setall(iter->started); 2817 cpumask_setall(iter->started);
2730 2818
2819 if (trace_flags & TRACE_ITER_LATENCY_FMT)
2820 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2821
2731 iter->cpu_file = cpu_file; 2822 iter->cpu_file = cpu_file;
2732 iter->tr = &global_trace; 2823 iter->tr = &global_trace;
2733 mutex_init(&iter->mutex); 2824 mutex_init(&iter->mutex);
@@ -2915,6 +3006,7 @@ waitagain:
2915 offsetof(struct trace_iterator, seq)); 3006 offsetof(struct trace_iterator, seq));
2916 iter->pos = -1; 3007 iter->pos = -1;
2917 3008
3009 trace_event_read_lock();
2918 while (find_next_entry_inc(iter) != NULL) { 3010 while (find_next_entry_inc(iter) != NULL) {
2919 enum print_line_t ret; 3011 enum print_line_t ret;
2920 int len = iter->seq.len; 3012 int len = iter->seq.len;
@@ -2931,6 +3023,7 @@ waitagain:
2931 if (iter->seq.len >= cnt) 3023 if (iter->seq.len >= cnt)
2932 break; 3024 break;
2933 } 3025 }
3026 trace_event_read_unlock();
2934 3027
2935 /* Now copy what we have to the user */ 3028 /* Now copy what we have to the user */
2936 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 3029 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -2993,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
2993 break; 3086 break;
2994 } 3087 }
2995 3088
2996 trace_consume(iter); 3089 if (ret != TRACE_TYPE_NO_CONSUME)
3090 trace_consume(iter);
2997 rem -= count; 3091 rem -= count;
2998 if (!find_next_entry_inc(iter)) { 3092 if (!find_next_entry_inc(iter)) {
2999 rem = 0; 3093 rem = 0;
@@ -3053,6 +3147,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3053 goto out_err; 3147 goto out_err;
3054 } 3148 }
3055 3149
3150 trace_event_read_lock();
3151
3056 /* Fill as many pages as possible. */ 3152 /* Fill as many pages as possible. */
3057 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3153 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
3058 pages[i] = alloc_page(GFP_KERNEL); 3154 pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3171,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3075 trace_seq_init(&iter->seq); 3171 trace_seq_init(&iter->seq);
3076 } 3172 }
3077 3173
3174 trace_event_read_unlock();
3078 mutex_unlock(&iter->mutex); 3175 mutex_unlock(&iter->mutex);
3079 3176
3080 spd.nr_pages = i; 3177 spd.nr_pages = i;
@@ -3425,7 +3522,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3425 .spd_release = buffer_spd_release, 3522 .spd_release = buffer_spd_release,
3426 }; 3523 };
3427 struct buffer_ref *ref; 3524 struct buffer_ref *ref;
3428 int size, i; 3525 int entries, size, i;
3429 size_t ret; 3526 size_t ret;
3430 3527
3431 if (*ppos & (PAGE_SIZE - 1)) { 3528 if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3537,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3440 len &= PAGE_MASK; 3537 len &= PAGE_MASK;
3441 } 3538 }
3442 3539
3443 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { 3540 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3541
3542 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
3444 struct page *page; 3543 struct page *page;
3445 int r; 3544 int r;
3446 3545
@@ -3457,7 +3556,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3457 } 3556 }
3458 3557
3459 r = ring_buffer_read_page(ref->buffer, &ref->page, 3558 r = ring_buffer_read_page(ref->buffer, &ref->page,
3460 len, info->cpu, 0); 3559 len, info->cpu, 1);
3461 if (r < 0) { 3560 if (r < 0) {
3462 ring_buffer_free_read_page(ref->buffer, 3561 ring_buffer_free_read_page(ref->buffer,
3463 ref->page); 3562 ref->page);
@@ -3481,6 +3580,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3481 spd.partial[i].private = (unsigned long)ref; 3580 spd.partial[i].private = (unsigned long)ref;
3482 spd.nr_pages++; 3581 spd.nr_pages++;
3483 *ppos += PAGE_SIZE; 3582 *ppos += PAGE_SIZE;
3583
3584 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3484 } 3585 }
3485 3586
3486 spd.nr_pages = i; 3587 spd.nr_pages = i;
@@ -3508,6 +3609,45 @@ static const struct file_operations tracing_buffers_fops = {
3508 .llseek = no_llseek, 3609 .llseek = no_llseek,
3509}; 3610};
3510 3611
3612static ssize_t
3613tracing_stats_read(struct file *filp, char __user *ubuf,
3614 size_t count, loff_t *ppos)
3615{
3616 unsigned long cpu = (unsigned long)filp->private_data;
3617 struct trace_array *tr = &global_trace;
3618 struct trace_seq *s;
3619 unsigned long cnt;
3620
3621 s = kmalloc(sizeof(*s), GFP_KERNEL);
3622 if (!s)
3623 return ENOMEM;
3624
3625 trace_seq_init(s);
3626
3627 cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
3628 trace_seq_printf(s, "entries: %ld\n", cnt);
3629
3630 cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
3631 trace_seq_printf(s, "overrun: %ld\n", cnt);
3632
3633 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3635
3636 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3637 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3638
3639 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3640
3641 kfree(s);
3642
3643 return count;
3644}
3645
3646static const struct file_operations tracing_stats_fops = {
3647 .open = tracing_open_generic,
3648 .read = tracing_stats_read,
3649};
3650
3511#ifdef CONFIG_DYNAMIC_FTRACE 3651#ifdef CONFIG_DYNAMIC_FTRACE
3512 3652
3513int __weak ftrace_arch_read_dyn_info(char *buf, int size) 3653int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3737,7 @@ struct dentry *tracing_dentry_percpu(void)
3597static void tracing_init_debugfs_percpu(long cpu) 3737static void tracing_init_debugfs_percpu(long cpu)
3598{ 3738{
3599 struct dentry *d_percpu = tracing_dentry_percpu(); 3739 struct dentry *d_percpu = tracing_dentry_percpu();
3600 struct dentry *entry, *d_cpu; 3740 struct dentry *d_cpu;
3601 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 3741 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
3602 char cpu_dir[7]; 3742 char cpu_dir[7];
3603 3743
@@ -3612,21 +3752,18 @@ static void tracing_init_debugfs_percpu(long cpu)
3612 } 3752 }
3613 3753
3614 /* per cpu trace_pipe */ 3754 /* per cpu trace_pipe */
3615 entry = debugfs_create_file("trace_pipe", 0444, d_cpu, 3755 trace_create_file("trace_pipe", 0444, d_cpu,
3616 (void *) cpu, &tracing_pipe_fops); 3756 (void *) cpu, &tracing_pipe_fops);
3617 if (!entry)
3618 pr_warning("Could not create debugfs 'trace_pipe' entry\n");
3619 3757
3620 /* per cpu trace */ 3758 /* per cpu trace */
3621 entry = debugfs_create_file("trace", 0644, d_cpu, 3759 trace_create_file("trace", 0644, d_cpu,
3622 (void *) cpu, &tracing_fops); 3760 (void *) cpu, &tracing_fops);
3623 if (!entry)
3624 pr_warning("Could not create debugfs 'trace' entry\n");
3625 3761
3626 entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, 3762 trace_create_file("trace_pipe_raw", 0444, d_cpu,
3627 (void *) cpu, &tracing_buffers_fops); 3763 (void *) cpu, &tracing_buffers_fops);
3628 if (!entry) 3764
3629 pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); 3765 trace_create_file("stats", 0444, d_cpu,
3766 (void *) cpu, &tracing_stats_fops);
3630} 3767}
3631 3768
3632#ifdef CONFIG_FTRACE_SELFTEST 3769#ifdef CONFIG_FTRACE_SELFTEST
@@ -3759,17 +3896,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3759 if (ret < 0) 3896 if (ret < 0)
3760 return ret; 3897 return ret;
3761 3898
3762 switch (val) { 3899 if (val != 0 && val != 1)
3763 case 0:
3764 trace_flags &= ~(1 << index);
3765 break;
3766 case 1:
3767 trace_flags |= 1 << index;
3768 break;
3769
3770 default:
3771 return -EINVAL; 3900 return -EINVAL;
3772 } 3901 set_tracer_flags(1 << index, val);
3773 3902
3774 *ppos += cnt; 3903 *ppos += cnt;
3775 3904
@@ -3782,6 +3911,22 @@ static const struct file_operations trace_options_core_fops = {
3782 .write = trace_options_core_write, 3911 .write = trace_options_core_write,
3783}; 3912};
3784 3913
3914struct dentry *trace_create_file(const char *name,
3915 mode_t mode,
3916 struct dentry *parent,
3917 void *data,
3918 const struct file_operations *fops)
3919{
3920 struct dentry *ret;
3921
3922 ret = debugfs_create_file(name, mode, parent, data, fops);
3923 if (!ret)
3924 pr_warning("Could not create debugfs '%s' entry\n", name);
3925
3926 return ret;
3927}
3928
3929
3785static struct dentry *trace_options_init_dentry(void) 3930static struct dentry *trace_options_init_dentry(void)
3786{ 3931{
3787 struct dentry *d_tracer; 3932 struct dentry *d_tracer;
@@ -3809,7 +3954,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
3809 struct tracer_opt *opt) 3954 struct tracer_opt *opt)
3810{ 3955{
3811 struct dentry *t_options; 3956 struct dentry *t_options;
3812 struct dentry *entry;
3813 3957
3814 t_options = trace_options_init_dentry(); 3958 t_options = trace_options_init_dentry();
3815 if (!t_options) 3959 if (!t_options)
@@ -3818,11 +3962,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
3818 topt->flags = flags; 3962 topt->flags = flags;
3819 topt->opt = opt; 3963 topt->opt = opt;
3820 3964
3821 entry = debugfs_create_file(opt->name, 0644, t_options, topt, 3965 topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
3822 &trace_options_fops); 3966 &trace_options_fops);
3823 3967
3824 topt->entry = entry;
3825
3826} 3968}
3827 3969
3828static struct trace_option_dentry * 3970static struct trace_option_dentry *
@@ -3877,123 +4019,84 @@ static struct dentry *
3877create_trace_option_core_file(const char *option, long index) 4019create_trace_option_core_file(const char *option, long index)
3878{ 4020{
3879 struct dentry *t_options; 4021 struct dentry *t_options;
3880 struct dentry *entry;
3881 4022
3882 t_options = trace_options_init_dentry(); 4023 t_options = trace_options_init_dentry();
3883 if (!t_options) 4024 if (!t_options)
3884 return NULL; 4025 return NULL;
3885 4026
3886 entry = debugfs_create_file(option, 0644, t_options, (void *)index, 4027 return trace_create_file(option, 0644, t_options, (void *)index,
3887 &trace_options_core_fops); 4028 &trace_options_core_fops);
3888
3889 return entry;
3890} 4029}
3891 4030
3892static __init void create_trace_options_dir(void) 4031static __init void create_trace_options_dir(void)
3893{ 4032{
3894 struct dentry *t_options; 4033 struct dentry *t_options;
3895 struct dentry *entry;
3896 int i; 4034 int i;
3897 4035
3898 t_options = trace_options_init_dentry(); 4036 t_options = trace_options_init_dentry();
3899 if (!t_options) 4037 if (!t_options)
3900 return; 4038 return;
3901 4039
3902 for (i = 0; trace_options[i]; i++) { 4040 for (i = 0; trace_options[i]; i++)
3903 entry = create_trace_option_core_file(trace_options[i], i); 4041 create_trace_option_core_file(trace_options[i], i);
3904 if (!entry)
3905 pr_warning("Could not create debugfs %s entry\n",
3906 trace_options[i]);
3907 }
3908} 4042}
3909 4043
3910static __init int tracer_init_debugfs(void) 4044static __init int tracer_init_debugfs(void)
3911{ 4045{
3912 struct dentry *d_tracer; 4046 struct dentry *d_tracer;
3913 struct dentry *entry;
3914 int cpu; 4047 int cpu;
3915 4048
3916 d_tracer = tracing_init_dentry(); 4049 d_tracer = tracing_init_dentry();
3917 4050
3918 entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, 4051 trace_create_file("tracing_enabled", 0644, d_tracer,
3919 &global_trace, &tracing_ctrl_fops); 4052 &global_trace, &tracing_ctrl_fops);
3920 if (!entry)
3921 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
3922 4053
3923 entry = debugfs_create_file("trace_options", 0644, d_tracer, 4054 trace_create_file("trace_options", 0644, d_tracer,
3924 NULL, &tracing_iter_fops); 4055 NULL, &tracing_iter_fops);
3925 if (!entry)
3926 pr_warning("Could not create debugfs 'trace_options' entry\n");
3927 4056
3928 create_trace_options_dir(); 4057 trace_create_file("tracing_cpumask", 0644, d_tracer,
4058 NULL, &tracing_cpumask_fops);
4059
4060 trace_create_file("trace", 0644, d_tracer,
4061 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
4062
4063 trace_create_file("available_tracers", 0444, d_tracer,
4064 &global_trace, &show_traces_fops);
4065
4066 trace_create_file("current_tracer", 0644, d_tracer,
4067 &global_trace, &set_tracer_fops);
3929 4068
3930 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 4069 trace_create_file("tracing_max_latency", 0644, d_tracer,
3931 NULL, &tracing_cpumask_fops); 4070 &tracing_max_latency, &tracing_max_lat_fops);
3932 if (!entry) 4071
3933 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); 4072 trace_create_file("tracing_thresh", 0644, d_tracer,
3934 4073 &tracing_thresh, &tracing_max_lat_fops);
3935 entry = debugfs_create_file("trace", 0644, d_tracer, 4074
3936 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); 4075 trace_create_file("README", 0444, d_tracer,
3937 if (!entry) 4076 NULL, &tracing_readme_fops);
3938 pr_warning("Could not create debugfs 'trace' entry\n"); 4077
3939 4078 trace_create_file("trace_pipe", 0444, d_tracer,
3940 entry = debugfs_create_file("available_tracers", 0444, d_tracer,
3941 &global_trace, &show_traces_fops);
3942 if (!entry)
3943 pr_warning("Could not create debugfs 'available_tracers' entry\n");
3944
3945 entry = debugfs_create_file("current_tracer", 0444, d_tracer,
3946 &global_trace, &set_tracer_fops);
3947 if (!entry)
3948 pr_warning("Could not create debugfs 'current_tracer' entry\n");
3949
3950 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
3951 &tracing_max_latency,
3952 &tracing_max_lat_fops);
3953 if (!entry)
3954 pr_warning("Could not create debugfs "
3955 "'tracing_max_latency' entry\n");
3956
3957 entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
3958 &tracing_thresh, &tracing_max_lat_fops);
3959 if (!entry)
3960 pr_warning("Could not create debugfs "
3961 "'tracing_thresh' entry\n");
3962 entry = debugfs_create_file("README", 0644, d_tracer,
3963 NULL, &tracing_readme_fops);
3964 if (!entry)
3965 pr_warning("Could not create debugfs 'README' entry\n");
3966
3967 entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
3968 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); 4079 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
3969 if (!entry) 4080
3970 pr_warning("Could not create debugfs " 4081 trace_create_file("buffer_size_kb", 0644, d_tracer,
3971 "'trace_pipe' entry\n"); 4082 &global_trace, &tracing_entries_fops);
3972 4083
3973 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, 4084 trace_create_file("trace_marker", 0220, d_tracer,
3974 &global_trace, &tracing_entries_fops); 4085 NULL, &tracing_mark_fops);
3975 if (!entry) 4086
3976 pr_warning("Could not create debugfs " 4087 trace_create_file("saved_cmdlines", 0444, d_tracer,
3977 "'buffer_size_kb' entry\n"); 4088 NULL, &tracing_saved_cmdlines_fops);
3978
3979 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
3980 NULL, &tracing_mark_fops);
3981 if (!entry)
3982 pr_warning("Could not create debugfs "
3983 "'trace_marker' entry\n");
3984 4089
3985#ifdef CONFIG_DYNAMIC_FTRACE 4090#ifdef CONFIG_DYNAMIC_FTRACE
3986 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4091 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
3987 &ftrace_update_tot_cnt, 4092 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
3988 &tracing_dyn_info_fops);
3989 if (!entry)
3990 pr_warning("Could not create debugfs "
3991 "'dyn_ftrace_total_info' entry\n");
3992#endif 4093#endif
3993#ifdef CONFIG_SYSPROF_TRACER 4094#ifdef CONFIG_SYSPROF_TRACER
3994 init_tracer_sysprof_debugfs(d_tracer); 4095 init_tracer_sysprof_debugfs(d_tracer);
3995#endif 4096#endif
3996 4097
4098 create_trace_options_dir();
4099
3997 for_each_tracing_cpu(cpu) 4100 for_each_tracing_cpu(cpu)
3998 tracing_init_debugfs_percpu(cpu); 4101 tracing_init_debugfs_percpu(cpu);
3999 4102
@@ -4064,7 +4167,8 @@ trace_printk_seq(struct trace_seq *s)
4064 4167
4065static void __ftrace_dump(bool disable_tracing) 4168static void __ftrace_dump(bool disable_tracing)
4066{ 4169{
4067 static DEFINE_SPINLOCK(ftrace_dump_lock); 4170 static raw_spinlock_t ftrace_dump_lock =
4171 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
4068 /* use static because iter can be a bit big for the stack */ 4172 /* use static because iter can be a bit big for the stack */
4069 static struct trace_iterator iter; 4173 static struct trace_iterator iter;
4070 unsigned int old_userobj; 4174 unsigned int old_userobj;
@@ -4073,7 +4177,8 @@ static void __ftrace_dump(bool disable_tracing)
4073 int cnt = 0, cpu; 4177 int cnt = 0, cpu;
4074 4178
4075 /* only one dump */ 4179 /* only one dump */
4076 spin_lock_irqsave(&ftrace_dump_lock, flags); 4180 local_irq_save(flags);
4181 __raw_spin_lock(&ftrace_dump_lock);
4077 if (dump_ran) 4182 if (dump_ran)
4078 goto out; 4183 goto out;
4079 4184
@@ -4122,8 +4227,11 @@ static void __ftrace_dump(bool disable_tracing)
4122 iter.pos = -1; 4227 iter.pos = -1;
4123 4228
4124 if (find_next_entry_inc(&iter) != NULL) { 4229 if (find_next_entry_inc(&iter) != NULL) {
4125 print_trace_line(&iter); 4230 int ret;
4126 trace_consume(&iter); 4231
4232 ret = print_trace_line(&iter);
4233 if (ret != TRACE_TYPE_NO_CONSUME)
4234 trace_consume(&iter);
4127 } 4235 }
4128 4236
4129 trace_printk_seq(&iter.seq); 4237 trace_printk_seq(&iter.seq);
@@ -4145,7 +4253,8 @@ static void __ftrace_dump(bool disable_tracing)
4145 } 4253 }
4146 4254
4147 out: 4255 out:
4148 spin_unlock_irqrestore(&ftrace_dump_lock, flags); 4256 __raw_spin_unlock(&ftrace_dump_lock);
4257 local_irq_restore(flags);
4149} 4258}
4150 4259
4151/* By default: disable tracing after the dump */ 4260/* By default: disable tracing after the dump */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e685ac2b2ba1..8b9f4f6e9559 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,9 +9,12 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h> 11#include <trace/boot.h>
12#include <trace/kmemtrace.h> 12#include <linux/kmemtrace.h>
13#include <trace/power.h> 13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h>
17
15enum trace_type { 18enum trace_type {
16 __TRACE_FIRST_TYPE = 0, 19 __TRACE_FIRST_TYPE = 0,
17 20
@@ -42,20 +45,6 @@ enum trace_type {
42}; 45};
43 46
44/* 47/*
45 * The trace entry - the most basic unit of tracing. This is what
46 * is printed in the end as a single line in the trace output, such as:
47 *
48 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
49 */
50struct trace_entry {
51 unsigned char type;
52 unsigned char flags;
53 unsigned char preempt_count;
54 int pid;
55 int tgid;
56};
57
58/*
59 * Function trace entry - function address and parent function addres: 48 * Function trace entry - function address and parent function addres:
60 */ 49 */
61struct ftrace_entry { 50struct ftrace_entry {
@@ -263,8 +252,6 @@ struct trace_array_cpu {
263 char comm[TASK_COMM_LEN]; 252 char comm[TASK_COMM_LEN];
264}; 253};
265 254
266struct trace_iterator;
267
268/* 255/*
269 * The trace array - an array of per-CPU trace arrays. This is the 256 * The trace array - an array of per-CPU trace arrays. This is the
270 * highest level data structure that individual tracers deal with. 257 * highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
339 __ftrace_bad_type(); \ 326 __ftrace_bad_type(); \
340 } while (0) 327 } while (0)
341 328
342/* Return values for print_line callback */
343enum print_line_t {
344 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
345 TRACE_TYPE_HANDLED = 1,
346 TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
347 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
348};
349
350
351/* 329/*
352 * An option specific to a tracer. This is a boolean value. 330 * An option specific to a tracer. This is a boolean value.
353 * The bit is the bit index that sets its value on the 331 * The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
423 struct tracer_stat *stats; 401 struct tracer_stat *stats;
424}; 402};
425 403
426struct trace_seq {
427 unsigned char buffer[PAGE_SIZE];
428 unsigned int len;
429 unsigned int readpos;
430};
431
432static inline void
433trace_seq_init(struct trace_seq *s)
434{
435 s->len = 0;
436 s->readpos = 0;
437}
438
439 404
440#define TRACE_PIPE_ALL_CPU -1 405#define TRACE_PIPE_ALL_CPU -1
441 406
442/*
443 * Trace iterator - used by printout routines who present trace
444 * results to users and which routines might sleep, etc:
445 */
446struct trace_iterator {
447 struct trace_array *tr;
448 struct tracer *trace;
449 void *private;
450 int cpu_file;
451 struct mutex mutex;
452 struct ring_buffer_iter *buffer_iter[NR_CPUS];
453
454 /* The below is zeroed out in pipe_read */
455 struct trace_seq seq;
456 struct trace_entry *ent;
457 int cpu;
458 u64 ts;
459
460 unsigned long iter_flags;
461 loff_t pos;
462 long idx;
463
464 cpumask_var_t started;
465};
466
467int tracer_init(struct tracer *t, struct trace_array *tr); 407int tracer_init(struct tracer *t, struct trace_array *tr);
468int tracing_is_enabled(void); 408int tracing_is_enabled(void);
469void trace_wake_up(void); 409void trace_wake_up(void);
470void tracing_reset(struct trace_array *tr, int cpu); 410void tracing_reset(struct trace_array *tr, int cpu);
471void tracing_reset_online_cpus(struct trace_array *tr); 411void tracing_reset_online_cpus(struct trace_array *tr);
412void tracing_reset_current(int cpu);
413void tracing_reset_current_online_cpus(void);
472int tracing_open_generic(struct inode *inode, struct file *filp); 414int tracing_open_generic(struct inode *inode, struct file *filp);
415struct dentry *trace_create_file(const char *name,
416 mode_t mode,
417 struct dentry *parent,
418 void *data,
419 const struct file_operations *fops);
420
473struct dentry *tracing_init_dentry(void); 421struct dentry *tracing_init_dentry(void);
474void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 422void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
475 423
476struct ring_buffer_event; 424struct ring_buffer_event;
477 425
478struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
479 unsigned char type, 427 int type,
480 unsigned long len, 428 unsigned long len,
481 unsigned long flags, 429 unsigned long flags,
482 int pc); 430 int pc);
@@ -484,24 +432,12 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
484 struct ring_buffer_event *event, 432 struct ring_buffer_event *event,
485 unsigned long flags, int pc); 433 unsigned long flags, int pc);
486 434
487struct ring_buffer_event *
488trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
489 unsigned long flags, int pc);
490void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
491 unsigned long flags, int pc);
492void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
493 unsigned long flags, int pc);
494
495struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 435struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
496 struct trace_array_cpu *data); 436 struct trace_array_cpu *data);
497 437
498struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
499 int *ent_cpu, u64 *ent_ts); 439 int *ent_cpu, u64 *ent_ts);
500 440
501void tracing_generic_entry_update(struct trace_entry *entry,
502 unsigned long flags,
503 int pc);
504
505void default_wait_pipe(struct trace_iterator *iter); 441void default_wait_pipe(struct trace_iterator *iter);
506void poll_wait_pipe(struct trace_iterator *iter); 442void poll_wait_pipe(struct trace_iterator *iter);
507 443
@@ -514,7 +450,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
514 struct task_struct *prev, 450 struct task_struct *prev,
515 struct task_struct *next, 451 struct task_struct *next,
516 unsigned long flags, int pc); 452 unsigned long flags, int pc);
517void tracing_record_cmdline(struct task_struct *tsk);
518 453
519void tracing_sched_wakeup_trace(struct trace_array *tr, 454void tracing_sched_wakeup_trace(struct trace_array *tr,
520 struct task_struct *wakee, 455 struct task_struct *wakee,
@@ -599,6 +534,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
599 struct trace_array *tr); 534 struct trace_array *tr);
600extern int trace_selftest_startup_branch(struct tracer *trace, 535extern int trace_selftest_startup_branch(struct tracer *trace,
601 struct trace_array *tr); 536 struct trace_array *tr);
537extern int trace_selftest_startup_hw_branches(struct tracer *trace,
538 struct trace_array *tr);
602#endif /* CONFIG_FTRACE_STARTUP_TEST */ 539#endif /* CONFIG_FTRACE_STARTUP_TEST */
603 540
604extern void *head_page(struct trace_array_cpu *data); 541extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +550,8 @@ extern unsigned long trace_flags;
613/* Standard output formatting function used for function return traces */ 550/* Standard output formatting function used for function return traces */
614#ifdef CONFIG_FUNCTION_GRAPH_TRACER 551#ifdef CONFIG_FUNCTION_GRAPH_TRACER
615extern enum print_line_t print_graph_function(struct trace_iterator *iter); 552extern enum print_line_t print_graph_function(struct trace_iterator *iter);
553extern enum print_line_t
554trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
616 555
617#ifdef CONFIG_DYNAMIC_FTRACE 556#ifdef CONFIG_DYNAMIC_FTRACE
618/* TODO: make this variable */ 557/* TODO: make this variable */
@@ -644,7 +583,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
644 return 1; 583 return 1;
645} 584}
646#endif /* CONFIG_DYNAMIC_FTRACE */ 585#endif /* CONFIG_DYNAMIC_FTRACE */
647
648#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 586#else /* CONFIG_FUNCTION_GRAPH_TRACER */
649static inline enum print_line_t 587static inline enum print_line_t
650print_graph_function(struct trace_iterator *iter) 588print_graph_function(struct trace_iterator *iter)
@@ -655,6 +593,7 @@ print_graph_function(struct trace_iterator *iter)
655 593
656extern struct pid *ftrace_pid_trace; 594extern struct pid *ftrace_pid_trace;
657 595
596#ifdef CONFIG_FUNCTION_TRACER
658static inline int ftrace_trace_task(struct task_struct *task) 597static inline int ftrace_trace_task(struct task_struct *task)
659{ 598{
660 if (!ftrace_pid_trace) 599 if (!ftrace_pid_trace)
@@ -662,6 +601,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
662 601
663 return test_tsk_trace_trace(task); 602 return test_tsk_trace_trace(task);
664} 603}
604#else
605static inline int ftrace_trace_task(struct task_struct *task)
606{
607 return 1;
608}
609#endif
665 610
666/* 611/*
667 * trace_iterator_flags is an enumeration that defines bit 612 * trace_iterator_flags is an enumeration that defines bit
@@ -692,6 +637,7 @@ enum trace_iterator_flags {
692 TRACE_ITER_LATENCY_FMT = 0x40000, 637 TRACE_ITER_LATENCY_FMT = 0x40000,
693 TRACE_ITER_GLOBAL_CLK = 0x80000, 638 TRACE_ITER_GLOBAL_CLK = 0x80000,
694 TRACE_ITER_SLEEP_TIME = 0x100000, 639 TRACE_ITER_SLEEP_TIME = 0x100000,
640 TRACE_ITER_GRAPH_TIME = 0x200000,
695}; 641};
696 642
697/* 643/*
@@ -790,103 +736,113 @@ struct ftrace_event_field {
790 char *type; 736 char *type;
791 int offset; 737 int offset;
792 int size; 738 int size;
739 int is_signed;
793}; 740};
794 741
795struct ftrace_event_call { 742struct event_filter {
796 char *name; 743 int n_preds;
797 char *system;
798 struct dentry *dir;
799 int enabled;
800 int (*regfunc)(void);
801 void (*unregfunc)(void);
802 int id;
803 int (*raw_init)(void);
804 int (*show_format)(struct trace_seq *s);
805 int (*define_fields)(void);
806 struct list_head fields;
807 struct filter_pred **preds; 744 struct filter_pred **preds;
808 745 char *filter_string;
809#ifdef CONFIG_EVENT_PROFILE
810 atomic_t profile_count;
811 int (*profile_enable)(struct ftrace_event_call *);
812 void (*profile_disable)(struct ftrace_event_call *);
813#endif
814}; 746};
815 747
816struct event_subsystem { 748struct event_subsystem {
817 struct list_head list; 749 struct list_head list;
818 const char *name; 750 const char *name;
819 struct dentry *entry; 751 struct dentry *entry;
820 struct filter_pred **preds; 752 void *filter;
821}; 753};
822 754
823#define events_for_each(event) \
824 for (event = __start_ftrace_events; \
825 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
826 event++)
827
828#define MAX_FILTER_PRED 8
829
830struct filter_pred; 755struct filter_pred;
831 756
832typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); 757typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
758 int val1, int val2);
833 759
834struct filter_pred { 760struct filter_pred {
835 filter_pred_fn_t fn; 761 filter_pred_fn_t fn;
836 u64 val; 762 u64 val;
837 char *str_val; 763 char str_val[MAX_FILTER_STR_VAL];
838 int str_len; 764 int str_len;
839 char *field_name; 765 char *field_name;
840 int offset; 766 int offset;
841 int not; 767 int not;
842 int or; 768 int op;
843 int compound; 769 int pop_n;
844 int clear;
845}; 770};
846 771
847int trace_define_field(struct ftrace_event_call *call, char *type, 772extern void print_event_filter(struct ftrace_event_call *call,
848 char *name, int offset, int size);
849extern void filter_free_pred(struct filter_pred *pred);
850extern void filter_print_preds(struct filter_pred **preds,
851 struct trace_seq *s); 773 struct trace_seq *s);
852extern int filter_parse(char **pbuf, struct filter_pred *pred); 774extern int apply_event_filter(struct ftrace_event_call *call,
853extern int filter_add_pred(struct ftrace_event_call *call, 775 char *filter_string);
854 struct filter_pred *pred); 776extern int apply_subsystem_event_filter(struct event_subsystem *system,
855extern void filter_free_preds(struct ftrace_event_call *call); 777 char *filter_string);
856extern int filter_match_preds(struct ftrace_event_call *call, void *rec); 778extern void print_subsystem_event_filter(struct event_subsystem *system,
857extern void filter_free_subsystem_preds(struct event_subsystem *system); 779 struct trace_seq *s);
858extern int filter_add_subsystem_pred(struct event_subsystem *system, 780
859 struct filter_pred *pred); 781static inline int
860 782filter_check_discard(struct ftrace_event_call *call, void *rec,
861void event_trace_printk(unsigned long ip, const char *fmt, ...); 783 struct ring_buffer *buffer,
862extern struct ftrace_event_call __start_ftrace_events[]; 784 struct ring_buffer_event *event)
863extern struct ftrace_event_call __stop_ftrace_events[]; 785{
864 786 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
865#define for_each_event(event) \ 787 ring_buffer_discard_commit(buffer, event);
866 for (event = __start_ftrace_events; \ 788 return 1;
867 (unsigned long)event < (unsigned long)__stop_ftrace_events; \ 789 }
868 event++) 790
791 return 0;
792}
793
794#define DEFINE_COMPARISON_PRED(type) \
795static int filter_pred_##type(struct filter_pred *pred, void *event, \
796 int val1, int val2) \
797{ \
798 type *addr = (type *)(event + pred->offset); \
799 type val = (type)pred->val; \
800 int match = 0; \
801 \
802 switch (pred->op) { \
803 case OP_LT: \
804 match = (*addr < val); \
805 break; \
806 case OP_LE: \
807 match = (*addr <= val); \
808 break; \
809 case OP_GT: \
810 match = (*addr > val); \
811 break; \
812 case OP_GE: \
813 match = (*addr >= val); \
814 break; \
815 default: \
816 break; \
817 } \
818 \
819 return match; \
820}
821
822#define DEFINE_EQUALITY_PRED(size) \
823static int filter_pred_##size(struct filter_pred *pred, void *event, \
824 int val1, int val2) \
825{ \
826 u##size *addr = (u##size *)(event + pred->offset); \
827 u##size val = (u##size)pred->val; \
828 int match; \
829 \
830 match = (val == *addr) ^ pred->not; \
831 \
832 return match; \
833}
834
835extern struct mutex event_mutex;
836extern struct list_head ftrace_events;
869 837
870extern const char *__start___trace_bprintk_fmt[]; 838extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[]; 839extern const char *__stop___trace_bprintk_fmt[];
872 840
873/* 841#undef TRACE_EVENT_FORMAT
874 * The double __builtin_constant_p is because gcc will give us an error 842#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
875 * if we try to allocate the static variable to fmt if it is not a 843 extern struct ftrace_event_call event_##call;
876 * constant. Even with the outer if statement optimizing out. 844#undef TRACE_EVENT_FORMAT_NOFILTER
877 */ 845#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
878#define event_trace_printk(ip, fmt, args...) \ 846#include "trace_event_types.h"
879do { \
880 __trace_printk_check_format(fmt, ##args); \
881 tracing_record_cmdline(current); \
882 if (__builtin_constant_p(fmt)) { \
883 static const char *trace_printk_fmt \
884 __attribute__((section("__trace_printk_fmt"))) = \
885 __builtin_constant_p(fmt) ? fmt : NULL; \
886 \
887 __trace_bprintk(ip, trace_printk_fmt, ##args); \
888 } else \
889 __trace_printk(ip, fmt, ##args); \
890} while (0)
891 847
892#endif /* _LINUX_KERNEL_TRACE_H */ 848#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4c3642..a29ef23ffb47 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/time.h>
12 13
13#include "trace.h" 14#include "trace.h"
14#include "trace_output.h" 15#include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
67 trace_assign_type(field, entry); 68 trace_assign_type(field, entry);
68 call = &field->boot_call; 69 call = &field->boot_call;
69 ts = iter->ts; 70 ts = iter->ts;
70 nsec_rem = do_div(ts, 1000000000); 71 nsec_rem = do_div(ts, NSEC_PER_SEC);
71 72
72 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 73 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
73 (unsigned long)ts, nsec_rem, call->func, call->caller); 74 (unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
92 trace_assign_type(field, entry); 93 trace_assign_type(field, entry);
93 init_ret = &field->boot_ret; 94 init_ret = &field->boot_ret;
94 ts = iter->ts; 95 ts = iter->ts;
95 nsec_rem = do_div(ts, 1000000000); 96 nsec_rem = do_div(ts, NSEC_PER_SEC);
96 97
97 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 98 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
98 "returned %d after %llu msecs\n", 99 "returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8333715e4066..7a7a9fd249a9 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
30static void 30static void
31probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) 31probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
32{ 32{
33 struct ftrace_event_call *call = &event_branch;
33 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
34 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
35 struct trace_branch *entry; 36 struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
73 entry->line = f->line; 74 entry->line = f->line;
74 entry->correct = val == expect; 75 entry->correct = val == expect;
75 76
76 ring_buffer_unlock_commit(tr->buffer, event); 77 if (!filter_check_discard(call, entry, tr->buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event);
77 79
78 out: 80 out:
79 atomic_dec(&tr->data[cpu]->disabled); 81 atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
271 return 0; 273 return 0;
272} 274}
273 275
274static void *annotated_branch_stat_start(void) 276static void *annotated_branch_stat_start(struct tracer_stat *trace)
275{ 277{
276 return __start_annotated_branch_profile; 278 return __start_annotated_branch_profile;
277} 279}
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
346 return 0; 348 return 0;
347} 349}
348 350
349static void *all_branch_stat_start(void) 351static void *all_branch_stat_start(struct tracer_stat *trace)
350{ 352{
351 return __start_branch_profile; 353 return __start_branch_profile;
352} 354}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba9970776..11ba5bb4ed0a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,22 +10,30 @@
10int ftrace_profile_enable(int event_id) 10int ftrace_profile_enable(int event_id)
11{ 11{
12 struct ftrace_event_call *event; 12 struct ftrace_event_call *event;
13 int ret = -EINVAL;
13 14
14 for_each_event(event) { 15 mutex_lock(&event_mutex);
15 if (event->id == event_id) 16 list_for_each_entry(event, &ftrace_events, list) {
16 return event->profile_enable(event); 17 if (event->id == event_id && event->profile_enable) {
18 ret = event->profile_enable(event);
19 break;
20 }
17 } 21 }
22 mutex_unlock(&event_mutex);
18 23
19 return -EINVAL; 24 return ret;
20} 25}
21 26
22void ftrace_profile_disable(int event_id) 27void ftrace_profile_disable(int event_id)
23{ 28{
24 struct ftrace_event_call *event; 29 struct ftrace_event_call *event;
25 30
26 for_each_event(event) { 31 mutex_lock(&event_mutex);
27 if (event->id == event_id) 32 list_for_each_entry(event, &ftrace_events, list) {
28 return event->profile_disable(event); 33 if (event->id == event_id) {
34 event->profile_disable(event);
35 break;
36 }
29 } 37 }
38 mutex_unlock(&event_mutex);
30} 39}
31
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee71dd7..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore, 26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT( 27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func) 28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
29 TRACE_FIELD(int, ret.depth, depth) 32 TRACE_FIELD(int, ret.depth, depth)
30 ), 33 ),
31 TP_RAW_FMT("<-- %lx (%d)") 34 TP_RAW_FMT("<-- %lx (%d)")
@@ -57,7 +60,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") 60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58); 61);
59 62
60TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, 63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT( 64 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1) 65 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2) 66 TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +125,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, 125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
123 TRACE_STRUCT( 126 TRACE_STRUCT(
124 TRACE_FIELD(unsigned int, line, line) 127 TRACE_FIELD(unsigned int, line, line)
125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) 128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
126 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) 129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
127 TRACE_FIELD(char, correct, correct) 132 TRACE_FIELD(char, correct, correct)
128 ), 133 ),
129 TP_RAW_FMT("%u:%s:%s (%u)") 134 TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +144,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
139 144
140TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, 145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
141 TRACE_STRUCT( 146 TRACE_STRUCT(
142 TRACE_FIELD(ktime_t, state_data.stamp, stamp) 147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
143 TRACE_FIELD(ktime_t, state_data.end, end) 148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
144 TRACE_FIELD(int, state_data.type, type) 149 TRACE_FIELD(int, state_data.type, type)
145 TRACE_FIELD(int, state_data.state, state) 150 TRACE_FIELD(int, state_data.state, state)
146 ), 151 ),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 576f4fa2af0d..e75276a49cf5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,19 +8,25 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/workqueue.h>
12#include <linux/spinlock.h>
13#include <linux/kthread.h>
11#include <linux/debugfs.h> 14#include <linux/debugfs.h>
12#include <linux/uaccess.h> 15#include <linux/uaccess.h>
13#include <linux/module.h> 16#include <linux/module.h>
14#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h>
15 19
16#include "trace_output.h" 20#include "trace_output.h"
17 21
18#define TRACE_SYSTEM "TRACE_SYSTEM" 22#define TRACE_SYSTEM "TRACE_SYSTEM"
19 23
20static DEFINE_MUTEX(event_mutex); 24DEFINE_MUTEX(event_mutex);
25
26LIST_HEAD(ftrace_events);
21 27
22int trace_define_field(struct ftrace_event_call *call, char *type, 28int trace_define_field(struct ftrace_event_call *call, char *type,
23 char *name, int offset, int size) 29 char *name, int offset, int size, int is_signed)
24{ 30{
25 struct ftrace_event_field *field; 31 struct ftrace_event_field *field;
26 32
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
38 44
39 field->offset = offset; 45 field->offset = offset;
40 field->size = size; 46 field->size = size;
47 field->is_signed = is_signed;
41 list_add(&field->link, &call->fields); 48 list_add(&field->link, &call->fields);
42 49
43 return 0; 50 return 0;
@@ -51,47 +58,94 @@ err:
51 58
52 return -ENOMEM; 59 return -ENOMEM;
53} 60}
61EXPORT_SYMBOL_GPL(trace_define_field);
54 62
55static void ftrace_clear_events(void) 63#ifdef CONFIG_MODULES
56{
57 struct ftrace_event_call *call = (void *)__start_ftrace_events;
58
59 64
60 while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { 65static void trace_destroy_fields(struct ftrace_event_call *call)
66{
67 struct ftrace_event_field *field, *next;
61 68
62 if (call->enabled) { 69 list_for_each_entry_safe(field, next, &call->fields, link) {
63 call->enabled = 0; 70 list_del(&field->link);
64 call->unregfunc(); 71 kfree(field->type);
65 } 72 kfree(field->name);
66 call++; 73 kfree(field);
67 } 74 }
68} 75}
69 76
77#endif /* CONFIG_MODULES */
78
70static void ftrace_event_enable_disable(struct ftrace_event_call *call, 79static void ftrace_event_enable_disable(struct ftrace_event_call *call,
71 int enable) 80 int enable)
72{ 81{
73
74 switch (enable) { 82 switch (enable) {
75 case 0: 83 case 0:
76 if (call->enabled) { 84 if (call->enabled) {
77 call->enabled = 0; 85 call->enabled = 0;
86 tracing_stop_cmdline_record();
78 call->unregfunc(); 87 call->unregfunc();
79 } 88 }
80 break; 89 break;
81 case 1: 90 case 1:
82 if (!call->enabled) { 91 if (!call->enabled) {
83 call->enabled = 1; 92 call->enabled = 1;
93 tracing_start_cmdline_record();
84 call->regfunc(); 94 call->regfunc();
85 } 95 }
86 break; 96 break;
87 } 97 }
88} 98}
89 99
100static void ftrace_clear_events(void)
101{
102 struct ftrace_event_call *call;
103
104 mutex_lock(&event_mutex);
105 list_for_each_entry(call, &ftrace_events, list) {
106 ftrace_event_enable_disable(call, 0);
107 }
108 mutex_unlock(&event_mutex);
109}
110
111/*
112 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
113 */
114static int __ftrace_set_clr_event(const char *match, const char *sub,
115 const char *event, int set)
116{
117 struct ftrace_event_call *call;
118 int ret = -EINVAL;
119
120 mutex_lock(&event_mutex);
121 list_for_each_entry(call, &ftrace_events, list) {
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142
143 return ret;
144}
145
90static int ftrace_set_clr_event(char *buf, int set) 146static int ftrace_set_clr_event(char *buf, int set)
91{ 147{
92 struct ftrace_event_call *call = __start_ftrace_events;
93 char *event = NULL, *sub = NULL, *match; 148 char *event = NULL, *sub = NULL, *match;
94 int ret = -EINVAL;
95 149
96 /* 150 /*
97 * The buf format can be <subsystem>:<event-name> 151 * The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
117 event = NULL; 171 event = NULL;
118 } 172 }
119 173
120 mutex_lock(&event_mutex); 174 return __ftrace_set_clr_event(match, sub, event, set);
121 for_each_event(call) { 175}
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142 176
143 return ret; 177/**
178 * trace_set_clr_event - enable or disable an event
179 * @system: system name to match (NULL for any system)
180 * @event: event name to match (NULL for all events, within system)
181 * @set: 1 to enable, 0 to disable
182 *
183 * This is a way for other parts of the kernel to enable or disable
184 * event recording.
185 *
186 * Returns 0 on success, -EINVAL if the parameters do not match any
187 * registered events.
188 */
189int trace_set_clr_event(const char *system, const char *event, int set)
190{
191 return __ftrace_set_clr_event(NULL, system, event, set);
144} 192}
145 193
146/* 128 should be much more than enough */ 194/* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
224static void * 272static void *
225t_next(struct seq_file *m, void *v, loff_t *pos) 273t_next(struct seq_file *m, void *v, loff_t *pos)
226{ 274{
227 struct ftrace_event_call *call = m->private; 275 struct list_head *list = m->private;
228 struct ftrace_event_call *next = call; 276 struct ftrace_event_call *call;
229 277
230 (*pos)++; 278 (*pos)++;
231 279
232 for (;;) { 280 for (;;) {
233 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) 281 if (list == &ftrace_events)
234 return NULL; 282 return NULL;
235 283
284 call = list_entry(list, struct ftrace_event_call, list);
285
236 /* 286 /*
237 * The ftrace subsystem is for showing formats only. 287 * The ftrace subsystem is for showing formats only.
238 * They can not be enabled or disabled via the event files. 288 * They can not be enabled or disabled via the event files.
@@ -240,46 +290,68 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
240 if (call->regfunc) 290 if (call->regfunc)
241 break; 291 break;
242 292
243 call++; 293 list = list->next;
244 next = call;
245 } 294 }
246 295
247 m->private = ++next; 296 m->private = list->next;
248 297
249 return call; 298 return call;
250} 299}
251 300
252static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
253{ 302{
254 return t_next(m, NULL, pos); 303 struct ftrace_event_call *call = NULL;
304 loff_t l;
305
306 mutex_lock(&event_mutex);
307
308 m->private = ftrace_events.next;
309 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l);
311 if (!call)
312 break;
313 }
314 return call;
255} 315}
256 316
257static void * 317static void *
258s_next(struct seq_file *m, void *v, loff_t *pos) 318s_next(struct seq_file *m, void *v, loff_t *pos)
259{ 319{
260 struct ftrace_event_call *call = m->private; 320 struct list_head *list = m->private;
261 struct ftrace_event_call *next; 321 struct ftrace_event_call *call;
262 322
263 (*pos)++; 323 (*pos)++;
264 324
265 retry: 325 retry:
266 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) 326 if (list == &ftrace_events)
267 return NULL; 327 return NULL;
268 328
329 call = list_entry(list, struct ftrace_event_call, list);
330
269 if (!call->enabled) { 331 if (!call->enabled) {
270 call++; 332 list = list->next;
271 goto retry; 333 goto retry;
272 } 334 }
273 335
274 next = call; 336 m->private = list->next;
275 m->private = ++next;
276 337
277 return call; 338 return call;
278} 339}
279 340
280static void *s_start(struct seq_file *m, loff_t *pos) 341static void *s_start(struct seq_file *m, loff_t *pos)
281{ 342{
282 return s_next(m, NULL, pos); 343 struct ftrace_event_call *call = NULL;
344 loff_t l;
345
346 mutex_lock(&event_mutex);
347
348 m->private = ftrace_events.next;
349 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l);
351 if (!call)
352 break;
353 }
354 return call;
283} 355}
284 356
285static int t_show(struct seq_file *m, void *v) 357static int t_show(struct seq_file *m, void *v)
@@ -295,26 +367,20 @@ static int t_show(struct seq_file *m, void *v)
295 367
296static void t_stop(struct seq_file *m, void *p) 368static void t_stop(struct seq_file *m, void *p)
297{ 369{
370 mutex_unlock(&event_mutex);
298} 371}
299 372
300static int 373static int
301ftrace_event_seq_open(struct inode *inode, struct file *file) 374ftrace_event_seq_open(struct inode *inode, struct file *file)
302{ 375{
303 int ret;
304 const struct seq_operations *seq_ops; 376 const struct seq_operations *seq_ops;
305 377
306 if ((file->f_mode & FMODE_WRITE) && 378 if ((file->f_mode & FMODE_WRITE) &&
307 !(file->f_flags & O_APPEND)) 379 (file->f_flags & O_TRUNC))
308 ftrace_clear_events(); 380 ftrace_clear_events();
309 381
310 seq_ops = inode->i_private; 382 seq_ops = inode->i_private;
311 ret = seq_open(file, seq_ops); 383 return seq_open(file, seq_ops);
312 if (!ret) {
313 struct seq_file *m = file->private_data;
314
315 m->private = __start_ftrace_events;
316 }
317 return ret;
318} 384}
319 385
320static ssize_t 386static ssize_t
@@ -374,8 +440,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
374 return cnt; 440 return cnt;
375} 441}
376 442
443static ssize_t
444system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
445 loff_t *ppos)
446{
447 const char set_to_char[4] = { '?', '0', '1', 'X' };
448 const char *system = filp->private_data;
449 struct ftrace_event_call *call;
450 char buf[2];
451 int set = 0;
452 int ret;
453
454 mutex_lock(&event_mutex);
455 list_for_each_entry(call, &ftrace_events, list) {
456 if (!call->name || !call->regfunc)
457 continue;
458
459 if (system && strcmp(call->system, system) != 0)
460 continue;
461
462 /*
463 * We need to find out if all the events are set
464 * or if all events or cleared, or if we have
465 * a mixture.
466 */
467 set |= (1 << !!call->enabled);
468
469 /*
470 * If we have a mixture, no need to look further.
471 */
472 if (set == 3)
473 break;
474 }
475 mutex_unlock(&event_mutex);
476
477 buf[0] = set_to_char[set];
478 buf[1] = '\n';
479
480 ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
481
482 return ret;
483}
484
485static ssize_t
486system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
487 loff_t *ppos)
488{
489 const char *system = filp->private_data;
490 unsigned long val;
491 char buf[64];
492 ssize_t ret;
493
494 if (cnt >= sizeof(buf))
495 return -EINVAL;
496
497 if (copy_from_user(&buf, ubuf, cnt))
498 return -EFAULT;
499
500 buf[cnt] = 0;
501
502 ret = strict_strtoul(buf, 10, &val);
503 if (ret < 0)
504 return ret;
505
506 ret = tracing_update_buffers();
507 if (ret < 0)
508 return ret;
509
510 if (val != 0 && val != 1)
511 return -EINVAL;
512
513 ret = __ftrace_set_clr_event(NULL, system, NULL, val);
514 if (ret)
515 goto out;
516
517 ret = cnt;
518
519out:
520 *ppos += cnt;
521
522 return ret;
523}
524
525extern char *__bad_type_size(void);
526
377#undef FIELD 527#undef FIELD
378#define FIELD(type, name) \ 528#define FIELD(type, name) \
529 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
379 #type, "common_" #name, offsetof(typeof(field), name), \ 530 #type, "common_" #name, offsetof(typeof(field), name), \
380 sizeof(field.name) 531 sizeof(field.name)
381 532
@@ -391,7 +542,7 @@ static int trace_write_header(struct trace_seq *s)
391 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 542 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
392 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 543 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
393 "\n", 544 "\n",
394 FIELD(unsigned char, type), 545 FIELD(unsigned short, type),
395 FIELD(unsigned char, flags), 546 FIELD(unsigned char, flags),
396 FIELD(unsigned char, preempt_count), 547 FIELD(unsigned char, preempt_count),
397 FIELD(int, pid), 548 FIELD(int, pid),
@@ -481,7 +632,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
481 632
482 trace_seq_init(s); 633 trace_seq_init(s);
483 634
484 filter_print_preds(call->preds, s); 635 print_event_filter(call, s);
485 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 636 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
486 637
487 kfree(s); 638 kfree(s);
@@ -494,38 +645,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
494 loff_t *ppos) 645 loff_t *ppos)
495{ 646{
496 struct ftrace_event_call *call = filp->private_data; 647 struct ftrace_event_call *call = filp->private_data;
497 char buf[64], *pbuf = buf; 648 char *buf;
498 struct filter_pred *pred;
499 int err; 649 int err;
500 650
501 if (cnt >= sizeof(buf)) 651 if (cnt >= PAGE_SIZE)
502 return -EINVAL; 652 return -EINVAL;
503 653
504 if (copy_from_user(&buf, ubuf, cnt)) 654 buf = (char *)__get_free_page(GFP_TEMPORARY);
505 return -EFAULT; 655 if (!buf)
506 buf[cnt] = '\0';
507
508 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
509 if (!pred)
510 return -ENOMEM; 656 return -ENOMEM;
511 657
512 err = filter_parse(&pbuf, pred); 658 if (copy_from_user(buf, ubuf, cnt)) {
513 if (err < 0) { 659 free_page((unsigned long) buf);
514 filter_free_pred(pred); 660 return -EFAULT;
515 return err;
516 }
517
518 if (pred->clear) {
519 filter_free_preds(call);
520 filter_free_pred(pred);
521 return cnt;
522 } 661 }
662 buf[cnt] = '\0';
523 663
524 err = filter_add_pred(call, pred); 664 err = apply_event_filter(call, buf);
525 if (err < 0) { 665 free_page((unsigned long) buf);
526 filter_free_pred(pred); 666 if (err < 0)
527 return err; 667 return err;
528 }
529 668
530 *ppos += cnt; 669 *ppos += cnt;
531 670
@@ -549,7 +688,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
549 688
550 trace_seq_init(s); 689 trace_seq_init(s);
551 690
552 filter_print_preds(system->preds, s); 691 print_subsystem_event_filter(system, s);
553 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 692 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
554 693
555 kfree(s); 694 kfree(s);
@@ -562,45 +701,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
562 loff_t *ppos) 701 loff_t *ppos)
563{ 702{
564 struct event_subsystem *system = filp->private_data; 703 struct event_subsystem *system = filp->private_data;
565 char buf[64], *pbuf = buf; 704 char *buf;
566 struct filter_pred *pred;
567 int err; 705 int err;
568 706
569 if (cnt >= sizeof(buf)) 707 if (cnt >= PAGE_SIZE)
570 return -EINVAL; 708 return -EINVAL;
571 709
572 if (copy_from_user(&buf, ubuf, cnt)) 710 buf = (char *)__get_free_page(GFP_TEMPORARY);
573 return -EFAULT; 711 if (!buf)
574 buf[cnt] = '\0';
575
576 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
577 if (!pred)
578 return -ENOMEM; 712 return -ENOMEM;
579 713
580 err = filter_parse(&pbuf, pred); 714 if (copy_from_user(buf, ubuf, cnt)) {
581 if (err < 0) { 715 free_page((unsigned long) buf);
582 filter_free_pred(pred); 716 return -EFAULT;
583 return err;
584 }
585
586 if (pred->clear) {
587 filter_free_subsystem_preds(system);
588 filter_free_pred(pred);
589 return cnt;
590 } 717 }
718 buf[cnt] = '\0';
591 719
592 err = filter_add_subsystem_pred(system, pred); 720 err = apply_subsystem_event_filter(system, buf);
593 if (err < 0) { 721 free_page((unsigned long) buf);
594 filter_free_subsystem_preds(system); 722 if (err < 0)
595 filter_free_pred(pred);
596 return err; 723 return err;
597 }
598 724
599 *ppos += cnt; 725 *ppos += cnt;
600 726
601 return cnt; 727 return cnt;
602} 728}
603 729
730static ssize_t
731show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
732{
733 int (*func)(struct trace_seq *s) = filp->private_data;
734 struct trace_seq *s;
735 int r;
736
737 if (*ppos)
738 return 0;
739
740 s = kmalloc(sizeof(*s), GFP_KERNEL);
741 if (!s)
742 return -ENOMEM;
743
744 trace_seq_init(s);
745
746 func(s);
747 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
748
749 kfree(s);
750
751 return r;
752}
753
604static const struct seq_operations show_event_seq_ops = { 754static const struct seq_operations show_event_seq_ops = {
605 .start = t_start, 755 .start = t_start,
606 .next = t_next, 756 .next = t_next,
@@ -658,6 +808,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
658 .write = subsystem_filter_write, 808 .write = subsystem_filter_write,
659}; 809};
660 810
811static const struct file_operations ftrace_system_enable_fops = {
812 .open = tracing_open_generic,
813 .read = system_enable_read,
814 .write = system_enable_write,
815};
816
817static const struct file_operations ftrace_show_header_fops = {
818 .open = tracing_open_generic,
819 .read = show_header,
820};
821
661static struct dentry *event_trace_events_dir(void) 822static struct dentry *event_trace_events_dir(void)
662{ 823{
663 static struct dentry *d_tracer; 824 static struct dentry *d_tracer;
@@ -684,6 +845,7 @@ static struct dentry *
684event_subsystem_dir(const char *name, struct dentry *d_events) 845event_subsystem_dir(const char *name, struct dentry *d_events)
685{ 846{
686 struct event_subsystem *system; 847 struct event_subsystem *system;
848 struct dentry *entry;
687 849
688 /* First see if we did not already create this dir */ 850 /* First see if we did not already create this dir */
689 list_for_each_entry(system, &event_subsystems, list) { 851 list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +869,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
707 return d_events; 869 return d_events;
708 } 870 }
709 871
710 system->name = name; 872 system->name = kstrdup(name, GFP_KERNEL);
873 if (!system->name) {
874 debugfs_remove(system->entry);
875 kfree(system);
876 return d_events;
877 }
878
711 list_add(&system->list, &event_subsystems); 879 list_add(&system->list, &event_subsystems);
712 880
713 system->preds = NULL; 881 system->filter = NULL;
882
883 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
884 if (!system->filter) {
885 pr_warning("Could not allocate filter for subsystem "
886 "'%s'\n", name);
887 return system->entry;
888 }
889
890 entry = debugfs_create_file("filter", 0644, system->entry, system,
891 &ftrace_subsystem_filter_fops);
892 if (!entry) {
893 kfree(system->filter);
894 system->filter = NULL;
895 pr_warning("Could not create debugfs "
896 "'%s/filter' entry\n", name);
897 }
898
899 entry = trace_create_file("enable", 0644, system->entry,
900 (void *)system->name,
901 &ftrace_system_enable_fops);
714 902
715 return system->entry; 903 return system->entry;
716} 904}
717 905
718static int 906static int
719event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) 907event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
908 const struct file_operations *id,
909 const struct file_operations *enable,
910 const struct file_operations *filter,
911 const struct file_operations *format)
720{ 912{
721 struct dentry *entry; 913 struct dentry *entry;
722 int ret; 914 int ret;
@@ -725,7 +917,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
725 * If the trace point header did not define TRACE_SYSTEM 917 * If the trace point header did not define TRACE_SYSTEM
726 * then the system would be called "TRACE_SYSTEM". 918 * then the system would be called "TRACE_SYSTEM".
727 */ 919 */
728 if (strcmp(call->system, "TRACE_SYSTEM") != 0) 920 if (strcmp(call->system, TRACE_SYSTEM) != 0)
729 d_events = event_subsystem_dir(call->system, d_events); 921 d_events = event_subsystem_dir(call->system, d_events);
730 922
731 if (call->raw_init) { 923 if (call->raw_init) {
@@ -744,21 +936,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
744 return -1; 936 return -1;
745 } 937 }
746 938
747 if (call->regfunc) { 939 if (call->regfunc)
748 entry = debugfs_create_file("enable", 0644, call->dir, call, 940 entry = trace_create_file("enable", 0644, call->dir, call,
749 &ftrace_enable_fops); 941 enable);
750 if (!entry)
751 pr_warning("Could not create debugfs "
752 "'%s/enable' entry\n", call->name);
753 }
754 942
755 if (call->id) { 943 if (call->id && call->profile_enable)
756 entry = debugfs_create_file("id", 0444, call->dir, call, 944 entry = trace_create_file("id", 0444, call->dir, call,
757 &ftrace_event_id_fops); 945 id);
758 if (!entry)
759 pr_warning("Could not create debugfs '%s/id' entry\n",
760 call->name);
761 }
762 946
763 if (call->define_fields) { 947 if (call->define_fields) {
764 ret = call->define_fields(); 948 ret = call->define_fields();
@@ -767,32 +951,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
767 " events/%s\n", call->name); 951 " events/%s\n", call->name);
768 return ret; 952 return ret;
769 } 953 }
770 entry = debugfs_create_file("filter", 0644, call->dir, call, 954 entry = trace_create_file("filter", 0644, call->dir, call,
771 &ftrace_event_filter_fops); 955 filter);
772 if (!entry)
773 pr_warning("Could not create debugfs "
774 "'%s/filter' entry\n", call->name);
775 } 956 }
776 957
777 /* A trace may not want to export its format */ 958 /* A trace may not want to export its format */
778 if (!call->show_format) 959 if (!call->show_format)
779 return 0; 960 return 0;
780 961
781 entry = debugfs_create_file("format", 0444, call->dir, call, 962 entry = trace_create_file("format", 0444, call->dir, call,
782 &ftrace_event_format_fops); 963 format);
783 if (!entry)
784 pr_warning("Could not create debugfs "
785 "'%s/format' entry\n", call->name);
786 964
787 return 0; 965 return 0;
788} 966}
789 967
968#define for_each_event(event, start, end) \
969 for (event = start; \
970 (unsigned long)event < (unsigned long)end; \
971 event++)
972
973#ifdef CONFIG_MODULES
974
975static LIST_HEAD(ftrace_module_file_list);
976
977/*
978 * Modules must own their file_operations to keep up with
979 * reference counting.
980 */
981struct ftrace_module_file_ops {
982 struct list_head list;
983 struct module *mod;
984 struct file_operations id;
985 struct file_operations enable;
986 struct file_operations format;
987 struct file_operations filter;
988};
989
990static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod)
992{
993 struct ftrace_module_file_ops *file_ops;
994
995 /*
996 * This is a bit of a PITA. To allow for correct reference
997 * counting, modules must "own" their file_operations.
998 * To do this, we allocate the file operations that will be
999 * used in the event directory.
1000 */
1001
1002 file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
1003 if (!file_ops)
1004 return NULL;
1005
1006 file_ops->mod = mod;
1007
1008 file_ops->id = ftrace_event_id_fops;
1009 file_ops->id.owner = mod;
1010
1011 file_ops->enable = ftrace_enable_fops;
1012 file_ops->enable.owner = mod;
1013
1014 file_ops->filter = ftrace_event_filter_fops;
1015 file_ops->filter.owner = mod;
1016
1017 file_ops->format = ftrace_event_format_fops;
1018 file_ops->format.owner = mod;
1019
1020 list_add(&file_ops->list, &ftrace_module_file_list);
1021
1022 return file_ops;
1023}
1024
1025static void trace_module_add_events(struct module *mod)
1026{
1027 struct ftrace_module_file_ops *file_ops = NULL;
1028 struct ftrace_event_call *call, *start, *end;
1029 struct dentry *d_events;
1030
1031 start = mod->trace_events;
1032 end = mod->trace_events + mod->num_trace_events;
1033
1034 if (start == end)
1035 return;
1036
1037 d_events = event_trace_events_dir();
1038 if (!d_events)
1039 return;
1040
1041 for_each_event(call, start, end) {
1042 /* The linker may leave blanks */
1043 if (!call->name)
1044 continue;
1045
1046 /*
1047 * This module has events, create file ops for this module
1048 * if not already done.
1049 */
1050 if (!file_ops) {
1051 file_ops = trace_create_file_ops(mod);
1052 if (!file_ops)
1053 return;
1054 }
1055 call->mod = mod;
1056 list_add(&call->list, &ftrace_events);
1057 event_create_dir(call, d_events,
1058 &file_ops->id, &file_ops->enable,
1059 &file_ops->filter, &file_ops->format);
1060 }
1061}
1062
1063static void trace_module_remove_events(struct module *mod)
1064{
1065 struct ftrace_module_file_ops *file_ops;
1066 struct ftrace_event_call *call, *p;
1067 bool found = false;
1068
1069 down_write(&trace_event_mutex);
1070 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1071 if (call->mod == mod) {
1072 found = true;
1073 ftrace_event_enable_disable(call, 0);
1074 if (call->event)
1075 __unregister_ftrace_event(call->event);
1076 debugfs_remove_recursive(call->dir);
1077 list_del(&call->list);
1078 trace_destroy_fields(call);
1079 destroy_preds(call);
1080 }
1081 }
1082
1083 /* Now free the file_operations */
1084 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1085 if (file_ops->mod == mod)
1086 break;
1087 }
1088 if (&file_ops->list != &ftrace_module_file_list) {
1089 list_del(&file_ops->list);
1090 kfree(file_ops);
1091 }
1092
1093 /*
1094 * It is safest to reset the ring buffer if the module being unloaded
1095 * registered any events.
1096 */
1097 if (found)
1098 tracing_reset_current_online_cpus();
1099 up_write(&trace_event_mutex);
1100}
1101
1102static int trace_module_notify(struct notifier_block *self,
1103 unsigned long val, void *data)
1104{
1105 struct module *mod = data;
1106
1107 mutex_lock(&event_mutex);
1108 switch (val) {
1109 case MODULE_STATE_COMING:
1110 trace_module_add_events(mod);
1111 break;
1112 case MODULE_STATE_GOING:
1113 trace_module_remove_events(mod);
1114 break;
1115 }
1116 mutex_unlock(&event_mutex);
1117
1118 return 0;
1119}
1120#else
1121static int trace_module_notify(struct notifier_block *self,
1122 unsigned long val, void *data)
1123{
1124 return 0;
1125}
1126#endif /* CONFIG_MODULES */
1127
1128struct notifier_block trace_module_nb = {
1129 .notifier_call = trace_module_notify,
1130 .priority = 0,
1131};
1132
1133extern struct ftrace_event_call __start_ftrace_events[];
1134extern struct ftrace_event_call __stop_ftrace_events[];
1135
790static __init int event_trace_init(void) 1136static __init int event_trace_init(void)
791{ 1137{
792 struct ftrace_event_call *call = __start_ftrace_events; 1138 struct ftrace_event_call *call;
793 struct dentry *d_tracer; 1139 struct dentry *d_tracer;
794 struct dentry *entry; 1140 struct dentry *entry;
795 struct dentry *d_events; 1141 struct dentry *d_events;
1142 int ret;
796 1143
797 d_tracer = tracing_init_dentry(); 1144 d_tracer = tracing_init_dentry();
798 if (!d_tracer) 1145 if (!d_tracer)
@@ -816,13 +1163,243 @@ static __init int event_trace_init(void)
816 if (!d_events) 1163 if (!d_events)
817 return 0; 1164 return 0;
818 1165
819 for_each_event(call) { 1166 /* ring buffer internal formats */
1167 trace_create_file("header_page", 0444, d_events,
1168 ring_buffer_print_page_header,
1169 &ftrace_show_header_fops);
1170
1171 trace_create_file("header_event", 0444, d_events,
1172 ring_buffer_print_entry_header,
1173 &ftrace_show_header_fops);
1174
1175 trace_create_file("enable", 0644, d_events,
1176 NULL, &ftrace_system_enable_fops);
1177
1178 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
820 /* The linker may leave blanks */ 1179 /* The linker may leave blanks */
821 if (!call->name) 1180 if (!call->name)
822 continue; 1181 continue;
823 event_create_dir(call, d_events); 1182 list_add(&call->list, &ftrace_events);
1183 event_create_dir(call, d_events, &ftrace_event_id_fops,
1184 &ftrace_enable_fops, &ftrace_event_filter_fops,
1185 &ftrace_event_format_fops);
824 } 1186 }
825 1187
1188 ret = register_module_notifier(&trace_module_nb);
1189 if (ret)
1190 pr_warning("Failed to register trace events module notifier\n");
1191
826 return 0; 1192 return 0;
827} 1193}
828fs_initcall(event_trace_init); 1194fs_initcall(event_trace_init);
1195
1196#ifdef CONFIG_FTRACE_STARTUP_TEST
1197
1198static DEFINE_SPINLOCK(test_spinlock);
1199static DEFINE_SPINLOCK(test_spinlock_irq);
1200static DEFINE_MUTEX(test_mutex);
1201
1202static __init void test_work(struct work_struct *dummy)
1203{
1204 spin_lock(&test_spinlock);
1205 spin_lock_irq(&test_spinlock_irq);
1206 udelay(1);
1207 spin_unlock_irq(&test_spinlock_irq);
1208 spin_unlock(&test_spinlock);
1209
1210 mutex_lock(&test_mutex);
1211 msleep(1);
1212 mutex_unlock(&test_mutex);
1213}
1214
1215static __init int event_test_thread(void *unused)
1216{
1217 void *test_malloc;
1218
1219 test_malloc = kmalloc(1234, GFP_KERNEL);
1220 if (!test_malloc)
1221 pr_info("failed to kmalloc\n");
1222
1223 schedule_on_each_cpu(test_work);
1224
1225 kfree(test_malloc);
1226
1227 set_current_state(TASK_INTERRUPTIBLE);
1228 while (!kthread_should_stop())
1229 schedule();
1230
1231 return 0;
1232}
1233
1234/*
1235 * Do various things that may trigger events.
1236 */
1237static __init void event_test_stuff(void)
1238{
1239 struct task_struct *test_thread;
1240
1241 test_thread = kthread_run(event_test_thread, NULL, "test-events");
1242 msleep(1);
1243 kthread_stop(test_thread);
1244}
1245
1246/*
1247 * For every trace event defined, we will test each trace point separately,
1248 * and then by groups, and finally all trace points.
1249 */
1250static __init void event_trace_self_tests(void)
1251{
1252 struct ftrace_event_call *call;
1253 struct event_subsystem *system;
1254 int ret;
1255
1256 pr_info("Running tests on trace events:\n");
1257
1258 list_for_each_entry(call, &ftrace_events, list) {
1259
1260 /* Only test those that have a regfunc */
1261 if (!call->regfunc)
1262 continue;
1263
1264 pr_info("Testing event %s: ", call->name);
1265
1266 /*
1267 * If an event is already enabled, someone is using
1268 * it and the self test should not be on.
1269 */
1270 if (call->enabled) {
1271 pr_warning("Enabled event during self test!\n");
1272 WARN_ON_ONCE(1);
1273 continue;
1274 }
1275
1276 ftrace_event_enable_disable(call, 1);
1277 event_test_stuff();
1278 ftrace_event_enable_disable(call, 0);
1279
1280 pr_cont("OK\n");
1281 }
1282
1283 /* Now test at the sub system level */
1284
1285 pr_info("Running tests on trace event systems:\n");
1286
1287 list_for_each_entry(system, &event_subsystems, list) {
1288
1289 /* the ftrace system is special, skip it */
1290 if (strcmp(system->name, "ftrace") == 0)
1291 continue;
1292
1293 pr_info("Testing event system %s: ", system->name);
1294
1295 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
1296 if (WARN_ON_ONCE(ret)) {
1297 pr_warning("error enabling system %s\n",
1298 system->name);
1299 continue;
1300 }
1301
1302 event_test_stuff();
1303
1304 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
1305 if (WARN_ON_ONCE(ret))
1306 pr_warning("error disabling system %s\n",
1307 system->name);
1308
1309 pr_cont("OK\n");
1310 }
1311
1312 /* Test with all events enabled */
1313
1314 pr_info("Running tests on all trace events:\n");
1315 pr_info("Testing all events: ");
1316
1317 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
1318 if (WARN_ON_ONCE(ret)) {
1319 pr_warning("error enabling all events\n");
1320 return;
1321 }
1322
1323 event_test_stuff();
1324
1325 /* reset sysname */
1326 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
1327 if (WARN_ON_ONCE(ret)) {
1328 pr_warning("error disabling all events\n");
1329 return;
1330 }
1331
1332 pr_cont("OK\n");
1333}
1334
1335#ifdef CONFIG_FUNCTION_TRACER
1336
1337static DEFINE_PER_CPU(atomic_t, test_event_disable);
1338
1339static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip)
1341{
1342 struct ring_buffer_event *event;
1343 struct ftrace_entry *entry;
1344 unsigned long flags;
1345 long disabled;
1346 int resched;
1347 int cpu;
1348 int pc;
1349
1350 pc = preempt_count();
1351 resched = ftrace_preempt_disable();
1352 cpu = raw_smp_processor_id();
1353 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
1354
1355 if (disabled != 1)
1356 goto out;
1357
1358 local_save_flags(flags);
1359
1360 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
1361 flags, pc);
1362 if (!event)
1363 goto out;
1364 entry = ring_buffer_event_data(event);
1365 entry->ip = ip;
1366 entry->parent_ip = parent_ip;
1367
1368 trace_nowake_buffer_unlock_commit(event, flags, pc);
1369
1370 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu));
1372 ftrace_preempt_enable(resched);
1373}
1374
1375static struct ftrace_ops trace_ops __initdata =
1376{
1377 .func = function_test_events_call,
1378};
1379
1380static __init void event_trace_self_test_with_function(void)
1381{
1382 register_ftrace_function(&trace_ops);
1383 pr_info("Running tests again, along with the function tracer\n");
1384 event_trace_self_tests();
1385 unregister_ftrace_function(&trace_ops);
1386}
1387#else
1388static __init void event_trace_self_test_with_function(void)
1389{
1390}
1391#endif
1392
1393static __init int event_trace_self_tests_init(void)
1394{
1395
1396 event_trace_self_tests();
1397
1398 event_trace_self_test_with_function();
1399
1400 return 0;
1401}
1402
1403late_initcall(event_trace_self_tests_init);
1404
1405#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e03cbf1e38f3..f32dc9d1ea7b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,119 +22,295 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/ctype.h> 24#include <linux/ctype.h>
25#include <linux/mutex.h>
25 26
26#include "trace.h" 27#include "trace.h"
27#include "trace_output.h" 28#include "trace_output.h"
28 29
29static int filter_pred_64(struct filter_pred *pred, void *event) 30enum filter_op_ids
30{ 31{
31 u64 *addr = (u64 *)(event + pred->offset); 32 OP_OR,
32 u64 val = (u64)pred->val; 33 OP_AND,
33 int match; 34 OP_NE,
35 OP_EQ,
36 OP_LT,
37 OP_LE,
38 OP_GT,
39 OP_GE,
40 OP_NONE,
41 OP_OPEN_PAREN,
42};
43
44struct filter_op {
45 int id;
46 char *string;
47 int precedence;
48};
49
50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 },
54 { OP_EQ, "==", 4 },
55 { OP_LT, "<", 5 },
56 { OP_LE, "<=", 5 },
57 { OP_GT, ">", 5 },
58 { OP_GE, ">=", 5 },
59 { OP_NONE, "OP_NONE", 0 },
60 { OP_OPEN_PAREN, "(", 0 },
61};
62
63enum {
64 FILT_ERR_NONE,
65 FILT_ERR_INVALID_OP,
66 FILT_ERR_UNBALANCED_PAREN,
67 FILT_ERR_TOO_MANY_OPERANDS,
68 FILT_ERR_OPERAND_TOO_LONG,
69 FILT_ERR_FIELD_NOT_FOUND,
70 FILT_ERR_ILLEGAL_FIELD_OP,
71 FILT_ERR_ILLEGAL_INTVAL,
72 FILT_ERR_BAD_SUBSYS_FILTER,
73 FILT_ERR_TOO_MANY_PREDS,
74 FILT_ERR_MISSING_FIELD,
75 FILT_ERR_INVALID_FILTER,
76};
77
78static char *err_text[] = {
79 "No error",
80 "Invalid operator",
81 "Unbalanced parens",
82 "Too many operands",
83 "Operand too long",
84 "Field not found",
85 "Illegal operation for field type",
86 "Illegal integer value",
87 "Couldn't find or set field in one of a subsystem's events",
88 "Too many terms in predicate expression",
89 "Missing field name and/or value",
90 "Meaningless filter expression",
91};
92
93struct opstack_op {
94 int op;
95 struct list_head list;
96};
97
98struct postfix_elt {
99 int op;
100 char *operand;
101 struct list_head list;
102};
103
104struct filter_parse_state {
105 struct filter_op *ops;
106 struct list_head opstack;
107 struct list_head postfix;
108 int lasterr;
109 int lasterr_pos;
110
111 struct {
112 char *string;
113 unsigned int cnt;
114 unsigned int tail;
115 } infix;
116
117 struct {
118 char string[MAX_FILTER_STR_VAL];
119 int pos;
120 unsigned int tail;
121 } operand;
122};
123
124DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32);
127DEFINE_COMPARISON_PRED(u32);
128DEFINE_COMPARISON_PRED(s16);
129DEFINE_COMPARISON_PRED(u16);
130DEFINE_COMPARISON_PRED(s8);
131DEFINE_COMPARISON_PRED(u8);
132
133DEFINE_EQUALITY_PRED(64);
134DEFINE_EQUALITY_PRED(32);
135DEFINE_EQUALITY_PRED(16);
136DEFINE_EQUALITY_PRED(8);
137
138static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
139 void *event __attribute((unused)),
140 int val1, int val2)
141{
142 return val1 && val2;
143}
144
145static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
146 void *event __attribute((unused)),
147 int val1, int val2)
148{
149 return val1 || val2;
150}
151
152/* Filter predicate for fixed sized arrays of characters */
153static int filter_pred_string(struct filter_pred *pred, void *event,
154 int val1, int val2)
155{
156 char *addr = (char *)(event + pred->offset);
157 int cmp, match;
34 158
35 match = (val == *addr) ^ pred->not; 159 cmp = strncmp(addr, pred->str_val, pred->str_len);
160
161 match = (!cmp) ^ pred->not;
36 162
37 return match; 163 return match;
38} 164}
39 165
40static int filter_pred_32(struct filter_pred *pred, void *event) 166/*
167 * Filter predicate for dynamic sized arrays of characters.
168 * These are implemented through a list of strings at the end
169 * of the entry.
170 * Also each of these strings have a field in the entry which
171 * contains its offset from the beginning of the entry.
172 * We have then first to get this field, dereference it
173 * and add it to the address of the entry, and at last we have
174 * the address of the string.
175 */
176static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2)
41{ 178{
42 u32 *addr = (u32 *)(event + pred->offset); 179 unsigned short str_loc = *(unsigned short *)(event + pred->offset);
43 u32 val = (u32)pred->val; 180 char *addr = (char *)(event + str_loc);
44 int match; 181 int cmp, match;
45 182
46 match = (val == *addr) ^ pred->not; 183 cmp = strncmp(addr, pred->str_val, pred->str_len);
184
185 match = (!cmp) ^ pred->not;
47 186
48 return match; 187 return match;
49} 188}
50 189
51static int filter_pred_16(struct filter_pred *pred, void *event) 190static int filter_pred_none(struct filter_pred *pred, void *event,
191 int val1, int val2)
192{
193 return 0;
194}
195
196/* return 1 if event matches, 0 otherwise (discard) */
197int filter_match_preds(struct ftrace_event_call *call, void *rec)
52{ 198{
53 u16 *addr = (u16 *)(event + pred->offset); 199 struct event_filter *filter = call->filter;
54 u16 val = (u16)pred->val; 200 int match, top = 0, val1 = 0, val2 = 0;
55 int match; 201 int stack[MAX_FILTER_PRED];
202 struct filter_pred *pred;
203 int i;
56 204
57 match = (val == *addr) ^ pred->not; 205 for (i = 0; i < filter->n_preds; i++) {
206 pred = filter->preds[i];
207 if (!pred->pop_n) {
208 match = pred->fn(pred, rec, val1, val2);
209 stack[top++] = match;
210 continue;
211 }
212 if (pred->pop_n > top) {
213 WARN_ON_ONCE(1);
214 return 0;
215 }
216 val1 = stack[--top];
217 val2 = stack[--top];
218 match = pred->fn(pred, rec, val1, val2);
219 stack[top++] = match;
220 }
58 221
59 return match; 222 return stack[--top];
60} 223}
224EXPORT_SYMBOL_GPL(filter_match_preds);
61 225
62static int filter_pred_8(struct filter_pred *pred, void *event) 226static void parse_error(struct filter_parse_state *ps, int err, int pos)
63{ 227{
64 u8 *addr = (u8 *)(event + pred->offset); 228 ps->lasterr = err;
65 u8 val = (u8)pred->val; 229 ps->lasterr_pos = pos;
66 int match; 230}
67 231
68 match = (val == *addr) ^ pred->not; 232static void remove_filter_string(struct event_filter *filter)
233{
234 kfree(filter->filter_string);
235 filter->filter_string = NULL;
236}
69 237
70 return match; 238static int replace_filter_string(struct event_filter *filter,
239 char *filter_string)
240{
241 kfree(filter->filter_string);
242 filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
243 if (!filter->filter_string)
244 return -ENOMEM;
245
246 return 0;
71} 247}
72 248
73static int filter_pred_string(struct filter_pred *pred, void *event) 249static int append_filter_string(struct event_filter *filter,
250 char *string)
74{ 251{
75 char *addr = (char *)(event + pred->offset); 252 int newlen;
76 int cmp, match; 253 char *new_filter_string;
77 254
78 cmp = strncmp(addr, pred->str_val, pred->str_len); 255 BUG_ON(!filter->filter_string);
256 newlen = strlen(filter->filter_string) + strlen(string) + 1;
257 new_filter_string = kmalloc(newlen, GFP_KERNEL);
258 if (!new_filter_string)
259 return -ENOMEM;
79 260
80 match = (!cmp) ^ pred->not; 261 strcpy(new_filter_string, filter->filter_string);
262 strcat(new_filter_string, string);
263 kfree(filter->filter_string);
264 filter->filter_string = new_filter_string;
81 265
82 return match; 266 return 0;
83} 267}
84 268
85/* return 1 if event matches, 0 otherwise (discard) */ 269static void append_filter_err(struct filter_parse_state *ps,
86int filter_match_preds(struct ftrace_event_call *call, void *rec) 270 struct event_filter *filter)
87{ 271{
88 int i, matched, and_failed = 0; 272 int pos = ps->lasterr_pos;
89 struct filter_pred *pred; 273 char *buf, *pbuf;
90 274
91 for (i = 0; i < MAX_FILTER_PRED; i++) { 275 buf = (char *)__get_free_page(GFP_TEMPORARY);
92 if (call->preds[i]) { 276 if (!buf)
93 pred = call->preds[i]; 277 return;
94 if (and_failed && !pred->or)
95 continue;
96 matched = pred->fn(pred, rec);
97 if (!matched && !pred->or) {
98 and_failed = 1;
99 continue;
100 } else if (matched && pred->or)
101 return 1;
102 } else
103 break;
104 }
105 278
106 if (and_failed) 279 append_filter_string(filter, "\n");
107 return 0; 280 memset(buf, ' ', PAGE_SIZE);
281 if (pos > PAGE_SIZE - 128)
282 pos = 0;
283 buf[pos] = '^';
284 pbuf = &buf[pos] + 1;
108 285
109 return 1; 286 sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
287 append_filter_string(filter, buf);
288 free_page((unsigned long) buf);
110} 289}
111 290
112void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) 291void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
113{ 292{
114 char *field_name; 293 struct event_filter *filter = call->filter;
115 struct filter_pred *pred;
116 int i;
117 294
118 if (!preds) { 295 mutex_lock(&event_mutex);
296 if (filter->filter_string)
297 trace_seq_printf(s, "%s\n", filter->filter_string);
298 else
119 trace_seq_printf(s, "none\n"); 299 trace_seq_printf(s, "none\n");
120 return; 300 mutex_unlock(&event_mutex);
121 } 301}
122 302
123 for (i = 0; i < MAX_FILTER_PRED; i++) { 303void print_subsystem_event_filter(struct event_subsystem *system,
124 if (preds[i]) { 304 struct trace_seq *s)
125 pred = preds[i]; 305{
126 field_name = pred->field_name; 306 struct event_filter *filter = system->filter;
127 if (i) 307
128 trace_seq_printf(s, pred->or ? "|| " : "&& "); 308 mutex_lock(&event_mutex);
129 trace_seq_printf(s, "%s ", field_name); 309 if (filter->filter_string)
130 trace_seq_printf(s, pred->not ? "!= " : "== "); 310 trace_seq_printf(s, "%s\n", filter->filter_string);
131 if (pred->str_val) 311 else
132 trace_seq_printf(s, "%s\n", pred->str_val); 312 trace_seq_printf(s, "none\n");
133 else 313 mutex_unlock(&event_mutex);
134 trace_seq_printf(s, "%llu\n", pred->val);
135 } else
136 break;
137 }
138} 314}
139 315
140static struct ftrace_event_field * 316static struct ftrace_event_field *
@@ -150,284 +326,839 @@ find_event_field(struct ftrace_event_call *call, char *name)
150 return NULL; 326 return NULL;
151} 327}
152 328
153void filter_free_pred(struct filter_pred *pred) 329static void filter_free_pred(struct filter_pred *pred)
154{ 330{
155 if (!pred) 331 if (!pred)
156 return; 332 return;
157 333
158 kfree(pred->field_name); 334 kfree(pred->field_name);
159 kfree(pred->str_val);
160 kfree(pred); 335 kfree(pred);
161} 336}
162 337
163void filter_free_preds(struct ftrace_event_call *call) 338static void filter_clear_pred(struct filter_pred *pred)
164{ 339{
165 int i; 340 kfree(pred->field_name);
341 pred->field_name = NULL;
342 pred->str_len = 0;
343}
166 344
167 if (call->preds) { 345static int filter_set_pred(struct filter_pred *dest,
168 for (i = 0; i < MAX_FILTER_PRED; i++) 346 struct filter_pred *src,
169 filter_free_pred(call->preds[i]); 347 filter_pred_fn_t fn)
170 kfree(call->preds); 348{
171 call->preds = NULL; 349 *dest = *src;
350 if (src->field_name) {
351 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
352 if (!dest->field_name)
353 return -ENOMEM;
172 } 354 }
355 dest->fn = fn;
356
357 return 0;
173} 358}
174 359
175void filter_free_subsystem_preds(struct event_subsystem *system) 360static void filter_disable_preds(struct ftrace_event_call *call)
176{ 361{
177 struct ftrace_event_call *call = __start_ftrace_events; 362 struct event_filter *filter = call->filter;
178 int i; 363 int i;
179 364
180 if (system->preds) { 365 call->filter_active = 0;
181 for (i = 0; i < MAX_FILTER_PRED; i++) 366 filter->n_preds = 0;
182 filter_free_pred(system->preds[i]);
183 kfree(system->preds);
184 system->preds = NULL;
185 }
186 367
187 events_for_each(call) { 368 for (i = 0; i < MAX_FILTER_PRED; i++)
188 if (!call->name || !call->regfunc) 369 filter->preds[i]->fn = filter_pred_none;
189 continue; 370}
371
372void destroy_preds(struct ftrace_event_call *call)
373{
374 struct event_filter *filter = call->filter;
375 int i;
190 376
191 if (!strcmp(call->system, system->name)) 377 for (i = 0; i < MAX_FILTER_PRED; i++) {
192 filter_free_preds(call); 378 if (filter->preds[i])
379 filter_free_pred(filter->preds[i]);
193 } 380 }
381 kfree(filter->preds);
382 kfree(filter->filter_string);
383 kfree(filter);
384 call->filter = NULL;
194} 385}
195 386
196static int __filter_add_pred(struct ftrace_event_call *call, 387int init_preds(struct ftrace_event_call *call)
197 struct filter_pred *pred)
198{ 388{
389 struct event_filter *filter;
390 struct filter_pred *pred;
199 int i; 391 int i;
200 392
201 if (call->preds && !pred->compound) 393 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
202 filter_free_preds(call); 394 if (!call->filter)
395 return -ENOMEM;
203 396
204 if (!call->preds) { 397 call->filter_active = 0;
205 call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), 398 filter->n_preds = 0;
206 GFP_KERNEL); 399
207 if (!call->preds) 400 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
208 return -ENOMEM; 401 if (!filter->preds)
209 } 402 goto oom;
210 403
211 for (i = 0; i < MAX_FILTER_PRED; i++) { 404 for (i = 0; i < MAX_FILTER_PRED; i++) {
212 if (!call->preds[i]) { 405 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
213 call->preds[i] = pred; 406 if (!pred)
214 return 0; 407 goto oom;
408 pred->fn = filter_pred_none;
409 filter->preds[i] = pred;
410 }
411
412 return 0;
413
414oom:
415 destroy_preds(call);
416
417 return -ENOMEM;
418}
419EXPORT_SYMBOL_GPL(init_preds);
420
421static void filter_free_subsystem_preds(struct event_subsystem *system)
422{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call;
425 int i;
426
427 if (filter->n_preds) {
428 for (i = 0; i < filter->n_preds; i++)
429 filter_free_pred(filter->preds[i]);
430 kfree(filter->preds);
431 filter->preds = NULL;
432 filter->n_preds = 0;
433 }
434
435 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields)
437 continue;
438
439 if (!strcmp(call->system, system->name)) {
440 filter_disable_preds(call);
441 remove_filter_string(call->filter);
215 } 442 }
216 } 443 }
444}
445
446static int filter_add_pred_fn(struct filter_parse_state *ps,
447 struct ftrace_event_call *call,
448 struct filter_pred *pred,
449 filter_pred_fn_t fn)
450{
451 struct event_filter *filter = call->filter;
452 int idx, err;
453
454 if (filter->n_preds == MAX_FILTER_PRED) {
455 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
456 return -ENOSPC;
457 }
217 458
218 return -ENOSPC; 459 idx = filter->n_preds;
460 filter_clear_pred(filter->preds[idx]);
461 err = filter_set_pred(filter->preds[idx], pred, fn);
462 if (err)
463 return err;
464
465 filter->n_preds++;
466 call->filter_active = 1;
467
468 return 0;
219} 469}
220 470
471enum {
472 FILTER_STATIC_STRING = 1,
473 FILTER_DYN_STRING
474};
475
221static int is_string_field(const char *type) 476static int is_string_field(const char *type)
222{ 477{
478 if (strstr(type, "__data_loc") && strstr(type, "char"))
479 return FILTER_DYN_STRING;
480
223 if (strchr(type, '[') && strstr(type, "char")) 481 if (strchr(type, '[') && strstr(type, "char"))
224 return 1; 482 return FILTER_STATIC_STRING;
225 483
226 return 0; 484 return 0;
227} 485}
228 486
229int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) 487static int is_legal_op(struct ftrace_event_field *field, int op)
488{
489 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
490 return 0;
491
492 return 1;
493}
494
495static filter_pred_fn_t select_comparison_fn(int op, int field_size,
496 int field_is_signed)
497{
498 filter_pred_fn_t fn = NULL;
499
500 switch (field_size) {
501 case 8:
502 if (op == OP_EQ || op == OP_NE)
503 fn = filter_pred_64;
504 else if (field_is_signed)
505 fn = filter_pred_s64;
506 else
507 fn = filter_pred_u64;
508 break;
509 case 4:
510 if (op == OP_EQ || op == OP_NE)
511 fn = filter_pred_32;
512 else if (field_is_signed)
513 fn = filter_pred_s32;
514 else
515 fn = filter_pred_u32;
516 break;
517 case 2:
518 if (op == OP_EQ || op == OP_NE)
519 fn = filter_pred_16;
520 else if (field_is_signed)
521 fn = filter_pred_s16;
522 else
523 fn = filter_pred_u16;
524 break;
525 case 1:
526 if (op == OP_EQ || op == OP_NE)
527 fn = filter_pred_8;
528 else if (field_is_signed)
529 fn = filter_pred_s8;
530 else
531 fn = filter_pred_u8;
532 break;
533 }
534
535 return fn;
536}
537
538static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call,
540 struct filter_pred *pred)
230{ 541{
231 struct ftrace_event_field *field; 542 struct ftrace_event_field *field;
543 filter_pred_fn_t fn;
544 unsigned long long val;
545 int string_type;
546 int ret;
547
548 pred->fn = filter_pred_none;
549
550 if (pred->op == OP_AND) {
551 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and);
553 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or);
556 }
232 557
233 field = find_event_field(call, pred->field_name); 558 field = find_event_field(call, pred->field_name);
234 if (!field) 559 if (!field) {
560 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
235 return -EINVAL; 561 return -EINVAL;
562 }
236 563
237 pred->offset = field->offset; 564 pred->offset = field->offset;
238 565
239 if (is_string_field(field->type)) { 566 if (!is_legal_op(field, pred->op)) {
240 if (!pred->str_val) 567 parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
241 return -EINVAL; 568 return -EINVAL;
242 pred->fn = filter_pred_string; 569 }
570
571 string_type = is_string_field(field->type);
572 if (string_type) {
573 if (string_type == FILTER_STATIC_STRING)
574 fn = filter_pred_string;
575 else
576 fn = filter_pred_strloc;
243 pred->str_len = field->size; 577 pred->str_len = field->size;
244 return __filter_add_pred(call, pred); 578 if (pred->op == OP_NE)
579 pred->not = 1;
580 return filter_add_pred_fn(ps, call, pred, fn);
245 } else { 581 } else {
246 if (pred->str_val) 582 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val);
584 else
585 ret = strict_strtoull(pred->str_val, 0, &val);
586 if (ret) {
587 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
247 return -EINVAL; 588 return -EINVAL;
589 }
590 pred->val = val;
248 } 591 }
249 592
250 switch (field->size) { 593 fn = select_comparison_fn(pred->op, field->size, field->is_signed);
251 case 8: 594 if (!fn) {
252 pred->fn = filter_pred_64; 595 parse_error(ps, FILT_ERR_INVALID_OP, 0);
253 break;
254 case 4:
255 pred->fn = filter_pred_32;
256 break;
257 case 2:
258 pred->fn = filter_pred_16;
259 break;
260 case 1:
261 pred->fn = filter_pred_8;
262 break;
263 default:
264 return -EINVAL; 596 return -EINVAL;
265 } 597 }
266 598
267 return __filter_add_pred(call, pred); 599 if (pred->op == OP_NE)
600 pred->not = 1;
601
602 return filter_add_pred_fn(ps, call, pred, fn);
268} 603}
269 604
270static struct filter_pred *copy_pred(struct filter_pred *pred) 605static int filter_add_subsystem_pred(struct filter_parse_state *ps,
606 struct event_subsystem *system,
607 struct filter_pred *pred,
608 char *filter_string)
271{ 609{
272 struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); 610 struct event_filter *filter = system->filter;
273 if (!new_pred) 611 struct ftrace_event_call *call;
274 return NULL; 612 int err = 0;
275 613
276 memcpy(new_pred, pred, sizeof(*pred)); 614 if (!filter->preds) {
615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
616 GFP_KERNEL);
277 617
278 if (pred->field_name) { 618 if (!filter->preds)
279 new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); 619 return -ENOMEM;
280 if (!new_pred->field_name) {
281 kfree(new_pred);
282 return NULL;
283 }
284 } 620 }
285 621
286 if (pred->str_val) { 622 if (filter->n_preds == MAX_FILTER_PRED) {
287 new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); 623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
288 if (!new_pred->str_val) { 624 return -ENOSPC;
289 filter_free_pred(new_pred); 625 }
290 return NULL; 626
627 list_for_each_entry(call, &ftrace_events, list) {
628
629 if (!call->define_fields)
630 continue;
631
632 if (strcmp(call->system, system->name))
633 continue;
634
635 err = filter_add_pred(ps, call, pred);
636 if (err) {
637 filter_free_subsystem_preds(system);
638 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
639 goto out;
291 } 640 }
641 replace_filter_string(call->filter, filter_string);
292 } 642 }
293 643
294 return new_pred; 644 filter->preds[filter->n_preds] = pred;
645 filter->n_preds++;
646out:
647 return err;
295} 648}
296 649
297int filter_add_subsystem_pred(struct event_subsystem *system, 650static void parse_init(struct filter_parse_state *ps,
298 struct filter_pred *pred) 651 struct filter_op *ops,
652 char *infix_string)
299{ 653{
300 struct ftrace_event_call *call = __start_ftrace_events; 654 memset(ps, '\0', sizeof(*ps));
301 struct filter_pred *event_pred;
302 int i;
303 655
304 if (system->preds && !pred->compound) 656 ps->infix.string = infix_string;
305 filter_free_subsystem_preds(system); 657 ps->infix.cnt = strlen(infix_string);
658 ps->ops = ops;
306 659
307 if (!system->preds) { 660 INIT_LIST_HEAD(&ps->opstack);
308 system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), 661 INIT_LIST_HEAD(&ps->postfix);
309 GFP_KERNEL); 662}
310 if (!system->preds) 663
311 return -ENOMEM; 664static char infix_next(struct filter_parse_state *ps)
665{
666 ps->infix.cnt--;
667
668 return ps->infix.string[ps->infix.tail++];
669}
670
671static char infix_peek(struct filter_parse_state *ps)
672{
673 if (ps->infix.tail == strlen(ps->infix.string))
674 return 0;
675
676 return ps->infix.string[ps->infix.tail];
677}
678
679static void infix_advance(struct filter_parse_state *ps)
680{
681 ps->infix.cnt--;
682 ps->infix.tail++;
683}
684
685static inline int is_precedence_lower(struct filter_parse_state *ps,
686 int a, int b)
687{
688 return ps->ops[a].precedence < ps->ops[b].precedence;
689}
690
691static inline int is_op_char(struct filter_parse_state *ps, char c)
692{
693 int i;
694
695 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
696 if (ps->ops[i].string[0] == c)
697 return 1;
312 } 698 }
313 699
314 for (i = 0; i < MAX_FILTER_PRED; i++) { 700 return 0;
315 if (!system->preds[i]) { 701}
316 system->preds[i] = pred; 702
317 break; 703static int infix_get_op(struct filter_parse_state *ps, char firstc)
704{
705 char nextc = infix_peek(ps);
706 char opstr[3];
707 int i;
708
709 opstr[0] = firstc;
710 opstr[1] = nextc;
711 opstr[2] = '\0';
712
713 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
714 if (!strcmp(opstr, ps->ops[i].string)) {
715 infix_advance(ps);
716 return ps->ops[i].id;
318 } 717 }
319 } 718 }
320 719
321 if (i == MAX_FILTER_PRED) 720 opstr[1] = '\0';
322 return -ENOSPC; 721
722 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
723 if (!strcmp(opstr, ps->ops[i].string))
724 return ps->ops[i].id;
725 }
726
727 return OP_NONE;
728}
729
730static inline void clear_operand_string(struct filter_parse_state *ps)
731{
732 memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
733 ps->operand.tail = 0;
734}
735
736static inline int append_operand_char(struct filter_parse_state *ps, char c)
737{
738 if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
739 return -EINVAL;
740
741 ps->operand.string[ps->operand.tail++] = c;
742
743 return 0;
744}
745
746static int filter_opstack_push(struct filter_parse_state *ps, int op)
747{
748 struct opstack_op *opstack_op;
323 749
324 events_for_each(call) { 750 opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
325 int err; 751 if (!opstack_op)
752 return -ENOMEM;
753
754 opstack_op->op = op;
755 list_add(&opstack_op->list, &ps->opstack);
756
757 return 0;
758}
759
760static int filter_opstack_empty(struct filter_parse_state *ps)
761{
762 return list_empty(&ps->opstack);
763}
764
765static int filter_opstack_top(struct filter_parse_state *ps)
766{
767 struct opstack_op *opstack_op;
768
769 if (filter_opstack_empty(ps))
770 return OP_NONE;
771
772 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
773
774 return opstack_op->op;
775}
776
777static int filter_opstack_pop(struct filter_parse_state *ps)
778{
779 struct opstack_op *opstack_op;
780 int op;
781
782 if (filter_opstack_empty(ps))
783 return OP_NONE;
784
785 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
786 op = opstack_op->op;
787 list_del(&opstack_op->list);
788
789 kfree(opstack_op);
790
791 return op;
792}
793
794static void filter_opstack_clear(struct filter_parse_state *ps)
795{
796 while (!filter_opstack_empty(ps))
797 filter_opstack_pop(ps);
798}
326 799
327 if (!call->name || !call->regfunc) 800static char *curr_operand(struct filter_parse_state *ps)
801{
802 return ps->operand.string;
803}
804
805static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
806{
807 struct postfix_elt *elt;
808
809 elt = kmalloc(sizeof(*elt), GFP_KERNEL);
810 if (!elt)
811 return -ENOMEM;
812
813 elt->op = OP_NONE;
814 elt->operand = kstrdup(operand, GFP_KERNEL);
815 if (!elt->operand) {
816 kfree(elt);
817 return -ENOMEM;
818 }
819
820 list_add_tail(&elt->list, &ps->postfix);
821
822 return 0;
823}
824
825static int postfix_append_op(struct filter_parse_state *ps, int op)
826{
827 struct postfix_elt *elt;
828
829 elt = kmalloc(sizeof(*elt), GFP_KERNEL);
830 if (!elt)
831 return -ENOMEM;
832
833 elt->op = op;
834 elt->operand = NULL;
835
836 list_add_tail(&elt->list, &ps->postfix);
837
838 return 0;
839}
840
841static void postfix_clear(struct filter_parse_state *ps)
842{
843 struct postfix_elt *elt;
844
845 while (!list_empty(&ps->postfix)) {
846 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
847 kfree(elt->operand);
848 list_del(&elt->list);
849 }
850}
851
852static int filter_parse(struct filter_parse_state *ps)
853{
854 int in_string = 0;
855 int op, top_op;
856 char ch;
857
858 while ((ch = infix_next(ps))) {
859 if (ch == '"') {
860 in_string ^= 1;
328 continue; 861 continue;
862 }
329 863
330 if (strcmp(call->system, system->name)) 864 if (in_string)
865 goto parse_operand;
866
867 if (isspace(ch))
331 continue; 868 continue;
332 869
333 if (!find_event_field(call, pred->field_name)) 870 if (is_op_char(ps, ch)) {
871 op = infix_get_op(ps, ch);
872 if (op == OP_NONE) {
873 parse_error(ps, FILT_ERR_INVALID_OP, 0);
874 return -EINVAL;
875 }
876
877 if (strlen(curr_operand(ps))) {
878 postfix_append_operand(ps, curr_operand(ps));
879 clear_operand_string(ps);
880 }
881
882 while (!filter_opstack_empty(ps)) {
883 top_op = filter_opstack_top(ps);
884 if (!is_precedence_lower(ps, top_op, op)) {
885 top_op = filter_opstack_pop(ps);
886 postfix_append_op(ps, top_op);
887 continue;
888 }
889 break;
890 }
891
892 filter_opstack_push(ps, op);
334 continue; 893 continue;
894 }
335 895
336 event_pred = copy_pred(pred); 896 if (ch == '(') {
337 if (!event_pred) 897 filter_opstack_push(ps, OP_OPEN_PAREN);
338 goto oom; 898 continue;
899 }
339 900
340 err = filter_add_pred(call, event_pred); 901 if (ch == ')') {
341 if (err) 902 if (strlen(curr_operand(ps))) {
342 filter_free_pred(event_pred); 903 postfix_append_operand(ps, curr_operand(ps));
343 if (err == -ENOMEM) 904 clear_operand_string(ps);
344 goto oom; 905 }
906
907 top_op = filter_opstack_pop(ps);
908 while (top_op != OP_NONE) {
909 if (top_op == OP_OPEN_PAREN)
910 break;
911 postfix_append_op(ps, top_op);
912 top_op = filter_opstack_pop(ps);
913 }
914 if (top_op == OP_NONE) {
915 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
916 return -EINVAL;
917 }
918 continue;
919 }
920parse_operand:
921 if (append_operand_char(ps, ch)) {
922 parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
923 return -EINVAL;
924 }
925 }
926
927 if (strlen(curr_operand(ps)))
928 postfix_append_operand(ps, curr_operand(ps));
929
930 while (!filter_opstack_empty(ps)) {
931 top_op = filter_opstack_pop(ps);
932 if (top_op == OP_NONE)
933 break;
934 if (top_op == OP_OPEN_PAREN) {
935 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
936 return -EINVAL;
937 }
938 postfix_append_op(ps, top_op);
345 } 939 }
346 940
347 return 0; 941 return 0;
942}
348 943
349oom: 944static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
350 system->preds[i] = NULL; 945{
351 return -ENOMEM; 946 struct filter_pred *pred;
947
948 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
949 if (!pred)
950 return NULL;
951
952 pred->field_name = kstrdup(operand1, GFP_KERNEL);
953 if (!pred->field_name) {
954 kfree(pred);
955 return NULL;
956 }
957
958 strcpy(pred->str_val, operand2);
959 pred->str_len = strlen(operand2);
960
961 pred->op = op;
962
963 return pred;
352} 964}
353 965
354int filter_parse(char **pbuf, struct filter_pred *pred) 966static struct filter_pred *create_logical_pred(int op)
355{ 967{
356 char *tmp, *tok, *val_str = NULL; 968 struct filter_pred *pred;
357 int tok_n = 0; 969
358 970 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
359 /* field ==/!= number, or/and field ==/!= number, number */ 971 if (!pred)
360 while ((tok = strsep(pbuf, " \n"))) { 972 return NULL;
361 if (tok_n == 0) { 973
362 if (!strcmp(tok, "0")) { 974 pred->op = op;
363 pred->clear = 1; 975
364 return 0; 976 return pred;
365 } else if (!strcmp(tok, "&&")) { 977}
366 pred->or = 0; 978
367 pred->compound = 1; 979static int check_preds(struct filter_parse_state *ps)
368 } else if (!strcmp(tok, "||")) { 980{
369 pred->or = 1; 981 int n_normal_preds = 0, n_logical_preds = 0;
370 pred->compound = 1; 982 struct postfix_elt *elt;
371 } else 983
372 pred->field_name = tok; 984 list_for_each_entry(elt, &ps->postfix, list) {
373 tok_n = 1; 985 if (elt->op == OP_NONE)
986 continue;
987
988 if (elt->op == OP_AND || elt->op == OP_OR) {
989 n_logical_preds++;
374 continue; 990 continue;
375 } 991 }
376 if (tok_n == 1) { 992 n_normal_preds++;
377 if (!pred->field_name) 993 }
378 pred->field_name = tok; 994
379 else if (!strcmp(tok, "!=")) 995 if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
380 pred->not = 1; 996 parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
381 else if (!strcmp(tok, "==")) 997 return -EINVAL;
382 pred->not = 0; 998 }
999
1000 return 0;
1001}
1002
1003static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps,
1006 char *filter_string)
1007{
1008 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred;
1010 struct postfix_elt *elt;
1011 int err;
1012
1013 err = check_preds(ps);
1014 if (err)
1015 return err;
1016
1017 list_for_each_entry(elt, &ps->postfix, list) {
1018 if (elt->op == OP_NONE) {
1019 if (!operand1)
1020 operand1 = elt->operand;
1021 else if (!operand2)
1022 operand2 = elt->operand;
383 else { 1023 else {
384 pred->field_name = NULL; 1024 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
385 return -EINVAL; 1025 return -EINVAL;
386 } 1026 }
387 tok_n = 2;
388 continue; 1027 continue;
389 } 1028 }
390 if (tok_n == 2) { 1029
391 if (pred->compound) { 1030 if (elt->op == OP_AND || elt->op == OP_OR) {
392 if (!strcmp(tok, "!=")) 1031 pred = create_logical_pred(elt->op);
393 pred->not = 1; 1032 if (!pred)
394 else if (!strcmp(tok, "==")) 1033 return -ENOMEM;
395 pred->not = 0; 1034 if (call) {
396 else { 1035 err = filter_add_pred(ps, call, pred);
397 pred->field_name = NULL; 1036 filter_free_pred(pred);
398 return -EINVAL;
399 }
400 } else { 1037 } else {
401 val_str = tok; 1038 err = filter_add_subsystem_pred(ps, system,
402 break; /* done */ 1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
403 } 1042 }
404 tok_n = 3; 1043 if (err)
1044 return err;
1045
1046 operand1 = operand2 = NULL;
405 continue; 1047 continue;
406 } 1048 }
407 if (tok_n == 3) { 1049
408 val_str = tok; 1050 if (!operand1 || !operand2) {
409 break; /* done */ 1051 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1052 return -EINVAL;
1053 }
1054
1055 pred = create_pred(elt->op, operand1, operand2);
1056 if (!pred)
1057 return -ENOMEM;
1058 if (call) {
1059 err = filter_add_pred(ps, call, pred);
1060 filter_free_pred(pred);
1061 } else {
1062 err = filter_add_subsystem_pred(ps, system, pred,
1063 filter_string);
1064 if (err)
1065 filter_free_pred(pred);
410 } 1066 }
1067 if (err)
1068 return err;
1069
1070 operand1 = operand2 = NULL;
411 } 1071 }
412 1072
413 if (!val_str) { 1073 return 0;
414 pred->field_name = NULL; 1074}
415 return -EINVAL; 1075
1076int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1077{
1078 int err;
1079
1080 struct filter_parse_state *ps;
1081
1082 mutex_lock(&event_mutex);
1083
1084 if (!strcmp(strstrip(filter_string), "0")) {
1085 filter_disable_preds(call);
1086 remove_filter_string(call->filter);
1087 mutex_unlock(&event_mutex);
1088 return 0;
416 } 1089 }
417 1090
418 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); 1091 err = -ENOMEM;
419 if (!pred->field_name) 1092 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
420 return -ENOMEM; 1093 if (!ps)
1094 goto out_unlock;
421 1095
422 pred->val = simple_strtoull(val_str, &tmp, 0); 1096 filter_disable_preds(call);
423 if (tmp == val_str) { 1097 replace_filter_string(call->filter, filter_string);
424 pred->str_val = kstrdup(val_str, GFP_KERNEL);
425 if (!pred->str_val)
426 return -ENOMEM;
427 } else if (*tmp != '\0')
428 return -EINVAL;
429 1098
430 return 0; 1099 parse_init(ps, filter_ops, filter_string);
1100 err = filter_parse(ps);
1101 if (err) {
1102 append_filter_err(ps, call->filter);
1103 goto out;
1104 }
1105
1106 err = replace_preds(NULL, call, ps, filter_string);
1107 if (err)
1108 append_filter_err(ps, call->filter);
1109
1110out:
1111 filter_opstack_clear(ps);
1112 postfix_clear(ps);
1113 kfree(ps);
1114out_unlock:
1115 mutex_unlock(&event_mutex);
1116
1117 return err;
431} 1118}
432 1119
1120int apply_subsystem_event_filter(struct event_subsystem *system,
1121 char *filter_string)
1122{
1123 int err;
1124
1125 struct filter_parse_state *ps;
1126
1127 mutex_lock(&event_mutex);
1128
1129 if (!strcmp(strstrip(filter_string), "0")) {
1130 filter_free_subsystem_preds(system);
1131 remove_filter_string(system->filter);
1132 mutex_unlock(&event_mutex);
1133 return 0;
1134 }
1135
1136 err = -ENOMEM;
1137 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1138 if (!ps)
1139 goto out_unlock;
1140
1141 filter_free_subsystem_preds(system);
1142 replace_filter_string(system->filter, filter_string);
1143
1144 parse_init(ps, filter_ops, filter_string);
1145 err = filter_parse(ps);
1146 if (err) {
1147 append_filter_err(ps, system->filter);
1148 goto out;
1149 }
1150
1151 err = replace_preds(system, NULL, ps, filter_string);
1152 if (err)
1153 append_filter_err(ps, system->filter);
1154
1155out:
1156 filter_opstack_clear(ps);
1157 postfix_clear(ps);
1158 kfree(ps);
1159out_unlock:
1160 mutex_unlock(&event_mutex);
1161
1162 return err;
1163}
433 1164
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 38985f9b379c..000000000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
1/*
2 * Stage 1 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * struct ftrace_raw_<call> {
7 * struct trace_entry ent;
8 * <type> <item>;
9 * <type2> <item2>[<len>];
10 * [...]
11 * };
12 *
13 * The <type> <item> is created by the __field(type, item) macro or
14 * the __array(type2, item2, len) macro.
15 * We simply do "type item;", and that will create the fields
16 * in the structure.
17 */
18
19#undef TRACE_FORMAT
20#define TRACE_FORMAT(call, proto, args, fmt)
21
22#undef __array
23#define __array(type, item, len) type item[len];
24
25#undef __field
26#define __field(type, item) type item;
27
28#undef TP_STRUCT__entry
29#define TP_STRUCT__entry(args...) args
30
31#undef TRACE_EVENT
32#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
33 struct ftrace_raw_##name { \
34 struct trace_entry ent; \
35 tstruct \
36 }; \
37 static struct ftrace_event_call event_##name
38
39#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index d363c6672c6c..000000000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Stage 2 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * enum print_line_t
7 * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
8 * {
9 * struct trace_seq *s = &iter->seq;
10 * struct ftrace_raw_<call> *field; <-- defined in stage 1
11 * struct trace_entry *entry;
12 * int ret;
13 *
14 * entry = iter->ent;
15 *
16 * if (entry->type != event_<call>.id) {
17 * WARN_ON_ONCE(1);
18 * return TRACE_TYPE_UNHANDLED;
19 * }
20 *
21 * field = (typeof(field))entry;
22 *
23 * ret = trace_seq_printf(s, <TP_printk> "\n");
24 * if (!ret)
25 * return TRACE_TYPE_PARTIAL_LINE;
26 *
27 * return TRACE_TYPE_HANDLED;
28 * }
29 *
30 * This is the method used to print the raw event to the trace
31 * output format. Note, this is not needed if the data is read
32 * in binary.
33 */
34
35#undef __entry
36#define __entry field
37
38#undef TP_printk
39#define TP_printk(fmt, args...) fmt "\n", args
40
41#undef TRACE_EVENT
42#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
43enum print_line_t \
44ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
45{ \
46 struct trace_seq *s = &iter->seq; \
47 struct ftrace_raw_##call *field; \
48 struct trace_entry *entry; \
49 int ret; \
50 \
51 entry = iter->ent; \
52 \
53 if (entry->type != event_##call.id) { \
54 WARN_ON_ONCE(1); \
55 return TRACE_TYPE_UNHANDLED; \
56 } \
57 \
58 field = (typeof(field))entry; \
59 \
60 ret = trace_seq_printf(s, #call ": " print); \
61 if (!ret) \
62 return TRACE_TYPE_PARTIAL_LINE; \
63 \
64 return TRACE_TYPE_HANDLED; \
65}
66
67#include <trace/trace_event_types.h>
68
69/*
70 * Setup the showing format of trace point.
71 *
72 * int
73 * ftrace_format_##call(struct trace_seq *s)
74 * {
75 * struct ftrace_raw_##call field;
76 * int ret;
77 *
78 * ret = trace_seq_printf(s, #type " " #item ";"
79 * " offset:%u; size:%u;\n",
80 * offsetof(struct ftrace_raw_##call, item),
81 * sizeof(field.type));
82 *
83 * }
84 */
85
86#undef TP_STRUCT__entry
87#define TP_STRUCT__entry(args...) args
88
89#undef __field
90#define __field(type, item) \
91 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
92 "offset:%u;\tsize:%u;\n", \
93 (unsigned int)offsetof(typeof(field), item), \
94 (unsigned int)sizeof(field.item)); \
95 if (!ret) \
96 return 0;
97
98#undef __array
99#define __array(type, item, len) \
100 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
101 "offset:%u;\tsize:%u;\n", \
102 (unsigned int)offsetof(typeof(field), item), \
103 (unsigned int)sizeof(field.item)); \
104 if (!ret) \
105 return 0;
106
107#undef __entry
108#define __entry REC
109
110#undef TP_printk
111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
112
113#undef TP_fast_assign
114#define TP_fast_assign(args...) args
115
116#undef TRACE_EVENT
117#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
118static int \
119ftrace_format_##call(struct trace_seq *s) \
120{ \
121 struct ftrace_raw_##call field; \
122 int ret; \
123 \
124 tstruct; \
125 \
126 trace_seq_printf(s, "\nprint fmt: " print); \
127 \
128 return ret; \
129}
130
131#include <trace/trace_event_types.h>
132
133#undef __field
134#define __field(type, item) \
135 ret = trace_define_field(event_call, #type, #item, \
136 offsetof(typeof(field), item), \
137 sizeof(field.item)); \
138 if (ret) \
139 return ret;
140
141#undef __array
142#define __array(type, item, len) \
143 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
144 offsetof(typeof(field), item), \
145 sizeof(field.item)); \
146 if (ret) \
147 return ret;
148
149#define __common_field(type, item) \
150 ret = trace_define_field(event_call, #type, "common_" #item, \
151 offsetof(typeof(field.ent), item), \
152 sizeof(field.ent.item)); \
153 if (ret) \
154 return ret;
155
156#undef TRACE_EVENT
157#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
158int \
159ftrace_define_fields_##call(void) \
160{ \
161 struct ftrace_raw_##call field; \
162 struct ftrace_event_call *event_call = &event_##call; \
163 int ret; \
164 \
165 __common_field(unsigned char, type); \
166 __common_field(unsigned char, flags); \
167 __common_field(unsigned char, preempt_count); \
168 __common_field(int, pid); \
169 __common_field(int, tgid); \
170 \
171 tstruct; \
172 \
173 return ret; \
174}
175
176#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644
index 9d2fa78cecca..000000000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,281 +0,0 @@
1/*
2 * Stage 3 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * static void ftrace_event_<call>(proto)
7 * {
8 * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
9 * }
10 *
11 * static int ftrace_reg_event_<call>(void)
12 * {
13 * int ret;
14 *
15 * ret = register_trace_<call>(ftrace_event_<call>);
16 * if (!ret)
17 * pr_info("event trace: Could not activate trace point "
18 * "probe to <call>");
19 * return ret;
20 * }
21 *
22 * static void ftrace_unreg_event_<call>(void)
23 * {
24 * unregister_trace_<call>(ftrace_event_<call>);
25 * }
26 *
27 * For those macros defined with TRACE_FORMAT:
28 *
29 * static struct ftrace_event_call __used
30 * __attribute__((__aligned__(4)))
31 * __attribute__((section("_ftrace_events"))) event_<call> = {
32 * .name = "<call>",
33 * .regfunc = ftrace_reg_event_<call>,
34 * .unregfunc = ftrace_unreg_event_<call>,
35 * }
36 *
37 *
38 * For those macros defined with TRACE_EVENT:
39 *
40 * static struct ftrace_event_call event_<call>;
41 *
42 * static void ftrace_raw_event_<call>(proto)
43 * {
44 * struct ring_buffer_event *event;
45 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
46 * unsigned long irq_flags;
47 * int pc;
48 *
49 * local_save_flags(irq_flags);
50 * pc = preempt_count();
51 *
52 * event = trace_current_buffer_lock_reserve(event_<call>.id,
53 * sizeof(struct ftrace_raw_<call>),
54 * irq_flags, pc);
55 * if (!event)
56 * return;
57 * entry = ring_buffer_event_data(event);
58 *
59 * <assign>; <-- Here we assign the entries by the __field and
60 * __array macros.
61 *
62 * trace_current_buffer_unlock_commit(event, irq_flags, pc);
63 * }
64 *
65 * static int ftrace_raw_reg_event_<call>(void)
66 * {
67 * int ret;
68 *
69 * ret = register_trace_<call>(ftrace_raw_event_<call>);
70 * if (!ret)
71 * pr_info("event trace: Could not activate trace point "
72 * "probe to <call>");
73 * return ret;
74 * }
75 *
76 * static void ftrace_unreg_event_<call>(void)
77 * {
78 * unregister_trace_<call>(ftrace_raw_event_<call>);
79 * }
80 *
81 * static struct trace_event ftrace_event_type_<call> = {
82 * .trace = ftrace_raw_output_<call>, <-- stage 2
83 * };
84 *
85 * static int ftrace_raw_init_event_<call>(void)
86 * {
87 * int id;
88 *
89 * id = register_ftrace_event(&ftrace_event_type_<call>);
90 * if (!id)
91 * return -ENODEV;
92 * event_<call>.id = id;
93 * return 0;
94 * }
95 *
96 * static struct ftrace_event_call __used
97 * __attribute__((__aligned__(4)))
98 * __attribute__((section("_ftrace_events"))) event_<call> = {
99 * .name = "<call>",
100 * .system = "<system>",
101 * .raw_init = ftrace_raw_init_event_<call>,
102 * .regfunc = ftrace_reg_event_<call>,
103 * .unregfunc = ftrace_unreg_event_<call>,
104 * .show_format = ftrace_format_<call>,
105 * }
106 *
107 */
108
109#undef TP_FMT
110#define TP_FMT(fmt, args...) fmt "\n", ##args
111
112#ifdef CONFIG_EVENT_PROFILE
113#define _TRACE_PROFILE(call, proto, args) \
114static void ftrace_profile_##call(proto) \
115{ \
116 extern void perf_tpcounter_event(int); \
117 perf_tpcounter_event(event_##call.id); \
118} \
119 \
120static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
121{ \
122 int ret = 0; \
123 \
124 if (!atomic_inc_return(&call->profile_count)) \
125 ret = register_trace_##call(ftrace_profile_##call); \
126 \
127 return ret; \
128} \
129 \
130static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
131{ \
132 if (atomic_add_negative(-1, &call->profile_count)) \
133 unregister_trace_##call(ftrace_profile_##call); \
134}
135
136#define _TRACE_PROFILE_INIT(call) \
137 .profile_count = ATOMIC_INIT(-1), \
138 .profile_enable = ftrace_profile_enable_##call, \
139 .profile_disable = ftrace_profile_disable_##call,
140
141#else
142#define _TRACE_PROFILE(call, proto, args)
143#define _TRACE_PROFILE_INIT(call)
144#endif
145
146#define _TRACE_FORMAT(call, proto, args, fmt) \
147static void ftrace_event_##call(proto) \
148{ \
149 event_trace_printk(_RET_IP_, #call ": " fmt); \
150} \
151 \
152static int ftrace_reg_event_##call(void) \
153{ \
154 int ret; \
155 \
156 ret = register_trace_##call(ftrace_event_##call); \
157 if (ret) \
158 pr_info("event trace: Could not activate trace point " \
159 "probe to " #call "\n"); \
160 return ret; \
161} \
162 \
163static void ftrace_unreg_event_##call(void) \
164{ \
165 unregister_trace_##call(ftrace_event_##call); \
166} \
167 \
168static struct ftrace_event_call event_##call; \
169 \
170static int ftrace_init_event_##call(void) \
171{ \
172 int id; \
173 \
174 id = register_ftrace_event(NULL); \
175 if (!id) \
176 return -ENODEV; \
177 event_##call.id = id; \
178 return 0; \
179}
180
181#undef TRACE_FORMAT
182#define TRACE_FORMAT(call, proto, args, fmt) \
183_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
184_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
185static struct ftrace_event_call __used \
186__attribute__((__aligned__(4))) \
187__attribute__((section("_ftrace_events"))) event_##call = { \
188 .name = #call, \
189 .system = __stringify(TRACE_SYSTEM), \
190 .raw_init = ftrace_init_event_##call, \
191 .regfunc = ftrace_reg_event_##call, \
192 .unregfunc = ftrace_unreg_event_##call, \
193 _TRACE_PROFILE_INIT(call) \
194}
195
196#undef __entry
197#define __entry entry
198
199#undef TRACE_EVENT
200#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
201_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
202 \
203static struct ftrace_event_call event_##call; \
204 \
205static void ftrace_raw_event_##call(proto) \
206{ \
207 struct ftrace_event_call *call = &event_##call; \
208 struct ring_buffer_event *event; \
209 struct ftrace_raw_##call *entry; \
210 unsigned long irq_flags; \
211 int pc; \
212 \
213 local_save_flags(irq_flags); \
214 pc = preempt_count(); \
215 \
216 event = trace_current_buffer_lock_reserve(event_##call.id, \
217 sizeof(struct ftrace_raw_##call), \
218 irq_flags, pc); \
219 if (!event) \
220 return; \
221 entry = ring_buffer_event_data(event); \
222 \
223 assign; \
224 \
225 if (call->preds && !filter_match_preds(call, entry)) \
226 ring_buffer_event_discard(event); \
227 \
228 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
229 \
230} \
231 \
232static int ftrace_raw_reg_event_##call(void) \
233{ \
234 int ret; \
235 \
236 ret = register_trace_##call(ftrace_raw_event_##call); \
237 if (ret) \
238 pr_info("event trace: Could not activate trace point " \
239 "probe to " #call "\n"); \
240 return ret; \
241} \
242 \
243static void ftrace_raw_unreg_event_##call(void) \
244{ \
245 unregister_trace_##call(ftrace_raw_event_##call); \
246} \
247 \
248static struct trace_event ftrace_event_type_##call = { \
249 .trace = ftrace_raw_output_##call, \
250}; \
251 \
252static int ftrace_raw_init_event_##call(void) \
253{ \
254 int id; \
255 \
256 id = register_ftrace_event(&ftrace_event_type_##call); \
257 if (!id) \
258 return -ENODEV; \
259 event_##call.id = id; \
260 INIT_LIST_HEAD(&event_##call.fields); \
261 return 0; \
262} \
263 \
264static struct ftrace_event_call __used \
265__attribute__((__aligned__(4))) \
266__attribute__((section("_ftrace_events"))) event_##call = { \
267 .name = #call, \
268 .system = __stringify(TRACE_SYSTEM), \
269 .raw_init = ftrace_raw_init_event_##call, \
270 .regfunc = ftrace_raw_reg_event_##call, \
271 .unregfunc = ftrace_raw_unreg_event_##call, \
272 .show_format = ftrace_format_##call, \
273 .define_fields = ftrace_define_fields_##call, \
274 _TRACE_PROFILE_INIT(call) \
275}
276
277#include <trace/trace_event_types.h>
278
279#undef _TRACE_PROFILE
280#undef _TRACE_PROFILE_INIT
281
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c33ebf3..d06cf898dc86 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
19#undef TRACE_STRUCT 19#undef TRACE_STRUCT
20#define TRACE_STRUCT(args...) args 20#define TRACE_STRUCT(args...) args
21 21
22extern void __bad_type_size(void);
23
22#undef TRACE_FIELD 24#undef TRACE_FIELD
23#define TRACE_FIELD(type, item, assign) \ 25#define TRACE_FIELD(type, item, assign) \
26 if (sizeof(type) != sizeof(field.item)) \
27 __bad_type_size(); \
24 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
25 "offset:%u;\tsize:%u;\n", \ 29 "offset:%u;\tsize:%u;\n", \
26 (unsigned int)offsetof(typeof(field), item), \ 30 (unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
30 34
31 35
32#undef TRACE_FIELD_SPECIAL 36#undef TRACE_FIELD_SPECIAL
33#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ 37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
34 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
35 "offset:%u;\tsize:%u;\n", \ 39 "offset:%u;\tsize:%u;\n", \
36 (unsigned int)offsetof(typeof(field), item), \ 40 (unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
46 if (!ret) \ 50 if (!ret) \
47 return 0; 51 return 0;
48 52
53#undef TRACE_FIELD_SIGN
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
55 TRACE_FIELD(type, item, assign)
49 56
50#undef TP_RAW_FMT 57#undef TP_RAW_FMT
51#define TP_RAW_FMT(args...) args 58#define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s) \
65 return ret; \ 72 return ret; \
66} 73}
67 74
75#undef TRACE_EVENT_FORMAT_NOFILTER
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
77 tpfmt) \
78static int \
79ftrace_format_##call(struct trace_seq *s) \
80{ \
81 struct args field; \
82 int ret; \
83 \
84 tstruct; \
85 \
86 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
87 \
88 return ret; \
89}
90
68#include "trace_event_types.h" 91#include "trace_event_types.h"
69 92
70#undef TRACE_ZERO_CHAR 93#undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \
78#define TRACE_FIELD(type, item, assign)\ 101#define TRACE_FIELD(type, item, assign)\
79 entry->item = assign; 102 entry->item = assign;
80 103
104#undef TRACE_FIELD_SIGN
105#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
106 TRACE_FIELD(type, item, assign)
107
81#undef TP_CMD 108#undef TP_CMD
82#define TP_CMD(cmd...) cmd 109#define TP_CMD(cmd...) cmd
83 110
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s) \
85#define TRACE_ENTRY entry 112#define TRACE_ENTRY entry
86 113
87#undef TRACE_FIELD_SPECIAL 114#undef TRACE_FIELD_SPECIAL
88#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ 115#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
89 cmd; 116 cmd;
90 117
91#undef TRACE_EVENT_FORMAT 118#undef TRACE_EVENT_FORMAT
92#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
120int ftrace_define_fields_##call(void); \
121static int ftrace_raw_init_event_##call(void); \
122 \
123struct ftrace_event_call __used \
124__attribute__((__aligned__(4))) \
125__attribute__((section("_ftrace_events"))) event_##call = { \
126 .name = #call, \
127 .id = proto, \
128 .system = __stringify(TRACE_SYSTEM), \
129 .raw_init = ftrace_raw_init_event_##call, \
130 .show_format = ftrace_format_##call, \
131 .define_fields = ftrace_define_fields_##call, \
132}; \
133static int ftrace_raw_init_event_##call(void) \
134{ \
135 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \
138} \
139
140#undef TRACE_EVENT_FORMAT_NOFILTER
141#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
142 tpfmt) \
93 \ 143 \
94static struct ftrace_event_call __used \ 144struct ftrace_event_call __used \
95__attribute__((__aligned__(4))) \ 145__attribute__((__aligned__(4))) \
96__attribute__((section("_ftrace_events"))) event_##call = { \ 146__attribute__((section("_ftrace_events"))) event_##call = { \
97 .name = #call, \ 147 .name = #call, \
98 .id = proto, \ 148 .id = proto, \
99 .system = __stringify(TRACE_SYSTEM), \ 149 .system = __stringify(TRACE_SYSTEM), \
100 .show_format = ftrace_format_##call, \ 150 .show_format = ftrace_format_##call, \
151};
152
153#include "trace_event_types.h"
154
155#undef TRACE_FIELD
156#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \
160 if (ret) \
161 return ret;
162
163#undef TRACE_FIELD_SPECIAL
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \
168 if (ret) \
169 return ret;
170
171#undef TRACE_FIELD_SIGN
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \
176 if (ret) \
177 return ret;
178
179#undef TRACE_FIELD_ZERO_CHAR
180#define TRACE_FIELD_ZERO_CHAR(item)
181
182#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \
185ftrace_define_fields_##call(void) \
186{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \
189 int ret; \
190 \
191 __common_field(unsigned char, type, 0); \
192 __common_field(unsigned char, flags, 0); \
193 __common_field(unsigned char, preempt_count, 0); \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \
197 tstruct; \
198 \
199 return ret; \
101} 200}
201
202#undef TRACE_EVENT_FORMAT_NOFILTER
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
204 tpfmt)
205
102#include "trace_event_types.h" 206#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..75ef000613c3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
193static void tracing_stop_function_trace(void) 193static void tracing_stop_function_trace(void)
194{ 194{
195 ftrace_function_enabled = 0; 195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */ 196
197 unregister_ftrace_function(&trace_stack_ops); 197 if (func_flags.val & TRACE_FUNC_OPT_STACK)
198 unregister_ftrace_function(&trace_ops); 198 unregister_ftrace_function(&trace_stack_ops);
199 else
200 unregister_ftrace_function(&trace_ops);
199} 201}
200 202
201static int func_set_flag(u32 old_flags, u32 bit, int set) 203static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
300 if (count == -1) 302 if (count == -1)
301 seq_printf(m, ":unlimited\n"); 303 seq_printf(m, ":unlimited\n");
302 else 304 else
303 seq_printf(m, ":count=%ld", count); 305 seq_printf(m, ":count=%ld\n", count);
304 seq_putc(m, '\n');
305 306
306 return 0; 307 return 0;
307} 308}
@@ -362,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
362 out_reg: 363 out_reg:
363 ret = register_ftrace_function_probe(glob, ops, count); 364 ret = register_ftrace_function_probe(glob, ops, count);
364 365
365 return ret; 366 return ret < 0 ? ret : 0;
366} 367}
367 368
368static struct ftrace_func_command ftrace_traceon_cmd = { 369static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a7..420ec3487579 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
59int 59int
60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) 60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
61 unsigned long frame_pointer)
61{ 62{
62 unsigned long long calltime; 63 unsigned long long calltime;
63 int index; 64 int index;
@@ -65,6 +66,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
65 if (!current->ret_stack) 66 if (!current->ret_stack)
66 return -EBUSY; 67 return -EBUSY;
67 68
69 /*
70 * We must make sure the ret_stack is tested before we read
71 * anything else.
72 */
73 smp_rmb();
74
68 /* The return trace stack is full */ 75 /* The return trace stack is full */
69 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { 76 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
70 atomic_inc(&current->trace_overrun); 77 atomic_inc(&current->trace_overrun);
@@ -78,14 +85,17 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
78 current->ret_stack[index].ret = ret; 85 current->ret_stack[index].ret = ret;
79 current->ret_stack[index].func = func; 86 current->ret_stack[index].func = func;
80 current->ret_stack[index].calltime = calltime; 87 current->ret_stack[index].calltime = calltime;
88 current->ret_stack[index].subtime = 0;
89 current->ret_stack[index].fp = frame_pointer;
81 *depth = index; 90 *depth = index;
82 91
83 return 0; 92 return 0;
84} 93}
85 94
86/* Retrieve a function return address to the trace stack on thread info.*/ 95/* Retrieve a function return address to the trace stack on thread info.*/
87void 96static void
88ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) 97ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
98 unsigned long frame_pointer)
89{ 99{
90 int index; 100 int index;
91 101
@@ -99,28 +109,52 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
99 return; 109 return;
100 } 110 }
101 111
112#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
113 /*
114 * The arch may choose to record the frame pointer used
115 * and check it here to make sure that it is what we expect it
116 * to be. If gcc does not set the place holder of the return
117 * address in the frame pointer, and does a copy instead, then
118 * the function graph trace will fail. This test detects this
119 * case.
120 *
121 * Currently, x86_32 with optimize for size (-Os) makes the latest
122 * gcc do the above.
123 */
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n",
128 current->ret_stack[index].fp,
129 frame_pointer,
130 (void *)current->ret_stack[index].func,
131 current->ret_stack[index].ret);
132 *ret = (unsigned long)panic;
133 return;
134 }
135#endif
136
102 *ret = current->ret_stack[index].ret; 137 *ret = current->ret_stack[index].ret;
103 trace->func = current->ret_stack[index].func; 138 trace->func = current->ret_stack[index].func;
104 trace->calltime = current->ret_stack[index].calltime; 139 trace->calltime = current->ret_stack[index].calltime;
105 trace->overrun = atomic_read(&current->trace_overrun); 140 trace->overrun = atomic_read(&current->trace_overrun);
106 trace->depth = index; 141 trace->depth = index;
107 barrier();
108 current->curr_ret_stack--;
109
110} 142}
111 143
112/* 144/*
113 * Send the trace to the ring-buffer. 145 * Send the trace to the ring-buffer.
114 * @return the original return address. 146 * @return the original return address.
115 */ 147 */
116unsigned long ftrace_return_to_handler(void) 148unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
117{ 149{
118 struct ftrace_graph_ret trace; 150 struct ftrace_graph_ret trace;
119 unsigned long ret; 151 unsigned long ret;
120 152
121 ftrace_pop_return_trace(&trace, &ret); 153 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
122 trace.rettime = trace_clock_local(); 154 trace.rettime = trace_clock_local();
123 ftrace_graph_return(&trace); 155 ftrace_graph_return(&trace);
156 barrier();
157 current->curr_ret_stack--;
124 158
125 if (unlikely(!ret)) { 159 if (unlikely(!ret)) {
126 ftrace_graph_stop(); 160 ftrace_graph_stop();
@@ -426,8 +460,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
426 return TRACE_TYPE_HANDLED; 460 return TRACE_TYPE_HANDLED;
427} 461}
428 462
429static enum print_line_t 463enum print_line_t
430print_graph_duration(unsigned long long duration, struct trace_seq *s) 464trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
431{ 465{
432 unsigned long nsecs_rem = do_div(duration, 1000); 466 unsigned long nsecs_rem = do_div(duration, 1000);
433 /* log10(ULONG_MAX) + '\0' */ 467 /* log10(ULONG_MAX) + '\0' */
@@ -464,12 +498,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
464 if (!ret) 498 if (!ret)
465 return TRACE_TYPE_PARTIAL_LINE; 499 return TRACE_TYPE_PARTIAL_LINE;
466 } 500 }
501 return TRACE_TYPE_HANDLED;
502}
503
504static enum print_line_t
505print_graph_duration(unsigned long long duration, struct trace_seq *s)
506{
507 int ret;
508
509 ret = trace_print_graph_duration(duration, s);
510 if (ret != TRACE_TYPE_HANDLED)
511 return ret;
467 512
468 ret = trace_seq_printf(s, "| "); 513 ret = trace_seq_printf(s, "| ");
469 if (!ret) 514 if (!ret)
470 return TRACE_TYPE_PARTIAL_LINE; 515 return TRACE_TYPE_PARTIAL_LINE;
471 return TRACE_TYPE_HANDLED;
472 516
517 return TRACE_TYPE_HANDLED;
473} 518}
474 519
475/* Case of a leaf function on its call entry */ 520/* Case of a leaf function on its call entry */
@@ -798,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
798 843
799 switch (entry->type) { 844 switch (entry->type) {
800 case TRACE_GRAPH_ENT: { 845 case TRACE_GRAPH_ENT: {
801 struct ftrace_graph_ent_entry *field; 846 /*
847 * print_graph_entry() may consume the current event,
848 * thus @field may become invalid, so we need to save it.
849 * sizeof(struct ftrace_graph_ent_entry) is very small,
850 * it can be safely saved at the stack.
851 */
852 struct ftrace_graph_ent_entry *field, saved;
802 trace_assign_type(field, entry); 853 trace_assign_type(field, entry);
803 return print_graph_entry(field, s, iter); 854 saved = *field;
855 return print_graph_entry(&saved, s, iter);
804 } 856 }
805 case TRACE_GRAPH_RET: { 857 case TRACE_GRAPH_RET: {
806 struct ftrace_graph_ret_entry *field; 858 struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347f..ca7d7c4d0c2a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,10 +1,9 @@
1/* 1/*
2 * h/w branch tracer for x86 based on bts 2 * h/w branch tracer for x86 based on BTS
3 * 3 *
4 * Copyright (C) 2008-2009 Intel Corporation. 4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */ 6 */
7#include <linux/spinlock.h>
8#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
9#include <linux/debugfs.h> 8#include <linux/debugfs.h>
10#include <linux/ftrace.h> 9#include <linux/ftrace.h>
@@ -15,110 +14,119 @@
15 14
16#include <asm/ds.h> 15#include <asm/ds.h>
17 16
18#include "trace.h"
19#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h"
20 19
21 20
22#define SIZEOF_BTS (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
23 22
24/*
25 * The tracer lock protects the below per-cpu tracer array.
26 * It needs to be held to:
27 * - start tracing on all cpus
28 * - stop tracing on all cpus
29 * - start tracing on a single hotplug cpu
30 * - stop tracing on a single hotplug cpu
31 * - read the trace from all cpus
32 * - read the trace from a single cpu
33 */
34static DEFINE_SPINLOCK(bts_tracer_lock);
35static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, tracer);
36static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
37 25
38#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(tracer, smp_processor_id())
39#define this_buffer per_cpu(buffer, smp_processor_id())
40 27
41static int __read_mostly trace_hw_branches_enabled; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
42static struct trace_array *hw_branch_trace __read_mostly; 30static struct trace_array *hw_branch_trace __read_mostly;
43 31
44 32
45/* 33static void bts_trace_init_cpu(int cpu)
46 * Start tracing on the current cpu.
47 * The argument is ignored.
48 *
49 * pre: bts_tracer_lock must be locked.
50 */
51static void bts_trace_start_cpu(void *arg)
52{ 34{
53 if (this_tracer) 35 per_cpu(tracer, cpu) =
54 ds_release_bts(this_tracer); 36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
55 37 NULL, (size_t)-1, BTS_KERNEL);
56 this_tracer = 38
57 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, 39 if (IS_ERR(per_cpu(tracer, cpu)))
58 /* ovfl = */ NULL, /* th = */ (size_t)-1, 40 per_cpu(tracer, cpu) = NULL;
59 BTS_KERNEL);
60 if (IS_ERR(this_tracer)) {
61 this_tracer = NULL;
62 return;
63 }
64} 41}
65 42
66static void bts_trace_start(struct trace_array *tr) 43static int bts_trace_init(struct trace_array *tr)
67{ 44{
68 spin_lock(&bts_tracer_lock); 45 int cpu;
46
47 hw_branch_trace = tr;
48 trace_hw_branches_enabled = 0;
69 49
70 on_each_cpu(bts_trace_start_cpu, NULL, 1); 50 get_online_cpus();
71 trace_hw_branches_enabled = 1; 51 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu);
72 53
73 spin_unlock(&bts_tracer_lock); 54 if (likely(per_cpu(tracer, cpu)))
55 trace_hw_branches_enabled = 1;
56 }
57 trace_hw_branches_suspended = 0;
58 put_online_cpus();
59
60 /* If we could not enable tracing on a single cpu, we fail. */
61 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
74} 62}
75 63
76/* 64static void bts_trace_reset(struct trace_array *tr)
77 * Stop tracing on the current cpu.
78 * The argument is ignored.
79 *
80 * pre: bts_tracer_lock must be locked.
81 */
82static void bts_trace_stop_cpu(void *arg)
83{ 65{
84 if (this_tracer) { 66 int cpu;
85 ds_release_bts(this_tracer); 67
86 this_tracer = NULL; 68 get_online_cpus();
69 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu));
72 per_cpu(tracer, cpu) = NULL;
73 }
87 } 74 }
75 trace_hw_branches_enabled = 0;
76 trace_hw_branches_suspended = 0;
77 put_online_cpus();
88} 78}
89 79
90static void bts_trace_stop(struct trace_array *tr) 80static void bts_trace_start(struct trace_array *tr)
91{ 81{
92 spin_lock(&bts_tracer_lock); 82 int cpu;
93 83
94 trace_hw_branches_enabled = 0; 84 get_online_cpus();
95 on_each_cpu(bts_trace_stop_cpu, NULL, 1); 85 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu));
88 trace_hw_branches_suspended = 0;
89 put_online_cpus();
90}
96 91
97 spin_unlock(&bts_tracer_lock); 92static void bts_trace_stop(struct trace_array *tr)
93{
94 int cpu;
95
96 get_online_cpus();
97 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu));
100 trace_hw_branches_suspended = 1;
101 put_online_cpus();
98} 102}
99 103
100static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, 104static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
101 unsigned long action, void *hcpu) 105 unsigned long action, void *hcpu)
102{ 106{
103 unsigned int cpu = (unsigned long)hcpu; 107 int cpu = (long)hcpu;
104
105 spin_lock(&bts_tracer_lock);
106
107 if (!trace_hw_branches_enabled)
108 goto out;
109 108
110 switch (action) { 109 switch (action) {
111 case CPU_ONLINE: 110 case CPU_ONLINE:
112 case CPU_DOWN_FAILED: 111 case CPU_DOWN_FAILED:
113 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); 112 /* The notification is sent with interrupts enabled. */
113 if (trace_hw_branches_enabled) {
114 bts_trace_init_cpu(cpu);
115
116 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu));
119 }
114 break; 120 break;
121
115 case CPU_DOWN_PREPARE: 122 case CPU_DOWN_PREPARE:
116 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 123 /* The notification is sent with interrupts enabled. */
117 break; 124 if (likely(per_cpu(tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu));
126 per_cpu(tracer, cpu) = NULL;
127 }
118 } 128 }
119 129
120 out:
121 spin_unlock(&bts_tracer_lock);
122 return NOTIFY_DONE; 130 return NOTIFY_DONE;
123} 131}
124 132
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
126 .notifier_call = bts_hotcpu_handler 134 .notifier_call = bts_hotcpu_handler
127}; 135};
128 136
129static int bts_trace_init(struct trace_array *tr)
130{
131 hw_branch_trace = tr;
132
133 bts_trace_start(tr);
134
135 return 0;
136}
137
138static void bts_trace_reset(struct trace_array *tr)
139{
140 bts_trace_stop(tr);
141}
142
143static void bts_trace_print_header(struct seq_file *m) 137static void bts_trace_print_header(struct seq_file *m)
144{ 138{
145 seq_puts(m, "# CPU# TO <- FROM\n"); 139 seq_puts(m, "# CPU# TO <- FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
147 141
148static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) 142static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
149{ 143{
144 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
150 struct trace_entry *entry = iter->ent; 145 struct trace_entry *entry = iter->ent;
151 struct trace_seq *seq = &iter->seq; 146 struct trace_seq *seq = &iter->seq;
152 struct hw_branch_entry *it; 147 struct hw_branch_entry *it;
153 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
154 148
155 trace_assign_type(it, entry); 149 trace_assign_type(it, entry);
156 150
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
168 162
169void trace_hw_branch(u64 from, u64 to) 163void trace_hw_branch(u64 from, u64 to)
170{ 164{
165 struct ftrace_event_call *call = &event_hw_branch;
171 struct trace_array *tr = hw_branch_trace; 166 struct trace_array *tr = hw_branch_trace;
172 struct ring_buffer_event *event; 167 struct ring_buffer_event *event;
173 struct hw_branch_entry *entry; 168 struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
194 entry->ent.type = TRACE_HW_BRANCHES; 189 entry->ent.type = TRACE_HW_BRANCHES;
195 entry->from = from; 190 entry->from = from;
196 entry->to = to; 191 entry->to = to;
197 trace_buffer_unlock_commit(tr, event, 0, 0); 192 if (!filter_check_discard(call, entry, tr->buffer, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0);
198 194
199 out: 195 out:
200 atomic_dec(&tr->data[cpu]->disabled); 196 atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
224/* 220/*
225 * Collect the trace on the current cpu and write it into the ftrace buffer. 221 * Collect the trace on the current cpu and write it into the ftrace buffer.
226 * 222 *
227 * pre: bts_tracer_lock must be locked 223 * pre: tracing must be suspended on the current cpu
228 */ 224 */
229static void trace_bts_cpu(void *arg) 225static void trace_bts_cpu(void *arg)
230{ 226{
231 struct trace_array *tr = (struct trace_array *) arg; 227 struct trace_array *tr = (struct trace_array *)arg;
232 const struct bts_trace *trace; 228 const struct bts_trace *trace;
233 unsigned char *at; 229 unsigned char *at;
234 230
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
241 if (unlikely(!this_tracer)) 237 if (unlikely(!this_tracer))
242 return; 238 return;
243 239
244 ds_suspend_bts(this_tracer);
245 trace = ds_read_bts(this_tracer); 240 trace = ds_read_bts(this_tracer);
246 if (!trace) 241 if (!trace)
247 goto out; 242 return;
248 243
249 for (at = trace->ds.top; (void *)at < trace->ds.end; 244 for (at = trace->ds.top; (void *)at < trace->ds.end;
250 at += trace->ds.size) 245 at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
253 for (at = trace->ds.begin; (void *)at < trace->ds.top; 248 for (at = trace->ds.begin; (void *)at < trace->ds.top;
254 at += trace->ds.size) 249 at += trace->ds.size)
255 trace_bts_at(trace, at); 250 trace_bts_at(trace, at);
256
257out:
258 ds_resume_bts(this_tracer);
259} 251}
260 252
261static void trace_bts_prepare(struct trace_iterator *iter) 253static void trace_bts_prepare(struct trace_iterator *iter)
262{ 254{
263 spin_lock(&bts_tracer_lock); 255 int cpu;
264 256
257 get_online_cpus();
258 for_each_online_cpu(cpu)
259 if (likely(per_cpu(tracer, cpu)))
260 ds_suspend_bts(per_cpu(tracer, cpu));
261 /*
262 * We need to collect the trace on the respective cpu since ftrace
263 * implicitly adds the record for the current cpu.
264 * Once that is more flexible, we could collect the data from any cpu.
265 */
265 on_each_cpu(trace_bts_cpu, iter->tr, 1); 266 on_each_cpu(trace_bts_cpu, iter->tr, 1);
266 267
267 spin_unlock(&bts_tracer_lock); 268 for_each_online_cpu(cpu)
269 if (likely(per_cpu(tracer, cpu)))
270 ds_resume_bts(per_cpu(tracer, cpu));
271 put_online_cpus();
268} 272}
269 273
270static void trace_bts_close(struct trace_iterator *iter) 274static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
274 278
275void trace_hw_branch_oops(void) 279void trace_hw_branch_oops(void)
276{ 280{
277 spin_lock(&bts_tracer_lock); 281 if (this_tracer) {
278 282 ds_suspend_bts_noirq(this_tracer);
279 trace_bts_cpu(hw_branch_trace); 283 trace_bts_cpu(hw_branch_trace);
280 284 ds_resume_bts_noirq(this_tracer);
281 spin_unlock(&bts_tracer_lock); 285 }
282} 286}
283 287
284struct tracer bts_tracer __read_mostly = 288struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
291 .start = bts_trace_start, 295 .start = bts_trace_start,
292 .stop = bts_trace_stop, 296 .stop = bts_trace_stop,
293 .open = trace_bts_prepare, 297 .open = trace_bts_prepare,
294 .close = trace_bts_close 298 .close = trace_bts_close,
299#ifdef CONFIG_FTRACE_SELFTEST
300 .selftest = trace_selftest_startup_hw_branches,
301#endif /* CONFIG_FTRACE_SELFTEST */
295}; 302};
296 303
297__init static int init_bts_trace(void) 304__init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcddd8b4..d53b45ed0806 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/time.h>
13
12#include <asm/atomic.h> 14#include <asm/atomic.h>
13 15
14#include "trace.h" 16#include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
174 struct mmiotrace_rw *rw; 176 struct mmiotrace_rw *rw;
175 struct trace_seq *s = &iter->seq; 177 struct trace_seq *s = &iter->seq;
176 unsigned long long t = ns2usecs(iter->ts); 178 unsigned long long t = ns2usecs(iter->ts);
177 unsigned long usec_rem = do_div(t, 1000000ULL); 179 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
178 unsigned secs = (unsigned long)t; 180 unsigned secs = (unsigned long)t;
179 int ret = 1; 181 int ret = 1;
180 182
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
221 struct mmiotrace_map *m; 223 struct mmiotrace_map *m;
222 struct trace_seq *s = &iter->seq; 224 struct trace_seq *s = &iter->seq;
223 unsigned long long t = ns2usecs(iter->ts); 225 unsigned long long t = ns2usecs(iter->ts);
224 unsigned long usec_rem = do_div(t, 1000000ULL); 226 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
225 unsigned secs = (unsigned long)t; 227 unsigned secs = (unsigned long)t;
226 int ret; 228 int ret;
227 229
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64b54a59c55b..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,11 +14,24 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17static DEFINE_MUTEX(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
18static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
19 23
20static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
21 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29
30 seq_write(m, s->buffer, len);
31
32 trace_seq_init(s);
33}
34
22enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
23{ 36{
24 struct trace_seq *s = &iter->seq; 37 struct trace_seq *s = &iter->seq;
@@ -84,6 +97,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
84 97
85 return len; 98 return len;
86} 99}
100EXPORT_SYMBOL_GPL(trace_seq_printf);
101
102/**
103 * trace_seq_vprintf - sequence printing of trace information
104 * @s: trace sequence descriptor
105 * @fmt: printf format string
106 *
107 * The tracer may use either sequence operations or its own
108 * copy to user routines. To simplify formating of a trace
109 * trace_seq_printf is used to store strings into a special
110 * buffer (@s). Then the output may be either used by
111 * the sequencer or pulled into another buffer.
112 */
113int
114trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
115{
116 int len = (PAGE_SIZE - 1) - s->len;
117 int ret;
118
119 if (!len)
120 return 0;
121
122 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
123
124 /* If we can't write it all, don't bother writing anything */
125 if (ret >= len)
126 return 0;
127
128 s->len += ret;
129
130 return len;
131}
132EXPORT_SYMBOL_GPL(trace_seq_vprintf);
87 133
88int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 134int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
89{ 135{
@@ -201,6 +247,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
201 return 0; 247 return 0;
202} 248}
203 249
250const char *
251ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
252 unsigned long flags,
253 const struct trace_print_flags *flag_array)
254{
255 unsigned long mask;
256 const char *str;
257 const char *ret = p->buffer + p->len;
258 int i;
259
260 for (i = 0; flag_array[i].name && flags; i++) {
261
262 mask = flag_array[i].mask;
263 if ((flags & mask) != mask)
264 continue;
265
266 str = flag_array[i].name;
267 flags &= ~mask;
268 if (p->len && delim)
269 trace_seq_puts(p, delim);
270 trace_seq_puts(p, str);
271 }
272
273 /* check for left over flags */
274 if (flags) {
275 if (p->len && delim)
276 trace_seq_puts(p, delim);
277 trace_seq_printf(p, "0x%lx", flags);
278 }
279
280 trace_seq_putc(p, 0);
281
282 return ret;
283}
284EXPORT_SYMBOL(ftrace_print_flags_seq);
285
286const char *
287ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
288 const struct trace_print_flags *symbol_array)
289{
290 int i;
291 const char *ret = p->buffer + p->len;
292
293 for (i = 0; symbol_array[i].name; i++) {
294
295 if (val != symbol_array[i].mask)
296 continue;
297
298 trace_seq_puts(p, symbol_array[i].name);
299 break;
300 }
301
302 if (!p->len)
303 trace_seq_printf(p, "0x%lx", val);
304
305 trace_seq_putc(p, 0);
306
307 return ret;
308}
309EXPORT_SYMBOL(ftrace_print_symbols_seq);
310
204#ifdef CONFIG_KRETPROBES 311#ifdef CONFIG_KRETPROBES
205static inline const char *kretprobed(const char *name) 312static inline const char *kretprobed(const char *name)
206{ 313{
@@ -311,17 +418,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
311 418
312 if (ip == ULONG_MAX || !ret) 419 if (ip == ULONG_MAX || !ret)
313 break; 420 break;
314 if (i && ret) 421 if (ret)
315 ret = trace_seq_puts(s, " <- "); 422 ret = trace_seq_puts(s, " => ");
316 if (!ip) { 423 if (!ip) {
317 if (ret) 424 if (ret)
318 ret = trace_seq_puts(s, "??"); 425 ret = trace_seq_puts(s, "??");
426 if (ret)
427 ret = trace_seq_puts(s, "\n");
319 continue; 428 continue;
320 } 429 }
321 if (!ret) 430 if (!ret)
322 break; 431 break;
323 if (ret) 432 if (ret)
324 ret = seq_print_user_ip(s, mm, ip, sym_flags); 433 ret = seq_print_user_ip(s, mm, ip, sym_flags);
434 ret = trace_seq_puts(s, "\n");
325 } 435 }
326 436
327 if (mm) 437 if (mm)
@@ -455,6 +565,7 @@ static int task_state_char(unsigned long state)
455 * @type: the type of event to look for 565 * @type: the type of event to look for
456 * 566 *
457 * Returns an event of type @type otherwise NULL 567 * Returns an event of type @type otherwise NULL
568 * Called with trace_event_read_lock() held.
458 */ 569 */
459struct trace_event *ftrace_find_event(int type) 570struct trace_event *ftrace_find_event(int type)
460{ 571{
@@ -464,7 +575,7 @@ struct trace_event *ftrace_find_event(int type)
464 575
465 key = type & (EVENT_HASHSIZE - 1); 576 key = type & (EVENT_HASHSIZE - 1);
466 577
467 hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { 578 hlist_for_each_entry(event, n, &event_hash[key], node) {
468 if (event->type == type) 579 if (event->type == type)
469 return event; 580 return event;
470 } 581 }
@@ -472,6 +583,46 @@ struct trace_event *ftrace_find_event(int type)
472 return NULL; 583 return NULL;
473} 584}
474 585
586static LIST_HEAD(ftrace_event_list);
587
588static int trace_search_list(struct list_head **list)
589{
590 struct trace_event *e;
591 int last = __TRACE_LAST_TYPE;
592
593 if (list_empty(&ftrace_event_list)) {
594 *list = &ftrace_event_list;
595 return last + 1;
596 }
597
598 /*
599 * We used up all possible max events,
600 * lets see if somebody freed one.
601 */
602 list_for_each_entry(e, &ftrace_event_list, list) {
603 if (e->type != last + 1)
604 break;
605 last++;
606 }
607
608 /* Did we used up all 65 thousand events??? */
609 if ((last + 1) > FTRACE_MAX_EVENT)
610 return 0;
611
612 *list = &e->list;
613 return last + 1;
614}
615
616void trace_event_read_lock(void)
617{
618 down_read(&trace_event_mutex);
619}
620
621void trace_event_read_unlock(void)
622{
623 up_read(&trace_event_mutex);
624}
625
475/** 626/**
476 * register_ftrace_event - register output for an event type 627 * register_ftrace_event - register output for an event type
477 * @event: the event type to register 628 * @event: the event type to register
@@ -492,22 +643,42 @@ int register_ftrace_event(struct trace_event *event)
492 unsigned key; 643 unsigned key;
493 int ret = 0; 644 int ret = 0;
494 645
495 mutex_lock(&trace_event_mutex); 646 down_write(&trace_event_mutex);
496 647
497 if (!event) { 648 if (WARN_ON(!event))
498 ret = next_event_type++;
499 goto out; 649 goto out;
500 }
501 650
502 if (!event->type) 651 INIT_LIST_HEAD(&event->list);
503 event->type = next_event_type++; 652
504 else if (event->type > __TRACE_LAST_TYPE) { 653 if (!event->type) {
654 struct list_head *list = NULL;
655
656 if (next_event_type > FTRACE_MAX_EVENT) {
657
658 event->type = trace_search_list(&list);
659 if (!event->type)
660 goto out;
661
662 } else {
663
664 event->type = next_event_type++;
665 list = &ftrace_event_list;
666 }
667
668 if (WARN_ON(ftrace_find_event(event->type)))
669 goto out;
670
671 list_add_tail(&event->list, list);
672
673 } else if (event->type > __TRACE_LAST_TYPE) {
505 printk(KERN_WARNING "Need to add type to trace.h\n"); 674 printk(KERN_WARNING "Need to add type to trace.h\n");
506 WARN_ON(1); 675 WARN_ON(1);
507 }
508
509 if (ftrace_find_event(event->type))
510 goto out; 676 goto out;
677 } else {
678 /* Is this event already used */
679 if (ftrace_find_event(event->type))
680 goto out;
681 }
511 682
512 if (event->trace == NULL) 683 if (event->trace == NULL)
513 event->trace = trace_nop_print; 684 event->trace = trace_nop_print;
@@ -520,14 +691,25 @@ int register_ftrace_event(struct trace_event *event)
520 691
521 key = event->type & (EVENT_HASHSIZE - 1); 692 key = event->type & (EVENT_HASHSIZE - 1);
522 693
523 hlist_add_head_rcu(&event->node, &event_hash[key]); 694 hlist_add_head(&event->node, &event_hash[key]);
524 695
525 ret = event->type; 696 ret = event->type;
526 out: 697 out:
527 mutex_unlock(&trace_event_mutex); 698 up_write(&trace_event_mutex);
528 699
529 return ret; 700 return ret;
530} 701}
702EXPORT_SYMBOL_GPL(register_ftrace_event);
703
704/*
705 * Used by module code with the trace_event_mutex held for write.
706 */
707int __unregister_ftrace_event(struct trace_event *event)
708{
709 hlist_del(&event->node);
710 list_del(&event->list);
711 return 0;
712}
531 713
532/** 714/**
533 * unregister_ftrace_event - remove a no longer used event 715 * unregister_ftrace_event - remove a no longer used event
@@ -535,12 +717,13 @@ int register_ftrace_event(struct trace_event *event)
535 */ 717 */
536int unregister_ftrace_event(struct trace_event *event) 718int unregister_ftrace_event(struct trace_event *event)
537{ 719{
538 mutex_lock(&trace_event_mutex); 720 down_write(&trace_event_mutex);
539 hlist_del(&event->node); 721 __unregister_ftrace_event(event);
540 mutex_unlock(&trace_event_mutex); 722 up_write(&trace_event_mutex);
541 723
542 return 0; 724 return 0;
543} 725}
726EXPORT_SYMBOL_GPL(unregister_ftrace_event);
544 727
545/* 728/*
546 * Standard events 729 * Standard events
@@ -833,14 +1016,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
833 1016
834 trace_assign_type(field, iter->ent); 1017 trace_assign_type(field, iter->ent);
835 1018
1019 if (!trace_seq_puts(s, "<stack trace>\n"))
1020 goto partial;
836 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1021 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
837 if (i) { 1022 if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
838 if (!trace_seq_puts(s, " <= ")) 1023 break;
839 goto partial; 1024 if (!trace_seq_puts(s, " => "))
1025 goto partial;
840 1026
841 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1027 if (!seq_print_ip_sym(s, field->caller[i], flags))
842 goto partial; 1028 goto partial;
843 }
844 if (!trace_seq_puts(s, "\n")) 1029 if (!trace_seq_puts(s, "\n"))
845 goto partial; 1030 goto partial;
846 } 1031 }
@@ -868,10 +1053,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
868 1053
869 trace_assign_type(field, iter->ent); 1054 trace_assign_type(field, iter->ent);
870 1055
871 if (!seq_print_userip_objs(field, s, flags)) 1056 if (!trace_seq_puts(s, "<user stack trace>\n"))
872 goto partial; 1057 goto partial;
873 1058
874 if (!trace_seq_putc(s, '\n')) 1059 if (!seq_print_userip_objs(field, s, flags))
875 goto partial; 1060 goto partial;
876 1061
877 return TRACE_TYPE_HANDLED; 1062 return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index e0bde39c2dd9..d38bec4a9c30 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,41 +1,17 @@
1#ifndef __TRACE_EVENTS_H 1#ifndef __TRACE_EVENTS_H
2#define __TRACE_EVENTS_H 2#define __TRACE_EVENTS_H
3 3
4#include <linux/trace_seq.h>
4#include "trace.h" 5#include "trace.h"
5 6
6typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
7 int flags);
8
9struct trace_event {
10 struct hlist_node node;
11 int type;
12 trace_print_func trace;
13 trace_print_func raw;
14 trace_print_func hex;
15 trace_print_func binary;
16};
17
18extern enum print_line_t 7extern enum print_line_t
19trace_print_bprintk_msg_only(struct trace_iterator *iter); 8trace_print_bprintk_msg_only(struct trace_iterator *iter);
20extern enum print_line_t 9extern enum print_line_t
21trace_print_printk_msg_only(struct trace_iterator *iter); 10trace_print_printk_msg_only(struct trace_iterator *iter);
22 11
23extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
24 __attribute__ ((format (printf, 2, 3)));
25extern int
26trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
27extern int 12extern int
28seq_print_ip_sym(struct trace_seq *s, unsigned long ip, 13seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
29 unsigned long sym_flags); 14 unsigned long sym_flags);
30extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
31 size_t cnt);
32extern int trace_seq_puts(struct trace_seq *s, const char *str);
33extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
34extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
35extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
36 size_t len);
37extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
38extern int trace_seq_path(struct trace_seq *s, struct path *path);
39extern int seq_print_userip_objs(const struct userstack_entry *entry, 15extern int seq_print_userip_objs(const struct userstack_entry *entry,
40 struct trace_seq *s, unsigned long sym_flags); 16 struct trace_seq *s, unsigned long sym_flags);
41extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, 17extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
@@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
44extern int trace_print_context(struct trace_iterator *iter); 20extern int trace_print_context(struct trace_iterator *iter);
45extern int trace_print_lat_context(struct trace_iterator *iter); 21extern int trace_print_lat_context(struct trace_iterator *iter);
46 22
23extern void trace_event_read_lock(void);
24extern void trace_event_read_unlock(void);
47extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
48extern int register_ftrace_event(struct trace_event *event);
49extern int unregister_ftrace_event(struct trace_event *event);
50 26
51extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
52 int flags); 28 int flags);
53 29
30/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event);
32extern struct rw_semaphore trace_event_mutex;
33
54#define MAX_MEMHEX_BYTES 8 34#define MAX_MEMHEX_BYTES 8
55#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 35#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
56 36
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 118439709fb7..8a30d9874cd4 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
36 36
37static void probe_power_end(struct power_trace *it) 37static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power;
39 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
40 struct trace_power *entry; 41 struct trace_power *entry;
41 struct trace_array_cpu *data; 42 struct trace_array_cpu *data;
@@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)
54 goto out; 55 goto out;
55 entry = ring_buffer_event_data(event); 56 entry = ring_buffer_event_data(event);
56 entry->state_data = *it; 57 entry->state_data = *it;
57 trace_buffer_unlock_commit(tr, event, 0, 0); 58 if (!filter_check_discard(call, entry, tr->buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0);
58 out: 60 out:
59 preempt_enable(); 61 preempt_enable();
60} 62}
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
62static void probe_power_mark(struct power_trace *it, unsigned int type, 64static void probe_power_mark(struct power_trace *it, unsigned int type,
63 unsigned int level) 65 unsigned int level)
64{ 66{
67 struct ftrace_event_call *call = &event_power;
65 struct ring_buffer_event *event; 68 struct ring_buffer_event *event;
66 struct trace_power *entry; 69 struct trace_power *entry;
67 struct trace_array_cpu *data; 70 struct trace_array_cpu *data;
@@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
84 goto out; 87 goto out;
85 entry = ring_buffer_event_data(event); 88 entry = ring_buffer_event_data(event);
86 entry->state_data = *it; 89 entry->state_data = *it;
87 trace_buffer_unlock_commit(tr, event, 0, 0); 90 if (!filter_check_discard(call, entry, tr->buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0);
88 out: 92 out:
89 preempt_enable(); 93 preempt_enable();
90} 94}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556107fe..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
182 const char *str = *fmt; 176 const char *str = *fmt;
183 int i; 177 int i;
184 178
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 179 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
186 180
187 /* 181 /*
188 * Tabs and new lines need to be converted. 182 * Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
@@ -245,17 +231,13 @@ static const struct file_operations ftrace_formats_fops = {
245static __init int init_trace_printk_function_export(void) 231static __init int init_trace_printk_function_export(void)
246{ 232{
247 struct dentry *d_tracer; 233 struct dentry *d_tracer;
248 struct dentry *entry;
249 234
250 d_tracer = tracing_init_dentry(); 235 d_tracer = tracing_init_dentry();
251 if (!d_tracer) 236 if (!d_tracer)
252 return 0; 237 return 0;
253 238
254 entry = debugfs_create_file("printk_formats", 0444, d_tracer, 239 trace_create_file("printk_formats", 0444, d_tracer,
255 NULL, &ftrace_formats_fops); 240 NULL, &ftrace_formats_fops);
256 if (!entry)
257 pr_warning("Could not create debugfs "
258 "'printk_formats' entry\n");
259 241
260 return 0; 242 return 0;
261} 243}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea6f1ae..a98106dd979c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
10#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/ftrace.h> 12#include <linux/ftrace.h>
13#include <trace/sched.h> 13#include <trace/events/sched.h>
14 14
15#include "trace.h" 15#include "trace.h"
16 16
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
29 int cpu; 29 int cpu;
30 int pc; 30 int pc;
31 31
32 if (!sched_ref || sched_stopped) 32 if (unlikely(!sched_ref))
33 return; 33 return;
34 34
35 tracing_record_cmdline(prev); 35 tracing_record_cmdline(prev);
36 tracing_record_cmdline(next); 36 tracing_record_cmdline(next);
37 37
38 if (!tracer_enabled) 38 if (!tracer_enabled || sched_stopped)
39 return; 39 return;
40 40
41 pc = preempt_count(); 41 pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
56 unsigned long flags; 56 unsigned long flags;
57 int cpu, pc; 57 int cpu, pc;
58 58
59 if (!likely(tracer_enabled)) 59 if (unlikely(!sched_ref))
60 return; 60 return;
61 61
62 pc = preempt_count();
63 tracing_record_cmdline(current); 62 tracing_record_cmdline(current);
64 63
65 if (sched_stopped) 64 if (!tracer_enabled || sched_stopped)
66 return; 65 return;
67 66
67 pc = preempt_count();
68 local_irq_save(flags); 68 local_irq_save(flags);
69 cpu = raw_smp_processor_id(); 69 cpu = raw_smp_processor_id();
70 data = ctx_trace->data[cpu]; 70 data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153e..eacb27225173 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <trace/sched.h> 18#include <trace/events/sched.h>
19 19
20#include "trace.h" 20#include "trace.h"
21 21
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
138 138
139 pc = preempt_count(); 139 pc = preempt_count();
140 140
141 /* The task we are waiting for is waking up */
142 data = wakeup_trace->data[wakeup_cpu];
143
144 /* disable local data, not wakeup_cpu data */ 141 /* disable local data, not wakeup_cpu data */
145 cpu = raw_smp_processor_id(); 142 cpu = raw_smp_processor_id();
146 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 143 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
154 if (unlikely(!tracer_enabled || next != wakeup_task)) 151 if (unlikely(!tracer_enabled || next != wakeup_task))
155 goto out_unlock; 152 goto out_unlock;
156 153
154 /* The task we are waiting for is waking up */
155 data = wakeup_trace->data[wakeup_cpu];
156
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 159
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 08f4eb2763d1..00dd6485bdd7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
16 case TRACE_BRANCH: 16 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES:
19 return 1; 20 return 1;
20 } 21 }
21 return 0; 22 return 0;
@@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
188#else 189#else
189# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) 190# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
190#endif /* CONFIG_DYNAMIC_FTRACE */ 191#endif /* CONFIG_DYNAMIC_FTRACE */
192
191/* 193/*
192 * Simple verification test of ftrace function tracer. 194 * Simple verification test of ftrace function tracer.
193 * Enable ftrace, sleep 1/10 second, and then read the trace 195 * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
749 return ret; 751 return ret;
750} 752}
751#endif /* CONFIG_BRANCH_TRACER */ 753#endif /* CONFIG_BRANCH_TRACER */
754
755#ifdef CONFIG_HW_BRANCH_TRACER
756int
757trace_selftest_startup_hw_branches(struct tracer *trace,
758 struct trace_array *tr)
759{
760 struct trace_iterator *iter;
761 struct tracer tracer;
762 unsigned long count;
763 int ret;
764
765 if (!trace->open) {
766 printk(KERN_CONT "missing open function...");
767 return -1;
768 }
769
770 ret = tracer_init(trace, tr);
771 if (ret) {
772 warn_failed_init_tracer(trace, ret);
773 return ret;
774 }
775
776 /*
777 * The hw-branch tracer needs to collect the trace from the various
778 * cpu trace buffers - before tracing is stopped.
779 */
780 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
781 if (!iter)
782 return -ENOMEM;
783
784 memcpy(&tracer, trace, sizeof(tracer));
785
786 iter->trace = &tracer;
787 iter->tr = tr;
788 iter->pos = -1;
789 mutex_init(&iter->mutex);
790
791 trace->open(iter);
792
793 mutex_destroy(&iter->mutex);
794 kfree(iter);
795
796 tracing_stop();
797
798 ret = trace_test_buffer(tr, &count);
799 trace->reset(tr);
800 tracing_start();
801
802 if (!ret && !count) {
803 printk(KERN_CONT "no entries found..");
804 ret = -1;
805 }
806
807 return ret;
808}
809#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65f9661..6a2a9d484cd6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
265 seq_printf(m, " Depth Size Location" 265 seq_printf(m, " Depth Size Location"
266 " (%d entries)\n" 266 " (%d entries)\n"
267 " ----- ---- --------\n", 267 " ----- ---- --------\n",
268 max_stack_trace.nr_entries); 268 max_stack_trace.nr_entries - 1);
269 269
270 if (!stack_tracer_enabled && !max_stack_size) 270 if (!stack_tracer_enabled && !max_stack_size)
271 print_disabled(m); 271 print_disabled(m);
@@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 301
302static int stack_trace_open(struct inode *inode, struct file *file) 302static int stack_trace_open(struct inode *inode, struct file *file)
303{ 303{
304 int ret; 304 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 305}
310 306
311static const struct file_operations stack_trace_fops = { 307static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 308 .open = stack_trace_open,
313 .read = seq_read, 309 .read = seq_read,
314 .llseek = seq_lseek, 310 .llseek = seq_lseek,
311 .release = seq_release,
315}; 312};
316 313
317int 314int
@@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 323 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 324
328 if (ret || !write || 325 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 326 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 327 goto out;
331 328
332 last_stack_tracer_enabled = stack_tracer_enabled; 329 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 330
334 if (stack_tracer_enabled) 331 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 332 register_ftrace_function(&trace_ops);
@@ -352,19 +349,14 @@ __setup("stacktrace", enable_stacktrace);
352static __init int stack_trace_init(void) 349static __init int stack_trace_init(void)
353{ 350{
354 struct dentry *d_tracer; 351 struct dentry *d_tracer;
355 struct dentry *entry;
356 352
357 d_tracer = tracing_init_dentry(); 353 d_tracer = tracing_init_dentry();
358 354
359 entry = debugfs_create_file("stack_max_size", 0644, d_tracer, 355 trace_create_file("stack_max_size", 0644, d_tracer,
360 &max_stack_size, &stack_max_size_fops); 356 &max_stack_size, &stack_max_size_fops);
361 if (!entry)
362 pr_warning("Could not create debugfs 'stack_max_size' entry\n");
363 357
364 entry = debugfs_create_file("stack_trace", 0444, d_tracer, 358 trace_create_file("stack_trace", 0444, d_tracer,
365 NULL, &stack_trace_fops); 359 NULL, &stack_trace_fops);
366 if (!entry)
367 pr_warning("Could not create debugfs 'stack_trace' entry\n");
368 360
369 if (stack_tracer_enabled) 361 if (stack_tracer_enabled)
370 register_ftrace_function(&trace_ops); 362 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index acdebd771a93..aea321c82fa0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Infrastructure for statistic tracing (histogram output). 2 * Infrastructure for statistic tracing (histogram output).
3 * 3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> 4 * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
5 * 5 *
6 * Based on the code from trace_branch.c which is 6 * Based on the code from trace_branch.c which is
7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,22 +10,27 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/rbtree.h>
13#include <linux/debugfs.h> 14#include <linux/debugfs.h>
14#include "trace_stat.h" 15#include "trace_stat.h"
15#include "trace.h" 16#include "trace.h"
16 17
17 18
18/* List of stat entries from a tracer */ 19/*
19struct trace_stat_list { 20 * List of stat red-black nodes from a tracer
20 struct list_head list; 21 * We use a such tree to sort quickly the stat
22 * entries from the tracer.
23 */
24struct stat_node {
25 struct rb_node node;
21 void *stat; 26 void *stat;
22}; 27};
23 28
24/* A stat session is the stats output in one file */ 29/* A stat session is the stats output in one file */
25struct tracer_stat_session { 30struct stat_session {
26 struct list_head session_list; 31 struct list_head session_list;
27 struct tracer_stat *ts; 32 struct tracer_stat *ts;
28 struct list_head stat_list; 33 struct rb_root stat_root;
29 struct mutex stat_mutex; 34 struct mutex stat_mutex;
30 struct dentry *file; 35 struct dentry *file;
31}; 36};
@@ -37,77 +42,136 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
37/* The root directory for all stat files */ 42/* The root directory for all stat files */
38static struct dentry *stat_dir; 43static struct dentry *stat_dir;
39 44
45/*
46 * Iterate through the rbtree using a post order traversal path
47 * to release the next node.
48 * It won't necessary release one at each iteration
49 * but it will at least advance closer to the next one
50 * to be released.
51 */
52static struct rb_node *release_next(struct rb_node *node)
53{
54 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node);
56
57 if (node->rb_left)
58 return node->rb_left;
59 else if (node->rb_right)
60 return node->rb_right;
61 else {
62 if (!parent)
63 ;
64 else if (parent->rb_left == node)
65 parent->rb_left = NULL;
66 else
67 parent->rb_right = NULL;
68
69 snode = container_of(node, struct stat_node, node);
70 kfree(snode);
71
72 return parent;
73 }
74}
40 75
41static void reset_stat_session(struct tracer_stat_session *session) 76static void __reset_stat_session(struct stat_session *session)
42{ 77{
43 struct trace_stat_list *node, *next; 78 struct rb_node *node = session->stat_root.rb_node;
44 79
45 list_for_each_entry_safe(node, next, &session->stat_list, list) 80 while (node)
46 kfree(node); 81 node = release_next(node);
47 82
48 INIT_LIST_HEAD(&session->stat_list); 83 session->stat_root = RB_ROOT;
49} 84}
50 85
51static void destroy_session(struct tracer_stat_session *session) 86static void reset_stat_session(struct stat_session *session)
87{
88 mutex_lock(&session->stat_mutex);
89 __reset_stat_session(session);
90 mutex_unlock(&session->stat_mutex);
91}
92
93static void destroy_session(struct stat_session *session)
52{ 94{
53 debugfs_remove(session->file); 95 debugfs_remove(session->file);
54 reset_stat_session(session); 96 __reset_stat_session(session);
55 mutex_destroy(&session->stat_mutex); 97 mutex_destroy(&session->stat_mutex);
56 kfree(session); 98 kfree(session);
57} 99}
58 100
101typedef int (*cmp_stat_t)(void *, void *);
102
103static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
104{
105 struct rb_node **new = &(root->rb_node), *parent = NULL;
106 struct stat_node *data;
107
108 data = kzalloc(sizeof(*data), GFP_KERNEL);
109 if (!data)
110 return -ENOMEM;
111 data->stat = stat;
112
113 /*
114 * Figure out where to put new node
115 * This is a descendent sorting
116 */
117 while (*new) {
118 struct stat_node *this;
119 int result;
120
121 this = container_of(*new, struct stat_node, node);
122 result = cmp(data->stat, this->stat);
123
124 parent = *new;
125 if (result >= 0)
126 new = &((*new)->rb_left);
127 else
128 new = &((*new)->rb_right);
129 }
130
131 rb_link_node(&data->node, parent, new);
132 rb_insert_color(&data->node, root);
133 return 0;
134}
135
59/* 136/*
60 * For tracers that don't provide a stat_cmp callback. 137 * For tracers that don't provide a stat_cmp callback.
61 * This one will force an immediate insertion on tail of 138 * This one will force an insertion as right-most node
62 * the list. 139 * in the rbtree.
63 */ 140 */
64static int dummy_cmp(void *p1, void *p2) 141static int dummy_cmp(void *p1, void *p2)
65{ 142{
66 return 1; 143 return -1;
67} 144}
68 145
69/* 146/*
70 * Initialize the stat list at each trace_stat file opening. 147 * Initialize the stat rbtree at each trace_stat file opening.
71 * All of these copies and sorting are required on all opening 148 * All of these copies and sorting are required on all opening
72 * since the stats could have changed between two file sessions. 149 * since the stats could have changed between two file sessions.
73 */ 150 */
74static int stat_seq_init(struct tracer_stat_session *session) 151static int stat_seq_init(struct stat_session *session)
75{ 152{
76 struct trace_stat_list *iter_entry, *new_entry;
77 struct tracer_stat *ts = session->ts; 153 struct tracer_stat *ts = session->ts;
154 struct rb_root *root = &session->stat_root;
78 void *stat; 155 void *stat;
79 int ret = 0; 156 int ret = 0;
80 int i; 157 int i;
81 158
82 mutex_lock(&session->stat_mutex); 159 mutex_lock(&session->stat_mutex);
83 reset_stat_session(session); 160 __reset_stat_session(session);
84 161
85 if (!ts->stat_cmp) 162 if (!ts->stat_cmp)
86 ts->stat_cmp = dummy_cmp; 163 ts->stat_cmp = dummy_cmp;
87 164
88 stat = ts->stat_start(); 165 stat = ts->stat_start(ts);
89 if (!stat) 166 if (!stat)
90 goto exit; 167 goto exit;
91 168
92 /* 169 ret = insert_stat(root, stat, ts->stat_cmp);
93 * The first entry. Actually this is the second, but the first 170 if (ret)
94 * one (the stat_list head) is pointless.
95 */
96 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
97 if (!new_entry) {
98 ret = -ENOMEM;
99 goto exit; 171 goto exit;
100 }
101
102 INIT_LIST_HEAD(&new_entry->list);
103
104 list_add(&new_entry->list, &session->stat_list);
105
106 new_entry->stat = stat;
107 172
108 /* 173 /*
109 * Iterate over the tracer stat entries and store them in a sorted 174 * Iterate over the tracer stat entries and store them in an rbtree.
110 * list.
111 */ 175 */
112 for (i = 1; ; i++) { 176 for (i = 1; ; i++) {
113 stat = ts->stat_next(stat, i); 177 stat = ts->stat_next(stat, i);
@@ -116,37 +180,17 @@ static int stat_seq_init(struct tracer_stat_session *session)
116 if (!stat) 180 if (!stat)
117 break; 181 break;
118 182
119 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); 183 ret = insert_stat(root, stat, ts->stat_cmp);
120 if (!new_entry) { 184 if (ret)
121 ret = -ENOMEM; 185 goto exit_free_rbtree;
122 goto exit_free_list;
123 }
124
125 INIT_LIST_HEAD(&new_entry->list);
126 new_entry->stat = stat;
127
128 list_for_each_entry_reverse(iter_entry, &session->stat_list,
129 list) {
130
131 /* Insertion with a descendent sorting */
132 if (ts->stat_cmp(iter_entry->stat,
133 new_entry->stat) >= 0) {
134
135 list_add(&new_entry->list, &iter_entry->list);
136 break;
137 }
138 }
139
140 /* The current larger value */
141 if (list_empty(&new_entry->list))
142 list_add(&new_entry->list, &session->stat_list);
143 } 186 }
187
144exit: 188exit:
145 mutex_unlock(&session->stat_mutex); 189 mutex_unlock(&session->stat_mutex);
146 return ret; 190 return ret;
147 191
148exit_free_list: 192exit_free_rbtree:
149 reset_stat_session(session); 193 __reset_stat_session(session);
150 mutex_unlock(&session->stat_mutex); 194 mutex_unlock(&session->stat_mutex);
151 return ret; 195 return ret;
152} 196}
@@ -154,38 +198,47 @@ exit_free_list:
154 198
155static void *stat_seq_start(struct seq_file *s, loff_t *pos) 199static void *stat_seq_start(struct seq_file *s, loff_t *pos)
156{ 200{
157 struct tracer_stat_session *session = s->private; 201 struct stat_session *session = s->private;
202 struct rb_node *node;
203 int i;
158 204
159 /* Prevent from tracer switch or stat_list modification */ 205 /* Prevent from tracer switch or rbtree modification */
160 mutex_lock(&session->stat_mutex); 206 mutex_lock(&session->stat_mutex);
161 207
162 /* If we are in the beginning of the file, print the headers */ 208 /* If we are in the beginning of the file, print the headers */
163 if (!*pos && session->ts->stat_headers) 209 if (!*pos && session->ts->stat_headers)
164 return SEQ_START_TOKEN; 210 return SEQ_START_TOKEN;
165 211
166 return seq_list_start(&session->stat_list, *pos); 212 node = rb_first(&session->stat_root);
213 for (i = 0; node && i < *pos; i++)
214 node = rb_next(node);
215
216 return node;
167} 217}
168 218
169static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) 219static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
170{ 220{
171 struct tracer_stat_session *session = s->private; 221 struct stat_session *session = s->private;
222 struct rb_node *node = p;
223
224 (*pos)++;
172 225
173 if (p == SEQ_START_TOKEN) 226 if (p == SEQ_START_TOKEN)
174 return seq_list_start(&session->stat_list, *pos); 227 return rb_first(&session->stat_root);
175 228
176 return seq_list_next(p, &session->stat_list, pos); 229 return rb_next(node);
177} 230}
178 231
179static void stat_seq_stop(struct seq_file *s, void *p) 232static void stat_seq_stop(struct seq_file *s, void *p)
180{ 233{
181 struct tracer_stat_session *session = s->private; 234 struct stat_session *session = s->private;
182 mutex_unlock(&session->stat_mutex); 235 mutex_unlock(&session->stat_mutex);
183} 236}
184 237
185static int stat_seq_show(struct seq_file *s, void *v) 238static int stat_seq_show(struct seq_file *s, void *v)
186{ 239{
187 struct tracer_stat_session *session = s->private; 240 struct stat_session *session = s->private;
188 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); 241 struct stat_node *l = container_of(v, struct stat_node, node);
189 242
190 if (v == SEQ_START_TOKEN) 243 if (v == SEQ_START_TOKEN)
191 return session->ts->stat_headers(s); 244 return session->ts->stat_headers(s);
@@ -204,31 +257,34 @@ static const struct seq_operations trace_stat_seq_ops = {
204static int tracing_stat_open(struct inode *inode, struct file *file) 257static int tracing_stat_open(struct inode *inode, struct file *file)
205{ 258{
206 int ret; 259 int ret;
260 struct seq_file *m;
261 struct stat_session *session = inode->i_private;
207 262
208 struct tracer_stat_session *session = inode->i_private; 263 ret = stat_seq_init(session);
264 if (ret)
265 return ret;
209 266
210 ret = seq_open(file, &trace_stat_seq_ops); 267 ret = seq_open(file, &trace_stat_seq_ops);
211 if (!ret) { 268 if (ret) {
212 struct seq_file *m = file->private_data; 269 reset_stat_session(session);
213 m->private = session; 270 return ret;
214 ret = stat_seq_init(session);
215 } 271 }
216 272
273 m = file->private_data;
274 m->private = session;
217 return ret; 275 return ret;
218} 276}
219 277
220/* 278/*
221 * Avoid consuming memory with our now useless list. 279 * Avoid consuming memory with our now useless rbtree.
222 */ 280 */
223static int tracing_stat_release(struct inode *i, struct file *f) 281static int tracing_stat_release(struct inode *i, struct file *f)
224{ 282{
225 struct tracer_stat_session *session = i->i_private; 283 struct stat_session *session = i->i_private;
226 284
227 mutex_lock(&session->stat_mutex);
228 reset_stat_session(session); 285 reset_stat_session(session);
229 mutex_unlock(&session->stat_mutex);
230 286
231 return 0; 287 return seq_release(i, f);
232} 288}
233 289
234static const struct file_operations tracing_stat_fops = { 290static const struct file_operations tracing_stat_fops = {
@@ -251,7 +307,7 @@ static int tracing_stat_init(void)
251 return 0; 307 return 0;
252} 308}
253 309
254static int init_stat_file(struct tracer_stat_session *session) 310static int init_stat_file(struct stat_session *session)
255{ 311{
256 if (!stat_dir && tracing_stat_init()) 312 if (!stat_dir && tracing_stat_init())
257 return -ENODEV; 313 return -ENODEV;
@@ -266,7 +322,7 @@ static int init_stat_file(struct tracer_stat_session *session)
266 322
267int register_stat_tracer(struct tracer_stat *trace) 323int register_stat_tracer(struct tracer_stat *trace)
268{ 324{
269 struct tracer_stat_session *session, *node, *tmp; 325 struct stat_session *session, *node;
270 int ret; 326 int ret;
271 327
272 if (!trace) 328 if (!trace)
@@ -277,7 +333,7 @@ int register_stat_tracer(struct tracer_stat *trace)
277 333
278 /* Already registered? */ 334 /* Already registered? */
279 mutex_lock(&all_stat_sessions_mutex); 335 mutex_lock(&all_stat_sessions_mutex);
280 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { 336 list_for_each_entry(node, &all_stat_sessions, session_list) {
281 if (node->ts == trace) { 337 if (node->ts == trace) {
282 mutex_unlock(&all_stat_sessions_mutex); 338 mutex_unlock(&all_stat_sessions_mutex);
283 return -EINVAL; 339 return -EINVAL;
@@ -286,15 +342,13 @@ int register_stat_tracer(struct tracer_stat *trace)
286 mutex_unlock(&all_stat_sessions_mutex); 342 mutex_unlock(&all_stat_sessions_mutex);
287 343
288 /* Init the session */ 344 /* Init the session */
289 session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); 345 session = kzalloc(sizeof(*session), GFP_KERNEL);
290 if (!session) 346 if (!session)
291 return -ENOMEM; 347 return -ENOMEM;
292 348
293 session->ts = trace; 349 session->ts = trace;
294 INIT_LIST_HEAD(&session->session_list); 350 INIT_LIST_HEAD(&session->session_list);
295 INIT_LIST_HEAD(&session->stat_list);
296 mutex_init(&session->stat_mutex); 351 mutex_init(&session->stat_mutex);
297 session->file = NULL;
298 352
299 ret = init_stat_file(session); 353 ret = init_stat_file(session);
300 if (ret) { 354 if (ret) {
@@ -312,7 +366,7 @@ int register_stat_tracer(struct tracer_stat *trace)
312 366
313void unregister_stat_tracer(struct tracer_stat *trace) 367void unregister_stat_tracer(struct tracer_stat *trace)
314{ 368{
315 struct tracer_stat_session *node, *tmp; 369 struct stat_session *node, *tmp;
316 370
317 mutex_lock(&all_stat_sessions_mutex); 371 mutex_lock(&all_stat_sessions_mutex);
318 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { 372 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274cf7f3d..f3546a2cd826 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
12 /* The name of your stat file */ 12 /* The name of your stat file */
13 const char *name; 13 const char *name;
14 /* Iteration over statistic entries */ 14 /* Iteration over statistic entries */
15 void *(*stat_start)(void); 15 void *(*stat_start)(struct tracer_stat *trace);
16 void *(*stat_next)(void *prev, int idx); 16 void *(*stat_next)(void *prev, int idx);
17 /* Compare two entries for stats sorting */ 17 /* Compare two entries for stats sorting */
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149f..f6693969287d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 205
206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
207 HRTIMER_MODE_REL_PINNED);
207} 208}
208 209
209static void start_stack_timers(void) 210static void start_stack_timers(void)
@@ -321,11 +322,7 @@ static const struct file_operations sysprof_sample_fops = {
321 322
322void init_tracer_sysprof_debugfs(struct dentry *d_tracer) 323void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
323{ 324{
324 struct dentry *entry;
325 325
326 entry = debugfs_create_file("sysprof_sample_period", 0644, 326 trace_create_file("sysprof_sample_period", 0644,
327 d_tracer, NULL, &sysprof_sample_fops); 327 d_tracer, NULL, &sysprof_sample_fops);
328 if (entry)
329 return;
330 pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
331} 328}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 797201e4a137..97fcea4acce1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8 8
9#include <trace/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include "trace_stat.h" 12#include "trace_stat.h"
@@ -16,8 +16,6 @@
16/* A cpu workqueue thread */ 16/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 17struct cpu_workqueue_stats {
18 struct list_head list; 18 struct list_head list;
19/* Useful to know if we print the cpu headers */
20 bool first_entry;
21 int cpu; 19 int cpu;
22 pid_t pid; 20 pid_t pid;
23/* Can be inserted from interrupt or user context, need to be atomic */ 21/* Can be inserted from interrupt or user context, need to be atomic */
@@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
47 struct work_struct *work) 45 struct work_struct *work)
48{ 46{
49 int cpu = cpumask_first(&wq_thread->cpus_allowed); 47 int cpu = cpumask_first(&wq_thread->cpus_allowed);
50 struct cpu_workqueue_stats *node, *next; 48 struct cpu_workqueue_stats *node;
51 unsigned long flags; 49 unsigned long flags;
52 50
53 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 51 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
54 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, 52 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
55 list) {
56 if (node->pid == wq_thread->pid) { 53 if (node->pid == wq_thread->pid) {
57 atomic_inc(&node->inserted); 54 atomic_inc(&node->inserted);
58 goto found; 55 goto found;
@@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
69 struct work_struct *work) 66 struct work_struct *work)
70{ 67{
71 int cpu = cpumask_first(&wq_thread->cpus_allowed); 68 int cpu = cpumask_first(&wq_thread->cpus_allowed);
72 struct cpu_workqueue_stats *node, *next; 69 struct cpu_workqueue_stats *node;
73 unsigned long flags; 70 unsigned long flags;
74 71
75 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 72 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
76 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, 73 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
77 list) {
78 if (node->pid == wq_thread->pid) { 74 if (node->pid == wq_thread->pid) {
79 node->executed++; 75 node->executed++;
80 goto found; 76 goto found;
@@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
105 cws->pid = wq_thread->pid; 101 cws->pid = wq_thread->pid;
106 102
107 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
108 if (list_empty(&workqueue_cpu_stat(cpu)->list))
109 cws->first_entry = true;
110 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); 104 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
111 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 105 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
112} 106}
@@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
152 return ret; 146 return ret;
153} 147}
154 148
155static void *workqueue_stat_start(void) 149static void *workqueue_stat_start(struct tracer_stat *trace)
156{ 150{
157 int cpu; 151 int cpu;
158 void *ret = NULL; 152 void *ret = NULL;
@@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
191static int workqueue_stat_show(struct seq_file *s, void *p) 185static int workqueue_stat_show(struct seq_file *s, void *p)
192{ 186{
193 struct cpu_workqueue_stats *cws = p; 187 struct cpu_workqueue_stats *cws = p;
194 unsigned long flags;
195 int cpu = cws->cpu;
196 struct pid *pid; 188 struct pid *pid;
197 struct task_struct *tsk; 189 struct task_struct *tsk;
198 190
199 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
200 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
201 seq_printf(s, "\n");
202 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
203
204 pid = find_get_pid(cws->pid); 191 pid = find_get_pid(cws->pid);
205 if (pid) { 192 if (pid) {
206 tsk = get_pid_task(pid, PIDTYPE_PID); 193 tsk = get_pid_task(pid, PIDTYPE_PID);
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 75 put_user_ns(up->user_ns);
76} 76}
77 77
78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
79{
80 struct user_struct *user;
81 struct hlist_node *h;
82
83 hlist_for_each_entry(user, h, hashent, uidhash_node) {
84 if (user->uid == uid) {
85 atomic_inc(&user->__count);
86 return user;
87 }
88 }
89
90 return NULL;
91}
92
93#ifdef CONFIG_USER_SCHED 78#ifdef CONFIG_USER_SCHED
94 79
95static void sched_destroy_user(struct user_struct *up) 80static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
119 104
120#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) 105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
121 106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
122static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ 124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
123static DEFINE_MUTEX(uids_mutex); 125static DEFINE_MUTEX(uids_mutex);
124 126
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
283 return uids_user_create(&root_user); 285 return uids_user_create(&root_user);
284} 286}
285 287
286/* work function to remove sysfs directory for a user and free up 288/* delayed work function to remove sysfs directory for a user and free up
287 * corresponding structures. 289 * corresponding structures.
288 */ 290 */
289static void cleanup_user_struct(struct work_struct *w) 291static void cleanup_user_struct(struct work_struct *w)
290{ 292{
291 struct user_struct *up = container_of(w, struct user_struct, work); 293 struct user_struct *up = container_of(w, struct user_struct, work.work);
292 unsigned long flags; 294 unsigned long flags;
293 int remove_user = 0; 295 int remove_user = 0;
294 296
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
297 */ 299 */
298 uids_mutex_lock(); 300 uids_mutex_lock();
299 301
300 local_irq_save(flags); 302 spin_lock_irqsave(&uidhash_lock, flags);
301 303 if (atomic_read(&up->__count) == 0) {
302 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
303 uid_hash_remove(up); 304 uid_hash_remove(up);
304 remove_user = 1; 305 remove_user = 1;
305 spin_unlock_irqrestore(&uidhash_lock, flags);
306 } else {
307 local_irq_restore(flags);
308 } 306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
309 308
310 if (!remove_user) 309 if (!remove_user)
311 goto done; 310 goto done;
@@ -331,16 +330,28 @@ done:
331 */ 330 */
332static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
333{ 332{
334 /* restore back the count */
335 atomic_inc(&up->__count);
336 spin_unlock_irqrestore(&uidhash_lock, flags); 333 spin_unlock_irqrestore(&uidhash_lock, flags);
337 334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
338 INIT_WORK(&up->work, cleanup_user_struct); 335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
339 schedule_work(&up->work);
340} 336}
341 337
342#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
343 339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{
342 struct user_struct *user;
343 struct hlist_node *h;
344
345 hlist_for_each_entry(user, h, hashent, uidhash_node) {
346 if (user->uid == uid) {
347 atomic_inc(&user->__count);
348 return user;
349 }
350 }
351
352 return NULL;
353}
354
344int uids_sysfs_init(void) { return 0; } 355int uids_sysfs_init(void) { return 0; }
345static inline int uids_user_create(struct user_struct *up) { return 0; } 356static inline int uids_user_create(struct user_struct *up) { return 0; }
346static inline void uids_mutex_lock(void) { } 357static inline void uids_mutex_lock(void) { }
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af8..8a82b4b8ea52 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18static struct uts_namespace *create_uts_ns(void)
19{
20 struct uts_namespace *uts_ns;
21
22 uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
23 if (uts_ns)
24 kref_init(&uts_ns->kref);
25 return uts_ns;
26}
27
18/* 28/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 29 * Clone a new ns copying an original utsname, setting refcount to 1
20 * @old_ns: namespace to clone 30 * @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24{ 34{
25 struct uts_namespace *ns; 35 struct uts_namespace *ns;
26 36
27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 37 ns = create_uts_ns();
28 if (!ns) 38 if (!ns)
29 return ERR_PTR(-ENOMEM); 39 return ERR_PTR(-ENOMEM);
30 40
31 down_read(&uts_sem); 41 down_read(&uts_sem);
32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem); 43 up_read(&uts_sem);
34 kref_init(&ns->kref);
35 return ns; 44 return ns;
36} 45}
37 46
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void init_waitqueue_head(wait_queue_head_t *q) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key);
16 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
17} 18}
18 19
19EXPORT_SYMBOL(init_waitqueue_head); 20EXPORT_SYMBOL(__init_waitqueue_head);
20 21
21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 22void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 23{
@@ -154,7 +155,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
154 if (!list_empty(&wait->task_list)) 155 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list); 156 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q)) 157 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key); 158 __wake_up_locked_key(q, mode, key);
158 spin_unlock_irqrestore(&q->lock, flags); 159 spin_unlock_irqrestore(&q->lock, flags);
159} 160}
160EXPORT_SYMBOL(abort_exclusive_wait); 161EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f71fb2a08950..0668795d8818 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#include <trace/workqueue.h> 36#define CREATE_TRACE_POINTS
37#include <trace/events/workqueue.h>
37 38
38/* 39/*
39 * The per-CPU workqueue (if single thread, we always use the first 40 * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
124 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 125 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
125} 126}
126 127
127DEFINE_TRACE(workqueue_insertion);
128
129static void insert_work(struct cpu_workqueue_struct *cwq, 128static void insert_work(struct cpu_workqueue_struct *cwq,
130 struct work_struct *work, struct list_head *head) 129 struct work_struct *work, struct list_head *head)
131{ 130{
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
262} 261}
263EXPORT_SYMBOL_GPL(queue_delayed_work_on); 262EXPORT_SYMBOL_GPL(queue_delayed_work_on);
264 263
265DEFINE_TRACE(workqueue_execution);
266
267static void run_workqueue(struct cpu_workqueue_struct *cwq) 264static void run_workqueue(struct cpu_workqueue_struct *cwq)
268{ 265{
269 spin_lock_irq(&cwq->lock); 266 spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
753 return cwq; 750 return cwq;
754} 751}
755 752
756DEFINE_TRACE(workqueue_creation);
757
758static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 753static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
759{ 754{
760 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 755 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
860} 855}
861EXPORT_SYMBOL_GPL(__create_workqueue_key); 856EXPORT_SYMBOL_GPL(__create_workqueue_key);
862 857
863DEFINE_TRACE(workqueue_destruction);
864
865static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 858static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
866{ 859{
867 /* 860 /*