aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile12
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c66
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c166
-rw-r--r--kernel/cpu.c28
-rw-r--r--kernel/cpuset.c260
-rw-r--r--kernel/cred.c296
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c299
-rw-r--r--kernel/fork.c76
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c121
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/gcov/Kconfig48
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c148
-rw-r--r--kernel/gcov/fs.c673
-rw-r--r--kernel/gcov/gcc_3_4.c447
-rw-r--r--kernel/gcov/gcov.h128
-rw-r--r--kernel/groups.c288
-rw-r--r--kernel/hrtimer.c114
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h16
-rw-r--r--kernel/irq/manage.c186
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kmod.c10
-rw-r--r--kernel/kprobes.c38
-rw-r--r--kernel/kthread.c96
-rw-r--r--kernel/lockdep.c792
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c131
-rw-r--r--kernel/module.c52
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c1570
-rw-r--r--kernel/pid.c17
-rw-r--r--kernel/pid_namespace.c24
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c412
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/printk.c181
-rw-r--r--kernel/profile.c13
-rw-r--r--kernel/ptrace.c165
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c202
-rw-r--r--kernel/rcutree.c281
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h532
-rw-r--r--kernel/rcutree_trace.c88
-rw-r--r--kernel/res_counter.c12
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched.c1712
-rw-r--r--kernel/sched_cpupri.c47
-rw-r--r--kernel/sched_debug.c11
-rw-r--r--kernel/sched_fair.c518
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c4
-rw-r--r--kernel/sched_rt.c83
-rw-r--r--kernel/signal.c48
-rw-r--r--kernel/slow-work.c23
-rw-r--r--kernel/smp.c42
-rw-r--r--kernel/softirq.c82
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sys.c283
-rw-r--r--kernel/sysctl.c73
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time/clockevents.c27
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig65
-rw-r--r--kernel/trace/blktrace.c25
-rw-r--r--kernel/trace/ftrace.c347
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/ring_buffer.c1449
-rw-r--r--kernel/trace/ring_buffer_benchmark.c45
-rw-r--r--kernel/trace/trace.c860
-rw-r--r--kernel/trace/trace.h363
-rw-r--r--kernel/trace/trace_boot.c20
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h383
-rw-r--r--kernel/trace/trace_event_profile.c5
-rw-r--r--kernel/trace/trace_event_types.h175
-rw-r--r--kernel/trace/trace_events.c261
-rw-r--r--kernel/trace/trace_events_filter.c333
-rw-r--r--kernel/trace/trace_export.c290
-rw-r--r--kernel/trace/trace_functions.c17
-rw-r--r--kernel/trace/trace_functions_graph.c273
-rw-r--r--kernel/trace/trace_irqsoff.c19
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_output.c45
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c22
-rw-r--r--kernel/trace/trace_printk.c28
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c59
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c54
-rw-r--r--kernel/trace/trace_stat.c53
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c471
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c50
-rw-r--r--kernel/user.c67
-rw-r--r--kernel/utsname.c13
-rw-r--r--kernel/wait.c5
-rw-r--r--kernel/workqueue.c9
132 files changed, 12969 insertions, 9142 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 90b53f6dc226..3d9c7e27e3f9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o
14obj-y += groups.o
14 15
15ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
@@ -68,8 +69,9 @@ obj-$(CONFIG_IKCONFIG) += configs.o
68obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
69obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
70obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
71obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
72obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/
73obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
74obj-$(CONFIG_KPROBES) += kprobes.o 76obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_KGDB) += kgdb.o 77obj-$(CONFIG_KGDB) += kgdb.o
@@ -78,11 +80,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
78obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
79obj-$(CONFIG_SECCOMP) += seccomp.o 81obj-$(CONFIG_SECCOMP) += seccomp.o
80obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
81obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
82obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
83obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
84obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
85obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
86obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -90,10 +90,10 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
90obj-$(CONFIG_MARKERS) += marker.o 90obj-$(CONFIG_MARKERS) += marker.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 92obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/ 95obj-$(CONFIG_X86_DS) += trace/
96obj-$(CONFIG_RING_BUFFER) += trace/
97obj-$(CONFIG_SMP) += sched_cpupri.o 97obj-$(CONFIG_SMP) += sched_cpupri.o
98obj-$(CONFIG_SLOW_WORK) += slow-work.o 98obj-$(CONFIG_SLOW_WORK) += slow-work.o
99obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 99obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
@@ -116,7 +116,7 @@ $(obj)/config_data.gz: .config FORCE
116 $(call if_changed,gzip) 116 $(call if_changed,gzip)
117 117
118quiet_cmd_ikconfiggz = IKCFG $@ 118quiet_cmd_ikconfiggz = IKCFG $@
119 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ 119 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
120targets += config_data.h 120targets += config_data.h
121$(obj)/config_data.h: $(obj)/config_data.gz FORCE 121$(obj)/config_data.h: $(obj)/config_data.gz FORCE
122 $(call if_changed,ikconfiggz) 122 $(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9a4715a2f6bf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
@@ -489,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
489 u64 run_time; 491 u64 run_time;
490 struct timespec uptime; 492 struct timespec uptime;
491 struct tty_struct *tty; 493 struct tty_struct *tty;
494 const struct cred *orig_cred;
495
496 /* Perform file operations on behalf of whoever enabled accounting */
497 orig_cred = override_creds(file->f_cred);
492 498
493 /* 499 /*
494 * First check to see if there is enough free_space to continue 500 * First check to see if there is enough free_space to continue
495 * the process accounting system. 501 * the process accounting system.
496 */ 502 */
497 if (!check_free_space(acct, file)) 503 if (!check_free_space(acct, file))
498 return; 504 goto out;
499 505
500 /* 506 /*
501 * Fill the accounting struct with the needed info as recorded 507 * Fill the accounting struct with the needed info as recorded
@@ -576,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
576 sizeof(acct_t), &file->f_pos); 582 sizeof(acct_t), &file->f_pos);
577 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 583 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
578 set_fs(fs); 584 set_fs(fs);
585out:
586 revert_creds(orig_cred);
579} 587}
580 588
581/** 589/**
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba9..defc2e6f1e3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
115/* The netlink socket. */ 115/* The netlink socket. */
116static struct sock *audit_sock; 116static struct sock *audit_sock;
117 117
118/* Inotify handle. */
119struct inotify_handle *audit_ih;
120
121/* Hash for inode-based rules */ 118/* Hash for inode-based rules */
122struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 119struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
123 120
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
136static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 133static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
137 134
138/* Serialize requests from userspace. */ 135/* Serialize requests from userspace. */
139static DEFINE_MUTEX(audit_cmd_mutex); 136DEFINE_MUTEX(audit_cmd_mutex);
140 137
141/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 138/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
142 * audit records. Since printk uses a 1024 byte buffer, this buffer 139 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
375 kfree_skb(skb); 372 kfree_skb(skb);
376} 373}
377 374
375/*
376 * For one reason or another this nlh isn't getting delivered to the userspace
377 * audit daemon, just send it to printk.
378 */
379static void audit_printk_skb(struct sk_buff *skb)
380{
381 struct nlmsghdr *nlh = nlmsg_hdr(skb);
382 char *data = NLMSG_DATA(nlh);
383
384 if (nlh->nlmsg_type != AUDIT_EOE) {
385 if (printk_ratelimit())
386 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
387 else
388 audit_log_lost("printk limit exceeded\n");
389 }
390
391 audit_hold_skb(skb);
392}
393
378static void kauditd_send_skb(struct sk_buff *skb) 394static void kauditd_send_skb(struct sk_buff *skb)
379{ 395{
380 int err; 396 int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
427 if (skb) { 443 if (skb) {
428 if (audit_pid) 444 if (audit_pid)
429 kauditd_send_skb(skb); 445 kauditd_send_skb(skb);
430 else { 446 else
431 if (printk_ratelimit()) 447 audit_printk_skb(skb);
432 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
433 else
434 audit_log_lost("printk limit exceeded\n");
435
436 audit_hold_skb(skb);
437 }
438 } else { 448 } else {
439 DECLARE_WAITQUEUE(wait, current); 449 DECLARE_WAITQUEUE(wait, current);
440 set_current_state(TASK_INTERRUPTIBLE); 450 set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
495 return 0; 505 return 0;
496} 506}
497 507
498#ifdef CONFIG_AUDIT_TREE
499static int prune_tree_thread(void *unused)
500{
501 mutex_lock(&audit_cmd_mutex);
502 audit_prune_trees();
503 mutex_unlock(&audit_cmd_mutex);
504 return 0;
505}
506
507void audit_schedule_prune(void)
508{
509 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
510}
511#endif
512
513struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 508struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
514 int multi, void *payload, int size) 509 int multi, void *payload, int size)
515{ 510{
516 struct sk_buff *skb; 511 struct sk_buff *skb;
517 struct nlmsghdr *nlh; 512 struct nlmsghdr *nlh;
518 int len = NLMSG_SPACE(size);
519 void *data; 513 void *data;
520 int flags = multi ? NLM_F_MULTI : 0; 514 int flags = multi ? NLM_F_MULTI : 0;
521 int t = done ? NLMSG_DONE : type; 515 int t = done ? NLMSG_DONE : type;
522 516
523 skb = alloc_skb(len, GFP_KERNEL); 517 skb = nlmsg_new(size, GFP_KERNEL);
524 if (!skb) 518 if (!skb)
525 return NULL; 519 return NULL;
526 520
527 nlh = NLMSG_PUT(skb, pid, seq, t, size); 521 nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
528 nlh->nlmsg_flags = flags; 522 data = NLMSG_DATA(nlh);
529 data = NLMSG_DATA(nlh);
530 memcpy(data, payload, size); 523 memcpy(data, payload, size);
531 return skb; 524 return skb;
532 525
533nlmsg_failure: /* Used by NLMSG_PUT */ 526nlmsg_failure: /* Used by NLMSG_NEW */
534 if (skb) 527 if (skb)
535 kfree_skb(skb); 528 kfree_skb(skb);
536 return NULL; 529 return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
926} 919}
927 920
928/* 921/*
929 * Get message from skb (based on rtnetlink_rcv_skb). Each message is 922 * Get message from skb. Each message is processed by audit_receive_msg.
930 * processed by audit_receive_msg. Malformed skbs with wrong length are 923 * Malformed skbs with wrong length are discarded silently.
931 * discarded silently.
932 */ 924 */
933static void audit_receive_skb(struct sk_buff *skb) 925static void audit_receive_skb(struct sk_buff *skb)
934{ 926{
935 int err; 927 struct nlmsghdr *nlh;
936 struct nlmsghdr *nlh; 928 /*
937 u32 rlen; 929 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
930 * if the nlmsg_len was not aligned
931 */
932 int len;
933 int err;
938 934
939 while (skb->len >= NLMSG_SPACE(0)) { 935 nlh = nlmsg_hdr(skb);
940 nlh = nlmsg_hdr(skb); 936 len = skb->len;
941 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) 937
942 return; 938 while (NLMSG_OK(nlh, len)) {
943 rlen = NLMSG_ALIGN(nlh->nlmsg_len); 939 err = audit_receive_msg(skb, nlh);
944 if (rlen > skb->len) 940 /* if err or if this message says it wants a response */
945 rlen = skb->len; 941 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
946 if ((err = audit_receive_msg(skb, nlh))) {
947 netlink_ack(skb, nlh, err); 942 netlink_ack(skb, nlh, err);
948 } else if (nlh->nlmsg_flags & NLM_F_ACK) 943
949 netlink_ack(skb, nlh, 0); 944 nlh = NLMSG_NEXT(nlh, len);
950 skb_pull(skb, rlen);
951 } 945 }
952} 946}
953 947
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
959 mutex_unlock(&audit_cmd_mutex); 953 mutex_unlock(&audit_cmd_mutex);
960} 954}
961 955
962#ifdef CONFIG_AUDITSYSCALL
963static const struct inotify_operations audit_inotify_ops = {
964 .handle_event = audit_handle_ievent,
965 .destroy_watch = audit_free_parent,
966};
967#endif
968
969/* Initialize audit support at boot time. */ 956/* Initialize audit support at boot time. */
970static int __init audit_init(void) 957static int __init audit_init(void)
971{ 958{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
991 978
992 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 979 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
993 980
994#ifdef CONFIG_AUDITSYSCALL
995 audit_ih = inotify_init(&audit_inotify_ops);
996 if (IS_ERR(audit_ih))
997 audit_panic("cannot initialize inotify handle");
998#endif
999
1000 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 981 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
1001 INIT_LIST_HEAD(&audit_inode_hash[i]); 982 INIT_LIST_HEAD(&audit_inode_hash[i]);
1002 983
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
1070 goto err; 1051 goto err;
1071 } 1052 }
1072 1053
1073 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
1074 if (!ab->skb)
1075 goto err;
1076
1077 ab->ctx = ctx; 1054 ab->ctx = ctx;
1078 ab->gfp_mask = gfp_mask; 1055 ab->gfp_mask = gfp_mask;
1079 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 1056
1080 nlh->nlmsg_type = type; 1057 ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
1081 nlh->nlmsg_flags = 0; 1058 if (!ab->skb)
1082 nlh->nlmsg_pid = 0; 1059 goto nlmsg_failure;
1083 nlh->nlmsg_seq = 0; 1060
1061 nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
1062
1084 return ab; 1063 return ab;
1064
1065nlmsg_failure: /* Used by NLMSG_NEW */
1066 kfree_skb(ab->skb);
1067 ab->skb = NULL;
1085err: 1068err:
1086 audit_buffer_free(ab); 1069 audit_buffer_free(ab);
1087 return NULL; 1070 return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1452 kfree(pathname); 1435 kfree(pathname);
1453} 1436}
1454 1437
1438void audit_log_key(struct audit_buffer *ab, char *key)
1439{
1440 audit_log_format(ab, " key=");
1441 if (key)
1442 audit_log_untrustedstring(ab, key);
1443 else
1444 audit_log_format(ab, "(null)");
1445}
1446
1455/** 1447/**
1456 * audit_log_end - end one audit record 1448 * audit_log_end - end one audit record
1457 * @ab: the audit_buffer 1449 * @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
1475 skb_queue_tail(&audit_skb_queue, ab->skb); 1467 skb_queue_tail(&audit_skb_queue, ab->skb);
1476 wake_up_interruptible(&kauditd_wait); 1468 wake_up_interruptible(&kauditd_wait);
1477 } else { 1469 } else {
1478 if (nlh->nlmsg_type != AUDIT_EOE) { 1470 audit_printk_skb(ab->skb);
1479 if (printk_ratelimit()) {
1480 printk(KERN_NOTICE "type=%d %s\n",
1481 nlh->nlmsg_type,
1482 ab->skb->data + NLMSG_SPACE(0));
1483 } else
1484 audit_log_lost("printk limit exceeded\n");
1485 }
1486 audit_hold_skb(ab->skb);
1487 } 1471 }
1488 ab->skb = NULL; 1472 ab->skb = NULL;
1489 } 1473 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661b..208687be4f30 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent; 56struct audit_watch;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
68struct audit_tree; 57struct audit_tree;
69struct audit_chunk; 58struct audit_chunk;
70 59
@@ -108,19 +97,28 @@ struct audit_netlink_list {
108 97
109int audit_send_list(void *); 98int audit_send_list(void *);
110 99
111struct inotify_watch;
112/* Inotify handle */
113extern struct inotify_handle *audit_ih;
114
115extern void audit_free_parent(struct inotify_watch *);
116extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
117 const char *, struct inode *);
118extern int selinux_audit_rule_update(void); 100extern int selinux_audit_rule_update(void);
119 101
120extern struct mutex audit_filter_mutex; 102extern struct mutex audit_filter_mutex;
121extern void audit_free_rule_rcu(struct rcu_head *); 103extern void audit_free_rule_rcu(struct rcu_head *);
122extern struct list_head audit_filter_list[]; 104extern struct list_head audit_filter_list[];
123 105
106/* audit watch functions */
107extern unsigned long audit_watch_inode(struct audit_watch *watch);
108extern dev_t audit_watch_dev(struct audit_watch *watch);
109extern void audit_put_watch(struct audit_watch *watch);
110extern void audit_get_watch(struct audit_watch *watch);
111extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
112extern int audit_add_watch(struct audit_krule *krule);
113extern void audit_remove_watch(struct audit_watch *watch);
114extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
115extern void audit_inotify_unregister(struct list_head *in_list);
116extern char *audit_watch_path(struct audit_watch *watch);
117extern struct list_head *audit_watch_rules(struct audit_watch *watch);
118
119extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
120 struct audit_watch *watch);
121
124#ifdef CONFIG_AUDIT_TREE 122#ifdef CONFIG_AUDIT_TREE
125extern struct audit_chunk *audit_tree_lookup(const struct inode *); 123extern struct audit_chunk *audit_tree_lookup(const struct inode *);
126extern void audit_put_chunk(struct audit_chunk *); 124extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
130extern int audit_remove_tree_rule(struct audit_krule *); 128extern int audit_remove_tree_rule(struct audit_krule *);
131extern void audit_trim_trees(void); 129extern void audit_trim_trees(void);
132extern int audit_tag_tree(char *old, char *new); 130extern int audit_tag_tree(char *old, char *new);
133extern void audit_schedule_prune(void);
134extern void audit_prune_trees(void);
135extern const char *audit_tree_path(struct audit_tree *); 131extern const char *audit_tree_path(struct audit_tree *);
136extern void audit_put_tree(struct audit_tree *); 132extern void audit_put_tree(struct audit_tree *);
133extern void audit_kill_trees(struct list_head *);
137#else 134#else
138#define audit_remove_tree_rule(rule) BUG() 135#define audit_remove_tree_rule(rule) BUG()
139#define audit_add_tree_rule(rule) -EINVAL 136#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
142#define audit_put_tree(tree) (void)0 139#define audit_put_tree(tree) (void)0
143#define audit_tag_tree(old, new) -EINVAL 140#define audit_tag_tree(old, new) -EINVAL
144#define audit_tree_path(rule) "" /* never called */ 141#define audit_tree_path(rule) "" /* never called */
142#define audit_kill_trees(list) BUG()
145#endif 143#endif
146 144
147extern char *audit_unpack_string(void **, size_t *, size_t); 145extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
160 return 0; 158 return 0;
161} 159}
162extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 160extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
161extern struct list_head *audit_killed_trees(void);
163#else 162#else
164#define audit_signal_info(s,t) AUDIT_DISABLED 163#define audit_signal_info(s,t) AUDIT_DISABLED
165#define audit_filter_inodes(t,c) AUDIT_DISABLED 164#define audit_filter_inodes(t,c) AUDIT_DISABLED
166#endif 165#endif
166
167extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1f6396d76687..2451dc6f3282 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h>
5 6
6struct audit_tree; 7struct audit_tree;
7struct audit_chunk; 8struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
441 if (rule->tree) { 442 if (rule->tree) {
442 /* not a half-baked one */ 443 /* not a half-baked one */
443 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 444 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
444 audit_log_format(ab, "op=remove rule dir="); 445 audit_log_format(ab, "op=");
446 audit_log_string(ab, "remove rule");
447 audit_log_format(ab, " dir=");
445 audit_log_untrustedstring(ab, rule->tree->pathname); 448 audit_log_untrustedstring(ab, rule->tree->pathname);
446 if (rule->filterkey) { 449 audit_log_key(ab, rule->filterkey);
447 audit_log_format(ab, " key=");
448 audit_log_untrustedstring(ab, rule->filterkey);
449 } else
450 audit_log_format(ab, " key=(null)");
451 audit_log_format(ab, " list=%d res=1", rule->listnr); 450 audit_log_format(ab, " list=%d res=1", rule->listnr);
452 audit_log_end(ab); 451 audit_log_end(ab);
453 rule->tree = NULL; 452 rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
519 } 518 }
520} 519}
521 520
521static void audit_schedule_prune(void);
522
522/* called with audit_filter_mutex */ 523/* called with audit_filter_mutex */
523int audit_remove_tree_rule(struct audit_krule *rule) 524int audit_remove_tree_rule(struct audit_krule *rule)
524{ 525{
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
824 825
825/* 826/*
826 * That gets run when evict_chunk() ends up needing to kill audit_tree. 827 * That gets run when evict_chunk() ends up needing to kill audit_tree.
827 * Runs from a separate thread, with audit_cmd_mutex held. 828 * Runs from a separate thread.
828 */ 829 */
829void audit_prune_trees(void) 830static int prune_tree_thread(void *unused)
830{ 831{
832 mutex_lock(&audit_cmd_mutex);
831 mutex_lock(&audit_filter_mutex); 833 mutex_lock(&audit_filter_mutex);
832 834
833 while (!list_empty(&prune_list)) { 835 while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
844 } 846 }
845 847
846 mutex_unlock(&audit_filter_mutex); 848 mutex_unlock(&audit_filter_mutex);
849 mutex_unlock(&audit_cmd_mutex);
850 return 0;
851}
852
853static void audit_schedule_prune(void)
854{
855 kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
856}
857
858/*
859 * ... and that one is done if evict_chunk() decides to delay until the end
860 * of syscall. Runs synchronously.
861 */
862void audit_kill_trees(struct list_head *list)
863{
864 mutex_lock(&audit_cmd_mutex);
865 mutex_lock(&audit_filter_mutex);
866
867 while (!list_empty(list)) {
868 struct audit_tree *victim;
869
870 victim = list_entry(list->next, struct audit_tree, list);
871 kill_rules(victim);
872 list_del_init(&victim->list);
873
874 mutex_unlock(&audit_filter_mutex);
875
876 prune_one(victim);
877
878 mutex_lock(&audit_filter_mutex);
879 }
880
881 mutex_unlock(&audit_filter_mutex);
882 mutex_unlock(&audit_cmd_mutex);
847} 883}
848 884
849/* 885/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
854static void evict_chunk(struct audit_chunk *chunk) 890static void evict_chunk(struct audit_chunk *chunk)
855{ 891{
856 struct audit_tree *owner; 892 struct audit_tree *owner;
893 struct list_head *postponed = audit_killed_trees();
894 int need_prune = 0;
857 int n; 895 int n;
858 896
859 if (chunk->dead) 897 if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
869 owner->root = NULL; 907 owner->root = NULL;
870 list_del_init(&owner->same_root); 908 list_del_init(&owner->same_root);
871 spin_unlock(&hash_lock); 909 spin_unlock(&hash_lock);
872 kill_rules(owner); 910 if (!postponed) {
873 list_move(&owner->list, &prune_list); 911 kill_rules(owner);
874 audit_schedule_prune(); 912 list_move(&owner->list, &prune_list);
913 need_prune = 1;
914 } else {
915 list_move(&owner->list, postponed);
916 }
875 spin_lock(&hash_lock); 917 spin_lock(&hash_lock);
876 } 918 }
877 list_del_rcu(&chunk->hash); 919 list_del_rcu(&chunk->hash);
878 for (n = 0; n < chunk->count; n++) 920 for (n = 0; n < chunk->count; n++)
879 list_del_init(&chunk->owners[n].list); 921 list_del_init(&chunk->owners[n].list);
880 spin_unlock(&hash_lock); 922 spin_unlock(&hash_lock);
923 if (need_prune)
924 audit_schedule_prune();
881 mutex_unlock(&audit_filter_mutex); 925 mutex_unlock(&audit_filter_mutex);
882} 926}
883 927
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000000..0e96dbc60ea9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
1/* audit_watch.c -- watching inodes
2 *
3 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h>
32#include "audit.h"
33
34/*
35 * Reference counting:
36 *
37 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
38 * event. Each audit_watch holds a reference to its associated parent.
39 *
40 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
41 * audit_remove_watch(). Additionally, an audit_watch may exist
42 * temporarily to assist in searching existing filter data. Each
43 * audit_krule holds a reference to its associated watch.
44 */
45
46struct audit_watch {
47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */
50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */
53 struct list_head rules; /* associated rules */
54};
55
56struct audit_parent {
57 struct list_head ilist; /* entry in inotify registration list */
58 struct list_head watches; /* associated watches */
59 struct inotify_watch wdata; /* inotify watch data */
60 unsigned flags; /* status flags */
61};
62
63/* Inotify handle. */
64struct inotify_handle *audit_ih;
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Inotify events we care about. */
78#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
79
80static void audit_free_parent(struct inotify_watch *i_watch)
81{
82 struct audit_parent *parent;
83
84 parent = container_of(i_watch, struct audit_parent, wdata);
85 WARN_ON(!list_empty(&parent->watches));
86 kfree(parent);
87}
88
89void audit_get_watch(struct audit_watch *watch)
90{
91 atomic_inc(&watch->count);
92}
93
94void audit_put_watch(struct audit_watch *watch)
95{
96 if (atomic_dec_and_test(&watch->count)) {
97 WARN_ON(watch->parent);
98 WARN_ON(!list_empty(&watch->rules));
99 kfree(watch->path);
100 kfree(watch);
101 }
102}
103
104void audit_remove_watch(struct audit_watch *watch)
105{
106 list_del(&watch->wlist);
107 put_inotify_watch(&watch->parent->wdata);
108 watch->parent = NULL;
109 audit_put_watch(watch); /* match initial get */
110}
111
112char *audit_watch_path(struct audit_watch *watch)
113{
114 return watch->path;
115}
116
117struct list_head *audit_watch_rules(struct audit_watch *watch)
118{
119 return &watch->rules;
120}
121
122unsigned long audit_watch_inode(struct audit_watch *watch)
123{
124 return watch->ino;
125}
126
127dev_t audit_watch_dev(struct audit_watch *watch)
128{
129 return watch->dev;
130}
131
132/* Initialize a parent watch entry. */
133static struct audit_parent *audit_init_parent(struct nameidata *ndp)
134{
135 struct audit_parent *parent;
136 s32 wd;
137
138 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
139 if (unlikely(!parent))
140 return ERR_PTR(-ENOMEM);
141
142 INIT_LIST_HEAD(&parent->watches);
143 parent->flags = 0;
144
145 inotify_init_watch(&parent->wdata);
146 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
147 get_inotify_watch(&parent->wdata);
148 wd = inotify_add_watch(audit_ih, &parent->wdata,
149 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
150 if (wd < 0) {
151 audit_free_parent(&parent->wdata);
152 return ERR_PTR(wd);
153 }
154
155 return parent;
156}
157
158/* Initialize a watch entry. */
159static struct audit_watch *audit_init_watch(char *path)
160{
161 struct audit_watch *watch;
162
163 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
164 if (unlikely(!watch))
165 return ERR_PTR(-ENOMEM);
166
167 INIT_LIST_HEAD(&watch->rules);
168 atomic_set(&watch->count, 1);
169 watch->path = path;
170 watch->dev = (dev_t)-1;
171 watch->ino = (unsigned long)-1;
172
173 return watch;
174}
175
176/* Translate a watch string to kernel respresentation. */
177int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
178{
179 struct audit_watch *watch;
180
181 if (!audit_ih)
182 return -EOPNOTSUPP;
183
184 if (path[0] != '/' || path[len-1] == '/' ||
185 krule->listnr != AUDIT_FILTER_EXIT ||
186 op != Audit_equal ||
187 krule->inode_f || krule->watch || krule->tree)
188 return -EINVAL;
189
190 watch = audit_init_watch(path);
191 if (IS_ERR(watch))
192 return PTR_ERR(watch);
193
194 audit_get_watch(watch);
195 krule->watch = watch;
196
197 return 0;
198}
199
200/* Duplicate the given audit watch. The new watch's rules list is initialized
201 * to an empty list and wlist is undefined. */
202static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
203{
204 char *path;
205 struct audit_watch *new;
206
207 path = kstrdup(old->path, GFP_KERNEL);
208 if (unlikely(!path))
209 return ERR_PTR(-ENOMEM);
210
211 new = audit_init_watch(path);
212 if (IS_ERR(new)) {
213 kfree(path);
214 goto out;
215 }
216
217 new->dev = old->dev;
218 new->ino = old->ino;
219 get_inotify_watch(&old->parent->wdata);
220 new->parent = old->parent;
221
222out:
223 return new;
224}
225
226static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
227{
228 if (audit_enabled) {
229 struct audit_buffer *ab;
230 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
231 audit_log_format(ab, "auid=%u ses=%u op=",
232 audit_get_loginuid(current),
233 audit_get_sessionid(current));
234 audit_log_string(ab, op);
235 audit_log_format(ab, " path=");
236 audit_log_untrustedstring(ab, w->path);
237 audit_log_key(ab, r->filterkey);
238 audit_log_format(ab, " list=%d res=1", r->listnr);
239 audit_log_end(ab);
240 }
241}
242
243/* Update inode info in audit rules based on filesystem event. */
244static void audit_update_watch(struct audit_parent *parent,
245 const char *dname, dev_t dev,
246 unsigned long ino, unsigned invalidating)
247{
248 struct audit_watch *owatch, *nwatch, *nextw;
249 struct audit_krule *r, *nextr;
250 struct audit_entry *oentry, *nentry;
251
252 mutex_lock(&audit_filter_mutex);
253 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
254 if (audit_compare_dname_path(dname, owatch->path, NULL))
255 continue;
256
257 /* If the update involves invalidating rules, do the inode-based
258 * filtering now, so we don't omit records. */
259 if (invalidating && current->audit_context)
260 audit_filter_inodes(current, current->audit_context);
261
262 nwatch = audit_dupe_watch(owatch);
263 if (IS_ERR(nwatch)) {
264 mutex_unlock(&audit_filter_mutex);
265 audit_panic("error updating watch, skipping");
266 return;
267 }
268 nwatch->dev = dev;
269 nwatch->ino = ino;
270
271 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
272
273 oentry = container_of(r, struct audit_entry, rule);
274 list_del(&oentry->rule.rlist);
275 list_del_rcu(&oentry->list);
276
277 nentry = audit_dupe_rule(&oentry->rule, nwatch);
278 if (IS_ERR(nentry)) {
279 list_del(&oentry->rule.list);
280 audit_panic("error updating watch, removing");
281 } else {
282 int h = audit_hash_ino((u32)ino);
283 list_add(&nentry->rule.rlist, &nwatch->rules);
284 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
285 list_replace(&oentry->rule.list,
286 &nentry->rule.list);
287 }
288
289 audit_watch_log_rule_change(r, owatch, "updated rules");
290
291 call_rcu(&oentry->rcu, audit_free_rule_rcu);
292 }
293
294 audit_remove_watch(owatch);
295 goto add_watch_to_parent; /* event applies to a single watch */
296 }
297 mutex_unlock(&audit_filter_mutex);
298 return;
299
300add_watch_to_parent:
301 list_add(&nwatch->wlist, &parent->watches);
302 mutex_unlock(&audit_filter_mutex);
303 return;
304}
305
306/* Remove all watches & rules associated with a parent that is going away. */
307static void audit_remove_parent_watches(struct audit_parent *parent)
308{
309 struct audit_watch *w, *nextw;
310 struct audit_krule *r, *nextr;
311 struct audit_entry *e;
312
313 mutex_lock(&audit_filter_mutex);
314 parent->flags |= AUDIT_PARENT_INVALID;
315 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
316 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
317 e = container_of(r, struct audit_entry, rule);
318 audit_watch_log_rule_change(r, w, "remove rule");
319 list_del(&r->rlist);
320 list_del(&r->list);
321 list_del_rcu(&e->list);
322 call_rcu(&e->rcu, audit_free_rule_rcu);
323 }
324 audit_remove_watch(w);
325 }
326 mutex_unlock(&audit_filter_mutex);
327}
328
329/* Unregister inotify watches for parents on in_list.
330 * Generates an IN_IGNORED event. */
331void audit_inotify_unregister(struct list_head *in_list)
332{
333 struct audit_parent *p, *n;
334
335 list_for_each_entry_safe(p, n, in_list, ilist) {
336 list_del(&p->ilist);
337 inotify_rm_watch(audit_ih, &p->wdata);
338 /* the unpin matching the pin in audit_do_del_rule() */
339 unpin_inotify_watch(&p->wdata);
340 }
341}
342
343/* Get path information necessary for adding watches. */
344static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
345{
346 struct nameidata *ndparent, *ndwatch;
347 int err;
348
349 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
350 if (unlikely(!ndparent))
351 return -ENOMEM;
352
353 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
354 if (unlikely(!ndwatch)) {
355 kfree(ndparent);
356 return -ENOMEM;
357 }
358
359 err = path_lookup(path, LOOKUP_PARENT, ndparent);
360 if (err) {
361 kfree(ndparent);
362 kfree(ndwatch);
363 return err;
364 }
365
366 err = path_lookup(path, 0, ndwatch);
367 if (err) {
368 kfree(ndwatch);
369 ndwatch = NULL;
370 }
371
372 *ndp = ndparent;
373 *ndw = ndwatch;
374
375 return 0;
376}
377
378/* Release resources used for watch path information. */
379static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
380{
381 if (ndp) {
382 path_put(&ndp->path);
383 kfree(ndp);
384 }
385 if (ndw) {
386 path_put(&ndw->path);
387 kfree(ndw);
388 }
389}
390
391/* Associate the given rule with an existing parent inotify_watch.
392 * Caller must hold audit_filter_mutex. */
393static void audit_add_to_parent(struct audit_krule *krule,
394 struct audit_parent *parent)
395{
396 struct audit_watch *w, *watch = krule->watch;
397 int watch_found = 0;
398
399 list_for_each_entry(w, &parent->watches, wlist) {
400 if (strcmp(watch->path, w->path))
401 continue;
402
403 watch_found = 1;
404
405 /* put krule's and initial refs to temporary watch */
406 audit_put_watch(watch);
407 audit_put_watch(watch);
408
409 audit_get_watch(w);
410 krule->watch = watch = w;
411 break;
412 }
413
414 if (!watch_found) {
415 get_inotify_watch(&parent->wdata);
416 watch->parent = parent;
417
418 list_add(&watch->wlist, &parent->watches);
419 }
420 list_add(&krule->rlist, &watch->rules);
421}
422
423/* Find a matching watch entry, or add this one.
424 * Caller must hold audit_filter_mutex. */
425int audit_add_watch(struct audit_krule *krule)
426{
427 struct audit_watch *watch = krule->watch;
428 struct inotify_watch *i_watch;
429 struct audit_parent *parent;
430 struct nameidata *ndp = NULL, *ndw = NULL;
431 int ret = 0;
432
433 mutex_unlock(&audit_filter_mutex);
434
435 /* Avoid calling path_lookup under audit_filter_mutex. */
436 ret = audit_get_nd(watch->path, &ndp, &ndw);
437 if (ret) {
438 /* caller expects mutex locked */
439 mutex_lock(&audit_filter_mutex);
440 goto error;
441 }
442
443 /* update watch filter fields */
444 if (ndw) {
445 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
446 watch->ino = ndw->path.dentry->d_inode->i_ino;
447 }
448
449 /* The audit_filter_mutex must not be held during inotify calls because
450 * we hold it during inotify event callback processing. If an existing
451 * inotify watch is found, inotify_find_watch() grabs a reference before
452 * returning.
453 */
454 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
455 &i_watch) < 0) {
456 parent = audit_init_parent(ndp);
457 if (IS_ERR(parent)) {
458 /* caller expects mutex locked */
459 mutex_lock(&audit_filter_mutex);
460 ret = PTR_ERR(parent);
461 goto error;
462 }
463 } else
464 parent = container_of(i_watch, struct audit_parent, wdata);
465
466 mutex_lock(&audit_filter_mutex);
467
468 /* parent was moved before we took audit_filter_mutex */
469 if (parent->flags & AUDIT_PARENT_INVALID)
470 ret = -ENOENT;
471 else
472 audit_add_to_parent(krule, parent);
473
474 /* match get in audit_init_parent or inotify_find_watch */
475 put_inotify_watch(&parent->wdata);
476
477error:
478 audit_put_nd(ndp, ndw); /* NULL args OK */
479 return ret;
480
481}
482
483void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
484{
485 struct audit_watch *watch = krule->watch;
486 struct audit_parent *parent = watch->parent;
487
488 list_del(&krule->rlist);
489
490 if (list_empty(&watch->rules)) {
491 audit_remove_watch(watch);
492
493 if (list_empty(&parent->watches)) {
494 /* Put parent on the inotify un-registration
495 * list. Grab a reference before releasing
496 * audit_filter_mutex, to be released in
497 * audit_inotify_unregister().
498 * If filesystem is going away, just leave
499 * the sucker alone, eviction will take
500 * care of it. */
501 if (pin_inotify_watch(&parent->wdata))
502 list_add(&parent->ilist, list);
503 }
504 }
505}
506
507/* Update watch data in audit rules based on inotify events. */
508static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
509 u32 cookie, const char *dname, struct inode *inode)
510{
511 struct audit_parent *parent;
512
513 parent = container_of(i_watch, struct audit_parent, wdata);
514
515 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
516 audit_update_watch(parent, dname, inode->i_sb->s_dev,
517 inode->i_ino, 0);
518 else if (mask & (IN_DELETE|IN_MOVED_FROM))
519 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
520 /* inotify automatically removes the watch and sends IN_IGNORED */
521 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
522 audit_remove_parent_watches(parent);
523 /* inotify does not remove the watch, so remove it manually */
524 else if(mask & IN_MOVE_SELF) {
525 audit_remove_parent_watches(parent);
526 inotify_remove_watch_locked(audit_ih, i_watch);
527 } else if (mask & IN_IGNORED)
528 put_inotify_watch(i_watch);
529}
530
531static const struct inotify_operations audit_inotify_ops = {
532 .handle_event = audit_handle_ievent,
533 .destroy_watch = audit_free_parent,
534};
535
536static int __init audit_watch_init(void)
537{
538 audit_ih = inotify_init(&audit_inotify_ops);
539 if (IS_ERR(audit_ih))
540 audit_panic("cannot initialize inotify handle");
541 return 0;
542}
543subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a02..a70604047f3c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h>
31#include <linux/security.h> 30#include <linux/security.h>
32#include "audit.h" 31#include "audit.h"
33 32
@@ -44,36 +43,6 @@
44 * be written directly provided audit_filter_mutex is held. 43 * be written directly provided audit_filter_mutex is held.
45 */ 44 */
46 45
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */ 46/* Audit filter lists, defined in <linux/audit.h> */
78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 47struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
79 LIST_HEAD_INIT(audit_filter_list[0]), 48 LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
97 66
98DEFINE_MUTEX(audit_filter_mutex); 67DEFINE_MUTEX(audit_filter_mutex);
99 68
100/* Inotify events we care about. */
101#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
102
103void audit_free_parent(struct inotify_watch *i_watch)
104{
105 struct audit_parent *parent;
106
107 parent = container_of(i_watch, struct audit_parent, wdata);
108 WARN_ON(!list_empty(&parent->watches));
109 kfree(parent);
110}
111
112static inline void audit_get_watch(struct audit_watch *watch)
113{
114 atomic_inc(&watch->count);
115}
116
117static void audit_put_watch(struct audit_watch *watch)
118{
119 if (atomic_dec_and_test(&watch->count)) {
120 WARN_ON(watch->parent);
121 WARN_ON(!list_empty(&watch->rules));
122 kfree(watch->path);
123 kfree(watch);
124 }
125}
126
127static void audit_remove_watch(struct audit_watch *watch)
128{
129 list_del(&watch->wlist);
130 put_inotify_watch(&watch->parent->wdata);
131 watch->parent = NULL;
132 audit_put_watch(watch); /* match initial get */
133}
134
135static inline void audit_free_rule(struct audit_entry *e) 69static inline void audit_free_rule(struct audit_entry *e)
136{ 70{
137 int i; 71 int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
156 audit_free_rule(e); 90 audit_free_rule(e);
157} 91}
158 92
159/* Initialize a parent watch entry. */
160static struct audit_parent *audit_init_parent(struct nameidata *ndp)
161{
162 struct audit_parent *parent;
163 s32 wd;
164
165 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
166 if (unlikely(!parent))
167 return ERR_PTR(-ENOMEM);
168
169 INIT_LIST_HEAD(&parent->watches);
170 parent->flags = 0;
171
172 inotify_init_watch(&parent->wdata);
173 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
174 get_inotify_watch(&parent->wdata);
175 wd = inotify_add_watch(audit_ih, &parent->wdata,
176 ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
177 if (wd < 0) {
178 audit_free_parent(&parent->wdata);
179 return ERR_PTR(wd);
180 }
181
182 return parent;
183}
184
185/* Initialize a watch entry. */
186static struct audit_watch *audit_init_watch(char *path)
187{
188 struct audit_watch *watch;
189
190 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
191 if (unlikely(!watch))
192 return ERR_PTR(-ENOMEM);
193
194 INIT_LIST_HEAD(&watch->rules);
195 atomic_set(&watch->count, 1);
196 watch->path = path;
197 watch->dev = (dev_t)-1;
198 watch->ino = (unsigned long)-1;
199
200 return watch;
201}
202
203/* Initialize an audit filterlist entry. */ 93/* Initialize an audit filterlist entry. */
204static inline struct audit_entry *audit_init_entry(u32 field_count) 94static inline struct audit_entry *audit_init_entry(u32 field_count)
205{ 95{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
260 return 0; 150 return 0;
261} 151}
262 152
263/* Translate a watch string to kernel respresentation. */
264static int audit_to_watch(struct audit_krule *krule, char *path, int len,
265 u32 op)
266{
267 struct audit_watch *watch;
268
269 if (!audit_ih)
270 return -EOPNOTSUPP;
271
272 if (path[0] != '/' || path[len-1] == '/' ||
273 krule->listnr != AUDIT_FILTER_EXIT ||
274 op != Audit_equal ||
275 krule->inode_f || krule->watch || krule->tree)
276 return -EINVAL;
277
278 watch = audit_init_watch(path);
279 if (IS_ERR(watch))
280 return PTR_ERR(watch);
281
282 audit_get_watch(watch);
283 krule->watch = watch;
284
285 return 0;
286}
287
288static __u32 *classes[AUDIT_SYSCALL_CLASSES]; 153static __u32 *classes[AUDIT_SYSCALL_CLASSES];
289 154
290int __init audit_register_class(int class, unsigned *list) 155int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
766 break; 631 break;
767 case AUDIT_WATCH: 632 case AUDIT_WATCH:
768 data->buflen += data->values[i] = 633 data->buflen += data->values[i] =
769 audit_pack_string(&bufp, krule->watch->path); 634 audit_pack_string(&bufp,
635 audit_watch_path(krule->watch));
770 break; 636 break;
771 case AUDIT_DIR: 637 case AUDIT_DIR:
772 data->buflen += data->values[i] = 638 data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
818 return 1; 684 return 1;
819 break; 685 break;
820 case AUDIT_WATCH: 686 case AUDIT_WATCH:
821 if (strcmp(a->watch->path, b->watch->path)) 687 if (strcmp(audit_watch_path(a->watch),
688 audit_watch_path(b->watch)))
822 return 1; 689 return 1;
823 break; 690 break;
824 case AUDIT_DIR: 691 case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
844 return 0; 711 return 0;
845} 712}
846 713
847/* Duplicate the given audit watch. The new watch's rules list is initialized
848 * to an empty list and wlist is undefined. */
849static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
850{
851 char *path;
852 struct audit_watch *new;
853
854 path = kstrdup(old->path, GFP_KERNEL);
855 if (unlikely(!path))
856 return ERR_PTR(-ENOMEM);
857
858 new = audit_init_watch(path);
859 if (IS_ERR(new)) {
860 kfree(path);
861 goto out;
862 }
863
864 new->dev = old->dev;
865 new->ino = old->ino;
866 get_inotify_watch(&old->parent->wdata);
867 new->parent = old->parent;
868
869out:
870 return new;
871}
872
873/* Duplicate LSM field information. The lsm_rule is opaque, so must be 714/* Duplicate LSM field information. The lsm_rule is opaque, so must be
874 * re-initialized. */ 715 * re-initialized. */
875static inline int audit_dupe_lsm_field(struct audit_field *df, 716static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
904 * rule with the new rule in the filterlist, then free the old rule. 745 * rule with the new rule in the filterlist, then free the old rule.
905 * The rlist element is undefined; list manipulations are handled apart from 746 * The rlist element is undefined; list manipulations are handled apart from
906 * the initial copy. */ 747 * the initial copy. */
907static struct audit_entry *audit_dupe_rule(struct audit_krule *old, 748struct audit_entry *audit_dupe_rule(struct audit_krule *old,
908 struct audit_watch *watch) 749 struct audit_watch *watch)
909{ 750{
910 u32 fcount = old->field_count; 751 u32 fcount = old->field_count;
911 struct audit_entry *entry; 752 struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
977 return entry; 818 return entry;
978} 819}
979 820
980/* Update inode info in audit rules based on filesystem event. */
981static void audit_update_watch(struct audit_parent *parent,
982 const char *dname, dev_t dev,
983 unsigned long ino, unsigned invalidating)
984{
985 struct audit_watch *owatch, *nwatch, *nextw;
986 struct audit_krule *r, *nextr;
987 struct audit_entry *oentry, *nentry;
988
989 mutex_lock(&audit_filter_mutex);
990 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
991 if (audit_compare_dname_path(dname, owatch->path, NULL))
992 continue;
993
994 /* If the update involves invalidating rules, do the inode-based
995 * filtering now, so we don't omit records. */
996 if (invalidating && current->audit_context)
997 audit_filter_inodes(current, current->audit_context);
998
999 nwatch = audit_dupe_watch(owatch);
1000 if (IS_ERR(nwatch)) {
1001 mutex_unlock(&audit_filter_mutex);
1002 audit_panic("error updating watch, skipping");
1003 return;
1004 }
1005 nwatch->dev = dev;
1006 nwatch->ino = ino;
1007
1008 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
1009
1010 oentry = container_of(r, struct audit_entry, rule);
1011 list_del(&oentry->rule.rlist);
1012 list_del_rcu(&oentry->list);
1013
1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1017 audit_panic("error updating watch, removing");
1018 } else {
1019 int h = audit_hash_ino((u32)ino);
1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1024 }
1025
1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1027 }
1028
1029 if (audit_enabled) {
1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current),
1035 audit_get_sessionid(current));
1036 audit_log_format(ab,
1037 " op=updated rules specifying path=");
1038 audit_log_untrustedstring(ab, owatch->path);
1039 audit_log_format(ab, " with dev=%u ino=%lu\n",
1040 dev, ino);
1041 audit_log_format(ab, " list=%d res=1", r->listnr);
1042 audit_log_end(ab);
1043 }
1044 audit_remove_watch(owatch);
1045 goto add_watch_to_parent; /* event applies to a single watch */
1046 }
1047 mutex_unlock(&audit_filter_mutex);
1048 return;
1049
1050add_watch_to_parent:
1051 list_add(&nwatch->wlist, &parent->watches);
1052 mutex_unlock(&audit_filter_mutex);
1053 return;
1054}
1055
1056/* Remove all watches & rules associated with a parent that is going away. */
1057static void audit_remove_parent_watches(struct audit_parent *parent)
1058{
1059 struct audit_watch *w, *nextw;
1060 struct audit_krule *r, *nextr;
1061 struct audit_entry *e;
1062
1063 mutex_lock(&audit_filter_mutex);
1064 parent->flags |= AUDIT_PARENT_INVALID;
1065 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1066 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) {
1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current),
1074 audit_get_sessionid(current));
1075 audit_log_format(ab, " op=remove rule path=");
1076 audit_log_untrustedstring(ab, w->path);
1077 if (r->filterkey) {
1078 audit_log_format(ab, " key=");
1079 audit_log_untrustedstring(ab,
1080 r->filterkey);
1081 } else
1082 audit_log_format(ab, " key=(null)");
1083 audit_log_format(ab, " list=%d res=1",
1084 r->listnr);
1085 audit_log_end(ab);
1086 }
1087 list_del(&r->rlist);
1088 list_del(&r->list);
1089 list_del_rcu(&e->list);
1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1091 }
1092 audit_remove_watch(w);
1093 }
1094 mutex_unlock(&audit_filter_mutex);
1095}
1096
1097/* Unregister inotify watches for parents on in_list.
1098 * Generates an IN_IGNORED event. */
1099static void audit_inotify_unregister(struct list_head *in_list)
1100{
1101 struct audit_parent *p, *n;
1102
1103 list_for_each_entry_safe(p, n, in_list, ilist) {
1104 list_del(&p->ilist);
1105 inotify_rm_watch(audit_ih, &p->wdata);
1106 /* the unpin matching the pin in audit_do_del_rule() */
1107 unpin_inotify_watch(&p->wdata);
1108 }
1109}
1110
1111/* Find an existing audit rule. 821/* Find an existing audit rule.
1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 822 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1113static struct audit_entry *audit_find_rule(struct audit_entry *entry, 823static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
1145 return found; 855 return found;
1146} 856}
1147 857
1148/* Get path information necessary for adding watches. */
1149static int audit_get_nd(char *path, struct nameidata **ndp,
1150 struct nameidata **ndw)
1151{
1152 struct nameidata *ndparent, *ndwatch;
1153 int err;
1154
1155 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1156 if (unlikely(!ndparent))
1157 return -ENOMEM;
1158
1159 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1160 if (unlikely(!ndwatch)) {
1161 kfree(ndparent);
1162 return -ENOMEM;
1163 }
1164
1165 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1166 if (err) {
1167 kfree(ndparent);
1168 kfree(ndwatch);
1169 return err;
1170 }
1171
1172 err = path_lookup(path, 0, ndwatch);
1173 if (err) {
1174 kfree(ndwatch);
1175 ndwatch = NULL;
1176 }
1177
1178 *ndp = ndparent;
1179 *ndw = ndwatch;
1180
1181 return 0;
1182}
1183
1184/* Release resources used for watch path information. */
1185static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1186{
1187 if (ndp) {
1188 path_put(&ndp->path);
1189 kfree(ndp);
1190 }
1191 if (ndw) {
1192 path_put(&ndw->path);
1193 kfree(ndw);
1194 }
1195}
1196
1197/* Associate the given rule with an existing parent inotify_watch.
1198 * Caller must hold audit_filter_mutex. */
1199static void audit_add_to_parent(struct audit_krule *krule,
1200 struct audit_parent *parent)
1201{
1202 struct audit_watch *w, *watch = krule->watch;
1203 int watch_found = 0;
1204
1205 list_for_each_entry(w, &parent->watches, wlist) {
1206 if (strcmp(watch->path, w->path))
1207 continue;
1208
1209 watch_found = 1;
1210
1211 /* put krule's and initial refs to temporary watch */
1212 audit_put_watch(watch);
1213 audit_put_watch(watch);
1214
1215 audit_get_watch(w);
1216 krule->watch = watch = w;
1217 break;
1218 }
1219
1220 if (!watch_found) {
1221 get_inotify_watch(&parent->wdata);
1222 watch->parent = parent;
1223
1224 list_add(&watch->wlist, &parent->watches);
1225 }
1226 list_add(&krule->rlist, &watch->rules);
1227}
1228
1229/* Find a matching watch entry, or add this one.
1230 * Caller must hold audit_filter_mutex. */
1231static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1232 struct nameidata *ndw)
1233{
1234 struct audit_watch *watch = krule->watch;
1235 struct inotify_watch *i_watch;
1236 struct audit_parent *parent;
1237 int ret = 0;
1238
1239 /* update watch filter fields */
1240 if (ndw) {
1241 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
1242 watch->ino = ndw->path.dentry->d_inode->i_ino;
1243 }
1244
1245 /* The audit_filter_mutex must not be held during inotify calls because
1246 * we hold it during inotify event callback processing. If an existing
1247 * inotify watch is found, inotify_find_watch() grabs a reference before
1248 * returning.
1249 */
1250 mutex_unlock(&audit_filter_mutex);
1251
1252 if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
1253 &i_watch) < 0) {
1254 parent = audit_init_parent(ndp);
1255 if (IS_ERR(parent)) {
1256 /* caller expects mutex locked */
1257 mutex_lock(&audit_filter_mutex);
1258 return PTR_ERR(parent);
1259 }
1260 } else
1261 parent = container_of(i_watch, struct audit_parent, wdata);
1262
1263 mutex_lock(&audit_filter_mutex);
1264
1265 /* parent was moved before we took audit_filter_mutex */
1266 if (parent->flags & AUDIT_PARENT_INVALID)
1267 ret = -ENOENT;
1268 else
1269 audit_add_to_parent(krule, parent);
1270
1271 /* match get in audit_init_parent or inotify_find_watch */
1272 put_inotify_watch(&parent->wdata);
1273 return ret;
1274}
1275
1276static u64 prio_low = ~0ULL/2; 858static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1; 859static u64 prio_high = ~0ULL/2 - 1;
1278 860
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
1282 struct audit_entry *e; 864 struct audit_entry *e;
1283 struct audit_watch *watch = entry->rule.watch; 865 struct audit_watch *watch = entry->rule.watch;
1284 struct audit_tree *tree = entry->rule.tree; 866 struct audit_tree *tree = entry->rule.tree;
1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list; 867 struct list_head *list;
1287 int h, err; 868 int h, err;
1288#ifdef CONFIG_AUDITSYSCALL 869#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
1296 877
1297 mutex_lock(&audit_filter_mutex); 878 mutex_lock(&audit_filter_mutex);
1298 e = audit_find_rule(entry, &list); 879 e = audit_find_rule(entry, &list);
1299 mutex_unlock(&audit_filter_mutex);
1300 if (e) { 880 if (e) {
881 mutex_unlock(&audit_filter_mutex);
1301 err = -EEXIST; 882 err = -EEXIST;
1302 /* normally audit_add_tree_rule() will free it on failure */ 883 /* normally audit_add_tree_rule() will free it on failure */
1303 if (tree) 884 if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
1305 goto error; 886 goto error;
1306 } 887 }
1307 888
1308 /* Avoid calling path_lookup under audit_filter_mutex. */
1309 if (watch) {
1310 err = audit_get_nd(watch->path, &ndp, &ndw);
1311 if (err)
1312 goto error;
1313 }
1314
1315 mutex_lock(&audit_filter_mutex);
1316 if (watch) { 889 if (watch) {
1317 /* audit_filter_mutex is dropped and re-taken during this call */ 890 /* audit_filter_mutex is dropped and re-taken during this call */
1318 err = audit_add_watch(&entry->rule, ndp, ndw); 891 err = audit_add_watch(&entry->rule);
1319 if (err) { 892 if (err) {
1320 mutex_unlock(&audit_filter_mutex); 893 mutex_unlock(&audit_filter_mutex);
1321 goto error; 894 goto error;
1322 } 895 }
1323 h = audit_hash_ino((u32)watch->ino); 896 /* entry->rule.watch may have changed during audit_add_watch() */
897 watch = entry->rule.watch;
898 h = audit_hash_ino((u32)audit_watch_inode(watch));
1324 list = &audit_inode_hash[h]; 899 list = &audit_inode_hash[h];
1325 } 900 }
1326 if (tree) { 901 if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
1358#endif 933#endif
1359 mutex_unlock(&audit_filter_mutex); 934 mutex_unlock(&audit_filter_mutex);
1360 935
1361 audit_put_nd(ndp, ndw); /* NULL args OK */
1362 return 0; 936 return 0;
1363 937
1364error: 938error:
1365 audit_put_nd(ndp, ndw); /* NULL args OK */
1366 if (watch) 939 if (watch)
1367 audit_put_watch(watch); /* tmp watch, matches initial get */ 940 audit_put_watch(watch); /* tmp watch, matches initial get */
1368 return err; 941 return err;
@@ -1372,7 +945,7 @@ error:
1372static inline int audit_del_rule(struct audit_entry *entry) 945static inline int audit_del_rule(struct audit_entry *entry)
1373{ 946{
1374 struct audit_entry *e; 947 struct audit_entry *e;
1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 948 struct audit_watch *watch = entry->rule.watch;
1376 struct audit_tree *tree = entry->rule.tree; 949 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list; 950 struct list_head *list;
1378 LIST_HEAD(inotify_list); 951 LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1394 goto out; 967 goto out;
1395 } 968 }
1396 969
1397 watch = e->rule.watch; 970 if (e->rule.watch)
1398 if (watch) { 971 audit_remove_watch_rule(&e->rule, &inotify_list);
1399 struct audit_parent *parent = watch->parent;
1400
1401 list_del(&e->rule.rlist);
1402
1403 if (list_empty(&watch->rules)) {
1404 audit_remove_watch(watch);
1405
1406 if (list_empty(&parent->watches)) {
1407 /* Put parent on the inotify un-registration
1408 * list. Grab a reference before releasing
1409 * audit_filter_mutex, to be released in
1410 * audit_inotify_unregister().
1411 * If filesystem is going away, just leave
1412 * the sucker alone, eviction will take
1413 * care of it.
1414 */
1415 if (pin_inotify_watch(&parent->wdata))
1416 list_add(&parent->ilist, &inotify_list);
1417 }
1418 }
1419 }
1420 972
1421 if (e->rule.tree) 973 if (e->rule.tree)
1422 audit_remove_tree_rule(&e->rule); 974 audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
1438 audit_inotify_unregister(&inotify_list); 990 audit_inotify_unregister(&inotify_list);
1439 991
1440out: 992out:
1441 if (tmp_watch) 993 if (watch)
1442 audit_put_watch(tmp_watch); /* match initial get */ 994 audit_put_watch(watch); /* match initial get */
1443 if (tree) 995 if (tree)
1444 audit_put_tree(tree); /* that's the temporary one */ 996 audit_put_tree(tree); /* that's the temporary one */
1445 997
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1527 security_release_secctx(ctx, len); 1079 security_release_secctx(ctx, len);
1528 } 1080 }
1529 } 1081 }
1530 audit_log_format(ab, " op=%s rule key=", action); 1082 audit_log_format(ab, " op=");
1531 if (rule->filterkey) 1083 audit_log_string(ab, action);
1532 audit_log_untrustedstring(ab, rule->filterkey); 1084 audit_log_key(ab, rule->filterkey);
1533 else
1534 audit_log_format(ab, "(null)");
1535 audit_log_format(ab, " list=%d res=%d", rule->listnr, res); 1085 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1536 audit_log_end(ab); 1086 audit_log_end(ab);
1537} 1087}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1595 return PTR_ERR(entry); 1145 return PTR_ERR(entry);
1596 1146
1597 err = audit_add_rule(entry); 1147 err = audit_add_rule(entry);
1598 audit_log_rule_change(loginuid, sessionid, sid, "add", 1148 audit_log_rule_change(loginuid, sessionid, sid, "add rule",
1599 &entry->rule, !err); 1149 &entry->rule, !err);
1600 1150
1601 if (err) 1151 if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1611 return PTR_ERR(entry); 1161 return PTR_ERR(entry);
1612 1162
1613 err = audit_del_rule(entry); 1163 err = audit_del_rule(entry);
1614 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1164 audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
1615 &entry->rule, !err); 1165 &entry->rule, !err);
1616 1166
1617 audit_free_rule(entry); 1167 audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
1793 list_del(&r->list); 1343 list_del(&r->list);
1794 } else { 1344 } else {
1795 if (watch) { 1345 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules); 1346 list_add(&nentry->rule.rlist, audit_watch_rules(watch));
1797 list_del(&r->rlist); 1347 list_del(&r->rlist);
1798 } else if (tree) 1348 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist); 1349 list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
1829 1379
1830 return err; 1380 return err;
1831} 1381}
1832
1833/* Update watch data in audit rules based on inotify events. */
1834void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1835 u32 cookie, const char *dname, struct inode *inode)
1836{
1837 struct audit_parent *parent;
1838
1839 parent = container_of(i_watch, struct audit_parent, wdata);
1840
1841 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1842 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1843 inode->i_ino, 0);
1844 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1845 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1846 /* inotify automatically removes the watch and sends IN_IGNORED */
1847 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1848 audit_remove_parent_watches(parent);
1849 /* inotify does not remove the watch, so remove it manually */
1850 else if(mask & IN_MOVE_SELF) {
1851 audit_remove_parent_watches(parent);
1852 inotify_remove_watch_locked(audit_ih, i_watch);
1853 } else if (mask & IN_IGNORED)
1854 put_inotify_watch(i_watch);
1855}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f414..68d3c6a0ecd6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count; 201 int tree_count;
202 struct list_head killed_trees;
202 203
203 int type; 204 int type;
204 union { 205 union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
548 } 549 }
549 break; 550 break;
550 case AUDIT_WATCH: 551 case AUDIT_WATCH:
551 if (name && rule->watch->ino != (unsigned long)-1) 552 if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
552 result = (name->dev == rule->watch->dev && 553 result = (name->dev == audit_watch_dev(rule->watch) &&
553 name->ino == rule->watch->ino); 554 name->ino == audit_watch_inode(rule->watch));
554 break; 555 break;
555 case AUDIT_DIR: 556 case AUDIT_DIR:
556 if (ctx) 557 if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
853 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 854 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
854 return NULL; 855 return NULL;
855 audit_zero_context(context, state); 856 audit_zero_context(context, state);
857 INIT_LIST_HEAD(&context->killed_trees);
856 return context; 858 return context;
857} 859}
858 860
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1026{
1025 char arg_num_len_buf[12]; 1027 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1028 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of " a=" */ 1029 /* how many digits are in arg_num? 5 is the length of ' a=""' */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1030 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
1029 size_t len, len_left, to_send; 1031 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1032 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
1031 unsigned int i, has_cntl = 0, too_long = 0; 1033 unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 if (has_cntl) 1139 if (has_cntl)
1138 audit_log_n_hex(*ab, buf, to_send); 1140 audit_log_n_hex(*ab, buf, to_send);
1139 else 1141 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1142 audit_log_string(*ab, buf);
1141 1143
1142 p += to_send; 1144 p += to_send;
1143 len_left -= to_send; 1145 len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1372 1374
1373 1375
1374 audit_log_task_info(ab, tsk); 1376 audit_log_task_info(ab, tsk);
1375 if (context->filterkey) { 1377 audit_log_key(ab, context->filterkey);
1376 audit_log_format(ab, " key=");
1377 audit_log_untrustedstring(ab, context->filterkey);
1378 } else
1379 audit_log_format(ab, " key=(null)");
1380 audit_log_end(ab); 1378 audit_log_end(ab);
1381 1379
1382 for (aux = context->aux; aux; aux = aux->next) { 1380 for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
1549 /* that can happen only if we are called from do_exit() */ 1547 /* that can happen only if we are called from do_exit() */
1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1548 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1551 audit_log_exit(context, tsk); 1549 audit_log_exit(context, tsk);
1550 if (!list_empty(&context->killed_trees))
1551 audit_kill_trees(&context->killed_trees);
1552 1552
1553 audit_free_context(context); 1553 audit_free_context(context);
1554} 1554}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
1692 context->in_syscall = 0; 1692 context->in_syscall = 0;
1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1694 1694
1695 if (!list_empty(&context->killed_trees))
1696 audit_kill_trees(&context->killed_trees);
1697
1695 if (context->previous) { 1698 if (context->previous) {
1696 struct audit_context *new_context = context->previous; 1699 struct audit_context *new_context = context->previous;
1697 context->previous = NULL; 1700 context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
2525 audit_log_format(ab, " sig=%ld", signr); 2528 audit_log_format(ab, " sig=%ld", signr);
2526 audit_log_end(ab); 2529 audit_log_end(ab);
2527} 2530}
2531
2532struct list_head *audit_killed_trees(void)
2533{
2534 struct audit_context *ctx = current->audit_context;
2535 if (likely(!ctx || !ctx->in_syscall))
2536 return NULL;
2537 return &ctx->killed_trees;
2538}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3fb789f6df94..c7ece8f027f2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -599,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
599static struct file_operations proc_cgroupstats_operations; 600static struct file_operations proc_cgroupstats_operations;
600 601
601static struct backing_dev_info cgroup_backing_dev_info = { 602static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup",
602 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 604 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
603}; 605};
604 606
@@ -734,16 +736,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 736 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 737 * to zero, soon.
736 * 738 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 739 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 740 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 741DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 742
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 743static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 744{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 745 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 746 wake_up_all(&cgroup_rmdir_waitq);
745} 747}
746 748
749void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
750{
751 css_get(css);
752}
753
754void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
755{
756 cgroup_wakeup_rmdir_waiter(css->cgroup);
757 css_put(css);
758}
759
760
747static int rebind_subsystems(struct cgroupfs_root *root, 761static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 762 unsigned long final_bits)
749{ 763{
@@ -843,6 +857,11 @@ static int parse_cgroupfs_options(char *data,
843 struct cgroup_sb_opts *opts) 857 struct cgroup_sb_opts *opts)
844{ 858{
845 char *token, *o = data ?: "all"; 859 char *token, *o = data ?: "all";
860 unsigned long mask = (unsigned long)-1;
861
862#ifdef CONFIG_CPUSETS
863 mask = ~(1UL << cpuset_subsys_id);
864#endif
846 865
847 opts->subsys_bits = 0; 866 opts->subsys_bits = 0;
848 opts->flags = 0; 867 opts->flags = 0;
@@ -887,6 +906,15 @@ static int parse_cgroupfs_options(char *data,
887 } 906 }
888 } 907 }
889 908
909 /*
910 * Option noprefix was introduced just for backward compatibility
911 * with the old cpuset, so we allow noprefix only if mounting just
912 * the cpuset subsystem.
913 */
914 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
915 (opts->subsys_bits & mask))
916 return -EINVAL;
917
890 /* We can't have an empty hierarchy */ 918 /* We can't have an empty hierarchy */
891 if (!opts->subsys_bits) 919 if (!opts->subsys_bits)
892 return -EINVAL; 920 return -EINVAL;
@@ -946,6 +974,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
946 INIT_LIST_HEAD(&cgrp->children); 974 INIT_LIST_HEAD(&cgrp->children);
947 INIT_LIST_HEAD(&cgrp->css_sets); 975 INIT_LIST_HEAD(&cgrp->css_sets);
948 INIT_LIST_HEAD(&cgrp->release_list); 976 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list);
949 init_rwsem(&cgrp->pids_mutex); 978 init_rwsem(&cgrp->pids_mutex);
950} 979}
951static void init_cgroup_root(struct cgroupfs_root *root) 980static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1343,7 +1372,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1343 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1372 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1344 * is no longer empty. 1373 * is no longer empty.
1345 */ 1374 */
1346 cgroup_wakeup_rmdir_waiters(cgrp); 1375 cgroup_wakeup_rmdir_waiter(cgrp);
1347 return 0; 1376 return 0;
1348} 1377}
1349 1378
@@ -2187,12 +2216,30 @@ err:
2187 return ret; 2216 return ret;
2188} 2217}
2189 2218
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2190static int cmppid(const void *a, const void *b) 2238static int cmppid(const void *a, const void *b)
2191{ 2239{
2192 return *(pid_t *)a - *(pid_t *)b; 2240 return *(pid_t *)a - *(pid_t *)b;
2193} 2241}
2194 2242
2195
2196/* 2243/*
2197 * seq_file methods for the "tasks" file. The seq_file position is the 2244 * seq_file methods for the "tasks" file. The seq_file position is the
2198 * next pid to display; the seq_file iterator is a pointer to the pid 2245 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2207,45 +2254,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2207 * after a seek to the start). Use a binary-search to find the 2254 * after a seek to the start). Use a binary-search to find the
2208 * next pid to display, if any 2255 * next pid to display, if any
2209 */ 2256 */
2210 struct cgroup *cgrp = s->private; 2257 struct cgroup_pids *cp = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2211 int index = 0, pid = *pos; 2259 int index = 0, pid = *pos;
2212 int *iter; 2260 int *iter;
2213 2261
2214 down_read(&cgrp->pids_mutex); 2262 down_read(&cgrp->pids_mutex);
2215 if (pid) { 2263 if (pid) {
2216 int end = cgrp->pids_length; 2264 int end = cp->length;
2217 2265
2218 while (index < end) { 2266 while (index < end) {
2219 int mid = (index + end) / 2; 2267 int mid = (index + end) / 2;
2220 if (cgrp->tasks_pids[mid] == pid) { 2268 if (cp->tasks_pids[mid] == pid) {
2221 index = mid; 2269 index = mid;
2222 break; 2270 break;
2223 } else if (cgrp->tasks_pids[mid] <= pid) 2271 } else if (cp->tasks_pids[mid] <= pid)
2224 index = mid + 1; 2272 index = mid + 1;
2225 else 2273 else
2226 end = mid; 2274 end = mid;
2227 } 2275 }
2228 } 2276 }
2229 /* If we're off the end of the array, we're done */ 2277 /* If we're off the end of the array, we're done */
2230 if (index >= cgrp->pids_length) 2278 if (index >= cp->length)
2231 return NULL; 2279 return NULL;
2232 /* Update the abstract position to be the actual pid that we found */ 2280 /* Update the abstract position to be the actual pid that we found */
2233 iter = cgrp->tasks_pids + index; 2281 iter = cp->tasks_pids + index;
2234 *pos = *iter; 2282 *pos = *iter;
2235 return iter; 2283 return iter;
2236} 2284}
2237 2285
2238static void cgroup_tasks_stop(struct seq_file *s, void *v) 2286static void cgroup_tasks_stop(struct seq_file *s, void *v)
2239{ 2287{
2240 struct cgroup *cgrp = s->private; 2288 struct cgroup_pids *cp = s->private;
2289 struct cgroup *cgrp = cp->cgrp;
2241 up_read(&cgrp->pids_mutex); 2290 up_read(&cgrp->pids_mutex);
2242} 2291}
2243 2292
2244static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2245{ 2294{
2246 struct cgroup *cgrp = s->private; 2295 struct cgroup_pids *cp = s->private;
2247 int *p = v; 2296 int *p = v;
2248 int *end = cgrp->tasks_pids + cgrp->pids_length; 2297 int *end = cp->tasks_pids + cp->length;
2249 2298
2250 /* 2299 /*
2251 * Advance to the next pid in the array. If this goes off the 2300 * Advance to the next pid in the array. If this goes off the
@@ -2272,26 +2321,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2272 .show = cgroup_tasks_show, 2321 .show = cgroup_tasks_show,
2273}; 2322};
2274 2323
2275static void release_cgroup_pid_array(struct cgroup *cgrp) 2324static void release_cgroup_pid_array(struct cgroup_pids *cp)
2276{ 2325{
2326 struct cgroup *cgrp = cp->cgrp;
2327
2277 down_write(&cgrp->pids_mutex); 2328 down_write(&cgrp->pids_mutex);
2278 BUG_ON(!cgrp->pids_use_count); 2329 BUG_ON(!cp->use_count);
2279 if (!--cgrp->pids_use_count) { 2330 if (!--cp->use_count) {
2280 kfree(cgrp->tasks_pids); 2331 list_del(&cp->list);
2281 cgrp->tasks_pids = NULL; 2332 put_pid_ns(cp->ns);
2282 cgrp->pids_length = 0; 2333 kfree(cp->tasks_pids);
2334 kfree(cp);
2283 } 2335 }
2284 up_write(&cgrp->pids_mutex); 2336 up_write(&cgrp->pids_mutex);
2285} 2337}
2286 2338
2287static int cgroup_tasks_release(struct inode *inode, struct file *file) 2339static int cgroup_tasks_release(struct inode *inode, struct file *file)
2288{ 2340{
2289 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2341 struct seq_file *seq;
2342 struct cgroup_pids *cp;
2290 2343
2291 if (!(file->f_mode & FMODE_READ)) 2344 if (!(file->f_mode & FMODE_READ))
2292 return 0; 2345 return 0;
2293 2346
2294 release_cgroup_pid_array(cgrp); 2347 seq = file->private_data;
2348 cp = seq->private;
2349
2350 release_cgroup_pid_array(cp);
2295 return seq_release(inode, file); 2351 return seq_release(inode, file);
2296} 2352}
2297 2353
@@ -2310,6 +2366,8 @@ static struct file_operations cgroup_tasks_operations = {
2310static int cgroup_tasks_open(struct inode *unused, struct file *file) 2366static int cgroup_tasks_open(struct inode *unused, struct file *file)
2311{ 2367{
2312 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns;
2370 struct cgroup_pids *cp;
2313 pid_t *pidarray; 2371 pid_t *pidarray;
2314 int npids; 2372 int npids;
2315 int retval; 2373 int retval;
@@ -2336,20 +2394,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2336 * array if necessary 2394 * array if necessary
2337 */ 2395 */
2338 down_write(&cgrp->pids_mutex); 2396 down_write(&cgrp->pids_mutex);
2339 kfree(cgrp->tasks_pids); 2397
2340 cgrp->tasks_pids = pidarray; 2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2341 cgrp->pids_length = npids; 2399 if (ns == cp->ns)
2342 cgrp->pids_use_count++; 2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2343 up_write(&cgrp->pids_mutex); 2418 up_write(&cgrp->pids_mutex);
2344 2419
2345 file->f_op = &cgroup_tasks_operations; 2420 file->f_op = &cgroup_tasks_operations;
2346 2421
2347 retval = seq_open(file, &cgroup_tasks_seq_operations); 2422 retval = seq_open(file, &cgroup_tasks_seq_operations);
2348 if (retval) { 2423 if (retval) {
2349 release_cgroup_pid_array(cgrp); 2424 release_cgroup_pid_array(cp);
2350 return retval; 2425 return retval;
2351 } 2426 }
2352 ((struct seq_file *)file->private_data)->private = cgrp; 2427 ((struct seq_file *)file->private_data)->private = cp;
2353 return 0; 2428 return 0;
2354} 2429}
2355 2430
@@ -2682,33 +2757,42 @@ again:
2682 mutex_unlock(&cgroup_mutex); 2757 mutex_unlock(&cgroup_mutex);
2683 2758
2684 /* 2759 /*
2760 * In general, subsystem has no css->refcnt after pre_destroy(). But
2761 * in racy cases, subsystem may have to get css->refcnt after
2762 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2763 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2764 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2765 * and subsystem's reference count handling. Please see css_get/put
2766 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2767 */
2768 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2769
2770 /*
2685 * Call pre_destroy handlers of subsys. Notify subsystems 2771 * Call pre_destroy handlers of subsys. Notify subsystems
2686 * that rmdir() request comes. 2772 * that rmdir() request comes.
2687 */ 2773 */
2688 ret = cgroup_call_pre_destroy(cgrp); 2774 ret = cgroup_call_pre_destroy(cgrp);
2689 if (ret) 2775 if (ret) {
2776 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2690 return ret; 2777 return ret;
2778 }
2691 2779
2692 mutex_lock(&cgroup_mutex); 2780 mutex_lock(&cgroup_mutex);
2693 parent = cgrp->parent; 2781 parent = cgrp->parent;
2694 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2782 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2783 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2695 mutex_unlock(&cgroup_mutex); 2784 mutex_unlock(&cgroup_mutex);
2696 return -EBUSY; 2785 return -EBUSY;
2697 } 2786 }
2698 /*
2699 * css_put/get is provided for subsys to grab refcnt to css. In typical
2700 * case, subsystem has no reference after pre_destroy(). But, under
2701 * hierarchy management, some *temporal* refcnt can be hold.
2702 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2703 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2704 * is called when css_put() is called and refcnt goes down to 0.
2705 */
2706 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2707 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2787 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2708
2709 if (!cgroup_clear_css_refs(cgrp)) { 2788 if (!cgroup_clear_css_refs(cgrp)) {
2710 mutex_unlock(&cgroup_mutex); 2789 mutex_unlock(&cgroup_mutex);
2711 schedule(); 2790 /*
2791 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2792 * prepare_to_wait(), we need to check this flag.
2793 */
2794 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2795 schedule();
2712 finish_wait(&cgroup_rmdir_waitq, &wait); 2796 finish_wait(&cgroup_rmdir_waitq, &wait);
2713 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2797 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2714 if (signal_pending(current)) 2798 if (signal_pending(current))
@@ -3280,7 +3364,7 @@ void __css_put(struct cgroup_subsys_state *css)
3280 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3364 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3281 check_for_release(cgrp); 3365 check_for_release(cgrp);
3282 } 3366 }
3283 cgroup_wakeup_rmdir_waiters(cgrp); 3367 cgroup_wakeup_rmdir_waiter(cgrp);
3284 } 3368 }
3285 rcu_read_unlock(); 3369 rcu_read_unlock();
3286} 3370}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8d..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
34 * an ongoing cpu hotplug operation. 34 * an ongoing cpu hotplug operation.
35 */ 35 */
36 int refcount; 36 int refcount;
37} cpu_hotplug; 37} cpu_hotplug = {
38 38 .active_writer = NULL,
39void __init cpu_hotplug_init(void) 39 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
40{ 40 .refcount = 0,
41 cpu_hotplug.active_writer = NULL; 41};
42 mutex_init(&cpu_hotplug.lock);
43 cpu_hotplug.refcount = 0;
44}
45 42
46#ifdef CONFIG_HOTPLUG_CPU 43#ifdef CONFIG_HOTPLUG_CPU
47 44
@@ -404,6 +401,7 @@ int disable_nonboot_cpus(void)
404 break; 401 break;
405 } 402 }
406 } 403 }
404
407 if (!error) { 405 if (!error) {
408 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
409 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -416,6 +414,14 @@ int disable_nonboot_cpus(void)
416 return error; 414 return error;
417} 415}
418 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
419void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
420{ 426{
421 int cpu, error; 427 int cpu, error;
@@ -427,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
427 goto out; 433 goto out;
428 434
429 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
430 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
431 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
432 if (!error) { 441 if (!error) {
@@ -435,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
435 } 444 }
436 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
437 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
438 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
439out: 451out:
440 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5a7e17474ee..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
331 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 305 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
332} 306}
333 307
334/** 308/*
335 * cpuset_update_task_memory_state - update task memory placement 309 * update task's spread flag if cpuset's page/slab spread flag is set
336 * 310 *
337 * If the current tasks cpusets mems_allowed changed behind our 311 * Called with callback_mutex/cgroup_mutex held
338 * backs, update current->mems_allowed, mems_generation and task NUMA
339 * mempolicy to the new value.
340 *
341 * Task mempolicy is updated by rebinding it relative to the
342 * current->cpuset if a task has its memory placement changed.
343 * Do not call this routine if in_interrupt().
344 *
345 * Call without callback_mutex or task_lock() held. May be
346 * called with or without cgroup_mutex held. Thanks in part to
347 * 'the_top_cpuset_hack', the task's cpuset pointer will never
348 * be NULL. This routine also might acquire callback_mutex during
349 * call.
350 *
351 * Reading current->cpuset->mems_generation doesn't need task_lock
352 * to guard the current->cpuset derefence, because it is guarded
353 * from concurrent freeing of current->cpuset using RCU.
354 *
355 * The rcu_dereference() is technically probably not needed,
356 * as I don't actually mind if I see a new cpuset pointer but
357 * an old value of mems_generation. However this really only
358 * matters on alpha systems using cpusets heavily. If I dropped
359 * that rcu_dereference(), it would save them a memory barrier.
360 * For all other arch's, rcu_dereference is a no-op anyway, and for
361 * alpha systems not using cpusets, another planned optimization,
362 * avoiding the rcu critical section for tasks in the root cpuset
363 * which is statically allocated, so can't vanish, will make this
364 * irrelevant. Better to use RCU as intended, than to engage in
365 * some cute trick to save a memory barrier that is impossible to
366 * test, for alpha systems using cpusets heavily, which might not
367 * even exist.
368 *
369 * This routine is needed to update the per-task mems_allowed data,
370 * within the tasks context, when it is trying to allocate memory
371 * (in various mm/mempolicy.c routines) and notices that some other
372 * task has been modifying its cpuset.
373 */ 312 */
374 313static void cpuset_update_task_spread_flag(struct cpuset *cs,
375void cpuset_update_task_memory_state(void) 314 struct task_struct *tsk)
376{ 315{
377 int my_cpusets_mem_gen; 316 if (is_spread_page(cs))
378 struct task_struct *tsk = current; 317 tsk->flags |= PF_SPREAD_PAGE;
379 struct cpuset *cs; 318 else
380 319 tsk->flags &= ~PF_SPREAD_PAGE;
381 rcu_read_lock(); 320 if (is_spread_slab(cs))
382 my_cpusets_mem_gen = task_cs(tsk)->mems_generation; 321 tsk->flags |= PF_SPREAD_SLAB;
383 rcu_read_unlock(); 322 else
384 323 tsk->flags &= ~PF_SPREAD_SLAB;
385 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
386 mutex_lock(&callback_mutex);
387 task_lock(tsk);
388 cs = task_cs(tsk); /* Maybe changed when task not locked */
389 guarantee_online_mems(cs, &tsk->mems_allowed);
390 tsk->cpuset_mems_generation = cs->mems_generation;
391 if (is_spread_page(cs))
392 tsk->flags |= PF_SPREAD_PAGE;
393 else
394 tsk->flags &= ~PF_SPREAD_PAGE;
395 if (is_spread_slab(cs))
396 tsk->flags |= PF_SPREAD_SLAB;
397 else
398 tsk->flags &= ~PF_SPREAD_SLAB;
399 task_unlock(tsk);
400 mutex_unlock(&callback_mutex);
401 mpol_rebind_task(tsk, &tsk->mems_allowed);
402 }
403} 324}
404 325
405/* 326/*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1007 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1008 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1009 * migrating memory region. 930 * migrating memory region.
1010 *
1011 * We call cpuset_update_task_memory_state() before hacking
1012 * our tasks mems_allowed, so that we are assured of being in
1013 * sync with our tasks cpuset, and in particular, callbacks to
1014 * cpuset_update_task_memory_state() from nested page allocations
1015 * won't see any mismatch of our cpuset and task mems_generation
1016 * values, so won't overwrite our hacked tasks mems_allowed
1017 * nodemask.
1018 */ 931 */
1019 932
1020static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1022{ 935{
1023 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1024 937
1025 cpuset_update_task_memory_state();
1026
1027 mutex_lock(&callback_mutex);
1028 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1029 mutex_unlock(&callback_mutex);
1030 939
1031 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1032 941
1033 mutex_lock(&callback_mutex);
1034 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1035 mutex_unlock(&callback_mutex);
1036} 943}
1037 944
1038/* 945/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1041 */ 969 */
1042static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1046 struct cpuset *cs; 974 struct cpuset *cs;
1047 int migrate; 975 int migrate;
1048 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1049 985
1050 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1051 if (!mm) 987 if (!mm)
1052 return; 988 return;
1053 989
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1056 991
1057 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1104/* 1039/*
1105 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1106 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1107 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1108 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1109 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1110 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1111 * 1046 *
1112 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1113 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1160 1095
1161 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1162 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1163 cs->mems_generation = cpuset_mems_generation++;
1164 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1165 1099
1166 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1193} 1127}
1194 1128
1195/* 1129/*
1130 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1131 * @tsk: task to be updated
1132 * @scan: struct cgroup_scanner containing the cgroup of the task
1133 *
1134 * Called by cgroup_scan_tasks() for each task in a cgroup.
1135 *
1136 * We don't need to re-check for the cgroup/cpuset membership, since we're
1137 * holding cgroup_lock() at this point.
1138 */
1139static void cpuset_change_flag(struct task_struct *tsk,
1140 struct cgroup_scanner *scan)
1141{
1142 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1143}
1144
1145/*
1146 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1147 * @cs: the cpuset in which each task's spread flags needs to be changed
1148 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1149 *
1150 * Called with cgroup_mutex held
1151 *
1152 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1153 * calling callback functions for each.
1154 *
1155 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1156 * if @heap != NULL.
1157 */
1158static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1159{
1160 struct cgroup_scanner scan;
1161
1162 scan.cg = cs->css.cgroup;
1163 scan.test_task = NULL;
1164 scan.process_task = cpuset_change_flag;
1165 scan.heap = heap;
1166 cgroup_scan_tasks(&scan);
1167}
1168
1169/*
1196 * update_flag - read a 0 or a 1 in a file and update associated flag 1170 * update_flag - read a 0 or a 1 in a file and update associated flag
1197 * bit: the bit to update (see cpuset_flagbits_t) 1171 * bit: the bit to update (see cpuset_flagbits_t)
1198 * cs: the cpuset to update 1172 * cs: the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1205 int turning_on) 1179 int turning_on)
1206{ 1180{
1207 struct cpuset *trialcs; 1181 struct cpuset *trialcs;
1208 int err;
1209 int balance_flag_changed; 1182 int balance_flag_changed;
1183 int spread_flag_changed;
1184 struct ptr_heap heap;
1185 int err;
1210 1186
1211 trialcs = alloc_trial_cpuset(cs); 1187 trialcs = alloc_trial_cpuset(cs);
1212 if (!trialcs) 1188 if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1221 if (err < 0) 1197 if (err < 0)
1222 goto out; 1198 goto out;
1223 1199
1200 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1201 if (err < 0)
1202 goto out;
1203
1224 balance_flag_changed = (is_sched_load_balance(cs) != 1204 balance_flag_changed = (is_sched_load_balance(cs) !=
1225 is_sched_load_balance(trialcs)); 1205 is_sched_load_balance(trialcs));
1226 1206
1207 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1208 || (is_spread_page(cs) != is_spread_page(trialcs)));
1209
1227 mutex_lock(&callback_mutex); 1210 mutex_lock(&callback_mutex);
1228 cs->flags = trialcs->flags; 1211 cs->flags = trialcs->flags;
1229 mutex_unlock(&callback_mutex); 1212 mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1231 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1214 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1232 async_rebuild_sched_domains(); 1215 async_rebuild_sched_domains();
1233 1216
1217 if (spread_flag_changed)
1218 update_tasks_flags(cs, &heap);
1219 heap_free(&heap);
1234out: 1220out:
1235 free_trial_cpuset(trialcs); 1221 free_trial_cpuset(trialcs);
1236 return err; 1222 return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1372 1358
1373 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1374 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1375 } else { 1362 } else {
1376 mutex_lock(&callback_mutex);
1377 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1378 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1379 } 1365 }
1380 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1381 if (err) 1367 if (err)
1382 return; 1368 return;
1383 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1373 cpuset_update_task_spread_flag(cs, tsk);
1374
1384 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
1385 to = cs->mems_allowed; 1376 to = cs->mems_allowed;
1386 mm = get_task_mm(tsk); 1377 mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1442 break; 1433 break;
1443 case FILE_SPREAD_PAGE: 1434 case FILE_SPREAD_PAGE:
1444 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1435 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1445 cs->mems_generation = cpuset_mems_generation++;
1446 break; 1436 break;
1447 case FILE_SPREAD_SLAB: 1437 case FILE_SPREAD_SLAB:
1448 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1438 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1449 cs->mems_generation = cpuset_mems_generation++;
1450 break; 1439 break;
1451 default: 1440 default:
1452 retval = -EINVAL; 1441 retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1786 struct cpuset *parent; 1775 struct cpuset *parent;
1787 1776
1788 if (!cont->parent) { 1777 if (!cont->parent) {
1789 /* This is early initialization for the top cgroup */
1790 top_cpuset.mems_generation = cpuset_mems_generation++;
1791 return &top_cpuset.css; 1778 return &top_cpuset.css;
1792 } 1779 }
1793 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1799 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1800 } 1787 }
1801 1788
1802 cpuset_update_task_memory_state();
1803 cs->flags = 0; 1789 cs->flags = 0;
1804 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1805 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1808 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1809 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1810 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1811 cs->mems_generation = cpuset_mems_generation++;
1812 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1813 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1814 1799
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1827{ 1812{
1828 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1829 1814
1830 cpuset_update_task_memory_state();
1831
1832 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1833 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1834 1817
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1849 .early_init = 1, 1832 .early_init = 1,
1850}; 1833};
1851 1834
1852/*
1853 * cpuset_init_early - just enough so that the calls to
1854 * cpuset_update_task_memory_state() in early init code
1855 * are harmless.
1856 */
1857
1858int __init cpuset_init_early(void)
1859{
1860 alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0;
1864}
1865
1866
1867/** 1835/**
1868 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1869 * 1837 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
1874{ 1842{
1875 int err = 0; 1843 int err = 0;
1876 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1877 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1878 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1879 1850
1880 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1881 top_cpuset.mems_generation = cpuset_mems_generation++;
1882 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1883 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1884 1854
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d616..d7f7a01082eb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
18#include <linux/cn_proc.h> 18#include <linux/cn_proc.h>
19#include "cred-internals.h" 19#include "cred-internals.h"
20 20
21#if 0
22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif
32
21static struct kmem_cache *cred_jar; 33static struct kmem_cache *cred_jar;
22 34
23/* 35/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
36 */ 48 */
37struct cred init_cred = { 49struct cred init_cred = {
38 .usage = ATOMIC_INIT(4), 50 .usage = ATOMIC_INIT(4),
51#ifdef CONFIG_DEBUG_CREDENTIALS
52 .subscribers = ATOMIC_INIT(2),
53 .magic = CRED_MAGIC,
54#endif
39 .securebits = SECUREBITS_DEFAULT, 55 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET, 56 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET, 57 .cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
48#endif 64#endif
49}; 65};
50 66
67static inline void set_cred_subscribers(struct cred *cred, int n)
68{
69#ifdef CONFIG_DEBUG_CREDENTIALS
70 atomic_set(&cred->subscribers, n);
71#endif
72}
73
74static inline int read_cred_subscribers(const struct cred *cred)
75{
76#ifdef CONFIG_DEBUG_CREDENTIALS
77 return atomic_read(&cred->subscribers);
78#else
79 return 0;
80#endif
81}
82
83static inline void alter_cred_subscribers(const struct cred *_cred, int n)
84{
85#ifdef CONFIG_DEBUG_CREDENTIALS
86 struct cred *cred = (struct cred *) _cred;
87
88 atomic_add(n, &cred->subscribers);
89#endif
90}
91
51/* 92/*
52 * Dispose of the shared task group credentials 93 * Dispose of the shared task group credentials
53 */ 94 */
@@ -85,15 +126,29 @@ static void put_cred_rcu(struct rcu_head *rcu)
85{ 126{
86 struct cred *cred = container_of(rcu, struct cred, rcu); 127 struct cred *cred = container_of(rcu, struct cred, rcu);
87 128
129 kdebug("put_cred_rcu(%p)", cred);
130
131#ifdef CONFIG_DEBUG_CREDENTIALS
132 if (cred->magic != CRED_MAGIC_DEAD ||
133 atomic_read(&cred->usage) != 0 ||
134 read_cred_subscribers(cred) != 0)
135 panic("CRED: put_cred_rcu() sees %p with"
136 " mag %x, put %p, usage %d, subscr %d\n",
137 cred, cred->magic, cred->put_addr,
138 atomic_read(&cred->usage),
139 read_cred_subscribers(cred));
140#else
88 if (atomic_read(&cred->usage) != 0) 141 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n", 142 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage)); 143 cred, atomic_read(&cred->usage));
144#endif
91 145
92 security_cred_free(cred); 146 security_cred_free(cred);
93 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth); 148 key_put(cred->request_key_auth);
95 release_tgcred(cred); 149 release_tgcred(cred);
96 put_group_info(cred->group_info); 150 if (cred->group_info)
151 put_group_info(cred->group_info);
97 free_uid(cred->user); 152 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred); 153 kmem_cache_free(cred_jar, cred);
99} 154}
@@ -106,12 +161,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
106 */ 161 */
107void __put_cred(struct cred *cred) 162void __put_cred(struct cred *cred)
108{ 163{
164 kdebug("__put_cred(%p{%d,%d})", cred,
165 atomic_read(&cred->usage),
166 read_cred_subscribers(cred));
167
109 BUG_ON(atomic_read(&cred->usage) != 0); 168 BUG_ON(atomic_read(&cred->usage) != 0);
169#ifdef CONFIG_DEBUG_CREDENTIALS
170 BUG_ON(read_cred_subscribers(cred) != 0);
171 cred->magic = CRED_MAGIC_DEAD;
172 cred->put_addr = __builtin_return_address(0);
173#endif
174 BUG_ON(cred == current->cred);
175 BUG_ON(cred == current->real_cred);
110 176
111 call_rcu(&cred->rcu, put_cred_rcu); 177 call_rcu(&cred->rcu, put_cred_rcu);
112} 178}
113EXPORT_SYMBOL(__put_cred); 179EXPORT_SYMBOL(__put_cred);
114 180
181/*
182 * Clean up a task's credentials when it exits
183 */
184void exit_creds(struct task_struct *tsk)
185{
186 struct cred *cred;
187
188 kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
189 atomic_read(&tsk->cred->usage),
190 read_cred_subscribers(tsk->cred));
191
192 cred = (struct cred *) tsk->real_cred;
193 tsk->real_cred = NULL;
194 validate_creds(cred);
195 alter_cred_subscribers(cred, -1);
196 put_cred(cred);
197
198 cred = (struct cred *) tsk->cred;
199 tsk->cred = NULL;
200 validate_creds(cred);
201 alter_cred_subscribers(cred, -1);
202 put_cred(cred);
203
204 cred = (struct cred *) tsk->replacement_session_keyring;
205 if (cred) {
206 tsk->replacement_session_keyring = NULL;
207 validate_creds(cred);
208 put_cred(cred);
209 }
210}
211
212/*
213 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM.
215 */
216struct cred *cred_alloc_blank(void)
217{
218 struct cred *new;
219
220 new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
221 if (!new)
222 return NULL;
223
224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) {
227 kfree(new);
228 return NULL;
229 }
230 atomic_set(&new->tgcred->usage, 1);
231#endif
232
233 atomic_set(&new->usage, 1);
234
235 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
236 goto error;
237
238#ifdef CONFIG_DEBUG_CREDENTIALS
239 new->magic = CRED_MAGIC;
240#endif
241 return new;
242
243error:
244 abort_creds(new);
245 return NULL;
246}
247
115/** 248/**
116 * prepare_creds - Prepare a new set of credentials for modification 249 * prepare_creds - Prepare a new set of credentials for modification
117 * 250 *
@@ -132,16 +265,19 @@ struct cred *prepare_creds(void)
132 const struct cred *old; 265 const struct cred *old;
133 struct cred *new; 266 struct cred *new;
134 267
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1); 268 validate_process_creds();
136 269
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL); 270 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new) 271 if (!new)
139 return NULL; 272 return NULL;
140 273
274 kdebug("prepare_creds() alloc %p", new);
275
141 old = task->cred; 276 old = task->cred;
142 memcpy(new, old, sizeof(struct cred)); 277 memcpy(new, old, sizeof(struct cred));
143 278
144 atomic_set(&new->usage, 1); 279 atomic_set(&new->usage, 1);
280 set_cred_subscribers(new, 0);
145 get_group_info(new->group_info); 281 get_group_info(new->group_info);
146 get_uid(new->user); 282 get_uid(new->user);
147 283
@@ -157,6 +293,7 @@ struct cred *prepare_creds(void)
157 293
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 294 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error; 295 goto error;
296 validate_creds(new);
160 return new; 297 return new;
161 298
162error: 299error:
@@ -229,9 +366,12 @@ struct cred *prepare_usermodehelper_creds(void)
229 if (!new) 366 if (!new)
230 return NULL; 367 return NULL;
231 368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
232 memcpy(new, &init_cred, sizeof(struct cred)); 371 memcpy(new, &init_cred, sizeof(struct cred));
233 372
234 atomic_set(&new->usage, 1); 373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
235 get_group_info(new->group_info); 375 get_group_info(new->group_info);
236 get_uid(new->user); 376 get_uid(new->user);
237 377
@@ -250,6 +390,7 @@ struct cred *prepare_usermodehelper_creds(void)
250#endif 390#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) 391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error; 392 goto error;
393 validate_creds(new);
253 394
254 BUG_ON(atomic_read(&new->usage) != 1); 395 BUG_ON(atomic_read(&new->usage) != 1);
255 return new; 396 return new;
@@ -286,6 +427,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
286 ) { 427 ) {
287 p->real_cred = get_cred(p->cred); 428 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred); 429 get_cred(p->cred);
430 alter_cred_subscribers(p->cred, 2);
431 kdebug("share_creds(%p{%d,%d})",
432 p->cred, atomic_read(&p->cred->usage),
433 read_cred_subscribers(p->cred));
289 atomic_inc(&p->cred->user->processes); 434 atomic_inc(&p->cred->user->processes);
290 return 0; 435 return 0;
291 } 436 }
@@ -331,6 +476,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
331 476
332 atomic_inc(&new->user->processes); 477 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new); 478 p->cred = p->real_cred = get_cred(new);
479 alter_cred_subscribers(new, 2);
480 validate_creds(new);
334 return 0; 481 return 0;
335 482
336error_put: 483error_put:
@@ -355,13 +502,20 @@ error_put:
355int commit_creds(struct cred *new) 502int commit_creds(struct cred *new)
356{ 503{
357 struct task_struct *task = current; 504 struct task_struct *task = current;
358 const struct cred *old; 505 const struct cred *old = task->real_cred;
359 506
360 BUG_ON(task->cred != task->real_cred); 507 kdebug("commit_creds(%p{%d,%d})", new,
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2); 508 atomic_read(&new->usage),
509 read_cred_subscribers(new));
510
511 BUG_ON(task->cred != old);
512#ifdef CONFIG_DEBUG_CREDENTIALS
513 BUG_ON(read_cred_subscribers(old) < 2);
514 validate_creds(old);
515 validate_creds(new);
516#endif
362 BUG_ON(atomic_read(&new->usage) < 1); 517 BUG_ON(atomic_read(&new->usage) < 1);
363 518
364 old = task->real_cred;
365 security_commit_creds(new, old); 519 security_commit_creds(new, old);
366 520
367 get_cred(new); /* we will require a ref for the subj creds too */ 521 get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +544,14 @@ int commit_creds(struct cred *new)
390 * cheaply with the new uid cache, so if it matters 544 * cheaply with the new uid cache, so if it matters
391 * we should be checking for it. -DaveM 545 * we should be checking for it. -DaveM
392 */ 546 */
547 alter_cred_subscribers(new, 2);
393 if (new->user != old->user) 548 if (new->user != old->user)
394 atomic_inc(&new->user->processes); 549 atomic_inc(&new->user->processes);
395 rcu_assign_pointer(task->real_cred, new); 550 rcu_assign_pointer(task->real_cred, new);
396 rcu_assign_pointer(task->cred, new); 551 rcu_assign_pointer(task->cred, new);
397 if (new->user != old->user) 552 if (new->user != old->user)
398 atomic_dec(&old->user->processes); 553 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2);
399 555
400 sched_switch_user(task); 556 sched_switch_user(task);
401 557
@@ -428,6 +584,13 @@ EXPORT_SYMBOL(commit_creds);
428 */ 584 */
429void abort_creds(struct cred *new) 585void abort_creds(struct cred *new)
430{ 586{
587 kdebug("abort_creds(%p{%d,%d})", new,
588 atomic_read(&new->usage),
589 read_cred_subscribers(new));
590
591#ifdef CONFIG_DEBUG_CREDENTIALS
592 BUG_ON(read_cred_subscribers(new) != 0);
593#endif
431 BUG_ON(atomic_read(&new->usage) < 1); 594 BUG_ON(atomic_read(&new->usage) < 1);
432 put_cred(new); 595 put_cred(new);
433} 596}
@@ -444,7 +607,20 @@ const struct cred *override_creds(const struct cred *new)
444{ 607{
445 const struct cred *old = current->cred; 608 const struct cred *old = current->cred;
446 609
447 rcu_assign_pointer(current->cred, get_cred(new)); 610 kdebug("override_creds(%p{%d,%d})", new,
611 atomic_read(&new->usage),
612 read_cred_subscribers(new));
613
614 validate_creds(old);
615 validate_creds(new);
616 get_cred(new);
617 alter_cred_subscribers(new, 1);
618 rcu_assign_pointer(current->cred, new);
619 alter_cred_subscribers(old, -1);
620
621 kdebug("override_creds() = %p{%d,%d}", old,
622 atomic_read(&old->usage),
623 read_cred_subscribers(old));
448 return old; 624 return old;
449} 625}
450EXPORT_SYMBOL(override_creds); 626EXPORT_SYMBOL(override_creds);
@@ -460,7 +636,15 @@ void revert_creds(const struct cred *old)
460{ 636{
461 const struct cred *override = current->cred; 637 const struct cred *override = current->cred;
462 638
639 kdebug("revert_creds(%p{%d,%d})", old,
640 atomic_read(&old->usage),
641 read_cred_subscribers(old));
642
643 validate_creds(old);
644 validate_creds(override);
645 alter_cred_subscribers(old, 1);
463 rcu_assign_pointer(current->cred, old); 646 rcu_assign_pointer(current->cred, old);
647 alter_cred_subscribers(override, -1);
464 put_cred(override); 648 put_cred(override);
465} 649}
466EXPORT_SYMBOL(revert_creds); 650EXPORT_SYMBOL(revert_creds);
@@ -502,11 +686,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
502 if (!new) 686 if (!new)
503 return NULL; 687 return NULL;
504 688
689 kdebug("prepare_kernel_cred() alloc %p", new);
690
505 if (daemon) 691 if (daemon)
506 old = get_task_cred(daemon); 692 old = get_task_cred(daemon);
507 else 693 else
508 old = get_cred(&init_cred); 694 old = get_cred(&init_cred);
509 695
696 validate_creds(old);
697
510 *new = *old; 698 *new = *old;
511 get_uid(new->user); 699 get_uid(new->user);
512 get_group_info(new->group_info); 700 get_group_info(new->group_info);
@@ -526,7 +714,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
526 goto error; 714 goto error;
527 715
528 atomic_set(&new->usage, 1); 716 atomic_set(&new->usage, 1);
717 set_cred_subscribers(new, 0);
529 put_cred(old); 718 put_cred(old);
719 validate_creds(new);
530 return new; 720 return new;
531 721
532error: 722error:
@@ -589,3 +779,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
589 return security_kernel_create_files_as(new, inode); 779 return security_kernel_create_files_as(new, inode);
590} 780}
591EXPORT_SYMBOL(set_create_files_as); 781EXPORT_SYMBOL(set_create_files_as);
782
783#ifdef CONFIG_DEBUG_CREDENTIALS
784
785/*
786 * dump invalid credentials
787 */
788static void dump_invalid_creds(const struct cred *cred, const char *label,
789 const struct task_struct *tsk)
790{
791 printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
792 label, cred,
793 cred == &init_cred ? "[init]" : "",
794 cred == tsk->real_cred ? "[real]" : "",
795 cred == tsk->cred ? "[eff]" : "");
796 printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
797 cred->magic, cred->put_addr);
798 printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
799 atomic_read(&cred->usage),
800 read_cred_subscribers(cred));
801 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
802 cred->uid, cred->euid, cred->suid, cred->fsuid);
803 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
804 cred->gid, cred->egid, cred->sgid, cred->fsgid);
805#ifdef CONFIG_SECURITY
806 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
807 if ((unsigned long) cred->security >= PAGE_SIZE &&
808 (((unsigned long) cred->security & 0xffffff00) !=
809 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
810 printk(KERN_ERR "CRED: ->security {%x, %x}\n",
811 ((u32*)cred->security)[0],
812 ((u32*)cred->security)[1]);
813#endif
814}
815
816/*
817 * report use of invalid credentials
818 */
819void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
820{
821 printk(KERN_ERR "CRED: Invalid credentials\n");
822 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
823 dump_invalid_creds(cred, "Specified", current);
824 BUG();
825}
826EXPORT_SYMBOL(__invalid_creds);
827
828/*
829 * check the credentials on a process
830 */
831void __validate_process_creds(struct task_struct *tsk,
832 const char *file, unsigned line)
833{
834 if (tsk->cred == tsk->real_cred) {
835 if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
836 creds_are_invalid(tsk->cred)))
837 goto invalid_creds;
838 } else {
839 if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
840 read_cred_subscribers(tsk->cred) < 1 ||
841 creds_are_invalid(tsk->real_cred) ||
842 creds_are_invalid(tsk->cred)))
843 goto invalid_creds;
844 }
845 return;
846
847invalid_creds:
848 printk(KERN_ERR "CRED: Invalid process credentials\n");
849 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
850
851 dump_invalid_creds(tsk->real_cred, "Real", tsk);
852 if (tsk->cred != tsk->real_cred)
853 dump_invalid_creds(tsk->cred, "Effective", tsk);
854 else
855 printk(KERN_ERR "CRED: Effective creds == Real creds\n");
856 BUG();
857}
858EXPORT_SYMBOL(__validate_process_creds);
859
860/*
861 * check creds for do_exit()
862 */
863void validate_creds_for_do_exit(struct task_struct *tsk)
864{
865 kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
866 tsk->real_cred, tsk->cred,
867 atomic_read(&tsk->cred->usage),
868 read_cred_subscribers(tsk->cred));
869
870 __validate_process_creds(tsk, __FILE__, __LINE__);
871}
872
873#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index b6c90b5ef509..ae5d8660ddff 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
@@ -375,9 +374,8 @@ static void set_special_pids(struct pid *pid)
375} 374}
376 375
377/* 376/*
378 * Let kernel threads use this to say that they 377 * Let kernel threads use this to say that they allow a certain signal.
379 * allow a certain signal (since daemonize() will 378 * Must not be used if kthread was cloned with CLONE_SIGHAND.
380 * have disabled all of them by default).
381 */ 379 */
382int allow_signal(int sig) 380int allow_signal(int sig)
383{ 381{
@@ -385,14 +383,14 @@ int allow_signal(int sig)
385 return -EINVAL; 383 return -EINVAL;
386 384
387 spin_lock_irq(&current->sighand->siglock); 385 spin_lock_irq(&current->sighand->siglock);
386 /* This is only needed for daemonize()'ed kthreads */
388 sigdelset(&current->blocked, sig); 387 sigdelset(&current->blocked, sig);
389 if (!current->mm) { 388 /*
390 /* Kernel threads handle their own signals. 389 * Kernel threads handle their own signals. Let the signal code
391 Let the signal code know it'll be handled, so 390 * know it'll be handled, so that they don't get converted to
392 that they don't get converted to SIGKILL or 391 * SIGKILL or just silently dropped.
393 just silently dropped */ 392 */
394 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; 393 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
395 }
396 recalc_sigpending(); 394 recalc_sigpending();
397 spin_unlock_irq(&current->sighand->siglock); 395 spin_unlock_irq(&current->sighand->siglock);
398 return 0; 396 return 0;
@@ -591,7 +589,7 @@ retry:
591 /* 589 /*
592 * Search in the siblings 590 * Search in the siblings
593 */ 591 */
594 list_for_each_entry(c, &p->parent->children, sibling) { 592 list_for_each_entry(c, &p->real_parent->children, sibling) {
595 if (c->mm == mm) 593 if (c->mm == mm)
596 goto assign_new_owner; 594 goto assign_new_owner;
597 } 595 }
@@ -758,7 +756,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
758 p->exit_signal = SIGCHLD; 756 p->exit_signal = SIGCHLD;
759 757
760 /* If it has exited notify the new parent about this child's death. */ 758 /* If it has exited notify the new parent about this child's death. */
761 if (!p->ptrace && 759 if (!task_ptrace(p) &&
762 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 760 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
763 do_notify_parent(p, p->exit_signal); 761 do_notify_parent(p, p->exit_signal);
764 if (task_detached(p)) { 762 if (task_detached(p)) {
@@ -783,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
783 list_for_each_entry_safe(p, n, &father->children, sibling) { 781 list_for_each_entry_safe(p, n, &father->children, sibling) {
784 p->real_parent = reaper; 782 p->real_parent = reaper;
785 if (p->parent == father) { 783 if (p->parent == father) {
786 BUG_ON(p->ptrace); 784 BUG_ON(task_ptrace(p));
787 p->parent = p->real_parent; 785 p->parent = p->real_parent;
788 } 786 }
789 reparent_thread(father, p, &dead_children); 787 reparent_thread(father, p, &dead_children);
@@ -903,6 +901,8 @@ NORET_TYPE void do_exit(long code)
903 901
904 tracehook_report_exit(&code); 902 tracehook_report_exit(&code);
905 903
904 validate_creds_for_do_exit(tsk);
905
906 /* 906 /*
907 * We're taking recursive faults here in do_exit. Safest is to just 907 * We're taking recursive faults here in do_exit. Safest is to just
908 * leave this task alone and wait for reboot. 908 * leave this task alone and wait for reboot.
@@ -1011,7 +1011,10 @@ NORET_TYPE void do_exit(long code)
1011 if (tsk->splice_pipe) 1011 if (tsk->splice_pipe)
1012 __free_pipe_info(tsk->splice_pipe); 1012 __free_pipe_info(tsk->splice_pipe);
1013 1013
1014 validate_creds_for_do_exit(tsk);
1015
1014 preempt_disable(); 1016 preempt_disable();
1017 exit_rcu();
1015 /* causes final put_task_struct in finish_task_switch(). */ 1018 /* causes final put_task_struct in finish_task_switch(). */
1016 tsk->state = TASK_DEAD; 1019 tsk->state = TASK_DEAD;
1017 schedule(); 1020 schedule();
@@ -1081,6 +1084,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
1081 return 0; 1084 return 0;
1082} 1085}
1083 1086
1087struct wait_opts {
1088 enum pid_type wo_type;
1089 int wo_flags;
1090 struct pid *wo_pid;
1091
1092 struct siginfo __user *wo_info;
1093 int __user *wo_stat;
1094 struct rusage __user *wo_rusage;
1095
1096 int notask_error;
1097};
1098
1084static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1099static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1085{ 1100{
1086 struct pid *pid = NULL; 1101 struct pid *pid = NULL;
@@ -1091,13 +1106,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1091 return pid; 1106 return pid;
1092} 1107}
1093 1108
1094static int eligible_child(enum pid_type type, struct pid *pid, int options, 1109static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1095 struct task_struct *p)
1096{ 1110{
1097 int err; 1111 int err;
1098 1112
1099 if (type < PIDTYPE_MAX) { 1113 if (wo->wo_type < PIDTYPE_MAX) {
1100 if (task_pid_type(p, type) != pid) 1114 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1101 return 0; 1115 return 0;
1102 } 1116 }
1103 1117
@@ -1106,8 +1120,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1106 * set; otherwise, wait for non-clone children *only*. (Note: 1120 * set; otherwise, wait for non-clone children *only*. (Note:
1107 * A "clone" child here is one that reports to its parent 1121 * A "clone" child here is one that reports to its parent
1108 * using a signal other than SIGCHLD.) */ 1122 * using a signal other than SIGCHLD.) */
1109 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) 1123 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1110 && !(options & __WALL)) 1124 && !(wo->wo_flags & __WALL))
1111 return 0; 1125 return 0;
1112 1126
1113 err = security_task_wait(p); 1127 err = security_task_wait(p);
@@ -1117,14 +1131,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1117 return 1; 1131 return 1;
1118} 1132}
1119 1133
1120static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1134static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1121 int why, int status, 1135 pid_t pid, uid_t uid, int why, int status)
1122 struct siginfo __user *infop,
1123 struct rusage __user *rusagep)
1124{ 1136{
1125 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; 1137 struct siginfo __user *infop;
1138 int retval = wo->wo_rusage
1139 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1126 1140
1127 put_task_struct(p); 1141 put_task_struct(p);
1142 infop = wo->wo_info;
1128 if (!retval) 1143 if (!retval)
1129 retval = put_user(SIGCHLD, &infop->si_signo); 1144 retval = put_user(SIGCHLD, &infop->si_signo);
1130 if (!retval) 1145 if (!retval)
@@ -1148,19 +1163,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1148 * the lock and this task is uninteresting. If we return nonzero, we have 1163 * the lock and this task is uninteresting. If we return nonzero, we have
1149 * released the lock and the system call should return. 1164 * released the lock and the system call should return.
1150 */ 1165 */
1151static int wait_task_zombie(struct task_struct *p, int options, 1166static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1152 struct siginfo __user *infop,
1153 int __user *stat_addr, struct rusage __user *ru)
1154{ 1167{
1155 unsigned long state; 1168 unsigned long state;
1156 int retval, status, traced; 1169 int retval, status, traced;
1157 pid_t pid = task_pid_vnr(p); 1170 pid_t pid = task_pid_vnr(p);
1158 uid_t uid = __task_cred(p)->uid; 1171 uid_t uid = __task_cred(p)->uid;
1172 struct siginfo __user *infop;
1159 1173
1160 if (!likely(options & WEXITED)) 1174 if (!likely(wo->wo_flags & WEXITED))
1161 return 0; 1175 return 0;
1162 1176
1163 if (unlikely(options & WNOWAIT)) { 1177 if (unlikely(wo->wo_flags & WNOWAIT)) {
1164 int exit_code = p->exit_code; 1178 int exit_code = p->exit_code;
1165 int why, status; 1179 int why, status;
1166 1180
@@ -1173,8 +1187,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1173 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; 1187 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1174 status = exit_code & 0x7f; 1188 status = exit_code & 0x7f;
1175 } 1189 }
1176 return wait_noreap_copyout(p, pid, uid, why, 1190 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1177 status, infop, ru);
1178 } 1191 }
1179 1192
1180 /* 1193 /*
@@ -1188,11 +1201,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
1188 } 1201 }
1189 1202
1190 traced = ptrace_reparented(p); 1203 traced = ptrace_reparented(p);
1191 1204 /*
1192 if (likely(!traced)) { 1205 * It can be ptraced but not reparented, check
1206 * !task_detached() to filter out sub-threads.
1207 */
1208 if (likely(!traced) && likely(!task_detached(p))) {
1193 struct signal_struct *psig; 1209 struct signal_struct *psig;
1194 struct signal_struct *sig; 1210 struct signal_struct *sig;
1195 struct task_cputime cputime;
1196 1211
1197 /* 1212 /*
1198 * The resource counters for the group leader are in its 1213 * The resource counters for the group leader are in its
@@ -1205,26 +1220,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1205 * p->signal fields, because they are only touched by 1220 * p->signal fields, because they are only touched by
1206 * __exit_signal, which runs with tasklist_lock 1221 * __exit_signal, which runs with tasklist_lock
1207 * write-locked anyway, and so is excluded here. We do 1222 * write-locked anyway, and so is excluded here. We do
1208 * need to protect the access to p->parent->signal fields, 1223 * need to protect the access to parent->signal fields,
1209 * as other threads in the parent group can be right 1224 * as other threads in the parent group can be right
1210 * here reaping other children at the same time. 1225 * here reaping other children at the same time.
1211 *
1212 * We use thread_group_cputime() to get times for the thread
1213 * group, which consolidates times for all threads in the
1214 * group including the group leader.
1215 */ 1226 */
1216 thread_group_cputime(p, &cputime); 1227 spin_lock_irq(&p->real_parent->sighand->siglock);
1217 spin_lock_irq(&p->parent->sighand->siglock); 1228 psig = p->real_parent->signal;
1218 psig = p->parent->signal;
1219 sig = p->signal; 1229 sig = p->signal;
1220 psig->cutime = 1230 psig->cutime =
1221 cputime_add(psig->cutime, 1231 cputime_add(psig->cutime,
1222 cputime_add(cputime.utime, 1232 cputime_add(p->utime,
1223 sig->cutime)); 1233 cputime_add(sig->utime,
1234 sig->cutime)));
1224 psig->cstime = 1235 psig->cstime =
1225 cputime_add(psig->cstime, 1236 cputime_add(psig->cstime,
1226 cputime_add(cputime.stime, 1237 cputime_add(p->stime,
1227 sig->cstime)); 1238 cputime_add(sig->stime,
1239 sig->cstime)));
1228 psig->cgtime = 1240 psig->cgtime =
1229 cputime_add(psig->cgtime, 1241 cputime_add(psig->cgtime,
1230 cputime_add(p->gtime, 1242 cputime_add(p->gtime,
@@ -1246,7 +1258,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1246 sig->oublock + sig->coublock; 1258 sig->oublock + sig->coublock;
1247 task_io_accounting_add(&psig->ioac, &p->ioac); 1259 task_io_accounting_add(&psig->ioac, &p->ioac);
1248 task_io_accounting_add(&psig->ioac, &sig->ioac); 1260 task_io_accounting_add(&psig->ioac, &sig->ioac);
1249 spin_unlock_irq(&p->parent->sighand->siglock); 1261 spin_unlock_irq(&p->real_parent->sighand->siglock);
1250 } 1262 }
1251 1263
1252 /* 1264 /*
@@ -1255,11 +1267,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
1255 */ 1267 */
1256 read_unlock(&tasklist_lock); 1268 read_unlock(&tasklist_lock);
1257 1269
1258 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1270 retval = wo->wo_rusage
1271 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1259 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1272 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1260 ? p->signal->group_exit_code : p->exit_code; 1273 ? p->signal->group_exit_code : p->exit_code;
1261 if (!retval && stat_addr) 1274 if (!retval && wo->wo_stat)
1262 retval = put_user(status, stat_addr); 1275 retval = put_user(status, wo->wo_stat);
1276
1277 infop = wo->wo_info;
1263 if (!retval && infop) 1278 if (!retval && infop)
1264 retval = put_user(SIGCHLD, &infop->si_signo); 1279 retval = put_user(SIGCHLD, &infop->si_signo);
1265 if (!retval && infop) 1280 if (!retval && infop)
@@ -1327,15 +1342,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1327 * the lock and this task is uninteresting. If we return nonzero, we have 1342 * the lock and this task is uninteresting. If we return nonzero, we have
1328 * released the lock and the system call should return. 1343 * released the lock and the system call should return.
1329 */ 1344 */
1330static int wait_task_stopped(int ptrace, struct task_struct *p, 1345static int wait_task_stopped(struct wait_opts *wo,
1331 int options, struct siginfo __user *infop, 1346 int ptrace, struct task_struct *p)
1332 int __user *stat_addr, struct rusage __user *ru)
1333{ 1347{
1348 struct siginfo __user *infop;
1334 int retval, exit_code, *p_code, why; 1349 int retval, exit_code, *p_code, why;
1335 uid_t uid = 0; /* unneeded, required by compiler */ 1350 uid_t uid = 0; /* unneeded, required by compiler */
1336 pid_t pid; 1351 pid_t pid;
1337 1352
1338 if (!(options & WUNTRACED)) 1353 /*
1354 * Traditionally we see ptrace'd stopped tasks regardless of options.
1355 */
1356 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1339 return 0; 1357 return 0;
1340 1358
1341 exit_code = 0; 1359 exit_code = 0;
@@ -1349,7 +1367,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1349 if (!exit_code) 1367 if (!exit_code)
1350 goto unlock_sig; 1368 goto unlock_sig;
1351 1369
1352 if (!unlikely(options & WNOWAIT)) 1370 if (!unlikely(wo->wo_flags & WNOWAIT))
1353 *p_code = 0; 1371 *p_code = 0;
1354 1372
1355 /* don't need the RCU readlock here as we're holding a spinlock */ 1373 /* don't need the RCU readlock here as we're holding a spinlock */
@@ -1371,14 +1389,15 @@ unlock_sig:
1371 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1389 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1372 read_unlock(&tasklist_lock); 1390 read_unlock(&tasklist_lock);
1373 1391
1374 if (unlikely(options & WNOWAIT)) 1392 if (unlikely(wo->wo_flags & WNOWAIT))
1375 return wait_noreap_copyout(p, pid, uid, 1393 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1376 why, exit_code, 1394
1377 infop, ru); 1395 retval = wo->wo_rusage
1396 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1397 if (!retval && wo->wo_stat)
1398 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1378 1399
1379 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1400 infop = wo->wo_info;
1380 if (!retval && stat_addr)
1381 retval = put_user((exit_code << 8) | 0x7f, stat_addr);
1382 if (!retval && infop) 1401 if (!retval && infop)
1383 retval = put_user(SIGCHLD, &infop->si_signo); 1402 retval = put_user(SIGCHLD, &infop->si_signo);
1384 if (!retval && infop) 1403 if (!retval && infop)
@@ -1405,15 +1424,13 @@ unlock_sig:
1405 * the lock and this task is uninteresting. If we return nonzero, we have 1424 * the lock and this task is uninteresting. If we return nonzero, we have
1406 * released the lock and the system call should return. 1425 * released the lock and the system call should return.
1407 */ 1426 */
1408static int wait_task_continued(struct task_struct *p, int options, 1427static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1409 struct siginfo __user *infop,
1410 int __user *stat_addr, struct rusage __user *ru)
1411{ 1428{
1412 int retval; 1429 int retval;
1413 pid_t pid; 1430 pid_t pid;
1414 uid_t uid; 1431 uid_t uid;
1415 1432
1416 if (!unlikely(options & WCONTINUED)) 1433 if (!unlikely(wo->wo_flags & WCONTINUED))
1417 return 0; 1434 return 0;
1418 1435
1419 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1436 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1425,7 +1442,7 @@ static int wait_task_continued(struct task_struct *p, int options,
1425 spin_unlock_irq(&p->sighand->siglock); 1442 spin_unlock_irq(&p->sighand->siglock);
1426 return 0; 1443 return 0;
1427 } 1444 }
1428 if (!unlikely(options & WNOWAIT)) 1445 if (!unlikely(wo->wo_flags & WNOWAIT))
1429 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1446 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1430 uid = __task_cred(p)->uid; 1447 uid = __task_cred(p)->uid;
1431 spin_unlock_irq(&p->sighand->siglock); 1448 spin_unlock_irq(&p->sighand->siglock);
@@ -1434,17 +1451,17 @@ static int wait_task_continued(struct task_struct *p, int options,
1434 get_task_struct(p); 1451 get_task_struct(p);
1435 read_unlock(&tasklist_lock); 1452 read_unlock(&tasklist_lock);
1436 1453
1437 if (!infop) { 1454 if (!wo->wo_info) {
1438 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1455 retval = wo->wo_rusage
1456 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1439 put_task_struct(p); 1457 put_task_struct(p);
1440 if (!retval && stat_addr) 1458 if (!retval && wo->wo_stat)
1441 retval = put_user(0xffff, stat_addr); 1459 retval = put_user(0xffff, wo->wo_stat);
1442 if (!retval) 1460 if (!retval)
1443 retval = pid; 1461 retval = pid;
1444 } else { 1462 } else {
1445 retval = wait_noreap_copyout(p, pid, uid, 1463 retval = wait_noreap_copyout(wo, p, pid, uid,
1446 CLD_CONTINUED, SIGCONT, 1464 CLD_CONTINUED, SIGCONT);
1447 infop, ru);
1448 BUG_ON(retval == 0); 1465 BUG_ON(retval == 0);
1449 } 1466 }
1450 1467
@@ -1454,19 +1471,16 @@ static int wait_task_continued(struct task_struct *p, int options,
1454/* 1471/*
1455 * Consider @p for a wait by @parent. 1472 * Consider @p for a wait by @parent.
1456 * 1473 *
1457 * -ECHILD should be in *@notask_error before the first call. 1474 * -ECHILD should be in ->notask_error before the first call.
1458 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1475 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1459 * Returns zero if the search for a child should continue; 1476 * Returns zero if the search for a child should continue;
1460 * then *@notask_error is 0 if @p is an eligible child, 1477 * then ->notask_error is 0 if @p is an eligible child,
1461 * or another error from security_task_wait(), or still -ECHILD. 1478 * or another error from security_task_wait(), or still -ECHILD.
1462 */ 1479 */
1463static int wait_consider_task(struct task_struct *parent, int ptrace, 1480static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
1464 struct task_struct *p, int *notask_error, 1481 int ptrace, struct task_struct *p)
1465 enum pid_type type, struct pid *pid, int options,
1466 struct siginfo __user *infop,
1467 int __user *stat_addr, struct rusage __user *ru)
1468{ 1482{
1469 int ret = eligible_child(type, pid, options, p); 1483 int ret = eligible_child(wo, p);
1470 if (!ret) 1484 if (!ret)
1471 return ret; 1485 return ret;
1472 1486
@@ -1478,17 +1492,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1478 * to look for security policy problems, rather 1492 * to look for security policy problems, rather
1479 * than for mysterious wait bugs. 1493 * than for mysterious wait bugs.
1480 */ 1494 */
1481 if (*notask_error) 1495 if (wo->notask_error)
1482 *notask_error = ret; 1496 wo->notask_error = ret;
1483 return 0; 1497 return 0;
1484 } 1498 }
1485 1499
1486 if (likely(!ptrace) && unlikely(p->ptrace)) { 1500 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1487 /* 1501 /*
1488 * This child is hidden by ptrace. 1502 * This child is hidden by ptrace.
1489 * We aren't allowed to see it now, but eventually we will. 1503 * We aren't allowed to see it now, but eventually we will.
1490 */ 1504 */
1491 *notask_error = 0; 1505 wo->notask_error = 0;
1492 return 0; 1506 return 0;
1493 } 1507 }
1494 1508
@@ -1499,34 +1513,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1499 * We don't reap group leaders with subthreads. 1513 * We don't reap group leaders with subthreads.
1500 */ 1514 */
1501 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1515 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1502 return wait_task_zombie(p, options, infop, stat_addr, ru); 1516 return wait_task_zombie(wo, p);
1503 1517
1504 /* 1518 /*
1505 * It's stopped or running now, so it might 1519 * It's stopped or running now, so it might
1506 * later continue, exit, or stop again. 1520 * later continue, exit, or stop again.
1507 */ 1521 */
1508 *notask_error = 0; 1522 wo->notask_error = 0;
1509 1523
1510 if (task_stopped_code(p, ptrace)) 1524 if (task_stopped_code(p, ptrace))
1511 return wait_task_stopped(ptrace, p, options, 1525 return wait_task_stopped(wo, ptrace, p);
1512 infop, stat_addr, ru);
1513 1526
1514 return wait_task_continued(p, options, infop, stat_addr, ru); 1527 return wait_task_continued(wo, p);
1515} 1528}
1516 1529
1517/* 1530/*
1518 * Do the work of do_wait() for one thread in the group, @tsk. 1531 * Do the work of do_wait() for one thread in the group, @tsk.
1519 * 1532 *
1520 * -ECHILD should be in *@notask_error before the first call. 1533 * -ECHILD should be in ->notask_error before the first call.
1521 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1534 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1522 * Returns zero if the search for a child should continue; then 1535 * Returns zero if the search for a child should continue; then
1523 * *@notask_error is 0 if there were any eligible children, 1536 * ->notask_error is 0 if there were any eligible children,
1524 * or another error from security_task_wait(), or still -ECHILD. 1537 * or another error from security_task_wait(), or still -ECHILD.
1525 */ 1538 */
1526static int do_wait_thread(struct task_struct *tsk, int *notask_error, 1539static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1527 enum pid_type type, struct pid *pid, int options,
1528 struct siginfo __user *infop, int __user *stat_addr,
1529 struct rusage __user *ru)
1530{ 1540{
1531 struct task_struct *p; 1541 struct task_struct *p;
1532 1542
@@ -1535,9 +1545,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1535 * Do not consider detached threads. 1545 * Do not consider detached threads.
1536 */ 1546 */
1537 if (!task_detached(p)) { 1547 if (!task_detached(p)) {
1538 int ret = wait_consider_task(tsk, 0, p, notask_error, 1548 int ret = wait_consider_task(wo, tsk, 0, p);
1539 type, pid, options,
1540 infop, stat_addr, ru);
1541 if (ret) 1549 if (ret)
1542 return ret; 1550 return ret;
1543 } 1551 }
@@ -1546,22 +1554,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1546 return 0; 1554 return 0;
1547} 1555}
1548 1556
1549static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, 1557static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1550 enum pid_type type, struct pid *pid, int options,
1551 struct siginfo __user *infop, int __user *stat_addr,
1552 struct rusage __user *ru)
1553{ 1558{
1554 struct task_struct *p; 1559 struct task_struct *p;
1555 1560
1556 /*
1557 * Traditionally we see ptrace'd stopped tasks regardless of options.
1558 */
1559 options |= WUNTRACED;
1560
1561 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1561 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1562 int ret = wait_consider_task(tsk, 1, p, notask_error, 1562 int ret = wait_consider_task(wo, tsk, 1, p);
1563 type, pid, options,
1564 infop, stat_addr, ru);
1565 if (ret) 1563 if (ret)
1566 return ret; 1564 return ret;
1567 } 1565 }
@@ -1569,65 +1567,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1569 return 0; 1567 return 0;
1570} 1568}
1571 1569
1572static long do_wait(enum pid_type type, struct pid *pid, int options, 1570static long do_wait(struct wait_opts *wo)
1573 struct siginfo __user *infop, int __user *stat_addr,
1574 struct rusage __user *ru)
1575{ 1571{
1576 DECLARE_WAITQUEUE(wait, current); 1572 DECLARE_WAITQUEUE(wait, current);
1577 struct task_struct *tsk; 1573 struct task_struct *tsk;
1578 int retval; 1574 int retval;
1579 1575
1580 trace_sched_process_wait(pid); 1576 trace_sched_process_wait(wo->wo_pid);
1581 1577
1582 add_wait_queue(&current->signal->wait_chldexit,&wait); 1578 add_wait_queue(&current->signal->wait_chldexit,&wait);
1583repeat: 1579repeat:
1584 /* 1580 /*
1585 * If there is nothing that can match our critiera just get out. 1581 * If there is nothing that can match our critiera just get out.
1586 * We will clear @retval to zero if we see any child that might later 1582 * We will clear ->notask_error to zero if we see any child that
1587 * match our criteria, even if we are not able to reap it yet. 1583 * might later match our criteria, even if we are not able to reap
1584 * it yet.
1588 */ 1585 */
1589 retval = -ECHILD; 1586 wo->notask_error = -ECHILD;
1590 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1587 if ((wo->wo_type < PIDTYPE_MAX) &&
1591 goto end; 1588 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1589 goto notask;
1592 1590
1593 current->state = TASK_INTERRUPTIBLE; 1591 set_current_state(TASK_INTERRUPTIBLE);
1594 read_lock(&tasklist_lock); 1592 read_lock(&tasklist_lock);
1595 tsk = current; 1593 tsk = current;
1596 do { 1594 do {
1597 int tsk_result = do_wait_thread(tsk, &retval, 1595 retval = do_wait_thread(wo, tsk);
1598 type, pid, options, 1596 if (retval)
1599 infop, stat_addr, ru); 1597 goto end;
1600 if (!tsk_result) 1598
1601 tsk_result = ptrace_do_wait(tsk, &retval, 1599 retval = ptrace_do_wait(wo, tsk);
1602 type, pid, options, 1600 if (retval)
1603 infop, stat_addr, ru);
1604 if (tsk_result) {
1605 /*
1606 * tasklist_lock is unlocked and we have a final result.
1607 */
1608 retval = tsk_result;
1609 goto end; 1601 goto end;
1610 }
1611 1602
1612 if (options & __WNOTHREAD) 1603 if (wo->wo_flags & __WNOTHREAD)
1613 break; 1604 break;
1614 tsk = next_thread(tsk); 1605 } while_each_thread(current, tsk);
1615 BUG_ON(tsk->signal != current->signal);
1616 } while (tsk != current);
1617 read_unlock(&tasklist_lock); 1606 read_unlock(&tasklist_lock);
1618 1607
1619 if (!retval && !(options & WNOHANG)) { 1608notask:
1609 retval = wo->notask_error;
1610 if (!retval && !(wo->wo_flags & WNOHANG)) {
1620 retval = -ERESTARTSYS; 1611 retval = -ERESTARTSYS;
1621 if (!signal_pending(current)) { 1612 if (!signal_pending(current)) {
1622 schedule(); 1613 schedule();
1623 goto repeat; 1614 goto repeat;
1624 } 1615 }
1625 } 1616 }
1626
1627end: 1617end:
1628 current->state = TASK_RUNNING; 1618 __set_current_state(TASK_RUNNING);
1629 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1619 remove_wait_queue(&current->signal->wait_chldexit,&wait);
1630 if (infop) { 1620 if (wo->wo_info) {
1621 struct siginfo __user *infop = wo->wo_info;
1622
1631 if (retval > 0) 1623 if (retval > 0)
1632 retval = 0; 1624 retval = 0;
1633 else { 1625 else {
@@ -1656,6 +1648,7 @@ end:
1656SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1648SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1657 infop, int, options, struct rusage __user *, ru) 1649 infop, int, options, struct rusage __user *, ru)
1658{ 1650{
1651 struct wait_opts wo;
1659 struct pid *pid = NULL; 1652 struct pid *pid = NULL;
1660 enum pid_type type; 1653 enum pid_type type;
1661 long ret; 1654 long ret;
@@ -1685,7 +1678,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1685 1678
1686 if (type < PIDTYPE_MAX) 1679 if (type < PIDTYPE_MAX)
1687 pid = find_get_pid(upid); 1680 pid = find_get_pid(upid);
1688 ret = do_wait(type, pid, options, infop, NULL, ru); 1681
1682 wo.wo_type = type;
1683 wo.wo_pid = pid;
1684 wo.wo_flags = options;
1685 wo.wo_info = infop;
1686 wo.wo_stat = NULL;
1687 wo.wo_rusage = ru;
1688 ret = do_wait(&wo);
1689 put_pid(pid); 1689 put_pid(pid);
1690 1690
1691 /* avoid REGPARM breakage on x86: */ 1691 /* avoid REGPARM breakage on x86: */
@@ -1696,6 +1696,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1696SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1696SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1697 int, options, struct rusage __user *, ru) 1697 int, options, struct rusage __user *, ru)
1698{ 1698{
1699 struct wait_opts wo;
1699 struct pid *pid = NULL; 1700 struct pid *pid = NULL;
1700 enum pid_type type; 1701 enum pid_type type;
1701 long ret; 1702 long ret;
@@ -1717,7 +1718,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1717 pid = find_get_pid(upid); 1718 pid = find_get_pid(upid);
1718 } 1719 }
1719 1720
1720 ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); 1721 wo.wo_type = type;
1722 wo.wo_pid = pid;
1723 wo.wo_flags = options | WEXITED;
1724 wo.wo_info = NULL;
1725 wo.wo_stat = stat_addr;
1726 wo.wo_rusage = ru;
1727 ret = do_wait(&wo);
1721 put_pid(pid); 1728 put_pid(pid);
1722 1729
1723 /* avoid REGPARM breakage on x86: */ 1730 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 4430eb1376f2..bfee931ee3fb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -153,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk)
153 WARN_ON(atomic_read(&tsk->usage)); 152 WARN_ON(atomic_read(&tsk->usage));
154 WARN_ON(tsk == current); 153 WARN_ON(tsk == current);
155 154
156 put_cred(tsk->real_cred); 155 exit_creds(tsk);
157 put_cred(tsk->cred);
158 delayacct_tsk_free(tsk); 156 delayacct_tsk_free(tsk);
159 157
160 if (!profile_handoff_task(tsk)) 158 if (!profile_handoff_task(tsk))
@@ -178,7 +176,7 @@ void __init fork_init(unsigned long mempages)
178 /* create a slab on which task_structs can be allocated */ 176 /* create a slab on which task_structs can be allocated */
179 task_struct_cachep = 177 task_struct_cachep =
180 kmem_cache_create("task_struct", sizeof(struct task_struct), 178 kmem_cache_create("task_struct", sizeof(struct task_struct),
181 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 179 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
182#endif 180#endif
183 181
184 /* do the arch specific task caches init */ 182 /* do the arch specific task caches init */
@@ -568,18 +566,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 566 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 567 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 568 */
571 if (tsk->clear_child_tid 569 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 570 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 571 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 572 /*
573 * We don't check the error code - if userspace has
574 * not set up a proper pointer then tough luck.
575 */
576 put_user(0, tsk->clear_child_tid);
577 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
578 1, NULL, NULL, 0);
579 }
575 tsk->clear_child_tid = NULL; 580 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 581 }
584} 582}
585 583
@@ -816,11 +814,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
816{ 814{
817 struct signal_struct *sig; 815 struct signal_struct *sig;
818 816
819 if (clone_flags & CLONE_THREAD) { 817 if (clone_flags & CLONE_THREAD)
820 atomic_inc(&current->signal->count);
821 atomic_inc(&current->signal->live);
822 return 0; 818 return 0;
823 }
824 819
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 820 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
826 tsk->signal = sig; 821 tsk->signal = sig;
@@ -878,16 +873,6 @@ void __cleanup_signal(struct signal_struct *sig)
878 kmem_cache_free(signal_cachep, sig); 873 kmem_cache_free(signal_cachep, sig);
879} 874}
880 875
881static void cleanup_signal(struct task_struct *tsk)
882{
883 struct signal_struct *sig = tsk->signal;
884
885 atomic_dec(&sig->live);
886
887 if (atomic_dec_and_test(&sig->count))
888 __cleanup_signal(sig);
889}
890
891static void copy_flags(unsigned long clone_flags, struct task_struct *p) 876static void copy_flags(unsigned long clone_flags, struct task_struct *p)
892{ 877{
893 unsigned long new_flags = p->flags; 878 unsigned long new_flags = p->flags;
@@ -1022,14 +1007,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1022 copy_flags(clone_flags, p); 1007 copy_flags(clone_flags, p);
1023 INIT_LIST_HEAD(&p->children); 1008 INIT_LIST_HEAD(&p->children);
1024 INIT_LIST_HEAD(&p->sibling); 1009 INIT_LIST_HEAD(&p->sibling);
1025#ifdef CONFIG_PREEMPT_RCU 1010 rcu_copy_process(p);
1026 p->rcu_read_lock_nesting = 0;
1027 p->rcu_flipctr_idx = 0;
1028#endif /* #ifdef CONFIG_PREEMPT_RCU */
1029 p->vfork_done = NULL; 1011 p->vfork_done = NULL;
1030 spin_lock_init(&p->alloc_lock); 1012 spin_lock_init(&p->alloc_lock);
1031 1013
1032 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1033 init_sigpending(&p->pending); 1014 init_sigpending(&p->pending);
1034 1015
1035 p->utime = cputime_zero; 1016 p->utime = cputime_zero;
@@ -1241,6 +1222,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1241 } 1222 }
1242 1223
1243 if (clone_flags & CLONE_THREAD) { 1224 if (clone_flags & CLONE_THREAD) {
1225 atomic_inc(&current->signal->count);
1226 atomic_inc(&current->signal->live);
1244 p->group_leader = current->group_leader; 1227 p->group_leader = current->group_leader;
1245 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1228 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1246 } 1229 }
@@ -1270,6 +1253,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1270 write_unlock_irq(&tasklist_lock); 1253 write_unlock_irq(&tasklist_lock);
1271 proc_fork_connector(p); 1254 proc_fork_connector(p);
1272 cgroup_post_fork(p); 1255 cgroup_post_fork(p);
1256 perf_counter_fork(p);
1273 return p; 1257 return p;
1274 1258
1275bad_fork_free_pid: 1259bad_fork_free_pid:
@@ -1283,7 +1267,8 @@ bad_fork_cleanup_mm:
1283 if (p->mm) 1267 if (p->mm)
1284 mmput(p->mm); 1268 mmput(p->mm);
1285bad_fork_cleanup_signal: 1269bad_fork_cleanup_signal:
1286 cleanup_signal(p); 1270 if (!(clone_flags & CLONE_THREAD))
1271 __cleanup_signal(p->signal);
1287bad_fork_cleanup_sighand: 1272bad_fork_cleanup_sighand:
1288 __cleanup_sighand(p->sighand); 1273 __cleanup_sighand(p->sighand);
1289bad_fork_cleanup_fs: 1274bad_fork_cleanup_fs:
@@ -1308,8 +1293,7 @@ bad_fork_cleanup_put_domain:
1308 module_put(task_thread_info(p)->exec_domain->module); 1293 module_put(task_thread_info(p)->exec_domain->module);
1309bad_fork_cleanup_count: 1294bad_fork_cleanup_count:
1310 atomic_dec(&p->cred->user->processes); 1295 atomic_dec(&p->cred->user->processes);
1311 put_cred(p->real_cred); 1296 exit_creds(p);
1312 put_cred(p->cred);
1313bad_fork_free: 1297bad_fork_free:
1314 free_task(p); 1298 free_task(p);
1315fork_out: 1299fork_out:
@@ -1409,12 +1393,6 @@ long do_fork(unsigned long clone_flags,
1409 if (clone_flags & CLONE_VFORK) { 1393 if (clone_flags & CLONE_VFORK) {
1410 p->vfork_done = &vfork; 1394 p->vfork_done = &vfork;
1411 init_completion(&vfork); 1395 init_completion(&vfork);
1412 } else if (!(clone_flags & CLONE_VM)) {
1413 /*
1414 * vfork will do an exec which will call
1415 * set_task_comm()
1416 */
1417 perf_counter_fork(p);
1418 } 1396 }
1419 1397
1420 audit_finish_fork(p); 1398 audit_finish_fork(p);
@@ -1470,20 +1448,20 @@ void __init proc_caches_init(void)
1470{ 1448{
1471 sighand_cachep = kmem_cache_create("sighand_cache", 1449 sighand_cachep = kmem_cache_create("sighand_cache",
1472 sizeof(struct sighand_struct), 0, 1450 sizeof(struct sighand_struct), 0,
1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1451 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1474 sighand_ctor); 1452 SLAB_NOTRACK, sighand_ctor);
1475 signal_cachep = kmem_cache_create("signal_cache", 1453 signal_cachep = kmem_cache_create("signal_cache",
1476 sizeof(struct signal_struct), 0, 1454 sizeof(struct signal_struct), 0,
1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1455 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1478 files_cachep = kmem_cache_create("files_cache", 1456 files_cachep = kmem_cache_create("files_cache",
1479 sizeof(struct files_struct), 0, 1457 sizeof(struct files_struct), 0,
1480 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1458 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1481 fs_cachep = kmem_cache_create("fs_cache", 1459 fs_cachep = kmem_cache_create("fs_cache",
1482 sizeof(struct fs_struct), 0, 1460 sizeof(struct fs_struct), 0,
1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1461 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1484 mm_cachep = kmem_cache_create("mm_struct", 1462 mm_cachep = kmem_cache_create("mm_struct",
1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1463 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1464 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1487 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1465 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1488 mmap_init(); 1466 mmap_init();
1489} 1467}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index 80b5ce716596..248dd119a86e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
115 /* rt_waiter storage for requeue_pi: */ 115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 116 struct rt_mutex_waiter *rt_waiter;
117 117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key;
120
118 /* Bitset for the optional bitmasked wakeup */ 121 /* Bitset for the optional bitmasked wakeup */
119 u32 bitset; 122 u32 bitset;
120}; 123};
@@ -247,6 +250,7 @@ again:
247 if (err < 0) 250 if (err < 0)
248 return err; 251 return err;
249 252
253 page = compound_head(page);
250 lock_page(page); 254 lock_page(page);
251 if (!page->mapping) { 255 if (!page->mapping) {
252 unlock_page(page); 256 unlock_page(page);
@@ -284,6 +288,25 @@ void put_futex_key(int fshared, union futex_key *key)
284 drop_futex_key_refs(key); 288 drop_futex_key_refs(key);
285} 289}
286 290
291/*
292 * fault_in_user_writeable - fault in user address and verify RW access
293 * @uaddr: pointer to faulting user space address
294 *
295 * Slow path to fixup the fault we just took in the atomic write
296 * access to @uaddr.
297 *
298 * We have no generic implementation of a non destructive write to the
299 * user address. We know that we faulted in the atomic pagefault
300 * disabled section so we can as well avoid the #PF overhead by
301 * calling get_user_pages() right away.
302 */
303static int fault_in_user_writeable(u32 __user *uaddr)
304{
305 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
306 1, 1, 0, NULL, NULL);
307 return ret < 0 ? ret : 0;
308}
309
287/** 310/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex 311 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in 312 * @hb: the hash bucket the futex_q's reside in
@@ -896,7 +919,6 @@ retry:
896retry_private: 919retry_private:
897 op_ret = futex_atomic_op_inuser(op, uaddr2); 920 op_ret = futex_atomic_op_inuser(op, uaddr2);
898 if (unlikely(op_ret < 0)) { 921 if (unlikely(op_ret < 0)) {
899 u32 dummy;
900 922
901 double_unlock_hb(hb1, hb2); 923 double_unlock_hb(hb1, hb2);
902 924
@@ -914,7 +936,7 @@ retry_private:
914 goto out_put_keys; 936 goto out_put_keys;
915 } 937 }
916 938
917 ret = get_user(dummy, uaddr2); 939 ret = fault_in_user_writeable(uaddr2);
918 if (ret) 940 if (ret)
919 goto out_put_keys; 941 goto out_put_keys;
920 942
@@ -991,15 +1013,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1013 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q 1014 * q: the futex_q
993 * key: the key of the requeue target futex 1015 * key: the key of the requeue target futex
1016 * hb: the hash_bucket of the requeue target futex
994 * 1017 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1018 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1019 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right 1020 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1021 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held. 1022 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1023 * to protect access to the pi_state to fixup the owner later. Must be called
1024 * with both q->lock_ptr and hb->lock held.
1000 */ 1025 */
1001static inline 1026static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) 1027void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1028 struct futex_hash_bucket *hb)
1003{ 1029{
1004 drop_futex_key_refs(&q->key); 1030 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key); 1031 get_futex_key_refs(key);
@@ -1011,6 +1037,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1011 WARN_ON(!q->rt_waiter); 1037 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL; 1038 q->rt_waiter = NULL;
1013 1039
1040 q->lock_ptr = &hb->lock;
1041#ifdef CONFIG_DEBUG_PI_LIST
1042 q->list.plist.lock = &hb->lock;
1043#endif
1044
1014 wake_up_state(q->task, TASK_NORMAL); 1045 wake_up_state(q->task, TASK_NORMAL);
1015} 1046}
1016 1047
@@ -1061,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1061 if (!top_waiter) 1092 if (!top_waiter)
1062 return 0; 1093 return 0;
1063 1094
1095 /* Ensure we requeue to the expected futex. */
1096 if (!match_futex(top_waiter->requeue_pi_key, key2))
1097 return -EINVAL;
1098
1064 /* 1099 /*
1065 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1100 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1066 * the contended case or if set_waiters is 1. The pi_state is returned 1101 * the contended case or if set_waiters is 1. The pi_state is returned
@@ -1069,7 +1104,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1104 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters); 1105 set_waiters);
1071 if (ret == 1) 1106 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2); 1107 requeue_pi_wake_futex(top_waiter, key2, hb2);
1073 1108
1074 return ret; 1109 return ret;
1075} 1110}
@@ -1204,7 +1239,7 @@ retry_private:
1204 double_unlock_hb(hb1, hb2); 1239 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2); 1240 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1); 1241 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2); 1242 ret = fault_in_user_writeable(uaddr2);
1208 if (!ret) 1243 if (!ret)
1209 goto retry; 1244 goto retry;
1210 goto out; 1245 goto out;
@@ -1228,8 +1263,15 @@ retry_private:
1228 if (!match_futex(&this->key, &key1)) 1263 if (!match_futex(&this->key, &key1))
1229 continue; 1264 continue;
1230 1265
1231 WARN_ON(!requeue_pi && this->rt_waiter); 1266 /*
1232 WARN_ON(requeue_pi && !this->rt_waiter); 1267 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1268 * be paired with each other and no other futex ops.
1269 */
1270 if ((requeue_pi && !this->rt_waiter) ||
1271 (!requeue_pi && this->rt_waiter)) {
1272 ret = -EINVAL;
1273 break;
1274 }
1233 1275
1234 /* 1276 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1277 * Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1241,6 +1283,12 @@ retry_private:
1241 continue; 1283 continue;
1242 } 1284 }
1243 1285
1286 /* Ensure we requeue to the expected futex for requeue_pi. */
1287 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1288 ret = -EINVAL;
1289 break;
1290 }
1291
1244 /* 1292 /*
1245 * Requeue nr_requeue waiters and possibly one more in the case 1293 * Requeue nr_requeue waiters and possibly one more in the case
1246 * of requeue_pi if we couldn't acquire the lock atomically. 1294 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1254,7 +1302,7 @@ retry_private:
1254 this->task, 1); 1302 this->task, 1);
1255 if (ret == 1) { 1303 if (ret == 1) {
1256 /* We got the lock. */ 1304 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2); 1305 requeue_pi_wake_futex(this, &key2, hb2);
1258 continue; 1306 continue;
1259 } else if (ret) { 1307 } else if (ret) {
1260 /* -EDEADLK */ 1308 /* -EDEADLK */
@@ -1482,7 +1530,7 @@ retry:
1482handle_fault: 1530handle_fault:
1483 spin_unlock(q->lock_ptr); 1531 spin_unlock(q->lock_ptr);
1484 1532
1485 ret = get_user(uval, uaddr); 1533 ret = fault_in_user_writeable(uaddr);
1486 1534
1487 spin_lock(q->lock_ptr); 1535 spin_lock(q->lock_ptr);
1488 1536
@@ -1716,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1716 q.pi_state = NULL; 1764 q.pi_state = NULL;
1717 q.bitset = bitset; 1765 q.bitset = bitset;
1718 q.rt_waiter = NULL; 1766 q.rt_waiter = NULL;
1767 q.requeue_pi_key = NULL;
1719 1768
1720 if (abs_time) { 1769 if (abs_time) {
1721 to = &timeout; 1770 to = &timeout;
@@ -1807,7 +1856,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1807{ 1856{
1808 struct hrtimer_sleeper timeout, *to = NULL; 1857 struct hrtimer_sleeper timeout, *to = NULL;
1809 struct futex_hash_bucket *hb; 1858 struct futex_hash_bucket *hb;
1810 u32 uval;
1811 struct futex_q q; 1859 struct futex_q q;
1812 int res, ret; 1860 int res, ret;
1813 1861
@@ -1824,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1824 1872
1825 q.pi_state = NULL; 1873 q.pi_state = NULL;
1826 q.rt_waiter = NULL; 1874 q.rt_waiter = NULL;
1875 q.requeue_pi_key = NULL;
1827retry: 1876retry:
1828 q.key = FUTEX_KEY_INIT; 1877 q.key = FUTEX_KEY_INIT;
1829 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1878 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1909,16 +1958,9 @@ out:
1909 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1958 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1910 1959
1911uaddr_faulted: 1960uaddr_faulted:
1912 /*
1913 * We have to r/w *(int __user *)uaddr, and we have to modify it
1914 * atomically. Therefore, if we continue to fault after get_user()
1915 * below, we need to handle the fault ourselves, while still holding
1916 * the mmap_sem. This can occur if the uaddr is under contention as
1917 * we have to drop the mmap_sem in order to call get_user().
1918 */
1919 queue_unlock(&q, hb); 1961 queue_unlock(&q, hb);
1920 1962
1921 ret = get_user(uval, uaddr); 1963 ret = fault_in_user_writeable(uaddr);
1922 if (ret) 1964 if (ret)
1923 goto out_put_key; 1965 goto out_put_key;
1924 1966
@@ -2013,17 +2055,10 @@ out:
2013 return ret; 2055 return ret;
2014 2056
2015pi_faulted: 2057pi_faulted:
2016 /*
2017 * We have to r/w *(int __user *)uaddr, and we have to modify it
2018 * atomically. Therefore, if we continue to fault after get_user()
2019 * below, we need to handle the fault ourselves, while still holding
2020 * the mmap_sem. This can occur if the uaddr is under contention as
2021 * we have to drop the mmap_sem in order to call get_user().
2022 */
2023 spin_unlock(&hb->lock); 2058 spin_unlock(&hb->lock);
2024 put_futex_key(fshared, &key); 2059 put_futex_key(fshared, &key);
2025 2060
2026 ret = get_user(uval, uaddr); 2061 ret = fault_in_user_writeable(uaddr);
2027 if (!ret) 2062 if (!ret)
2028 goto retry; 2063 goto retry;
2029 2064
@@ -2098,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2098 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2133 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2099 * via the following: 2134 * via the following:
2100 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2135 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2101 * 2) wakeup on uaddr2 after a requeue and subsequent unlock 2136 * 2) wakeup on uaddr2 after a requeue
2102 * 3) signal (before or after requeue) 2137 * 3) signal
2103 * 4) timeout (before or after requeue) 2138 * 4) timeout
2104 * 2139 *
2105 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. 2140 * If 3, cleanup and return -ERESTARTNOINTR.
2106 * 2141 *
2107 * If 2, we may then block on trying to take the rt_mutex and return via: 2142 * If 2, we may then block on trying to take the rt_mutex and return via:
2108 * 5) successful lock 2143 * 5) successful lock
@@ -2110,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2110 * 7) timeout 2145 * 7) timeout
2111 * 8) other lock acquisition failure 2146 * 8) other lock acquisition failure
2112 * 2147 *
2113 * If 6, we setup a restart_block with futex_lock_pi() as the function. 2148 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2114 * 2149 *
2115 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2150 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2116 * 2151 *
@@ -2149,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2149 debug_rt_mutex_init_waiter(&rt_waiter); 2184 debug_rt_mutex_init_waiter(&rt_waiter);
2150 rt_waiter.task = NULL; 2185 rt_waiter.task = NULL;
2151 2186
2152 q.pi_state = NULL;
2153 q.bitset = bitset;
2154 q.rt_waiter = &rt_waiter;
2155
2156 key2 = FUTEX_KEY_INIT; 2187 key2 = FUTEX_KEY_INIT;
2157 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2188 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2158 if (unlikely(ret != 0)) 2189 if (unlikely(ret != 0))
2159 goto out; 2190 goto out;
2160 2191
2192 q.pi_state = NULL;
2193 q.bitset = bitset;
2194 q.rt_waiter = &rt_waiter;
2195 q.requeue_pi_key = &key2;
2196
2161 /* Prepare to wait on uaddr. */ 2197 /* Prepare to wait on uaddr. */
2162 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2198 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2163 if (ret) 2199 if (ret)
@@ -2228,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2228 rt_mutex_unlock(pi_mutex); 2264 rt_mutex_unlock(pi_mutex);
2229 } else if (ret == -EINTR) { 2265 } else if (ret == -EINTR) {
2230 /* 2266 /*
2231 * We've already been requeued, but we have no way to 2267 * We've already been requeued, but cannot restart by calling
2232 * restart by calling futex_lock_pi() directly. We 2268 * futex_lock_pi() directly. We could restart this syscall, but
2233 * could restart the syscall, but that will look at 2269 * it would detect that the user space "val" changed and return
2234 * the user space value and return right away. So we 2270 * -EWOULDBLOCK. Save the overhead of the restart and return
2235 * drop back with EWOULDBLOCK to tell user space that 2271 * -EWOULDBLOCK directly.
2236 * "val" has been changed. That's the same what the
2237 * restart of the syscall would do in
2238 * futex_wait_setup().
2239 */ 2272 */
2240 ret = -EWOULDBLOCK; 2273 ret = -EWOULDBLOCK;
2241 } 2274 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
180 int cmd = op & FUTEX_CMD_MASK; 180 int cmd = op & FUTEX_CMD_MASK;
181 181
182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
183 cmd == FUTEX_WAIT_BITSET)) { 183 cmd == FUTEX_WAIT_BITSET ||
184 cmd == FUTEX_WAIT_REQUEUE_PI)) {
184 if (get_compat_timespec(&ts, utime)) 185 if (get_compat_timespec(&ts, utime))
185 return -EFAULT; 186 return -EFAULT;
186 if (!timespec_valid(&ts)) 187 if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
191 t = ktime_add_safe(ktime_get(), t); 192 t = ktime_add_safe(ktime_get(), t);
192 tp = &t; 193 tp = &t;
193 } 194 }
194 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 195 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
196 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
195 val2 = (int) (unsigned long) utime; 197 val2 = (int) (unsigned long) utime;
196 198
197 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 199 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 000000000000..654efd09f6a9
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
1menu "GCOV-based kernel profiling"
2
3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS
6 default n
7 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage
9 measurements).
10
11 If unsure, say N.
12
13 Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
14 for the entire kernel. To enable profiling for specific files or
15 directories, add a line similar to the following to the respective
16 Makefile:
17
18 For a single file (e.g. main.o):
19 GCOV_PROFILE_main.o := y
20
21 For all files in one directory:
22 GCOV_PROFILE := y
23
24 To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
25 is specified, use:
26
27 GCOV_PROFILE_main.o := n
28 and:
29 GCOV_PROFILE := n
30
31 Note that the debugfs filesystem has to be mounted to access
32 profiling data.
33
34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL)
38 default n
39 ---help---
40 This options activates profiling for the entire kernel.
41
42 If unsure, say N.
43
44 Note that a kernel compiled with profiling flags will be significantly
45 larger and run slower. Also be sure to exclude files from profiling
46 which are not linked to the kernel image to prevent linker errors.
47
48endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 000000000000..3f761001d517
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 000000000000..9b22d03cc581
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
1/*
2 * This code maintains a list of active profiling data structures.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 */
15
16#define pr_fmt(fmt) "gcov: " fmt
17
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include "gcov.h"
22
23static struct gcov_info *gcov_info_head;
24static int gcov_events_enabled;
25static DEFINE_MUTEX(gcov_lock);
26
27/*
28 * __gcov_init is called by gcc-generated constructor code for each object
29 * file compiled with -fprofile-arcs.
30 */
31void __gcov_init(struct gcov_info *info)
32{
33 static unsigned int gcov_version;
34
35 mutex_lock(&gcov_lock);
36 if (gcov_version == 0) {
37 gcov_version = info->version;
38 /*
39 * Printing gcc's version magic may prove useful for debugging
40 * incompatibility reports.
41 */
42 pr_info("version magic: 0x%x\n", gcov_version);
43 }
44 /*
45 * Add new profiling data structure to list and inform event
46 * listener.
47 */
48 info->next = gcov_info_head;
49 gcov_info_head = info;
50 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock);
53}
54EXPORT_SYMBOL(__gcov_init);
55
56/*
57 * These functions may be referenced by gcc-generated profiling code but serve
58 * no function for kernel profiling.
59 */
60void __gcov_flush(void)
61{
62 /* Unused. */
63}
64EXPORT_SYMBOL(__gcov_flush);
65
66void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
67{
68 /* Unused. */
69}
70EXPORT_SYMBOL(__gcov_merge_add);
71
72void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
73{
74 /* Unused. */
75}
76EXPORT_SYMBOL(__gcov_merge_single);
77
78void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
79{
80 /* Unused. */
81}
82EXPORT_SYMBOL(__gcov_merge_delta);
83
84/**
85 * gcov_enable_events - enable event reporting through gcov_event()
86 *
87 * Turn on reporting of profiling data load/unload-events through the
88 * gcov_event() callback. Also replay all previous events once. This function
89 * is needed because some events are potentially generated too early for the
90 * callback implementation to handle them initially.
91 */
92void gcov_enable_events(void)
93{
94 struct gcov_info *info;
95
96 mutex_lock(&gcov_lock);
97 gcov_events_enabled = 1;
98 /* Perform event callback for previously registered entries. */
99 for (info = gcov_info_head; info; info = info->next)
100 gcov_event(GCOV_ADD, info);
101 mutex_unlock(&gcov_lock);
102}
103
104#ifdef CONFIG_MODULES
105static inline int within(void *addr, void *start, unsigned long size)
106{
107 return ((addr >= start) && (addr < start + size));
108}
109
110/* Update list and generate events when modules are unloaded. */
111static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
112 void *data)
113{
114 struct module *mod = data;
115 struct gcov_info *info;
116 struct gcov_info *prev;
117
118 if (event != MODULE_STATE_GOING)
119 return NOTIFY_OK;
120 mutex_lock(&gcov_lock);
121 prev = NULL;
122 /* Remove entries located in module from linked list. */
123 for (info = gcov_info_head; info; info = info->next) {
124 if (within(info, mod->module_core, mod->core_size)) {
125 if (prev)
126 prev->next = info->next;
127 else
128 gcov_info_head = info->next;
129 if (gcov_events_enabled)
130 gcov_event(GCOV_REMOVE, info);
131 } else
132 prev = info;
133 }
134 mutex_unlock(&gcov_lock);
135
136 return NOTIFY_OK;
137}
138
139static struct notifier_block gcov_nb = {
140 .notifier_call = gcov_module_notifier,
141};
142
143static int __init gcov_init(void)
144{
145 return register_module_notifier(&gcov_nb);
146}
147device_initcall(gcov_init);
148#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 000000000000..ef3c3f88a7a3
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
1/*
2 * This code exports profiling data as debugfs files to userspace.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 * Yi CDL Yang
15 */
16
17#define pr_fmt(fmt) "gcov: " fmt
18
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/debugfs.h>
22#include <linux/fs.h>
23#include <linux/list.h>
24#include <linux/string.h>
25#include <linux/slab.h>
26#include <linux/mutex.h>
27#include <linux/seq_file.h>
28#include "gcov.h"
29
30/**
31 * struct gcov_node - represents a debugfs entry
32 * @list: list head for child node list
33 * @children: child nodes
34 * @all: list head for list of all nodes
35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory
37 * @ghost: when an object file containing profiling data is unloaded we keep a
38 * copy of the profiling data here to allow collecting coverage data
39 * for cleanup code. Such a node is called a "ghost".
40 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links
42 * @name: data file basename
43 *
44 * struct gcov_node represents an entity within the gcov/ subdirectory
45 * of debugfs. There are directory and data file nodes. The latter represent
46 * the actual synthesized data file plus any associated symbolic links which
47 * are needed by the gcov tool to work correctly.
48 */
49struct gcov_node {
50 struct list_head list;
51 struct list_head children;
52 struct list_head all;
53 struct gcov_node *parent;
54 struct gcov_info *info;
55 struct gcov_info *ghost;
56 struct dentry *dentry;
57 struct dentry **links;
58 char name[0];
59};
60
61static const char objtree[] = OBJTREE;
62static const char srctree[] = SRCTREE;
63static struct gcov_node root_node;
64static struct dentry *reset_dentry;
65static LIST_HEAD(all_head);
66static DEFINE_MUTEX(node_lock);
67
68/* If non-zero, keep copies of profiling data for unloaded modules. */
69static int gcov_persist = 1;
70
71static int __init gcov_persist_setup(char *str)
72{
73 unsigned long val;
74
75 if (strict_strtoul(str, 0, &val)) {
76 pr_warning("invalid gcov_persist parameter '%s'\n", str);
77 return 0;
78 }
79 gcov_persist = val;
80 pr_info("setting gcov_persist to %d\n", gcov_persist);
81
82 return 1;
83}
84__setup("gcov_persist=", gcov_persist_setup);
85
86/*
87 * seq_file.start() implementation for gcov data files. Note that the
88 * gcov_iterator interface is designed to be more restrictive than seq_file
89 * (no start from arbitrary position, etc.), to simplify the iterator
90 * implementation.
91 */
92static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
93{
94 loff_t i;
95
96 gcov_iter_start(seq->private);
97 for (i = 0; i < *pos; i++) {
98 if (gcov_iter_next(seq->private))
99 return NULL;
100 }
101 return seq->private;
102}
103
104/* seq_file.next() implementation for gcov data files. */
105static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
106{
107 struct gcov_iterator *iter = data;
108
109 if (gcov_iter_next(iter))
110 return NULL;
111 (*pos)++;
112
113 return iter;
114}
115
116/* seq_file.show() implementation for gcov data files. */
117static int gcov_seq_show(struct seq_file *seq, void *data)
118{
119 struct gcov_iterator *iter = data;
120
121 if (gcov_iter_write(iter, seq))
122 return -EINVAL;
123 return 0;
124}
125
126static void gcov_seq_stop(struct seq_file *seq, void *data)
127{
128 /* Unused. */
129}
130
131static const struct seq_operations gcov_seq_ops = {
132 .start = gcov_seq_start,
133 .next = gcov_seq_next,
134 .show = gcov_seq_show,
135 .stop = gcov_seq_stop,
136};
137
138/*
139 * Return the profiling data set for a given node. This can either be the
140 * original profiling data structure or a duplicate (also called "ghost")
141 * in case the associated object file has been unloaded.
142 */
143static struct gcov_info *get_node_info(struct gcov_node *node)
144{
145 if (node->info)
146 return node->info;
147
148 return node->ghost;
149}
150
151/*
152 * open() implementation for gcov data files. Create a copy of the profiling
153 * data set and initialize the iterator and seq_file interface.
154 */
155static int gcov_seq_open(struct inode *inode, struct file *file)
156{
157 struct gcov_node *node = inode->i_private;
158 struct gcov_iterator *iter;
159 struct seq_file *seq;
160 struct gcov_info *info;
161 int rc = -ENOMEM;
162
163 mutex_lock(&node_lock);
164 /*
165 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access.
167 */
168 info = gcov_info_dup(get_node_info(node));
169 if (!info)
170 goto out_unlock;
171 iter = gcov_iter_new(info);
172 if (!iter)
173 goto err_free_info;
174 rc = seq_open(file, &gcov_seq_ops);
175 if (rc)
176 goto err_free_iter_info;
177 seq = file->private_data;
178 seq->private = iter;
179out_unlock:
180 mutex_unlock(&node_lock);
181 return rc;
182
183err_free_iter_info:
184 gcov_iter_free(iter);
185err_free_info:
186 gcov_info_free(info);
187 goto out_unlock;
188}
189
190/*
191 * release() implementation for gcov data files. Release resources allocated
192 * by open().
193 */
194static int gcov_seq_release(struct inode *inode, struct file *file)
195{
196 struct gcov_iterator *iter;
197 struct gcov_info *info;
198 struct seq_file *seq;
199
200 seq = file->private_data;
201 iter = seq->private;
202 info = gcov_iter_get_info(iter);
203 gcov_iter_free(iter);
204 gcov_info_free(info);
205 seq_release(inode, file);
206
207 return 0;
208}
209
210/*
211 * Find a node by the associated data file name. Needs to be called with
212 * node_lock held.
213 */
214static struct gcov_node *get_node_by_name(const char *name)
215{
216 struct gcov_node *node;
217 struct gcov_info *info;
218
219 list_for_each_entry(node, &all_head, all) {
220 info = get_node_info(node);
221 if (info && (strcmp(info->filename, name) == 0))
222 return node;
223 }
224
225 return NULL;
226}
227
228static void remove_node(struct gcov_node *node);
229
230/*
231 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is
233 * a "ghost" node), remove the debug fs node as well.
234 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos)
237{
238 struct seq_file *seq;
239 struct gcov_info *info;
240 struct gcov_node *node;
241
242 seq = file->private_data;
243 info = gcov_iter_get_info(seq->private);
244 mutex_lock(&node_lock);
245 node = get_node_by_name(info->filename);
246 if (node) {
247 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost)
249 remove_node(node);
250 else
251 gcov_info_reset(node->info);
252 }
253 /* Reset counts for open file. */
254 gcov_info_reset(info);
255 mutex_unlock(&node_lock);
256
257 return len;
258}
259
260/*
261 * Given a string <path> representing a file path of format:
262 * path/to/file.gcda
263 * construct and return a new string:
264 * <dir/>path/to/file.<ext>
265 */
266static char *link_target(const char *dir, const char *path, const char *ext)
267{
268 char *target;
269 char *old_ext;
270 char *copy;
271
272 copy = kstrdup(path, GFP_KERNEL);
273 if (!copy)
274 return NULL;
275 old_ext = strrchr(copy, '.');
276 if (old_ext)
277 *old_ext = '\0';
278 if (dir)
279 target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
280 else
281 target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
282 kfree(copy);
283
284 return target;
285}
286
287/*
288 * Construct a string representing the symbolic link target for the given
289 * gcov data file name and link type. Depending on the link type and the
290 * location of the data file, the link target can either point to a
291 * subdirectory of srctree, objtree or in an external location.
292 */
293static char *get_link_target(const char *filename, const struct gcov_link *ext)
294{
295 const char *rel;
296 char *result;
297
298 if (strncmp(filename, objtree, strlen(objtree)) == 0) {
299 rel = filename + strlen(objtree) + 1;
300 if (ext->dir == SRC_TREE)
301 result = link_target(srctree, rel, ext->ext);
302 else
303 result = link_target(objtree, rel, ext->ext);
304 } else {
305 /* External compilation. */
306 result = link_target(NULL, filename, ext->ext);
307 }
308
309 return result;
310}
311
312#define SKEW_PREFIX ".tmp_"
313
314/*
315 * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
316 * for filename skewing caused by the mod-versioning mechanism.
317 */
318static const char *deskew(const char *basename)
319{
320 if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
321 return basename + sizeof(SKEW_PREFIX) - 1;
322 return basename;
323}
324
325/*
326 * Create links to additional files (usually .c and .gcno files) which the
327 * gcov tool expects to find in the same directory as the gcov data file.
328 */
329static void add_links(struct gcov_node *node, struct dentry *parent)
330{
331 char *basename;
332 char *target;
333 int num;
334 int i;
335
336 for (num = 0; gcov_link[num].ext; num++)
337 /* Nothing. */;
338 node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
339 if (!node->links)
340 return;
341 for (i = 0; i < num; i++) {
342 target = get_link_target(get_node_info(node)->filename,
343 &gcov_link[i]);
344 if (!target)
345 goto out_err;
346 basename = strrchr(target, '/');
347 if (!basename)
348 goto out_err;
349 basename++;
350 node->links[i] = debugfs_create_symlink(deskew(basename),
351 parent, target);
352 if (!node->links[i])
353 goto out_err;
354 kfree(target);
355 }
356
357 return;
358out_err:
359 kfree(target);
360 while (i-- > 0)
361 debugfs_remove(node->links[i]);
362 kfree(node->links);
363 node->links = NULL;
364}
365
366static const struct file_operations gcov_data_fops = {
367 .open = gcov_seq_open,
368 .release = gcov_seq_release,
369 .read = seq_read,
370 .llseek = seq_lseek,
371 .write = gcov_seq_write,
372};
373
374/* Basic initialization of a new node. */
375static void init_node(struct gcov_node *node, struct gcov_info *info,
376 const char *name, struct gcov_node *parent)
377{
378 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all);
381 node->info = info;
382 node->parent = parent;
383 if (name)
384 strcpy(node->name, name);
385}
386
387/*
388 * Create a new node and associated debugfs entry. Needs to be called with
389 * node_lock held.
390 */
391static struct gcov_node *new_node(struct gcov_node *parent,
392 struct gcov_info *info, const char *name)
393{
394 struct gcov_node *node;
395
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) {
398 pr_warning("out of memory\n");
399 return NULL;
400 }
401 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */
403 if (info) {
404 node->dentry = debugfs_create_file(deskew(node->name), 0600,
405 parent->dentry, node, &gcov_data_fops);
406 } else
407 node->dentry = debugfs_create_dir(node->name, parent->dentry);
408 if (!node->dentry) {
409 pr_warning("could not create file\n");
410 kfree(node);
411 return NULL;
412 }
413 if (info)
414 add_links(node, parent->dentry);
415 list_add(&node->list, &parent->children);
416 list_add(&node->all, &all_head);
417
418 return node;
419}
420
421/* Remove symbolic links associated with node. */
422static void remove_links(struct gcov_node *node)
423{
424 int i;
425
426 if (!node->links)
427 return;
428 for (i = 0; gcov_link[i].ext; i++)
429 debugfs_remove(node->links[i]);
430 kfree(node->links);
431 node->links = NULL;
432}
433
434/*
435 * Remove node from all lists and debugfs and release associated resources.
436 * Needs to be called with node_lock held.
437 */
438static void release_node(struct gcov_node *node)
439{
440 list_del(&node->list);
441 list_del(&node->all);
442 debugfs_remove(node->dentry);
443 remove_links(node);
444 if (node->ghost)
445 gcov_info_free(node->ghost);
446 kfree(node);
447}
448
449/* Release node and empty parents. Needs to be called with node_lock held. */
450static void remove_node(struct gcov_node *node)
451{
452 struct gcov_node *parent;
453
454 while ((node != &root_node) && list_empty(&node->children)) {
455 parent = node->parent;
456 release_node(node);
457 node = parent;
458 }
459}
460
461/*
462 * Find child node with given basename. Needs to be called with node_lock
463 * held.
464 */
465static struct gcov_node *get_child_by_name(struct gcov_node *parent,
466 const char *name)
467{
468 struct gcov_node *node;
469
470 list_for_each_entry(node, &parent->children, list) {
471 if (strcmp(node->name, name) == 0)
472 return node;
473 }
474
475 return NULL;
476}
477
478/*
479 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes.
481 */
482static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos)
484{
485 struct gcov_node *node;
486
487 mutex_lock(&node_lock);
488restart:
489 list_for_each_entry(node, &all_head, all) {
490 if (node->info)
491 gcov_info_reset(node->info);
492 else if (list_empty(&node->children)) {
493 remove_node(node);
494 /* Several nodes may have gone - restart loop. */
495 goto restart;
496 }
497 }
498 mutex_unlock(&node_lock);
499
500 return len;
501}
502
503/* read() implementation for reset file. Unused. */
504static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
505 loff_t *pos)
506{
507 /* Allow read operation so that a recursive copy won't fail. */
508 return 0;
509}
510
511static const struct file_operations gcov_reset_fops = {
512 .write = reset_write,
513 .read = reset_read,
514};
515
516/*
517 * Create a node for a given profiling data set and add it to all lists and
518 * debugfs. Needs to be called with node_lock held.
519 */
520static void add_node(struct gcov_info *info)
521{
522 char *filename;
523 char *curr;
524 char *next;
525 struct gcov_node *parent;
526 struct gcov_node *node;
527
528 filename = kstrdup(info->filename, GFP_KERNEL);
529 if (!filename)
530 return;
531 parent = &root_node;
532 /* Create directory nodes along the path. */
533 for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
534 if (curr == next)
535 continue;
536 *next = 0;
537 if (strcmp(curr, ".") == 0)
538 continue;
539 if (strcmp(curr, "..") == 0) {
540 if (!parent->parent)
541 goto err_remove;
542 parent = parent->parent;
543 continue;
544 }
545 node = get_child_by_name(parent, curr);
546 if (!node) {
547 node = new_node(parent, NULL, curr);
548 if (!node)
549 goto err_remove;
550 }
551 parent = node;
552 }
553 /* Create file node. */
554 node = new_node(parent, info, curr);
555 if (!node)
556 goto err_remove;
557out:
558 kfree(filename);
559 return;
560
561err_remove:
562 remove_node(parent);
563 goto out;
564}
565
566/*
567 * The profiling data set associated with this node is being unloaded. Store a
568 * copy of the profiling data and turn this node into a "ghost".
569 */
570static int ghost_node(struct gcov_node *node)
571{
572 node->ghost = gcov_info_dup(node->info);
573 if (!node->ghost) {
574 pr_warning("could not save data for '%s' (out of memory)\n",
575 node->info->filename);
576 return -ENOMEM;
577 }
578 node->info = NULL;
579
580 return 0;
581}
582
583/*
584 * Profiling data for this node has been loaded again. Add profiling data
585 * from previous instantiation and turn this node into a regular node.
586 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info)
588{
589 if (gcov_info_is_compatible(node->ghost, info))
590 gcov_info_add(info, node->ghost);
591 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n",
593 info->filename);
594 }
595 gcov_info_free(node->ghost);
596 node->ghost = NULL;
597 node->info = info;
598}
599
600/*
601 * Callback to create/remove profiling files when code compiled with
602 * -fprofile-arcs is loaded/unloaded.
603 */
604void gcov_event(enum gcov_action action, struct gcov_info *info)
605{
606 struct gcov_node *node;
607
608 mutex_lock(&node_lock);
609 node = get_node_by_name(info->filename);
610 switch (action) {
611 case GCOV_ADD:
612 /* Add new node or revive ghost. */
613 if (!node) {
614 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break;
624 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */
626 if (!node) {
627 pr_warning("could not remove '%s' (not found)\n",
628 info->filename);
629 break;
630 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break;
637 }
638 mutex_unlock(&node_lock);
639}
640
641/* Create debugfs entries. */
642static __init int gcov_fs_init(void)
643{
644 int rc = -EIO;
645
646 init_node(&root_node, NULL, NULL, NULL);
647 /*
648 * /sys/kernel/debug/gcov will be parent for the reset control file
649 * and all profiling files.
650 */
651 root_node.dentry = debugfs_create_dir("gcov", NULL);
652 if (!root_node.dentry)
653 goto err_remove;
654 /*
655 * Create reset file which resets all profiling counts when written
656 * to.
657 */
658 reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
659 NULL, &gcov_reset_fops);
660 if (!reset_dentry)
661 goto err_remove;
662 /* Replay previous events to get our fs hierarchy up-to-date. */
663 gcov_enable_events();
664 return 0;
665
666err_remove:
667 pr_err("init failed\n");
668 if (root_node.dentry)
669 debugfs_remove(root_node.dentry);
670
671 return rc;
672}
673device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 000000000000..ae5bb4260033
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
1/*
2 * This code provides functions to handle gcc's profiling data format
3 * introduced with gcc 3.4. Future versions of gcc may change the gcov
4 * format (as happened before), so all format-specific information needs
5 * to be kept modular and easily exchangeable.
6 *
7 * This file is based on gcc-internal definitions. Functions and data
8 * structures are defined to be compatible with gcc counterparts.
9 * For a better understanding, refer to gcc source: gcc/gcov-io.h.
10 *
11 * Copyright IBM Corp. 2009
12 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 *
14 * Uses gcc-internal data definitions.
15 */
16
17#include <linux/errno.h>
18#include <linux/slab.h>
19#include <linux/string.h>
20#include <linux/seq_file.h>
21#include <linux/vmalloc.h>
22#include "gcov.h"
23
24/* Symbolic links to be created for each profiling data file. */
25const struct gcov_link gcov_link[] = {
26 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
27 { 0, NULL},
28};
29
30/*
31 * Determine whether a counter is active. Based on gcc magic. Doesn't change
32 * at run-time.
33 */
34static int counter_active(struct gcov_info *info, unsigned int type)
35{
36 return (1 << type) & info->ctr_mask;
37}
38
39/* Determine number of active counters. Based on gcc magic. */
40static unsigned int num_counter_active(struct gcov_info *info)
41{
42 unsigned int i;
43 unsigned int result = 0;
44
45 for (i = 0; i < GCOV_COUNTERS; i++) {
46 if (counter_active(info, i))
47 result++;
48 }
49 return result;
50}
51
52/**
53 * gcov_info_reset - reset profiling data to zero
54 * @info: profiling data set
55 */
56void gcov_info_reset(struct gcov_info *info)
57{
58 unsigned int active = num_counter_active(info);
59 unsigned int i;
60
61 for (i = 0; i < active; i++) {
62 memset(info->counts[i].values, 0,
63 info->counts[i].num * sizeof(gcov_type));
64 }
65}
66
67/**
68 * gcov_info_is_compatible - check if profiling data can be added
69 * @info1: first profiling data set
70 * @info2: second profiling data set
71 *
72 * Returns non-zero if profiling data can be added, zero otherwise.
73 */
74int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
75{
76 return (info1->stamp == info2->stamp);
77}
78
79/**
80 * gcov_info_add - add up profiling data
81 * @dest: profiling data set to which data is added
82 * @source: profiling data set which is added
83 *
84 * Adds profiling counts of @source to @dest.
85 */
86void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
87{
88 unsigned int i;
89 unsigned int j;
90
91 for (i = 0; i < num_counter_active(dest); i++) {
92 for (j = 0; j < dest->counts[i].num; j++) {
93 dest->counts[i].values[j] +=
94 source->counts[i].values[j];
95 }
96 }
97}
98
99/* Get size of function info entry. Based on gcc magic. */
100static size_t get_fn_size(struct gcov_info *info)
101{
102 size_t size;
103
104 size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
105 sizeof(unsigned int);
106 if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
107 size = ALIGN(size, __alignof__(struct gcov_fn_info));
108 return size;
109}
110
111/* Get address of function info entry. Based on gcc magic. */
112static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
113{
114 return (struct gcov_fn_info *)
115 ((char *) info->functions + fn * get_fn_size(info));
116}
117
118/**
119 * gcov_info_dup - duplicate profiling data set
120 * @info: profiling data set to duplicate
121 *
122 * Return newly allocated duplicate on success, %NULL on error.
123 */
124struct gcov_info *gcov_info_dup(struct gcov_info *info)
125{
126 struct gcov_info *dup;
127 unsigned int i;
128 unsigned int active;
129
130 /* Duplicate gcov_info. */
131 active = num_counter_active(info);
132 dup = kzalloc(sizeof(struct gcov_info) +
133 sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
134 if (!dup)
135 return NULL;
136 dup->version = info->version;
137 dup->stamp = info->stamp;
138 dup->n_functions = info->n_functions;
139 dup->ctr_mask = info->ctr_mask;
140 /* Duplicate filename. */
141 dup->filename = kstrdup(info->filename, GFP_KERNEL);
142 if (!dup->filename)
143 goto err_free;
144 /* Duplicate table of functions. */
145 dup->functions = kmemdup(info->functions, info->n_functions *
146 get_fn_size(info), GFP_KERNEL);
147 if (!dup->functions)
148 goto err_free;
149 /* Duplicate counter arrays. */
150 for (i = 0; i < active ; i++) {
151 struct gcov_ctr_info *ctr = &info->counts[i];
152 size_t size = ctr->num * sizeof(gcov_type);
153
154 dup->counts[i].num = ctr->num;
155 dup->counts[i].merge = ctr->merge;
156 dup->counts[i].values = vmalloc(size);
157 if (!dup->counts[i].values)
158 goto err_free;
159 memcpy(dup->counts[i].values, ctr->values, size);
160 }
161 return dup;
162
163err_free:
164 gcov_info_free(dup);
165 return NULL;
166}
167
168/**
169 * gcov_info_free - release memory for profiling data set duplicate
170 * @info: profiling data set duplicate to free
171 */
172void gcov_info_free(struct gcov_info *info)
173{
174 unsigned int active = num_counter_active(info);
175 unsigned int i;
176
177 for (i = 0; i < active ; i++)
178 vfree(info->counts[i].values);
179 kfree(info->functions);
180 kfree(info->filename);
181 kfree(info);
182}
183
184/**
185 * struct type_info - iterator helper array
186 * @ctr_type: counter type
187 * @offset: index of the first value of the current function for this type
188 *
189 * This array is needed to convert the in-memory data format into the in-file
190 * data format:
191 *
192 * In-memory:
193 * for each counter type
194 * for each function
195 * values
196 *
197 * In-file:
198 * for each function
199 * for each counter type
200 * values
201 *
202 * See gcc source gcc/gcov-io.h for more information on data organization.
203 */
204struct type_info {
205 int ctr_type;
206 unsigned int offset;
207};
208
209/**
210 * struct gcov_iterator - specifies current file position in logical records
211 * @info: associated profiling data
212 * @record: record type
213 * @function: function number
214 * @type: counter type
215 * @count: index into values array
216 * @num_types: number of counter types
217 * @type_info: helper array to get values-array offset for current function
218 */
219struct gcov_iterator {
220 struct gcov_info *info;
221
222 int record;
223 unsigned int function;
224 unsigned int type;
225 unsigned int count;
226
227 int num_types;
228 struct type_info type_info[0];
229};
230
231static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
232{
233 return get_fn_info(iter->info, iter->function);
234}
235
236static struct type_info *get_type(struct gcov_iterator *iter)
237{
238 return &iter->type_info[iter->type];
239}
240
241/**
242 * gcov_iter_new - allocate and initialize profiling data iterator
243 * @info: profiling data set to be iterated
244 *
245 * Return file iterator on success, %NULL otherwise.
246 */
247struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
248{
249 struct gcov_iterator *iter;
250
251 iter = kzalloc(sizeof(struct gcov_iterator) +
252 num_counter_active(info) * sizeof(struct type_info),
253 GFP_KERNEL);
254 if (iter)
255 iter->info = info;
256
257 return iter;
258}
259
260/**
261 * gcov_iter_free - release memory for iterator
262 * @iter: file iterator to free
263 */
264void gcov_iter_free(struct gcov_iterator *iter)
265{
266 kfree(iter);
267}
268
269/**
270 * gcov_iter_get_info - return profiling data set for given file iterator
271 * @iter: file iterator
272 */
273struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
274{
275 return iter->info;
276}
277
278/**
279 * gcov_iter_start - reset file iterator to starting position
280 * @iter: file iterator
281 */
282void gcov_iter_start(struct gcov_iterator *iter)
283{
284 int i;
285
286 iter->record = 0;
287 iter->function = 0;
288 iter->type = 0;
289 iter->count = 0;
290 iter->num_types = 0;
291 for (i = 0; i < GCOV_COUNTERS; i++) {
292 if (counter_active(iter->info, i)) {
293 iter->type_info[iter->num_types].ctr_type = i;
294 iter->type_info[iter->num_types++].offset = 0;
295 }
296 }
297}
298
299/* Mapping of logical record number to actual file content. */
300#define RECORD_FILE_MAGIC 0
301#define RECORD_GCOV_VERSION 1
302#define RECORD_TIME_STAMP 2
303#define RECORD_FUNCTION_TAG 3
304#define RECORD_FUNCTON_TAG_LEN 4
305#define RECORD_FUNCTION_IDENT 5
306#define RECORD_FUNCTION_CHECK 6
307#define RECORD_COUNT_TAG 7
308#define RECORD_COUNT_LEN 8
309#define RECORD_COUNT 9
310
311/**
312 * gcov_iter_next - advance file iterator to next logical record
313 * @iter: file iterator
314 *
315 * Return zero if new position is valid, non-zero if iterator has reached end.
316 */
317int gcov_iter_next(struct gcov_iterator *iter)
318{
319 switch (iter->record) {
320 case RECORD_FILE_MAGIC:
321 case RECORD_GCOV_VERSION:
322 case RECORD_FUNCTION_TAG:
323 case RECORD_FUNCTON_TAG_LEN:
324 case RECORD_FUNCTION_IDENT:
325 case RECORD_COUNT_TAG:
326 /* Advance to next record */
327 iter->record++;
328 break;
329 case RECORD_COUNT:
330 /* Advance to next count */
331 iter->count++;
332 /* fall through */
333 case RECORD_COUNT_LEN:
334 if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
335 iter->record = 9;
336 break;
337 }
338 /* Advance to next counter type */
339 get_type(iter)->offset += iter->count;
340 iter->count = 0;
341 iter->type++;
342 /* fall through */
343 case RECORD_FUNCTION_CHECK:
344 if (iter->type < iter->num_types) {
345 iter->record = 7;
346 break;
347 }
348 /* Advance to next function */
349 iter->type = 0;
350 iter->function++;
351 /* fall through */
352 case RECORD_TIME_STAMP:
353 if (iter->function < iter->info->n_functions)
354 iter->record = 3;
355 else
356 iter->record = -1;
357 break;
358 }
359 /* Check for EOF. */
360 if (iter->record == -1)
361 return -EINVAL;
362 else
363 return 0;
364}
365
366/**
367 * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
368 * @seq: seq_file handle
369 * @v: value to be stored
370 *
371 * Number format defined by gcc: numbers are recorded in the 32 bit
372 * unsigned binary form of the endianness of the machine generating the
373 * file.
374 */
375static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
376{
377 return seq_write(seq, &v, sizeof(v));
378}
379
380/**
381 * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
382 * @seq: seq_file handle
383 * @v: value to be stored
384 *
385 * Number format defined by gcc: numbers are recorded in the 32 bit
386 * unsigned binary form of the endianness of the machine generating the
387 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
388 * first.
389 */
390static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
391{
392 u32 data[2];
393
394 data[0] = (v & 0xffffffffUL);
395 data[1] = (v >> 32);
396 return seq_write(seq, data, sizeof(data));
397}
398
399/**
400 * gcov_iter_write - write data for current pos to seq_file
401 * @iter: file iterator
402 * @seq: seq_file handle
403 *
404 * Return zero on success, non-zero otherwise.
405 */
406int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
407{
408 int rc = -EINVAL;
409
410 switch (iter->record) {
411 case RECORD_FILE_MAGIC:
412 rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
413 break;
414 case RECORD_GCOV_VERSION:
415 rc = seq_write_gcov_u32(seq, iter->info->version);
416 break;
417 case RECORD_TIME_STAMP:
418 rc = seq_write_gcov_u32(seq, iter->info->stamp);
419 break;
420 case RECORD_FUNCTION_TAG:
421 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
422 break;
423 case RECORD_FUNCTON_TAG_LEN:
424 rc = seq_write_gcov_u32(seq, 2);
425 break;
426 case RECORD_FUNCTION_IDENT:
427 rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
428 break;
429 case RECORD_FUNCTION_CHECK:
430 rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
431 break;
432 case RECORD_COUNT_TAG:
433 rc = seq_write_gcov_u32(seq,
434 GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
435 break;
436 case RECORD_COUNT_LEN:
437 rc = seq_write_gcov_u32(seq,
438 get_func(iter)->n_ctrs[iter->type] * 2);
439 break;
440 case RECORD_COUNT:
441 rc = seq_write_gcov_u64(seq,
442 iter->info->counts[iter->type].
443 values[iter->count + get_type(iter)->offset]);
444 break;
445 }
446 return rc;
447}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 000000000000..060073ebf7a6
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
1/*
2 * Profiling infrastructure declarations.
3 *
4 * This file is based on gcc-internal definitions. Data structures are
5 * defined to be compatible with gcc counterparts. For a better
6 * understanding, refer to gcc source: gcc/gcov-io.h.
7 *
8 * Copyright IBM Corp. 2009
9 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
10 *
11 * Uses gcc-internal data definitions.
12 */
13
14#ifndef GCOV_H
15#define GCOV_H GCOV_H
16
17#include <linux/types.h>
18
19/*
20 * Profiling data types used for gcc 3.4 and above - these are defined by
21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible.
23 */
24#define GCOV_COUNTERS 5
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
28#define GCOV_TAG_FOR_COUNTER(count) \
29 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
30
31#if BITS_PER_LONG >= 64
32typedef long gcov_type;
33#else
34typedef long long gcov_type;
35#endif
36
37/**
38 * struct gcov_fn_info - profiling meta data per function
39 * @ident: object file-unique function identifier
40 * @checksum: function checksum
41 * @n_ctrs: number of values per counter type belonging to this function
42 *
43 * This data is generated by gcc during compilation and doesn't change
44 * at run-time.
45 */
46struct gcov_fn_info {
47 unsigned int ident;
48 unsigned int checksum;
49 unsigned int n_ctrs[0];
50};
51
52/**
53 * struct gcov_ctr_info - profiling data per counter type
54 * @num: number of counter values for this type
55 * @values: array of counter values for this type
56 * @merge: merge function for counter values of this type (unused)
57 *
58 * This data is generated by gcc during compilation and doesn't change
59 * at run-time with the exception of the values array.
60 */
61struct gcov_ctr_info {
62 unsigned int num;
63 gcov_type *values;
64 void (*merge)(gcov_type *, unsigned int);
65};
66
67/**
68 * struct gcov_info - profiling data per object file
69 * @version: gcov version magic indicating the gcc version used for compilation
70 * @next: list head for a singly-linked list
71 * @stamp: time stamp
72 * @filename: name of the associated gcov data file
73 * @n_functions: number of instrumented functions
74 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active
76 * @counts: counter data per counter type
77 *
78 * This data is generated by gcc during compilation and doesn't change
79 * at run-time with the exception of the next pointer.
80 */
81struct gcov_info {
82 unsigned int version;
83 struct gcov_info *next;
84 unsigned int stamp;
85 const char *filename;
86 unsigned int n_functions;
87 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask;
89 struct gcov_ctr_info counts[0];
90};
91
92/* Base interface. */
93enum gcov_action {
94 GCOV_ADD,
95 GCOV_REMOVE,
96};
97
98void gcov_event(enum gcov_action action, struct gcov_info *info);
99void gcov_enable_events(void);
100
101/* Iterator control. */
102struct seq_file;
103struct gcov_iterator;
104
105struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
106void gcov_iter_free(struct gcov_iterator *iter);
107void gcov_iter_start(struct gcov_iterator *iter);
108int gcov_iter_next(struct gcov_iterator *iter);
109int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
110struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
111
112/* gcov_info control. */
113void gcov_info_reset(struct gcov_info *info);
114int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
115void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
116struct gcov_info *gcov_info_dup(struct gcov_info *info);
117void gcov_info_free(struct gcov_info *info);
118
119struct gcov_link {
120 enum {
121 OBJ_TREE,
122 SRC_TREE,
123 } dir;
124 const char *ext;
125};
126extern const struct gcov_link gcov_link[];
127
128#endif /* GCOV_H */
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
1/*
2 * Supplementary group IDs
3 */
4#include <linux/cred.h>
5#include <linux/module.h>
6#include <linux/slab.h>
7#include <linux/security.h>
8#include <linux/syscalls.h>
9#include <asm/uaccess.h>
10
11/* init to 2 - one for init_task, one to ensure it is never freed */
12struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
13
14struct group_info *groups_alloc(int gidsetsize)
15{
16 struct group_info *group_info;
17 int nblocks;
18 int i;
19
20 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
21 /* Make sure we always allocate at least one indirect block pointer */
22 nblocks = nblocks ? : 1;
23 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
24 if (!group_info)
25 return NULL;
26 group_info->ngroups = gidsetsize;
27 group_info->nblocks = nblocks;
28 atomic_set(&group_info->usage, 1);
29
30 if (gidsetsize <= NGROUPS_SMALL)
31 group_info->blocks[0] = group_info->small_block;
32 else {
33 for (i = 0; i < nblocks; i++) {
34 gid_t *b;
35 b = (void *)__get_free_page(GFP_USER);
36 if (!b)
37 goto out_undo_partial_alloc;
38 group_info->blocks[i] = b;
39 }
40 }
41 return group_info;
42
43out_undo_partial_alloc:
44 while (--i >= 0) {
45 free_page((unsigned long)group_info->blocks[i]);
46 }
47 kfree(group_info);
48 return NULL;
49}
50
51EXPORT_SYMBOL(groups_alloc);
52
53void groups_free(struct group_info *group_info)
54{
55 if (group_info->blocks[0] != group_info->small_block) {
56 int i;
57 for (i = 0; i < group_info->nblocks; i++)
58 free_page((unsigned long)group_info->blocks[i]);
59 }
60 kfree(group_info);
61}
62
63EXPORT_SYMBOL(groups_free);
64
65/* export the group_info to a user-space array */
66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info)
68{
69 int i;
70 unsigned int count = group_info->ngroups;
71
72 for (i = 0; i < group_info->nblocks; i++) {
73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
74 unsigned int len = cp_count * sizeof(*grouplist);
75
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
81 }
82 return 0;
83}
84
85/* fill a group_info from a user-space array - it must be allocated already */
86static int groups_from_user(struct group_info *group_info,
87 gid_t __user *grouplist)
88{
89 int i;
90 unsigned int count = group_info->ngroups;
91
92 for (i = 0; i < group_info->nblocks; i++) {
93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
94 unsigned int len = cp_count * sizeof(*grouplist);
95
96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 return -EFAULT;
98
99 grouplist += NGROUPS_PER_BLOCK;
100 count -= cp_count;
101 }
102 return 0;
103}
104
105/* a simple Shell sort */
106static void groups_sort(struct group_info *group_info)
107{
108 int base, max, stride;
109 int gidsetsize = group_info->ngroups;
110
111 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
112 ; /* nothing */
113 stride /= 3;
114
115 while (stride) {
116 max = gidsetsize - stride;
117 for (base = 0; base < max; base++) {
118 int left = base;
119 int right = left + stride;
120 gid_t tmp = GROUP_AT(group_info, right);
121
122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left);
125 right = left;
126 left -= stride;
127 }
128 GROUP_AT(group_info, right) = tmp;
129 }
130 stride /= 3;
131 }
132}
133
134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, gid_t grp)
136{
137 unsigned int left, right;
138
139 if (!group_info)
140 return 0;
141
142 left = 0;
143 right = group_info->ngroups;
144 while (left < right) {
145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid);
147 if (cmp > 0)
148 left = mid + 1;
149 else if (cmp < 0)
150 right = mid;
151 else
152 return 1;
153 }
154 return 0;
155}
156
157/**
158 * set_groups - Change a group subscription in a set of credentials
159 * @new: The newly prepared set of credentials to alter
160 * @group_info: The group list to install
161 *
162 * Validate a group subscription and, if valid, insert it into a set
163 * of credentials.
164 */
165int set_groups(struct cred *new, struct group_info *group_info)
166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info);
174 groups_sort(group_info);
175 get_group_info(group_info);
176 new->group_info = group_info;
177 return 0;
178}
179
180EXPORT_SYMBOL(set_groups);
181
182/**
183 * set_current_groups - Change current's group subscription
184 * @group_info: The group list to impose
185 *
186 * Validate a group subscription and, if valid, impose it upon current's task
187 * security record.
188 */
189int set_current_groups(struct group_info *group_info)
190{
191 struct cred *new;
192 int ret;
193
194 new = prepare_creds();
195 if (!new)
196 return -ENOMEM;
197
198 ret = set_groups(new, group_info);
199 if (ret < 0) {
200 abort_creds(new);
201 return ret;
202 }
203
204 return commit_creds(new);
205}
206
207EXPORT_SYMBOL(set_current_groups);
208
209SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
210{
211 const struct cred *cred = current_cred();
212 int i;
213
214 if (gidsetsize < 0)
215 return -EINVAL;
216
217 /* no need to grab task_lock here; it cannot change */
218 i = cred->group_info->ngroups;
219 if (gidsetsize) {
220 if (i > gidsetsize) {
221 i = -EINVAL;
222 goto out;
223 }
224 if (groups_to_user(grouplist, cred->group_info)) {
225 i = -EFAULT;
226 goto out;
227 }
228 }
229out:
230 return i;
231}
232
233/*
234 * SMP: Our groups are copy-on-write. We can set them safely
235 * without another task interfering.
236 */
237
238SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
239{
240 struct group_info *group_info;
241 int retval;
242
243 if (!capable(CAP_SETGID))
244 return -EPERM;
245 if ((unsigned)gidsetsize > NGROUPS_MAX)
246 return -EINVAL;
247
248 group_info = groups_alloc(gidsetsize);
249 if (!group_info)
250 return -ENOMEM;
251 retval = groups_from_user(group_info, grouplist);
252 if (retval) {
253 put_group_info(group_info);
254 return retval;
255 }
256
257 retval = set_current_groups(group_info);
258 put_group_info(group_info);
259
260 return retval;
261}
262
263/*
264 * Check whether we're fsgid/egid or in the supplemental group..
265 */
266int in_group_p(gid_t grp)
267{
268 const struct cred *cred = current_cred();
269 int retval = 1;
270
271 if (grp != cred->fsgid)
272 retval = groups_search(cred->group_info, grp);
273 return retval;
274}
275
276EXPORT_SYMBOL(in_group_p);
277
278int in_egroup_p(gid_t grp)
279{
280 const struct cred *cred = current_cred();
281 int retval = 1;
282
283 if (grp != cred->egid)
284 retval = groups_search(cred->group_info, grp);
285 return retval;
286}
287
288EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b675a67c9ac3..05071bf6a37b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 191 }
192} 192}
193 193
194
195/*
196 * Get the preferred target CPU for NOHZ
197 */
198static int hrtimer_get_target(int this_cpu, int pinned)
199{
200#ifdef CONFIG_NO_HZ
201 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
202 int preferred_cpu = get_nohz_load_balancer();
203
204 if (preferred_cpu >= 0)
205 return preferred_cpu;
206 }
207#endif
208 return this_cpu;
209}
210
211/*
212 * With HIGHRES=y we do not migrate the timer when it is expiring
213 * before the next event on the target cpu because we cannot reprogram
214 * the target cpu hardware and we would cause it to fire late.
215 *
216 * Called with cpu_base->lock of target cpu held.
217 */
218static int
219hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
220{
221#ifdef CONFIG_HIGH_RES_TIMERS
222 ktime_t expires;
223
224 if (!new_base->cpu_base->hres_active)
225 return 0;
226
227 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
228 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
229#else
230 return 0;
231#endif
232}
233
194/* 234/*
195 * Switch the timer base to the current CPU when possible. 235 * Switch the timer base to the current CPU when possible.
196 */ 236 */
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 240{
201 struct hrtimer_clock_base *new_base; 241 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 242 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 243 int this_cpu = smp_processor_id();
204 244 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 245
214again: 246again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 247 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
217 249
218 if (base != new_base) { 250 if (base != new_base) {
219 /* 251 /*
220 * We are trying to schedule the timer on the local CPU. 252 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 253 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 254 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 255 * the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 265 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 266 spin_lock(&new_base->cpu_base->lock);
235 267
236 /* Optimized away for NOHZ=n SMP=n */ 268 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 269 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 270 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 271 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 272 timer->base = base;
241 new_base->offset); 273 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 274 }
269 timer->base = new_base; 275 timer->base = new_base;
270 } 276 }
@@ -380,6 +386,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
380 return res; 386 return res;
381} 387}
382 388
389EXPORT_SYMBOL_GPL(ktime_add_safe);
390
383#ifdef CONFIG_DEBUG_OBJECTS_TIMERS 391#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
384 392
385static struct debug_obj_descr hrtimer_debug_descr; 393static struct debug_obj_descr hrtimer_debug_descr;
@@ -477,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
477 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 485 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
478 __hrtimer_init(timer, clock_id, mode); 486 __hrtimer_init(timer, clock_id, mode);
479} 487}
488EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
480 489
481void destroy_hrtimer_on_stack(struct hrtimer *timer) 490void destroy_hrtimer_on_stack(struct hrtimer *timer)
482{ 491{
@@ -1274,14 +1283,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1274 1283
1275 expires_next.tv64 = KTIME_MAX; 1284 expires_next.tv64 = KTIME_MAX;
1276 1285
1286 spin_lock(&cpu_base->lock);
1287 /*
1288 * We set expires_next to KTIME_MAX here with cpu_base->lock
1289 * held to prevent that a timer is enqueued in our queue via
1290 * the migration code. This does not affect enqueueing of
1291 * timers which run their callback and need to be requeued on
1292 * this CPU.
1293 */
1294 cpu_base->expires_next.tv64 = KTIME_MAX;
1295
1277 base = cpu_base->clock_base; 1296 base = cpu_base->clock_base;
1278 1297
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1298 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1299 ktime_t basenow;
1281 struct rb_node *node; 1300 struct rb_node *node;
1282 1301
1283 spin_lock(&cpu_base->lock);
1284
1285 basenow = ktime_add(now, base->offset); 1302 basenow = ktime_add(now, base->offset);
1286 1303
1287 while ((node = base->first)) { 1304 while ((node = base->first)) {
@@ -1314,11 +1331,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1314 1331
1315 __run_hrtimer(timer); 1332 __run_hrtimer(timer);
1316 } 1333 }
1317 spin_unlock(&cpu_base->lock);
1318 base++; 1334 base++;
1319 } 1335 }
1320 1336
1337 /*
1338 * Store the new expiry value so the migration code can verify
1339 * against it.
1340 */
1321 cpu_base->expires_next = expires_next; 1341 cpu_base->expires_next = expires_next;
1342 spin_unlock(&cpu_base->lock);
1322 1343
1323 /* Reprogramming necessary ? */ 1344 /* Reprogramming necessary ? */
1324 if (expires_next.tv64 != KTIME_MAX) { 1345 if (expires_next.tv64 != KTIME_MAX) {
@@ -1457,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1457 sl->timer.function = hrtimer_wakeup; 1478 sl->timer.function = hrtimer_wakeup;
1458 sl->task = task; 1479 sl->task = task;
1459} 1480}
1481EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1460 1482
1461static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1483static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1462{ 1484{
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..c1660194d115 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
222} 222}
223EXPORT_SYMBOL(set_irq_chip_data); 223EXPORT_SYMBOL(set_irq_chip_data);
224 224
225/**
226 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
227 *
228 * @irq: Interrupt number
229 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
230 *
231 * The IRQ_NESTED_THREAD flag indicates that on
232 * request_threaded_irq() no separate interrupt thread should be
233 * created for the irq as the handler are called nested in the
234 * context of a demultiplexing interrupt handler thread.
235 */
236void set_irq_nested_thread(unsigned int irq, int nest)
237{
238 struct irq_desc *desc = irq_to_desc(irq);
239 unsigned long flags;
240
241 if (!desc)
242 return;
243
244 spin_lock_irqsave(&desc->lock, flags);
245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD;
247 else
248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags);
250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252
225/* 253/*
226 * default enable function 254 * default enable function
227 */ 255 */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
299 } 327 }
300} 328}
301 329
330/*
331 * handle_nested_irq - Handle a nested irq from a irq thread
332 * @irq: the interrupt number
333 *
334 * Handle interrupts which are nested into a threaded interrupt
335 * handler. The handler function is called inside the calling
336 * threads context.
337 */
338void handle_nested_irq(unsigned int irq)
339{
340 struct irq_desc *desc = irq_to_desc(irq);
341 struct irqaction *action;
342 irqreturn_t action_ret;
343
344 might_sleep();
345
346 spin_lock_irq(&desc->lock);
347
348 kstat_incr_irqs_this_cpu(irq, desc);
349
350 action = desc->action;
351 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
352 goto out_unlock;
353
354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock);
356
357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret);
360
361 spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS;
363
364out_unlock:
365 spin_unlock_irq(&desc->lock);
366}
367EXPORT_SYMBOL_GPL(handle_nested_irq);
368
302/** 369/**
303 * handle_simple_irq - Simple and software-decoded IRQs. 370 * handle_simple_irq - Simple and software-decoded IRQs.
304 * @irq: the interrupt number 371 * @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
382 449
383 spin_lock(&desc->lock); 450 spin_lock(&desc->lock);
384 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
385 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 452
453 if (unlikely(desc->status & IRQ_ONESHOT))
454 desc->status |= IRQ_MASKED;
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
386 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
387out_unlock: 457out_unlock:
388 spin_unlock(&desc->lock); 458 spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
572 desc->chip = &dummy_irq_chip; 642 desc->chip = &dummy_irq_chip;
573 } 643 }
574 644
645 chip_bus_lock(irq, desc);
575 spin_lock_irqsave(&desc->lock, flags); 646 spin_lock_irqsave(&desc->lock, flags);
576 647
577 /* Uninstall? */ 648 /* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
591 desc->chip->startup(irq); 662 desc->chip->startup(irq);
592 } 663 }
593 spin_unlock_irqrestore(&desc->lock, flags); 664 spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc);
594} 666}
595EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
596 668
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd920..a81cf80554db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
161 161
162 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
163 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node; 164 node = first_online_node;
165 165
166 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
172 172
173 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
174 desc[i].irq = i; 174 desc[i].irq = i;
175#ifdef CONFIG_SMP
176 desc[i].node = node;
177#endif
175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 178 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 179 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
177 alloc_desc_masks(&desc[i], node, true); 180 alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 73468253143b..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,8 +42,20 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); 46
47/* Inline functions for support of irq chips on slow busses */
48static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
49{
50 if (unlikely(desc->chip->bus_lock))
51 desc->chip->bus_lock(irq);
52}
53
54static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
55{
56 if (unlikely(desc->chip->bus_sync_unlock))
57 desc->chip->bus_sync_unlock(irq);
58}
47 59
48/* 60/*
49 * Debugging printout: 61 * Debugging printout:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aaf5c9d05770..bde4c667d24d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
112 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 if (!desc->chip->set_affinity(irq, cpumask)) { 121 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask); 122 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask); 123 irq_set_thread_affinity(desc);
116 } 124 }
117 } 125 }
118 else { 126 else {
@@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
122#else 130#else
123 if (!desc->chip->set_affinity(irq, cpumask)) { 131 if (!desc->chip->set_affinity(irq, cpumask)) {
124 cpumask_copy(desc->affinity, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask); 133 irq_set_thread_affinity(desc);
126 } 134 }
127#endif 135#endif
128 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
@@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
176 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
177 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
178 if (!ret) 186 if (!ret)
179 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
180 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
181 189
182 return ret; 190 return ret;
@@ -222,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
222 if (!desc) 230 if (!desc)
223 return; 231 return;
224 232
233 chip_bus_lock(irq, desc);
225 spin_lock_irqsave(&desc->lock, flags); 234 spin_lock_irqsave(&desc->lock, flags);
226 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
227 spin_unlock_irqrestore(&desc->lock, flags); 236 spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc);
228} 238}
229EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
230 240
@@ -286,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
286 * matches the last disable, processing of interrupts on this 296 * matches the last disable, processing of interrupts on this
287 * IRQ line is re-enabled. 297 * IRQ line is re-enabled.
288 * 298 *
289 * This function may be called from IRQ context. 299 * This function may be called from IRQ context only when
300 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
290 */ 301 */
291void enable_irq(unsigned int irq) 302void enable_irq(unsigned int irq)
292{ 303{
@@ -296,9 +307,11 @@ void enable_irq(unsigned int irq)
296 if (!desc) 307 if (!desc)
297 return; 308 return;
298 309
310 chip_bus_lock(irq, desc);
299 spin_lock_irqsave(&desc->lock, flags); 311 spin_lock_irqsave(&desc->lock, flags);
300 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
301 spin_unlock_irqrestore(&desc->lock, flags); 313 spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc);
302} 315}
303EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
304 317
@@ -428,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
428 return ret; 441 return ret;
429} 442}
430 443
444/*
445 * Default primary interrupt handler for threaded interrupts. Is
446 * assigned as primary handler when request_threaded_irq is called
447 * with handler == NULL. Useful for oneshot interrupts.
448 */
449static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
450{
451 return IRQ_WAKE_THREAD;
452}
453
454/*
455 * Primary handler for nested threaded interrupts. Should never be
456 * called.
457 */
458static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
459{
460 WARN(1, "Primary handler called for nested irq %d\n", irq);
461 return IRQ_NONE;
462}
463
431static int irq_wait_for_interrupt(struct irqaction *action) 464static int irq_wait_for_interrupt(struct irqaction *action)
432{ 465{
433 while (!kthread_should_stop()) { 466 while (!kthread_should_stop()) {
@@ -444,6 +477,56 @@ static int irq_wait_for_interrupt(struct irqaction *action)
444} 477}
445 478
446/* 479/*
480 * Oneshot interrupts keep the irq line masked until the threaded
481 * handler finished. unmask if the interrupt has not been disabled and
482 * is marked MASKED.
483 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{
486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq);
491 }
492 spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc);
494}
495
496#ifdef CONFIG_SMP
497/*
498 * Check whether we need to change the affinity of the interrupt thread.
499 */
500static void
501irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
502{
503 cpumask_var_t mask;
504
505 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
506 return;
507
508 /*
509 * In case we are out of memory we set IRQTF_AFFINITY again and
510 * try again next time
511 */
512 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
513 set_bit(IRQTF_AFFINITY, &action->thread_flags);
514 return;
515 }
516
517 spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock);
520
521 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask);
523}
524#else
525static inline void
526irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
527#endif
528
529/*
447 * Interrupt handler thread 530 * Interrupt handler thread
448 */ 531 */
449static int irq_thread(void *data) 532static int irq_thread(void *data)
@@ -451,13 +534,15 @@ static int irq_thread(void *data)
451 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 534 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
452 struct irqaction *action = data; 535 struct irqaction *action = data;
453 struct irq_desc *desc = irq_to_desc(action->irq); 536 struct irq_desc *desc = irq_to_desc(action->irq);
454 int wake; 537 int wake, oneshot = desc->status & IRQ_ONESHOT;
455 538
456 sched_setscheduler(current, SCHED_FIFO, &param); 539 sched_setscheduler(current, SCHED_FIFO, &param);
457 current->irqaction = action; 540 current->irqaction = action;
458 541
459 while (!irq_wait_for_interrupt(action)) { 542 while (!irq_wait_for_interrupt(action)) {
460 543
544 irq_thread_check_affinity(desc, action);
545
461 atomic_inc(&desc->threads_active); 546 atomic_inc(&desc->threads_active);
462 547
463 spin_lock_irq(&desc->lock); 548 spin_lock_irq(&desc->lock);
@@ -475,6 +560,9 @@ static int irq_thread(void *data)
475 spin_unlock_irq(&desc->lock); 560 spin_unlock_irq(&desc->lock);
476 561
477 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563
564 if (oneshot)
565 irq_finalize_oneshot(action->irq, desc);
478 } 566 }
479 567
480 wake = atomic_dec_and_test(&desc->threads_active); 568 wake = atomic_dec_and_test(&desc->threads_active);
@@ -522,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
522 struct irqaction *old, **old_ptr; 610 struct irqaction *old, **old_ptr;
523 const char *old_name = NULL; 611 const char *old_name = NULL;
524 unsigned long flags; 612 unsigned long flags;
525 int shared = 0; 613 int nested, shared = 0;
526 int ret; 614 int ret;
527 615
528 if (!desc) 616 if (!desc)
@@ -547,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
547 rand_initialize_irq(irq); 635 rand_initialize_irq(irq);
548 } 636 }
549 637
638 /* Oneshot interrupts are not allowed with shared */
639 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
640 return -EINVAL;
641
642 /*
643 * Check whether the interrupt nests into another interrupt
644 * thread.
645 */
646 nested = desc->status & IRQ_NESTED_THREAD;
647 if (nested) {
648 if (!new->thread_fn)
649 return -EINVAL;
650 /*
651 * Replace the primary handler which was provided from
652 * the driver for non nested interrupt handling by the
653 * dummy function which warns when called.
654 */
655 new->handler = irq_nested_primary_handler;
656 }
657
550 /* 658 /*
551 * Threaded handler ? 659 * Create a handler thread when a thread function is supplied
660 * and the interrupt does not nest into another interrupt
661 * thread.
552 */ 662 */
553 if (new->thread_fn) { 663 if (new->thread_fn && !nested) {
554 struct task_struct *t; 664 struct task_struct *t;
555 665
556 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 666 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -564,7 +674,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
564 */ 674 */
565 get_task_struct(t); 675 get_task_struct(t);
566 new->thread = t; 676 new->thread = t;
567 wake_up_process(t);
568 } 677 }
569 678
570 /* 679 /*
@@ -620,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
620 desc->status |= IRQ_PER_CPU; 729 desc->status |= IRQ_PER_CPU;
621#endif 730#endif
622 731
623 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 732 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
624 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 733 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
625 734
735 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT;
737
626 if (!(desc->status & IRQ_NOAUTOEN)) { 738 if (!(desc->status & IRQ_NOAUTOEN)) {
627 desc->depth = 0; 739 desc->depth = 0;
628 desc->status &= ~IRQ_DISABLED; 740 desc->status &= ~IRQ_DISABLED;
@@ -647,6 +759,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
647 (int)(new->flags & IRQF_TRIGGER_MASK)); 759 (int)(new->flags & IRQF_TRIGGER_MASK));
648 } 760 }
649 761
762 new->irq = irq;
650 *old_ptr = new; 763 *old_ptr = new;
651 764
652 /* Reset broken irq detection when installing new handler */ 765 /* Reset broken irq detection when installing new handler */
@@ -664,7 +777,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
664 777
665 spin_unlock_irqrestore(&desc->lock, flags); 778 spin_unlock_irqrestore(&desc->lock, flags);
666 779
667 new->irq = irq; 780 /*
781 * Strictly no need to wake it up, but hung_task complains
782 * when no hard interrupt wakes the thread up.
783 */
784 if (new->thread)
785 wake_up_process(new->thread);
786
668 register_irq_proc(irq, desc); 787 register_irq_proc(irq, desc);
669 new->dir = NULL; 788 new->dir = NULL;
670 register_handler_proc(irq, new); 789 register_handler_proc(irq, new);
@@ -718,7 +837,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
718{ 837{
719 struct irq_desc *desc = irq_to_desc(irq); 838 struct irq_desc *desc = irq_to_desc(irq);
720 struct irqaction *action, **action_ptr; 839 struct irqaction *action, **action_ptr;
721 struct task_struct *irqthread;
722 unsigned long flags; 840 unsigned long flags;
723 841
724 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 842 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -766,9 +884,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
766 desc->chip->disable(irq); 884 desc->chip->disable(irq);
767 } 885 }
768 886
769 irqthread = action->thread;
770 action->thread = NULL;
771
772 spin_unlock_irqrestore(&desc->lock, flags); 887 spin_unlock_irqrestore(&desc->lock, flags);
773 888
774 unregister_handler_proc(irq, action); 889 unregister_handler_proc(irq, action);
@@ -776,12 +891,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
776 /* Make sure it's not being used on another CPU: */ 891 /* Make sure it's not being used on another CPU: */
777 synchronize_irq(irq); 892 synchronize_irq(irq);
778 893
779 if (irqthread) {
780 if (!test_bit(IRQTF_DIED, &action->thread_flags))
781 kthread_stop(irqthread);
782 put_task_struct(irqthread);
783 }
784
785#ifdef CONFIG_DEBUG_SHIRQ 894#ifdef CONFIG_DEBUG_SHIRQ
786 /* 895 /*
787 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 896 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -797,6 +906,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
797 local_irq_restore(flags); 906 local_irq_restore(flags);
798 } 907 }
799#endif 908#endif
909
910 if (action->thread) {
911 if (!test_bit(IRQTF_DIED, &action->thread_flags))
912 kthread_stop(action->thread);
913 put_task_struct(action->thread);
914 }
915
800 return action; 916 return action;
801} 917}
802 918
@@ -829,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
829 */ 945 */
830void free_irq(unsigned int irq, void *dev_id) 946void free_irq(unsigned int irq, void *dev_id)
831{ 947{
948 struct irq_desc *desc = irq_to_desc(irq);
949
950 if (!desc)
951 return;
952
953 chip_bus_lock(irq, desc);
832 kfree(__free_irq(irq, dev_id)); 954 kfree(__free_irq(irq, dev_id));
955 chip_bus_sync_unlock(irq, desc);
833} 956}
834EXPORT_SYMBOL(free_irq); 957EXPORT_SYMBOL(free_irq);
835 958
@@ -838,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
838 * @irq: Interrupt line to allocate 961 * @irq: Interrupt line to allocate
839 * @handler: Function to be called when the IRQ occurs. 962 * @handler: Function to be called when the IRQ occurs.
840 * Primary handler for threaded interrupts 963 * Primary handler for threaded interrupts
964 * If NULL and thread_fn != NULL the default
965 * primary handler is installed
841 * @thread_fn: Function called from the irq handler thread 966 * @thread_fn: Function called from the irq handler thread
842 * If NULL, no irq thread is created 967 * If NULL, no irq thread is created
843 * @irqflags: Interrupt type flags 968 * @irqflags: Interrupt type flags
@@ -856,7 +981,7 @@ EXPORT_SYMBOL(free_irq);
856 * still called in hard interrupt context and has to check 981 * still called in hard interrupt context and has to check
857 * whether the interrupt originates from the device. If yes it 982 * whether the interrupt originates from the device. If yes it
858 * needs to disable the interrupt on the device and return 983 * needs to disable the interrupt on the device and return
859 * IRQ_THREAD_WAKE which will wake up the handler thread and run 984 * IRQ_WAKE_THREAD which will wake up the handler thread and run
860 * @thread_fn. This split handler design is necessary to support 985 * @thread_fn. This split handler design is necessary to support
861 * shared interrupts. 986 * shared interrupts.
862 * 987 *
@@ -917,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
917 1042
918 if (desc->status & IRQ_NOREQUEST) 1043 if (desc->status & IRQ_NOREQUEST)
919 return -EINVAL; 1044 return -EINVAL;
920 if (!handler) 1045
921 return -EINVAL; 1046 if (!handler) {
1047 if (!thread_fn)
1048 return -EINVAL;
1049 handler = irq_default_primary_handler;
1050 }
922 1051
923 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 1052 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
924 if (!action) 1053 if (!action)
@@ -930,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
930 action->name = devname; 1059 action->name = devname;
931 action->dev_id = dev_id; 1060 action->dev_id = dev_id;
932 1061
1062 chip_bus_lock(irq, desc);
933 retval = __setup_irq(irq, desc, action); 1063 retval = __setup_irq(irq, desc, action);
1064 chip_bus_sync_unlock(irq, desc);
1065
934 if (retval) 1066 if (retval)
935 kfree(action); 1067 kfree(action);
936 1068
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index cfe767ca1545..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -45,7 +45,7 @@ void move_masked_irq(int irq)
45 < nr_cpu_ids)) 45 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
47 cpumask_copy(desc->affinity, desc->pending_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc, desc->pending_mask); 48 irq_set_thread_affinity(desc);
49 } 49 }
50 50
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee57bf2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
107 107
108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
109{ 109{
110 /* those all static, do move them */ 110 /* those static or target node is -1, do not move them */
111 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY || node == -1)
112 return desc; 112 return desc;
113 113
114 if (desc->node != node) 114 if (desc->node != node)
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec14..a0bb09e79867 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
15/** 15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines 16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 * 17 *
18 * During system-wide suspend or hibernation device interrupts need to be 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * disabled at the chip level and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * IRQ_SUSPENDED flag for them. 21 * and sets the IRQ_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
74 !desc->chip->retrigger(irq)) {
75#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
76 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
77 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
297 297
298__setup("irqfixup", irqfixup_setup); 298__setup("irqfixup", irqfixup_setup);
299module_param(irqfixup, int, 0644); 299module_param(irqfixup, int, 0644);
300MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
301 300
302static int __init irqpoll_setup(char *str) 301static int __init irqpoll_setup(char *str)
303{ 302{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ae1c35201cc8..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
72 72
73 /* 73 /*
74 * round up to the next power of 2, since our 'let the indices 74 * round up to the next power of 2, since our 'let the indices
75 * wrap' tachnique works only in this case. 75 * wrap' technique works only in this case.
76 */ 76 */
77 if (size & (size - 1)) { 77 if (!is_power_of_2(size)) {
78 BUG_ON(size > 0x80000000); 78 BUG_ON(size > 0x80000000);
79 size = roundup_pow_of_two(size); 79 size = roundup_pow_of_two(size);
80 } 80 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..9fcb53a11f87 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
@@ -38,6 +37,8 @@
38#include <linux/suspend.h> 37#include <linux/suspend.h>
39#include <asm/uaccess.h> 38#include <asm/uaccess.h>
40 39
40#include <trace/events/module.h>
41
41extern int max_threads; 42extern int max_threads;
42 43
43static struct workqueue_struct *khelper_wq; 44static struct workqueue_struct *khelper_wq;
@@ -79,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...)
79#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
80 static int kmod_loop_msg; 81 static int kmod_loop_msg;
81 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
82 va_start(args, fmt); 87 va_start(args, fmt);
83 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
84 va_end(args); 89 va_end(args);
@@ -109,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
109 return -ENOMEM; 114 return -ENOMEM;
110 } 115 }
111 116
117 trace_module_request(module_name, wait, _RET_IP_);
118
112 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper(modprobe_path, argv, envp,
113 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
114 atomic_dec(&kmod_concurrent); 121 atomic_dec(&kmod_concurrent);
@@ -463,6 +470,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
463 int retval = 0; 470 int retval = 0;
464 471
465 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
466 474
467 helper_lock(); 475 helper_lock();
468 if (sub_info->path[0] == '\0') 476 if (sub_info->path[0] == '\0')
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,18 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 int safety;
241 234
242 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 236 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 237 return -EAGAIN;
248 238
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
250 int i; 240 int i;
251 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
252 continue; 242 continue;
@@ -264,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
264void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
265{ 255{
266 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
267 struct hlist_node *pos;
268 257
269 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
270 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
271 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
272 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
273 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
274 if (dirty) { 263 if (dirty) {
275 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
276 kip->ngarbage++; 265 kip->ngarbage++;
277 } else { 266 } else
278 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
279 }
280 break; 268 break;
281 } 269 }
282 } 270 }
@@ -698,7 +686,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 686 p->addr = addr;
699 687
700 preempt_disable(); 688 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 689 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 690 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 691 preempt_enable();
704 return -EINVAL; 692 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe40500..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,14 +9,13 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/cpuset.h>
12#include <linux/unistd.h> 13#include <linux/unistd.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
16#include <trace/events/sched.h> 17#include <trace/events/sched.h>
17 18
18#define KTHREAD_NICE_LEVEL (-5)
19
20static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -26,7 +25,6 @@ struct kthread_create_info
26 /* Information passed to kthread() from kthreadd. */ 25 /* Information passed to kthread() from kthreadd. */
27 int (*threadfn)(void *data); 26 int (*threadfn)(void *data);
28 void *data; 27 void *data;
29 struct completion started;
30 28
31 /* Result passed back to kthread_create() from kthreadd. */ 29 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 30 struct task_struct *result;
@@ -35,17 +33,13 @@ struct kthread_create_info
35 struct list_head list; 33 struct list_head list;
36}; 34};
37 35
38struct kthread_stop_info 36struct kthread {
39{ 37 int should_stop;
40 struct task_struct *k; 38 struct completion exited;
41 int err;
42 struct completion done;
43}; 39};
44 40
45/* Thread stopping is done by setthing this var: lock serializes 41#define to_kthread(tsk) \
46 * multiple kthread_stop calls. */ 42 container_of((tsk)->vfork_done, struct kthread, exited)
47static DEFINE_MUTEX(kthread_stop_lock);
48static struct kthread_stop_info kthread_stop_info;
49 43
50/** 44/**
51 * kthread_should_stop - should this kthread return now? 45 * kthread_should_stop - should this kthread return now?
@@ -56,36 +50,35 @@ static struct kthread_stop_info kthread_stop_info;
56 */ 50 */
57int kthread_should_stop(void) 51int kthread_should_stop(void)
58{ 52{
59 return (kthread_stop_info.k == current); 53 return to_kthread(current)->should_stop;
60} 54}
61EXPORT_SYMBOL(kthread_should_stop); 55EXPORT_SYMBOL(kthread_should_stop);
62 56
63static int kthread(void *_create) 57static int kthread(void *_create)
64{ 58{
59 /* Copy data: it's on kthread's stack */
65 struct kthread_create_info *create = _create; 60 struct kthread_create_info *create = _create;
66 int (*threadfn)(void *data); 61 int (*threadfn)(void *data) = create->threadfn;
67 void *data; 62 void *data = create->data;
68 int ret = -EINTR; 63 struct kthread self;
64 int ret;
69 65
70 /* Copy data: it's on kthread's stack */ 66 self.should_stop = 0;
71 threadfn = create->threadfn; 67 init_completion(&self.exited);
72 data = create->data; 68 current->vfork_done = &self.exited;
73 69
74 /* OK, tell user we're spawned, wait for stop or wakeup */ 70 /* OK, tell user we're spawned, wait for stop or wakeup */
75 __set_current_state(TASK_UNINTERRUPTIBLE); 71 __set_current_state(TASK_UNINTERRUPTIBLE);
76 create->result = current; 72 create->result = current;
77 complete(&create->started); 73 complete(&create->done);
78 schedule(); 74 schedule();
79 75
80 if (!kthread_should_stop()) 76 ret = -EINTR;
77 if (!self.should_stop)
81 ret = threadfn(data); 78 ret = threadfn(data);
82 79
83 /* It might have exited on its own, w/o kthread_stop. Check. */ 80 /* we can't just return, we must preserve "self" on stack */
84 if (kthread_should_stop()) { 81 do_exit(ret);
85 kthread_stop_info.err = ret;
86 complete(&kthread_stop_info.done);
87 }
88 return 0;
89} 82}
90 83
91static void create_kthread(struct kthread_create_info *create) 84static void create_kthread(struct kthread_create_info *create)
@@ -94,11 +87,10 @@ static void create_kthread(struct kthread_create_info *create)
94 87
95 /* We want our own signal handler (we take no signals by default). */ 88 /* We want our own signal handler (we take no signals by default). */
96 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 89 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
97 if (pid < 0) 90 if (pid < 0) {
98 create->result = ERR_PTR(pid); 91 create->result = ERR_PTR(pid);
99 else 92 complete(&create->done);
100 wait_for_completion(&create->started); 93 }
101 complete(&create->done);
102} 94}
103 95
104/** 96/**
@@ -129,7 +121,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
129 121
130 create.threadfn = threadfn; 122 create.threadfn = threadfn;
131 create.data = data; 123 create.data = data;
132 init_completion(&create.started);
133 init_completion(&create.done); 124 init_completion(&create.done);
134 125
135 spin_lock(&kthread_create_lock); 126 spin_lock(&kthread_create_lock);
@@ -152,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
152 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
153 */ 144 */
154 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
155 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
156 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
157 } 147 }
158 return create.result; 148 return create.result;
@@ -187,40 +177,34 @@ EXPORT_SYMBOL(kthread_bind);
187 * @k: thread created by kthread_create(). 177 * @k: thread created by kthread_create().
188 * 178 *
189 * Sets kthread_should_stop() for @k to return true, wakes it, and 179 * Sets kthread_should_stop() for @k to return true, wakes it, and
190 * waits for it to exit. Your threadfn() must not call do_exit() 180 * waits for it to exit. This can also be called after kthread_create()
191 * itself if you use this function! This can also be called after 181 * instead of calling wake_up_process(): the thread will exit without
192 * kthread_create() instead of calling wake_up_process(): the thread 182 * calling threadfn().
193 * will exit without calling threadfn(). 183 *
184 * If threadfn() may call do_exit() itself, the caller must ensure
185 * task_struct can't go away.
194 * 186 *
195 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 187 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
196 * was never called. 188 * was never called.
197 */ 189 */
198int kthread_stop(struct task_struct *k) 190int kthread_stop(struct task_struct *k)
199{ 191{
192 struct kthread *kthread;
200 int ret; 193 int ret;
201 194
202 mutex_lock(&kthread_stop_lock);
203
204 /* It could exit after stop_info.k set, but before wake_up_process. */
205 get_task_struct(k);
206
207 trace_sched_kthread_stop(k); 195 trace_sched_kthread_stop(k);
196 get_task_struct(k);
208 197
209 /* Must init completion *before* thread sees kthread_stop_info.k */ 198 kthread = to_kthread(k);
210 init_completion(&kthread_stop_info.done); 199 barrier(); /* it might have exited */
211 smp_wmb(); 200 if (k->vfork_done != NULL) {
201 kthread->should_stop = 1;
202 wake_up_process(k);
203 wait_for_completion(&kthread->exited);
204 }
205 ret = k->exit_code;
212 206
213 /* Now set kthread_should_stop() to true, and wake it up. */
214 kthread_stop_info.k = k;
215 wake_up_process(k);
216 put_task_struct(k); 207 put_task_struct(k);
217
218 /* Once it dies, reset stop ptr, gather result and we're done. */
219 wait_for_completion(&kthread_stop_info.done);
220 kthread_stop_info.k = NULL;
221 ret = kthread_stop_info.err;
222 mutex_unlock(&kthread_stop_lock);
223
224 trace_sched_kthread_stop_ret(ret); 208 trace_sched_kthread_stop_ret(ret);
225 209
226 return ret; 210 return ret;
@@ -234,8 +218,8 @@ int kthreadd(void *unused)
234 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
235 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
236 ignore_signals(tsk); 220 ignore_signals(tsk);
237 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
238 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
222 set_mems_allowed(node_possible_map);
239 223
240 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
241 225
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..f74d2d7aa605 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
366 367
367 save_stack_trace(trace); 368 save_stack_trace(trace);
368 369
370 /*
371 * Some daft arches put -1 at the end to indicate its a full trace.
372 *
373 * <rant> this is buggy anyway, since it takes a whole extra entry so a
374 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant>
376 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--;
379
369 trace->max_entries = trace->nr_entries; 380 trace->max_entries = trace->nr_entries;
370 381
371 nr_stack_trace_entries += trace->nr_entries; 382 nr_stack_trace_entries += trace->nr_entries;
372 383
373 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 384 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
374 if (!debug_locks_off_graph_unlock()) 385 if (!debug_locks_off_graph_unlock())
375 return 0; 386 return 0;
376 387
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
388unsigned int nr_softirq_chains; 399unsigned int nr_softirq_chains;
389unsigned int nr_process_chains; 400unsigned int nr_process_chains;
390unsigned int max_lockdep_depth; 401unsigned int max_lockdep_depth;
391unsigned int max_recursion_depth;
392
393static unsigned int lockdep_dependency_gen_id;
394
395static bool lockdep_dependency_visit(struct lock_class *source,
396 unsigned int depth)
397{
398 if (!depth)
399 lockdep_dependency_gen_id++;
400 if (source->dep_gen_id == lockdep_dependency_gen_id)
401 return true;
402 source->dep_gen_id = lockdep_dependency_gen_id;
403 return false;
404}
405 402
406#ifdef CONFIG_DEBUG_LOCKDEP 403#ifdef CONFIG_DEBUG_LOCKDEP
407/* 404/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
431atomic_t redundant_softirqs_off; 428atomic_t redundant_softirqs_off;
432atomic_t nr_unused_locks; 429atomic_t nr_unused_locks;
433atomic_t nr_cyclic_checks; 430atomic_t nr_cyclic_checks;
434atomic_t nr_cyclic_check_recursions;
435atomic_t nr_find_usage_forwards_checks; 431atomic_t nr_find_usage_forwards_checks;
436atomic_t nr_find_usage_forwards_recursions;
437atomic_t nr_find_usage_backwards_checks; 432atomic_t nr_find_usage_backwards_checks;
438atomic_t nr_find_usage_backwards_recursions;
439#endif 433#endif
440 434
441/* 435/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
551 } 545 }
552} 546}
553 547
554static void print_lock_class_header(struct lock_class *class, int depth)
555{
556 int bit;
557
558 printk("%*s->", depth, "");
559 print_lock_name(class);
560 printk(" ops: %lu", class->ops);
561 printk(" {\n");
562
563 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
564 if (class->usage_mask & (1 << bit)) {
565 int len = depth;
566
567 len += printk("%*s %s", depth, "", usage_str[bit]);
568 len += printk(" at:\n");
569 print_stack_trace(class->usage_traces + bit, len);
570 }
571 }
572 printk("%*s }\n", depth, "");
573
574 printk("%*s ... key at: ",depth,"");
575 print_ip_sym((unsigned long)class->key);
576}
577
578/*
579 * printk all lock dependencies starting at <entry>:
580 */
581static void __used
582print_lock_dependencies(struct lock_class *class, int depth)
583{
584 struct lock_list *entry;
585
586 if (lockdep_dependency_visit(class, depth))
587 return;
588
589 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
590 return;
591
592 print_lock_class_header(class, depth);
593
594 list_for_each_entry(entry, &class->locks_after, entry) {
595 if (DEBUG_LOCKS_WARN_ON(!entry->class))
596 return;
597
598 print_lock_dependencies(entry->class, depth + 1);
599
600 printk("%*s ... acquired at:\n",depth,"");
601 print_stack_trace(&entry->trace, 2);
602 printk("\n");
603 }
604}
605
606static void print_kernel_version(void) 548static void print_kernel_version(void)
607{ 549{
608 printk("%s %.*s\n", init_utsname()->release, 550 printk("%s %.*s\n", init_utsname()->release,
@@ -898,22 +840,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
898} 840}
899 841
900/* 842/*
843 * For good efficiency of modular, we use power of 2
844 */
845#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
846#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
847
848/*
849 * The circular_queue and helpers is used to implement the
850 * breadth-first search(BFS)algorithem, by which we can build
851 * the shortest path from the next lock to be acquired to the
852 * previous held lock if there is a circular between them.
853 */
854struct circular_queue {
855 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
856 unsigned int front, rear;
857};
858
859static struct circular_queue lock_cq;
860
861unsigned int max_bfs_queue_depth;
862
863static unsigned int lockdep_dependency_gen_id;
864
865static inline void __cq_init(struct circular_queue *cq)
866{
867 cq->front = cq->rear = 0;
868 lockdep_dependency_gen_id++;
869}
870
871static inline int __cq_empty(struct circular_queue *cq)
872{
873 return (cq->front == cq->rear);
874}
875
876static inline int __cq_full(struct circular_queue *cq)
877{
878 return ((cq->rear + 1) & CQ_MASK) == cq->front;
879}
880
881static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
882{
883 if (__cq_full(cq))
884 return -1;
885
886 cq->element[cq->rear] = elem;
887 cq->rear = (cq->rear + 1) & CQ_MASK;
888 return 0;
889}
890
891static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
892{
893 if (__cq_empty(cq))
894 return -1;
895
896 *elem = cq->element[cq->front];
897 cq->front = (cq->front + 1) & CQ_MASK;
898 return 0;
899}
900
901static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
902{
903 return (cq->rear - cq->front) & CQ_MASK;
904}
905
906static inline void mark_lock_accessed(struct lock_list *lock,
907 struct lock_list *parent)
908{
909 unsigned long nr;
910
911 nr = lock - list_entries;
912 WARN_ON(nr >= nr_list_entries);
913 lock->parent = parent;
914 lock->class->dep_gen_id = lockdep_dependency_gen_id;
915}
916
917static inline unsigned long lock_accessed(struct lock_list *lock)
918{
919 unsigned long nr;
920
921 nr = lock - list_entries;
922 WARN_ON(nr >= nr_list_entries);
923 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
924}
925
926static inline struct lock_list *get_lock_parent(struct lock_list *child)
927{
928 return child->parent;
929}
930
931static inline int get_lock_depth(struct lock_list *child)
932{
933 int depth = 0;
934 struct lock_list *parent;
935
936 while ((parent = get_lock_parent(child))) {
937 child = parent;
938 depth++;
939 }
940 return depth;
941}
942
943static int __bfs(struct lock_list *source_entry,
944 void *data,
945 int (*match)(struct lock_list *entry, void *data),
946 struct lock_list **target_entry,
947 int forward)
948{
949 struct lock_list *entry;
950 struct list_head *head;
951 struct circular_queue *cq = &lock_cq;
952 int ret = 1;
953
954 if (match(source_entry, data)) {
955 *target_entry = source_entry;
956 ret = 0;
957 goto exit;
958 }
959
960 if (forward)
961 head = &source_entry->class->locks_after;
962 else
963 head = &source_entry->class->locks_before;
964
965 if (list_empty(head))
966 goto exit;
967
968 __cq_init(cq);
969 __cq_enqueue(cq, (unsigned long)source_entry);
970
971 while (!__cq_empty(cq)) {
972 struct lock_list *lock;
973
974 __cq_dequeue(cq, (unsigned long *)&lock);
975
976 if (!lock->class) {
977 ret = -2;
978 goto exit;
979 }
980
981 if (forward)
982 head = &lock->class->locks_after;
983 else
984 head = &lock->class->locks_before;
985
986 list_for_each_entry(entry, head, entry) {
987 if (!lock_accessed(entry)) {
988 unsigned int cq_depth;
989 mark_lock_accessed(entry, lock);
990 if (match(entry, data)) {
991 *target_entry = entry;
992 ret = 0;
993 goto exit;
994 }
995
996 if (__cq_enqueue(cq, (unsigned long)entry)) {
997 ret = -1;
998 goto exit;
999 }
1000 cq_depth = __cq_get_elem_count(cq);
1001 if (max_bfs_queue_depth < cq_depth)
1002 max_bfs_queue_depth = cq_depth;
1003 }
1004 }
1005 }
1006exit:
1007 return ret;
1008}
1009
1010static inline int __bfs_forwards(struct lock_list *src_entry,
1011 void *data,
1012 int (*match)(struct lock_list *entry, void *data),
1013 struct lock_list **target_entry)
1014{
1015 return __bfs(src_entry, data, match, target_entry, 1);
1016
1017}
1018
1019static inline int __bfs_backwards(struct lock_list *src_entry,
1020 void *data,
1021 int (*match)(struct lock_list *entry, void *data),
1022 struct lock_list **target_entry)
1023{
1024 return __bfs(src_entry, data, match, target_entry, 0);
1025
1026}
1027
1028/*
901 * Recursive, forwards-direction lock-dependency checking, used for 1029 * Recursive, forwards-direction lock-dependency checking, used for
902 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe 1030 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
903 * checking. 1031 * checking.
904 *
905 * (to keep the stackframe of the recursive functions small we
906 * use these global variables, and we also mark various helper
907 * functions as noinline.)
908 */ 1032 */
909static struct held_lock *check_source, *check_target;
910 1033
911/* 1034/*
912 * Print a dependency chain entry (this is only done when a deadlock 1035 * Print a dependency chain entry (this is only done when a deadlock
913 * has been detected): 1036 * has been detected):
914 */ 1037 */
915static noinline int 1038static noinline int
916print_circular_bug_entry(struct lock_list *target, unsigned int depth) 1039print_circular_bug_entry(struct lock_list *target, int depth)
917{ 1040{
918 if (debug_locks_silent) 1041 if (debug_locks_silent)
919 return 0; 1042 return 0;
@@ -930,11 +1053,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
930 * header first: 1053 * header first:
931 */ 1054 */
932static noinline int 1055static noinline int
933print_circular_bug_header(struct lock_list *entry, unsigned int depth) 1056print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1057 struct held_lock *check_src,
1058 struct held_lock *check_tgt)
934{ 1059{
935 struct task_struct *curr = current; 1060 struct task_struct *curr = current;
936 1061
937 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1062 if (debug_locks_silent)
938 return 0; 1063 return 0;
939 1064
940 printk("\n=======================================================\n"); 1065 printk("\n=======================================================\n");
@@ -943,9 +1068,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
943 printk( "-------------------------------------------------------\n"); 1068 printk( "-------------------------------------------------------\n");
944 printk("%s/%d is trying to acquire lock:\n", 1069 printk("%s/%d is trying to acquire lock:\n",
945 curr->comm, task_pid_nr(curr)); 1070 curr->comm, task_pid_nr(curr));
946 print_lock(check_source); 1071 print_lock(check_src);
947 printk("\nbut task is already holding lock:\n"); 1072 printk("\nbut task is already holding lock:\n");
948 print_lock(check_target); 1073 print_lock(check_tgt);
949 printk("\nwhich lock already depends on the new lock.\n\n"); 1074 printk("\nwhich lock already depends on the new lock.\n\n");
950 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1075 printk("\nthe existing dependency chain (in reverse order) is:\n");
951 1076
@@ -954,19 +1079,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
954 return 0; 1079 return 0;
955} 1080}
956 1081
957static noinline int print_circular_bug_tail(void) 1082static inline int class_equal(struct lock_list *entry, void *data)
1083{
1084 return entry->class == data;
1085}
1086
1087static noinline int print_circular_bug(struct lock_list *this,
1088 struct lock_list *target,
1089 struct held_lock *check_src,
1090 struct held_lock *check_tgt)
958{ 1091{
959 struct task_struct *curr = current; 1092 struct task_struct *curr = current;
960 struct lock_list this; 1093 struct lock_list *parent;
1094 int depth;
961 1095
962 if (debug_locks_silent) 1096 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
963 return 0; 1097 return 0;
964 1098
965 this.class = hlock_class(check_source); 1099 if (!save_trace(&this->trace))
966 if (!save_trace(&this.trace))
967 return 0; 1100 return 0;
968 1101
969 print_circular_bug_entry(&this, 0); 1102 depth = get_lock_depth(target);
1103
1104 print_circular_bug_header(target, depth, check_src, check_tgt);
1105
1106 parent = get_lock_parent(target);
1107
1108 while (parent) {
1109 print_circular_bug_entry(parent, --depth);
1110 parent = get_lock_parent(parent);
1111 }
970 1112
971 printk("\nother info that might help us debug this:\n\n"); 1113 printk("\nother info that might help us debug this:\n\n");
972 lockdep_print_held_locks(curr); 1114 lockdep_print_held_locks(curr);
@@ -977,73 +1119,69 @@ static noinline int print_circular_bug_tail(void)
977 return 0; 1119 return 0;
978} 1120}
979 1121
980#define RECURSION_LIMIT 40 1122static noinline int print_bfs_bug(int ret)
981
982static int noinline print_infinite_recursion_bug(void)
983{ 1123{
984 if (!debug_locks_off_graph_unlock()) 1124 if (!debug_locks_off_graph_unlock())
985 return 0; 1125 return 0;
986 1126
987 WARN_ON(1); 1127 WARN(1, "lockdep bfs error:%d\n", ret);
988 1128
989 return 0; 1129 return 0;
990} 1130}
991 1131
992unsigned long __lockdep_count_forward_deps(struct lock_class *class, 1132static int noop_count(struct lock_list *entry, void *data)
993 unsigned int depth)
994{ 1133{
995 struct lock_list *entry; 1134 (*(unsigned long *)data)++;
996 unsigned long ret = 1; 1135 return 0;
1136}
997 1137
998 if (lockdep_dependency_visit(class, depth)) 1138unsigned long __lockdep_count_forward_deps(struct lock_list *this)
999 return 0; 1139{
1140 unsigned long count = 0;
1141 struct lock_list *uninitialized_var(target_entry);
1000 1142
1001 /* 1143 __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
1002 * Recurse this class's dependency list:
1003 */
1004 list_for_each_entry(entry, &class->locks_after, entry)
1005 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1006 1144
1007 return ret; 1145 return count;
1008} 1146}
1009
1010unsigned long lockdep_count_forward_deps(struct lock_class *class) 1147unsigned long lockdep_count_forward_deps(struct lock_class *class)
1011{ 1148{
1012 unsigned long ret, flags; 1149 unsigned long ret, flags;
1150 struct lock_list this;
1151
1152 this.parent = NULL;
1153 this.class = class;
1013 1154
1014 local_irq_save(flags); 1155 local_irq_save(flags);
1015 __raw_spin_lock(&lockdep_lock); 1156 __raw_spin_lock(&lockdep_lock);
1016 ret = __lockdep_count_forward_deps(class, 0); 1157 ret = __lockdep_count_forward_deps(&this);
1017 __raw_spin_unlock(&lockdep_lock); 1158 __raw_spin_unlock(&lockdep_lock);
1018 local_irq_restore(flags); 1159 local_irq_restore(flags);
1019 1160
1020 return ret; 1161 return ret;
1021} 1162}
1022 1163
1023unsigned long __lockdep_count_backward_deps(struct lock_class *class, 1164unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1024 unsigned int depth)
1025{ 1165{
1026 struct lock_list *entry; 1166 unsigned long count = 0;
1027 unsigned long ret = 1; 1167 struct lock_list *uninitialized_var(target_entry);
1028 1168
1029 if (lockdep_dependency_visit(class, depth)) 1169 __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
1030 return 0;
1031 /*
1032 * Recurse this class's dependency list:
1033 */
1034 list_for_each_entry(entry, &class->locks_before, entry)
1035 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1036 1170
1037 return ret; 1171 return count;
1038} 1172}
1039 1173
1040unsigned long lockdep_count_backward_deps(struct lock_class *class) 1174unsigned long lockdep_count_backward_deps(struct lock_class *class)
1041{ 1175{
1042 unsigned long ret, flags; 1176 unsigned long ret, flags;
1177 struct lock_list this;
1178
1179 this.parent = NULL;
1180 this.class = class;
1043 1181
1044 local_irq_save(flags); 1182 local_irq_save(flags);
1045 __raw_spin_lock(&lockdep_lock); 1183 __raw_spin_lock(&lockdep_lock);
1046 ret = __lockdep_count_backward_deps(class, 0); 1184 ret = __lockdep_count_backward_deps(&this);
1047 __raw_spin_unlock(&lockdep_lock); 1185 __raw_spin_unlock(&lockdep_lock);
1048 local_irq_restore(flags); 1186 local_irq_restore(flags);
1049 1187
@@ -1055,29 +1193,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1055 * lead to <target>. Print an error and return 0 if it does. 1193 * lead to <target>. Print an error and return 0 if it does.
1056 */ 1194 */
1057static noinline int 1195static noinline int
1058check_noncircular(struct lock_class *source, unsigned int depth) 1196check_noncircular(struct lock_list *root, struct lock_class *target,
1197 struct lock_list **target_entry)
1059{ 1198{
1060 struct lock_list *entry; 1199 int result;
1061 1200
1062 if (lockdep_dependency_visit(source, depth)) 1201 debug_atomic_inc(&nr_cyclic_checks);
1063 return 1;
1064 1202
1065 debug_atomic_inc(&nr_cyclic_check_recursions); 1203 result = __bfs_forwards(root, target, class_equal, target_entry);
1066 if (depth > max_recursion_depth) 1204
1067 max_recursion_depth = depth; 1205 return result;
1068 if (depth >= RECURSION_LIMIT)
1069 return print_infinite_recursion_bug();
1070 /*
1071 * Check this lock's dependency list:
1072 */
1073 list_for_each_entry(entry, &source->locks_after, entry) {
1074 if (entry->class == hlock_class(check_target))
1075 return print_circular_bug_header(entry, depth+1);
1076 debug_atomic_inc(&nr_cyclic_checks);
1077 if (!check_noncircular(entry->class, depth+1))
1078 return print_circular_bug_entry(entry, depth+1);
1079 }
1080 return 1;
1081} 1206}
1082 1207
1083#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1208#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1211,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
1086 * proving that two subgraphs can be connected by a new dependency 1211 * proving that two subgraphs can be connected by a new dependency
1087 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1212 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1088 */ 1213 */
1089static enum lock_usage_bit find_usage_bit; 1214
1090static struct lock_class *forwards_match, *backwards_match; 1215static inline int usage_match(struct lock_list *entry, void *bit)
1216{
1217 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
1218}
1219
1220
1091 1221
1092/* 1222/*
1093 * Find a node in the forwards-direction dependency sub-graph starting 1223 * Find a node in the forwards-direction dependency sub-graph starting
1094 * at <source> that matches <find_usage_bit>. 1224 * at @root->class that matches @bit.
1095 * 1225 *
1096 * Return 2 if such a node exists in the subgraph, and put that node 1226 * Return 0 if such a node exists in the subgraph, and put that node
1097 * into <forwards_match>. 1227 * into *@target_entry.
1098 * 1228 *
1099 * Return 1 otherwise and keep <forwards_match> unchanged. 1229 * Return 1 otherwise and keep *@target_entry unchanged.
1100 * Return 0 on error. 1230 * Return <0 on error.
1101 */ 1231 */
1102static noinline int 1232static int
1103find_usage_forwards(struct lock_class *source, unsigned int depth) 1233find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1234 struct lock_list **target_entry)
1104{ 1235{
1105 struct lock_list *entry; 1236 int result;
1106 int ret;
1107
1108 if (lockdep_dependency_visit(source, depth))
1109 return 1;
1110
1111 if (depth > max_recursion_depth)
1112 max_recursion_depth = depth;
1113 if (depth >= RECURSION_LIMIT)
1114 return print_infinite_recursion_bug();
1115 1237
1116 debug_atomic_inc(&nr_find_usage_forwards_checks); 1238 debug_atomic_inc(&nr_find_usage_forwards_checks);
1117 if (source->usage_mask & (1 << find_usage_bit)) {
1118 forwards_match = source;
1119 return 2;
1120 }
1121 1239
1122 /* 1240 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1123 * Check this lock's dependency list: 1241
1124 */ 1242 return result;
1125 list_for_each_entry(entry, &source->locks_after, entry) {
1126 debug_atomic_inc(&nr_find_usage_forwards_recursions);
1127 ret = find_usage_forwards(entry->class, depth+1);
1128 if (ret == 2 || ret == 0)
1129 return ret;
1130 }
1131 return 1;
1132} 1243}
1133 1244
1134/* 1245/*
1135 * Find a node in the backwards-direction dependency sub-graph starting 1246 * Find a node in the backwards-direction dependency sub-graph starting
1136 * at <source> that matches <find_usage_bit>. 1247 * at @root->class that matches @bit.
1137 * 1248 *
1138 * Return 2 if such a node exists in the subgraph, and put that node 1249 * Return 0 if such a node exists in the subgraph, and put that node
1139 * into <backwards_match>. 1250 * into *@target_entry.
1140 * 1251 *
1141 * Return 1 otherwise and keep <backwards_match> unchanged. 1252 * Return 1 otherwise and keep *@target_entry unchanged.
1142 * Return 0 on error. 1253 * Return <0 on error.
1143 */ 1254 */
1144static noinline int 1255static int
1145find_usage_backwards(struct lock_class *source, unsigned int depth) 1256find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1257 struct lock_list **target_entry)
1146{ 1258{
1147 struct lock_list *entry; 1259 int result;
1148 int ret;
1149 1260
1150 if (lockdep_dependency_visit(source, depth)) 1261 debug_atomic_inc(&nr_find_usage_backwards_checks);
1151 return 1;
1152 1262
1153 if (!__raw_spin_is_locked(&lockdep_lock)) 1263 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1154 return DEBUG_LOCKS_WARN_ON(1);
1155 1264
1156 if (depth > max_recursion_depth) 1265 return result;
1157 max_recursion_depth = depth; 1266}
1158 if (depth >= RECURSION_LIMIT)
1159 return print_infinite_recursion_bug();
1160 1267
1161 debug_atomic_inc(&nr_find_usage_backwards_checks); 1268static void print_lock_class_header(struct lock_class *class, int depth)
1162 if (source->usage_mask & (1 << find_usage_bit)) { 1269{
1163 backwards_match = source; 1270 int bit;
1164 return 2;
1165 }
1166 1271
1167 if (!source && debug_locks_off_graph_unlock()) { 1272 printk("%*s->", depth, "");
1168 WARN_ON(1); 1273 print_lock_name(class);
1169 return 0; 1274 printk(" ops: %lu", class->ops);
1170 } 1275 printk(" {\n");
1171 1276
1172 /* 1277 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
1173 * Check this lock's dependency list: 1278 if (class->usage_mask & (1 << bit)) {
1174 */ 1279 int len = depth;
1175 list_for_each_entry(entry, &source->locks_before, entry) { 1280
1176 debug_atomic_inc(&nr_find_usage_backwards_recursions); 1281 len += printk("%*s %s", depth, "", usage_str[bit]);
1177 ret = find_usage_backwards(entry->class, depth+1); 1282 len += printk(" at:\n");
1178 if (ret == 2 || ret == 0) 1283 print_stack_trace(class->usage_traces + bit, len);
1179 return ret; 1284 }
1180 } 1285 }
1181 return 1; 1286 printk("%*s }\n", depth, "");
1287
1288 printk("%*s ... key at: ",depth,"");
1289 print_ip_sym((unsigned long)class->key);
1290}
1291
1292/*
1293 * printk the shortest lock dependencies from @start to @end in reverse order:
1294 */
1295static void __used
1296print_shortest_lock_dependencies(struct lock_list *leaf,
1297 struct lock_list *root)
1298{
1299 struct lock_list *entry = leaf;
1300 int depth;
1301
1302 /*compute depth from generated tree by BFS*/
1303 depth = get_lock_depth(leaf);
1304
1305 do {
1306 print_lock_class_header(entry->class, depth);
1307 printk("%*s ... acquired at:\n", depth, "");
1308 print_stack_trace(&entry->trace, 2);
1309 printk("\n");
1310
1311 if (depth == 0 && (entry != root)) {
1312 printk("lockdep:%s bad BFS generated tree\n", __func__);
1313 break;
1314 }
1315
1316 entry = get_lock_parent(entry);
1317 depth--;
1318 } while (entry && (depth >= 0));
1319
1320 return;
1182} 1321}
1183 1322
1184static int 1323static int
1185print_bad_irq_dependency(struct task_struct *curr, 1324print_bad_irq_dependency(struct task_struct *curr,
1325 struct lock_list *prev_root,
1326 struct lock_list *next_root,
1327 struct lock_list *backwards_entry,
1328 struct lock_list *forwards_entry,
1186 struct held_lock *prev, 1329 struct held_lock *prev,
1187 struct held_lock *next, 1330 struct held_lock *next,
1188 enum lock_usage_bit bit1, 1331 enum lock_usage_bit bit1,
@@ -1215,26 +1358,32 @@ print_bad_irq_dependency(struct task_struct *curr,
1215 1358
1216 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1359 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
1217 irqclass); 1360 irqclass);
1218 print_lock_name(backwards_match); 1361 print_lock_name(backwards_entry->class);
1219 printk("\n... which became %s-irq-safe at:\n", irqclass); 1362 printk("\n... which became %s-irq-safe at:\n", irqclass);
1220 1363
1221 print_stack_trace(backwards_match->usage_traces + bit1, 1); 1364 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1222 1365
1223 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1366 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
1224 print_lock_name(forwards_match); 1367 print_lock_name(forwards_entry->class);
1225 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1368 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
1226 printk("..."); 1369 printk("...");
1227 1370
1228 print_stack_trace(forwards_match->usage_traces + bit2, 1); 1371 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1229 1372
1230 printk("\nother info that might help us debug this:\n\n"); 1373 printk("\nother info that might help us debug this:\n\n");
1231 lockdep_print_held_locks(curr); 1374 lockdep_print_held_locks(curr);
1232 1375
1233 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); 1376 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
1234 print_lock_dependencies(backwards_match, 0); 1377 printk(" and the holding lock:\n");
1378 if (!save_trace(&prev_root->trace))
1379 return 0;
1380 print_shortest_lock_dependencies(backwards_entry, prev_root);
1235 1381
1236 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); 1382 printk("\nthe dependencies between the lock to be acquired");
1237 print_lock_dependencies(forwards_match, 0); 1383 printk(" and %s-irq-unsafe lock:\n", irqclass);
1384 if (!save_trace(&next_root->trace))
1385 return 0;
1386 print_shortest_lock_dependencies(forwards_entry, next_root);
1238 1387
1239 printk("\nstack backtrace:\n"); 1388 printk("\nstack backtrace:\n");
1240 dump_stack(); 1389 dump_stack();
@@ -1248,19 +1397,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1248 enum lock_usage_bit bit_forwards, const char *irqclass) 1397 enum lock_usage_bit bit_forwards, const char *irqclass)
1249{ 1398{
1250 int ret; 1399 int ret;
1400 struct lock_list this, that;
1401 struct lock_list *uninitialized_var(target_entry);
1402 struct lock_list *uninitialized_var(target_entry1);
1251 1403
1252 find_usage_bit = bit_backwards; 1404 this.parent = NULL;
1253 /* fills in <backwards_match> */ 1405
1254 ret = find_usage_backwards(hlock_class(prev), 0); 1406 this.class = hlock_class(prev);
1255 if (!ret || ret == 1) 1407 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1408 if (ret < 0)
1409 return print_bfs_bug(ret);
1410 if (ret == 1)
1256 return ret; 1411 return ret;
1257 1412
1258 find_usage_bit = bit_forwards; 1413 that.parent = NULL;
1259 ret = find_usage_forwards(hlock_class(next), 0); 1414 that.class = hlock_class(next);
1260 if (!ret || ret == 1) 1415 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1416 if (ret < 0)
1417 return print_bfs_bug(ret);
1418 if (ret == 1)
1261 return ret; 1419 return ret;
1262 /* ret == 2 */ 1420
1263 return print_bad_irq_dependency(curr, prev, next, 1421 return print_bad_irq_dependency(curr, &this, &that,
1422 target_entry, target_entry1,
1423 prev, next,
1264 bit_backwards, bit_forwards, irqclass); 1424 bit_backwards, bit_forwards, irqclass);
1265} 1425}
1266 1426
@@ -1472,6 +1632,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1472{ 1632{
1473 struct lock_list *entry; 1633 struct lock_list *entry;
1474 int ret; 1634 int ret;
1635 struct lock_list this;
1636 struct lock_list *uninitialized_var(target_entry);
1475 1637
1476 /* 1638 /*
1477 * Prove that the new <prev> -> <next> dependency would not 1639 * Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1644,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1482 * We are using global variables to control the recursion, to 1644 * We are using global variables to control the recursion, to
1483 * keep the stackframe size of the recursive functions low: 1645 * keep the stackframe size of the recursive functions low:
1484 */ 1646 */
1485 check_source = next; 1647 this.class = hlock_class(next);
1486 check_target = prev; 1648 this.parent = NULL;
1487 if (!(check_noncircular(hlock_class(next), 0))) 1649 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1488 return print_circular_bug_tail(); 1650 if (unlikely(!ret))
1651 return print_circular_bug(&this, target_entry, next, prev);
1652 else if (unlikely(ret < 0))
1653 return print_bfs_bug(ret);
1489 1654
1490 if (!check_prev_add_irq(curr, prev, next)) 1655 if (!check_prev_add_irq(curr, prev, next))
1491 return 0; 1656 return 0;
@@ -1884,7 +2049,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1884 * print irq inversion bug: 2049 * print irq inversion bug:
1885 */ 2050 */
1886static int 2051static int
1887print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, 2052print_irq_inversion_bug(struct task_struct *curr,
2053 struct lock_list *root, struct lock_list *other,
1888 struct held_lock *this, int forwards, 2054 struct held_lock *this, int forwards,
1889 const char *irqclass) 2055 const char *irqclass)
1890{ 2056{
@@ -1902,17 +2068,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1902 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2068 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1903 else 2069 else
1904 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2070 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1905 print_lock_name(other); 2071 print_lock_name(other->class);
1906 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2072 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1907 2073
1908 printk("\nother info that might help us debug this:\n"); 2074 printk("\nother info that might help us debug this:\n");
1909 lockdep_print_held_locks(curr); 2075 lockdep_print_held_locks(curr);
1910 2076
1911 printk("\nthe first lock's dependencies:\n"); 2077 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
1912 print_lock_dependencies(hlock_class(this), 0); 2078 if (!save_trace(&root->trace))
1913 2079 return 0;
1914 printk("\nthe second lock's dependencies:\n"); 2080 print_shortest_lock_dependencies(other, root);
1915 print_lock_dependencies(other, 0);
1916 2081
1917 printk("\nstack backtrace:\n"); 2082 printk("\nstack backtrace:\n");
1918 dump_stack(); 2083 dump_stack();
@@ -1929,14 +2094,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1929 enum lock_usage_bit bit, const char *irqclass) 2094 enum lock_usage_bit bit, const char *irqclass)
1930{ 2095{
1931 int ret; 2096 int ret;
1932 2097 struct lock_list root;
1933 find_usage_bit = bit; 2098 struct lock_list *uninitialized_var(target_entry);
1934 /* fills in <forwards_match> */ 2099
1935 ret = find_usage_forwards(hlock_class(this), 0); 2100 root.parent = NULL;
1936 if (!ret || ret == 1) 2101 root.class = hlock_class(this);
2102 ret = find_usage_forwards(&root, bit, &target_entry);
2103 if (ret < 0)
2104 return print_bfs_bug(ret);
2105 if (ret == 1)
1937 return ret; 2106 return ret;
1938 2107
1939 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); 2108 return print_irq_inversion_bug(curr, &root, target_entry,
2109 this, 1, irqclass);
1940} 2110}
1941 2111
1942/* 2112/*
@@ -1948,14 +2118,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1948 enum lock_usage_bit bit, const char *irqclass) 2118 enum lock_usage_bit bit, const char *irqclass)
1949{ 2119{
1950 int ret; 2120 int ret;
1951 2121 struct lock_list root;
1952 find_usage_bit = bit; 2122 struct lock_list *uninitialized_var(target_entry);
1953 /* fills in <backwards_match> */ 2123
1954 ret = find_usage_backwards(hlock_class(this), 0); 2124 root.parent = NULL;
1955 if (!ret || ret == 1) 2125 root.class = hlock_class(this);
2126 ret = find_usage_backwards(&root, bit, &target_entry);
2127 if (ret < 0)
2128 return print_bfs_bug(ret);
2129 if (ret == 1)
1956 return ret; 2130 return ret;
1957 2131
1958 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 2132 return print_irq_inversion_bug(curr, &root, target_entry,
2133 this, 1, irqclass);
1959} 2134}
1960 2135
1961void print_irqtrace_events(struct task_struct *curr) 2136void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2705,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2530 */ 2705 */
2531static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2706static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2532 int trylock, int read, int check, int hardirqs_off, 2707 int trylock, int read, int check, int hardirqs_off,
2533 struct lockdep_map *nest_lock, unsigned long ip) 2708 struct lockdep_map *nest_lock, unsigned long ip,
2709 int references)
2534{ 2710{
2535 struct task_struct *curr = current; 2711 struct task_struct *curr = current;
2536 struct lock_class *class = NULL; 2712 struct lock_class *class = NULL;
2537 struct held_lock *hlock; 2713 struct held_lock *hlock;
2538 unsigned int depth, id; 2714 unsigned int depth, id;
2539 int chain_head = 0; 2715 int chain_head = 0;
2716 int class_idx;
2540 u64 chain_key; 2717 u64 chain_key;
2541 2718
2542 if (!prove_locking) 2719 if (!prove_locking)
@@ -2584,10 +2761,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2761 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2585 return 0; 2762 return 0;
2586 2763
2764 class_idx = class - lock_classes + 1;
2765
2766 if (depth) {
2767 hlock = curr->held_locks + depth - 1;
2768 if (hlock->class_idx == class_idx && nest_lock) {
2769 if (hlock->references)
2770 hlock->references++;
2771 else
2772 hlock->references = 2;
2773
2774 return 1;
2775 }
2776 }
2777
2587 hlock = curr->held_locks + depth; 2778 hlock = curr->held_locks + depth;
2588 if (DEBUG_LOCKS_WARN_ON(!class)) 2779 if (DEBUG_LOCKS_WARN_ON(!class))
2589 return 0; 2780 return 0;
2590 hlock->class_idx = class - lock_classes + 1; 2781 hlock->class_idx = class_idx;
2591 hlock->acquire_ip = ip; 2782 hlock->acquire_ip = ip;
2592 hlock->instance = lock; 2783 hlock->instance = lock;
2593 hlock->nest_lock = nest_lock; 2784 hlock->nest_lock = nest_lock;
@@ -2595,6 +2786,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2595 hlock->read = read; 2786 hlock->read = read;
2596 hlock->check = check; 2787 hlock->check = check;
2597 hlock->hardirqs_off = !!hardirqs_off; 2788 hlock->hardirqs_off = !!hardirqs_off;
2789 hlock->references = references;
2598#ifdef CONFIG_LOCK_STAT 2790#ifdef CONFIG_LOCK_STAT
2599 hlock->waittime_stamp = 0; 2791 hlock->waittime_stamp = 0;
2600 hlock->holdtime_stamp = sched_clock(); 2792 hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2895,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2703 return 1; 2895 return 1;
2704} 2896}
2705 2897
2898static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2899{
2900 if (hlock->instance == lock)
2901 return 1;
2902
2903 if (hlock->references) {
2904 struct lock_class *class = lock->class_cache;
2905
2906 if (!class)
2907 class = look_up_lock_class(lock, 0);
2908
2909 if (DEBUG_LOCKS_WARN_ON(!class))
2910 return 0;
2911
2912 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
2913 return 0;
2914
2915 if (hlock->class_idx == class - lock_classes + 1)
2916 return 1;
2917 }
2918
2919 return 0;
2920}
2921
2706static int 2922static int
2707__lock_set_class(struct lockdep_map *lock, const char *name, 2923__lock_set_class(struct lockdep_map *lock, const char *name,
2708 struct lock_class_key *key, unsigned int subclass, 2924 struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2942,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
2726 */ 2942 */
2727 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 2943 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2728 break; 2944 break;
2729 if (hlock->instance == lock) 2945 if (match_held_lock(hlock, lock))
2730 goto found_it; 2946 goto found_it;
2731 prev_hlock = hlock; 2947 prev_hlock = hlock;
2732 } 2948 }
@@ -2745,7 +2961,8 @@ found_it:
2745 if (!__lock_acquire(hlock->instance, 2961 if (!__lock_acquire(hlock->instance,
2746 hlock_class(hlock)->subclass, hlock->trylock, 2962 hlock_class(hlock)->subclass, hlock->trylock,
2747 hlock->read, hlock->check, hlock->hardirqs_off, 2963 hlock->read, hlock->check, hlock->hardirqs_off,
2748 hlock->nest_lock, hlock->acquire_ip)) 2964 hlock->nest_lock, hlock->acquire_ip,
2965 hlock->references))
2749 return 0; 2966 return 0;
2750 } 2967 }
2751 2968
@@ -2784,20 +3001,34 @@ lock_release_non_nested(struct task_struct *curr,
2784 */ 3001 */
2785 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3002 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2786 break; 3003 break;
2787 if (hlock->instance == lock) 3004 if (match_held_lock(hlock, lock))
2788 goto found_it; 3005 goto found_it;
2789 prev_hlock = hlock; 3006 prev_hlock = hlock;
2790 } 3007 }
2791 return print_unlock_inbalance_bug(curr, lock, ip); 3008 return print_unlock_inbalance_bug(curr, lock, ip);
2792 3009
2793found_it: 3010found_it:
2794 lock_release_holdtime(hlock); 3011 if (hlock->instance == lock)
3012 lock_release_holdtime(hlock);
3013
3014 if (hlock->references) {
3015 hlock->references--;
3016 if (hlock->references) {
3017 /*
3018 * We had, and after removing one, still have
3019 * references, the current lock stack is still
3020 * valid. We're done!
3021 */
3022 return 1;
3023 }
3024 }
2795 3025
2796 /* 3026 /*
2797 * We have the right lock to unlock, 'hlock' points to it. 3027 * We have the right lock to unlock, 'hlock' points to it.
2798 * Now we remove it from the stack, and add back the other 3028 * Now we remove it from the stack, and add back the other
2799 * entries (if any), recalculating the hash along the way: 3029 * entries (if any), recalculating the hash along the way:
2800 */ 3030 */
3031
2801 curr->lockdep_depth = i; 3032 curr->lockdep_depth = i;
2802 curr->curr_chain_key = hlock->prev_chain_key; 3033 curr->curr_chain_key = hlock->prev_chain_key;
2803 3034
@@ -2806,7 +3037,8 @@ found_it:
2806 if (!__lock_acquire(hlock->instance, 3037 if (!__lock_acquire(hlock->instance,
2807 hlock_class(hlock)->subclass, hlock->trylock, 3038 hlock_class(hlock)->subclass, hlock->trylock,
2808 hlock->read, hlock->check, hlock->hardirqs_off, 3039 hlock->read, hlock->check, hlock->hardirqs_off,
2809 hlock->nest_lock, hlock->acquire_ip)) 3040 hlock->nest_lock, hlock->acquire_ip,
3041 hlock->references))
2810 return 0; 3042 return 0;
2811 } 3043 }
2812 3044
@@ -2836,7 +3068,7 @@ static int lock_release_nested(struct task_struct *curr,
2836 /* 3068 /*
2837 * Is the unlock non-nested: 3069 * Is the unlock non-nested:
2838 */ 3070 */
2839 if (hlock->instance != lock) 3071 if (hlock->instance != lock || hlock->references)
2840 return lock_release_non_nested(curr, lock, ip); 3072 return lock_release_non_nested(curr, lock, ip);
2841 curr->lockdep_depth--; 3073 curr->lockdep_depth--;
2842 3074
@@ -2881,6 +3113,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2881 check_chain_key(curr); 3113 check_chain_key(curr);
2882} 3114}
2883 3115
3116static int __lock_is_held(struct lockdep_map *lock)
3117{
3118 struct task_struct *curr = current;
3119 int i;
3120
3121 for (i = 0; i < curr->lockdep_depth; i++) {
3122 struct held_lock *hlock = curr->held_locks + i;
3123
3124 if (match_held_lock(hlock, lock))
3125 return 1;
3126 }
3127
3128 return 0;
3129}
3130
2884/* 3131/*
2885 * Check whether we follow the irq-flags state precisely: 3132 * Check whether we follow the irq-flags state precisely:
2886 */ 3133 */
@@ -2957,7 +3204,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2957 3204
2958 current->lockdep_recursion = 1; 3205 current->lockdep_recursion = 1;
2959 __lock_acquire(lock, subclass, trylock, read, check, 3206 __lock_acquire(lock, subclass, trylock, read, check,
2960 irqs_disabled_flags(flags), nest_lock, ip); 3207 irqs_disabled_flags(flags), nest_lock, ip, 0);
2961 current->lockdep_recursion = 0; 3208 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 3209 raw_local_irq_restore(flags);
2963} 3210}
@@ -2982,6 +3229,26 @@ void lock_release(struct lockdep_map *lock, int nested,
2982} 3229}
2983EXPORT_SYMBOL_GPL(lock_release); 3230EXPORT_SYMBOL_GPL(lock_release);
2984 3231
3232int lock_is_held(struct lockdep_map *lock)
3233{
3234 unsigned long flags;
3235 int ret = 0;
3236
3237 if (unlikely(current->lockdep_recursion))
3238 return ret;
3239
3240 raw_local_irq_save(flags);
3241 check_flags(flags);
3242
3243 current->lockdep_recursion = 1;
3244 ret = __lock_is_held(lock);
3245 current->lockdep_recursion = 0;
3246 raw_local_irq_restore(flags);
3247
3248 return ret;
3249}
3250EXPORT_SYMBOL_GPL(lock_is_held);
3251
2985void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3252void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2986{ 3253{
2987 current->lockdep_reclaim_gfp = gfp_mask; 3254 current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3308,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3041 */ 3308 */
3042 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3309 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3043 break; 3310 break;
3044 if (hlock->instance == lock) 3311 if (match_held_lock(hlock, lock))
3045 goto found_it; 3312 goto found_it;
3046 prev_hlock = hlock; 3313 prev_hlock = hlock;
3047 } 3314 }
@@ -3049,6 +3316,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3049 return; 3316 return;
3050 3317
3051found_it: 3318found_it:
3319 if (hlock->instance != lock)
3320 return;
3321
3052 hlock->waittime_stamp = sched_clock(); 3322 hlock->waittime_stamp = sched_clock();
3053 3323
3054 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3358,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3088 */ 3358 */
3089 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3359 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3090 break; 3360 break;
3091 if (hlock->instance == lock) 3361 if (match_held_lock(hlock, lock))
3092 goto found_it; 3362 goto found_it;
3093 prev_hlock = hlock; 3363 prev_hlock = hlock;
3094 } 3364 }
@@ -3096,6 +3366,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3096 return; 3366 return;
3097 3367
3098found_it: 3368found_it:
3369 if (hlock->instance != lock)
3370 return;
3371
3099 cpu = smp_processor_id(); 3372 cpu = smp_processor_id();
3100 if (hlock->waittime_stamp) { 3373 if (hlock->waittime_stamp) {
3101 now = sched_clock(); 3374 now = sched_clock();
@@ -3326,7 +3599,12 @@ void __init lockdep_info(void)
3326 sizeof(struct list_head) * CLASSHASH_SIZE + 3599 sizeof(struct list_head) * CLASSHASH_SIZE +
3327 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + 3600 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
3328 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + 3601 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
3329 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); 3602 sizeof(struct list_head) * CHAINHASH_SIZE
3603#ifdef CONFIG_PROVE_LOCKING
3604 + sizeof(struct circular_queue)
3605#endif
3606 ) / 1024
3607 );
3330 3608
3331 printk(" per task-struct memory footprint: %lu bytes\n", 3609 printk(" per task-struct memory footprint: %lu bytes\n",
3332 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3610 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
91extern unsigned int max_lockdep_depth; 91extern unsigned int max_lockdep_depth;
92extern unsigned int max_recursion_depth; 92extern unsigned int max_recursion_depth;
93 93
94extern unsigned int max_bfs_queue_depth;
95
94#ifdef CONFIG_PROVE_LOCKING 96#ifdef CONFIG_PROVE_LOCKING
95extern unsigned long lockdep_count_forward_deps(struct lock_class *); 97extern unsigned long lockdep_count_forward_deps(struct lock_class *);
96extern unsigned long lockdep_count_backward_deps(struct lock_class *); 98extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..d4b3dbc79fdb 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class; 28 return seq_list_next(v, &all_lock_classes, pos);
29
30 (*pos)++;
31
32 if (v == SEQ_START_TOKEN)
33 class = m->private;
34 else {
35 class = v;
36
37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
43
44 return class;
45} 29}
46 30
47static void *l_start(struct seq_file *m, loff_t *pos) 31static void *l_start(struct seq_file *m, loff_t *pos)
48{ 32{
49 struct lock_class *class; 33 return seq_list_start_head(&all_lock_classes, *pos);
50 loff_t i = 0;
51
52 if (*pos == 0)
53 return SEQ_START_TOKEN;
54
55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
60} 34}
61 35
62static void l_stop(struct seq_file *m, void *v) 36static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 56
83static int l_show(struct seq_file *m, void *v) 57static int l_show(struct seq_file *m, void *v)
84{ 58{
85 struct lock_class *class = v; 59 struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
86 struct lock_list *entry; 60 struct lock_list *entry;
87 char usage[LOCK_USAGE_CHARS]; 61 char usage[LOCK_USAGE_CHARS];
88 62
89 if (v == SEQ_START_TOKEN) { 63 if (v == &all_lock_classes) {
90 seq_printf(m, "all lock classes:\n"); 64 seq_printf(m, "all lock classes:\n");
91 return 0; 65 return 0;
92 } 66 }
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
128 102
129static int lockdep_open(struct inode *inode, struct file *file) 103static int lockdep_open(struct inode *inode, struct file *file)
130{ 104{
131 int res = seq_open(file, &lockdep_ops); 105 return seq_open(file, &lockdep_ops);
132 if (!res) {
133 struct seq_file *m = file->private_data;
134
135 if (!list_empty(&all_lock_classes))
136 m->private = list_entry(all_lock_classes.next,
137 struct lock_class, lock_entry);
138 else
139 m->private = NULL;
140 }
141 return res;
142} 106}
143 107
144static const struct file_operations proc_lockdep_operations = { 108static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
149}; 113};
150 114
151#ifdef CONFIG_PROVE_LOCKING 115#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos) 116static void *lc_start(struct seq_file *m, loff_t *pos)
173{ 117{
174 if (*pos == 0) 118 if (*pos == 0)
175 return SEQ_START_TOKEN; 119 return SEQ_START_TOKEN;
176 120
177 if (*pos < nr_lock_chains) 121 if (*pos - 1 < nr_lock_chains)
178 return lock_chains + *pos; 122 return lock_chains + (*pos - 1);
179 123
180 return NULL; 124 return NULL;
181} 125}
182 126
127static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
128{
129 (*pos)++;
130 return lc_start(m, pos);
131}
132
183static void lc_stop(struct seq_file *m, void *v) 133static void lc_stop(struct seq_file *m, void *v)
184{ 134{
185} 135}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
220 170
221static int lockdep_chains_open(struct inode *inode, struct file *file) 171static int lockdep_chains_open(struct inode *inode, struct file *file)
222{ 172{
223 int res = seq_open(file, &lockdep_chains_ops); 173 return seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233} 174}
234 175
235static const struct file_operations proc_lockdep_chains_operations = { 176static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
258 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(&chain_lookup_hits));
259 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11u\n",
260 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(&nr_cyclic_checks));
261 seq_printf(m, " cyclic-check recursions: %11u\n",
262 debug_atomic_read(&nr_cyclic_check_recursions));
263 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11u\n",
264 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(&nr_find_usage_forwards_checks));
265 seq_printf(m, " find-mask forwards recursions: %11u\n",
266 debug_atomic_read(&nr_find_usage_forwards_recursions));
267 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11u\n",
268 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(&nr_find_usage_backwards_checks));
269 seq_printf(m, " find-mask backwards recursions:%11u\n",
270 debug_atomic_read(&nr_find_usage_backwards_recursions));
271 206
272 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11u\n", hi1);
273 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
409 nr_unused); 344 nr_unused);
410 seq_printf(m, " max locking depth: %11u\n", 345 seq_printf(m, " max locking depth: %11u\n",
411 max_lockdep_depth); 346 max_lockdep_depth);
412 seq_printf(m, " max recursion depth: %11u\n", 347#ifdef CONFIG_PROVE_LOCKING
413 max_recursion_depth); 348 seq_printf(m, " max bfs queue depth: %11u\n",
349 max_bfs_queue_depth);
350#endif
414 lockdep_stats_debug_show(m); 351 lockdep_stats_debug_show(m);
415 seq_printf(m, " debug_locks: %11u\n", 352 seq_printf(m, " debug_locks: %11u\n",
416 debug_locks); 353 debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
438}; 375};
439 376
440struct lock_stat_seq { 377struct lock_stat_seq {
441 struct lock_stat_data *iter;
442 struct lock_stat_data *iter_end; 378 struct lock_stat_data *iter_end;
443 struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; 379 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
444}; 380};
@@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m)
626static void *ls_start(struct seq_file *m, loff_t *pos) 562static void *ls_start(struct seq_file *m, loff_t *pos)
627{ 563{
628 struct lock_stat_seq *data = m->private; 564 struct lock_stat_seq *data = m->private;
565 struct lock_stat_data *iter;
629 566
630 if (*pos == 0) 567 if (*pos == 0)
631 return SEQ_START_TOKEN; 568 return SEQ_START_TOKEN;
632 569
633 data->iter = data->stats + *pos; 570 iter = data->stats + (*pos - 1);
634 if (data->iter >= data->iter_end) 571 if (iter >= data->iter_end)
635 data->iter = NULL; 572 iter = NULL;
636 573
637 return data->iter; 574 return iter;
638} 575}
639 576
640static void *ls_next(struct seq_file *m, void *v, loff_t *pos) 577static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
641{ 578{
642 struct lock_stat_seq *data = m->private;
643
644 (*pos)++; 579 (*pos)++;
645 580 return ls_start(m, pos);
646 if (v == SEQ_START_TOKEN)
647 data->iter = data->stats;
648 else {
649 data->iter = v;
650 data->iter++;
651 }
652
653 if (data->iter == data->iter_end)
654 data->iter = NULL;
655
656 return data->iter;
657} 581}
658 582
659static void ls_stop(struct seq_file *m, void *v) 583static void ls_stop(struct seq_file *m, void *v)
@@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
691 struct lock_stat_data *iter = data->stats; 615 struct lock_stat_data *iter = data->stats;
692 struct seq_file *m = file->private_data; 616 struct seq_file *m = file->private_data;
693 617
694 data->iter = iter;
695 list_for_each_entry(class, &all_lock_classes, lock_entry) { 618 list_for_each_entry(class, &all_lock_classes, lock_entry) {
696 iter->class = class; 619 iter->class = class;
697 iter->stats = lock_stats(class); 620 iter->stats = lock_stats(class);
@@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
699 } 622 }
700 data->iter_end = iter; 623 data->iter_end = iter;
701 624
702 sort(data->stats, data->iter_end - data->iter, 625 sort(data->stats, data->iter_end - data->stats,
703 sizeof(struct lock_stat_data), 626 sizeof(struct lock_stat_data),
704 lock_stat_cmp, NULL); 627 lock_stat_cmp, NULL);
705 628
@@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
734 struct seq_file *seq = file->private_data; 657 struct seq_file *seq = file->private_data;
735 658
736 vfree(seq->private); 659 vfree(seq->private);
737 seq->private = NULL;
738 return seq_release(inode, file); 660 return seq_release(inode, file);
739} 661}
740 662
@@ -758,7 +680,8 @@ static int __init lockdep_proc_init(void)
758 &proc_lockdep_stats_operations); 680 &proc_lockdep_stats_operations);
759 681
760#ifdef CONFIG_LOCK_STAT 682#ifdef CONFIG_LOCK_STAT
761 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); 683 proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
684 &proc_lock_stat_operations);
762#endif 685#endif
763 686
764 return 0; 687 return 0;
diff --git a/kernel/module.c b/kernel/module.c
index 215aaab09e91..05ce49ced8f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,11 @@
55#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h> 56#include <linux/kmemleak.h>
57 57
58#define CREATE_TRACE_POINTS
59#include <trace/events/module.h>
60
61EXPORT_TRACEPOINT_SYMBOL(module_get);
62
58#if 0 63#if 0
59#define DEBUGP printk 64#define DEBUGP printk
60#else 65#else
@@ -364,7 +369,7 @@ EXPORT_SYMBOL_GPL(find_module);
364 369
365#ifdef CONFIG_SMP 370#ifdef CONFIG_SMP
366 371
367#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 372#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
368 373
369static void *percpu_modalloc(unsigned long size, unsigned long align, 374static void *percpu_modalloc(unsigned long size, unsigned long align,
370 const char *name) 375 const char *name)
@@ -389,7 +394,7 @@ static void percpu_modfree(void *freeme)
389 free_percpu(freeme); 394 free_percpu(freeme);
390} 395}
391 396
392#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 397#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
393 398
394/* Number of blocks used and allocated. */ 399/* Number of blocks used and allocated. */
395static unsigned int pcpu_num_used, pcpu_num_allocated; 400static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +540,7 @@ static int percpu_modinit(void)
535} 540}
536__initcall(percpu_modinit); 541__initcall(percpu_modinit);
537 542
538#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 543#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
539 544
540static unsigned int find_pcpusec(Elf_Ehdr *hdr, 545static unsigned int find_pcpusec(Elf_Ehdr *hdr,
541 Elf_Shdr *sechdrs, 546 Elf_Shdr *sechdrs,
@@ -909,16 +914,18 @@ void __symbol_put(const char *symbol)
909} 914}
910EXPORT_SYMBOL(__symbol_put); 915EXPORT_SYMBOL(__symbol_put);
911 916
917/* Note this assumes addr is a function, which it currently always is. */
912void symbol_put_addr(void *addr) 918void symbol_put_addr(void *addr)
913{ 919{
914 struct module *modaddr; 920 struct module *modaddr;
921 unsigned long a = (unsigned long)dereference_function_descriptor(addr);
915 922
916 if (core_kernel_text((unsigned long)addr)) 923 if (core_kernel_text(a))
917 return; 924 return;
918 925
919 /* module_text_address is safe here: we're supposed to have reference 926 /* module_text_address is safe here: we're supposed to have reference
920 * to module from symbol_get, so it can't go away. */ 927 * to module from symbol_get, so it can't go away. */
921 modaddr = __module_text_address((unsigned long)addr); 928 modaddr = __module_text_address(a);
922 BUG_ON(!modaddr); 929 BUG_ON(!modaddr);
923 module_put(modaddr); 930 module_put(modaddr);
924} 931}
@@ -940,6 +947,8 @@ void module_put(struct module *module)
940 if (module) { 947 if (module) {
941 unsigned int cpu = get_cpu(); 948 unsigned int cpu = get_cpu();
942 local_dec(__module_ref_addr(module, cpu)); 949 local_dec(__module_ref_addr(module, cpu));
950 trace_module_put(module, _RET_IP_,
951 local_read(__module_ref_addr(module, cpu)));
943 /* Maybe they're waiting for us to drop reference? */ 952 /* Maybe they're waiting for us to drop reference? */
944 if (unlikely(!module_is_live(module))) 953 if (unlikely(!module_is_live(module)))
945 wake_up_process(module->waiter); 954 wake_up_process(module->waiter);
@@ -1068,7 +1077,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1068{ 1077{
1069 const unsigned long *crc; 1078 const unsigned long *crc;
1070 1079
1071 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1080 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1081 &crc, true, false))
1072 BUG(); 1082 BUG();
1073 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1083 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1074} 1084}
@@ -1271,6 +1281,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1271 struct module_notes_attrs *notes_attrs; 1281 struct module_notes_attrs *notes_attrs;
1272 struct bin_attribute *nattr; 1282 struct bin_attribute *nattr;
1273 1283
1284 /* failed to create section attributes, so can't create notes */
1285 if (!mod->sect_attrs)
1286 return;
1287
1274 /* Count notes sections and allocate structures. */ 1288 /* Count notes sections and allocate structures. */
1275 notes = 0; 1289 notes = 0;
1276 for (i = 0; i < nsect; i++) 1290 for (i = 0; i < nsect; i++)
@@ -1490,6 +1504,8 @@ static int __unlink_module(void *_mod)
1490/* Free a module, remove from lists, etc (must hold module_mutex). */ 1504/* Free a module, remove from lists, etc (must hold module_mutex). */
1491static void free_module(struct module *mod) 1505static void free_module(struct module *mod)
1492{ 1506{
1507 trace_module_free(mod);
1508
1493 /* Delete from various lists */ 1509 /* Delete from various lists */
1494 stop_machine(__unlink_module, mod, NULL); 1510 stop_machine(__unlink_module, mod, NULL);
1495 remove_notes_attrs(mod); 1511 remove_notes_attrs(mod);
@@ -2216,6 +2232,10 @@ static noinline struct module *load_module(void __user *umod,
2216 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, 2232 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2217 "__kcrctab_unused_gpl"); 2233 "__kcrctab_unused_gpl");
2218#endif 2234#endif
2235#ifdef CONFIG_CONSTRUCTORS
2236 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2237 sizeof(*mod->ctors), &mod->num_ctors);
2238#endif
2219 2239
2220#ifdef CONFIG_MARKERS 2240#ifdef CONFIG_MARKERS
2221 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", 2241 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2353,6 +2373,8 @@ static noinline struct module *load_module(void __user *umod,
2353 /* Get rid of temporary copy */ 2373 /* Get rid of temporary copy */
2354 vfree(hdr); 2374 vfree(hdr);
2355 2375
2376 trace_module_load(mod);
2377
2356 /* Done! */ 2378 /* Done! */
2357 return mod; 2379 return mod;
2358 2380
@@ -2389,6 +2411,17 @@ static noinline struct module *load_module(void __user *umod,
2389 goto free_hdr; 2411 goto free_hdr;
2390} 2412}
2391 2413
2414/* Call module constructors. */
2415static void do_mod_ctors(struct module *mod)
2416{
2417#ifdef CONFIG_CONSTRUCTORS
2418 unsigned long i;
2419
2420 for (i = 0; i < mod->num_ctors; i++)
2421 mod->ctors[i]();
2422#endif
2423}
2424
2392/* This is where the real work happens */ 2425/* This is where the real work happens */
2393SYSCALL_DEFINE3(init_module, void __user *, umod, 2426SYSCALL_DEFINE3(init_module, void __user *, umod,
2394 unsigned long, len, const char __user *, uargs) 2427 unsigned long, len, const char __user *, uargs)
@@ -2417,6 +2450,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2417 blocking_notifier_call_chain(&module_notify_list, 2450 blocking_notifier_call_chain(&module_notify_list,
2418 MODULE_STATE_COMING, mod); 2451 MODULE_STATE_COMING, mod);
2419 2452
2453 do_mod_ctors(mod);
2420 /* Start the module */ 2454 /* Start the module */
2421 if (mod->init != NULL) 2455 if (mod->init != NULL)
2422 ret = do_one_initcall(mod->init); 2456 ret = do_one_initcall(mod->init);
@@ -2435,9 +2469,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2435 return ret; 2469 return ret;
2436 } 2470 }
2437 if (ret > 0) { 2471 if (ret > 0) {
2438 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2472 printk(KERN_WARNING
2439 "it should follow 0/-E convention\n" 2473"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2440 KERN_WARNING "%s: loading module anyway...\n", 2474"%s: loading module anyway...\n",
2441 __func__, mod->name, ret, 2475 __func__, mod->name, ret,
2442 __func__); 2476 __func__);
2443 dump_stack(); 2477 dump_stack();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0c..09b4ff9711b2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29/* 29static inline struct nsproxy *create_nsproxy(void)
30 * creates a copy of "orig" with refcount 1.
31 */
32static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
33{ 30{
34 struct nsproxy *ns; 31 struct nsproxy *nsproxy;
35 32
36 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); 33 nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
37 if (ns) { 34 if (nsproxy)
38 memcpy(ns, orig, sizeof(struct nsproxy)); 35 atomic_set(&nsproxy->count, 1);
39 atomic_set(&ns->count, 1); 36 return nsproxy;
40 }
41 return ns;
42} 37}
43 38
44/* 39/*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
52 struct nsproxy *new_nsp; 47 struct nsproxy *new_nsp;
53 int err; 48 int err;
54 49
55 new_nsp = clone_nsproxy(tsk->nsproxy); 50 new_nsp = create_nsproxy();
56 if (!new_nsp) 51 if (!new_nsp)
57 return ERR_PTR(-ENOMEM); 52 return ERR_PTR(-ENOMEM);
58 53
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..512ab73b0ca3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551aa..8cb94a52d1bb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,14 +42,21 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
48 * 0 - not paranoid 49 * -1 - not paranoid at all
49 * 1 - disallow cpu counters to unpriv 50 * 0 - disallow raw tracepoint access for unpriv
50 * 2 - disallow kernel profiling to unpriv 51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
51 */ 53 */
52int sysctl_perf_counter_paranoid __read_mostly; 54int sysctl_perf_counter_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
53 60
54static inline bool perf_paranoid_cpu(void) 61static inline bool perf_paranoid_cpu(void)
55{ 62{
@@ -87,6 +94,7 @@ void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); } 94void __weak hw_perf_enable(void) { barrier(); }
88 95
89void __weak hw_perf_counter_setup(int cpu) { barrier(); } 96void __weak hw_perf_counter_setup(int cpu) { barrier(); }
97void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
90 98
91int __weak 99int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader, 100hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -98,16 +106,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
98 106
99void __weak perf_counter_print_debug(void) { } 107void __weak perf_counter_print_debug(void) { }
100 108
101static DEFINE_PER_CPU(int, disable_count); 109static DEFINE_PER_CPU(int, perf_disable_count);
102 110
103void __perf_disable(void) 111void __perf_disable(void)
104{ 112{
105 __get_cpu_var(disable_count)++; 113 __get_cpu_var(perf_disable_count)++;
106} 114}
107 115
108bool __perf_enable(void) 116bool __perf_enable(void)
109{ 117{
110 return !--__get_cpu_var(disable_count); 118 return !--__get_cpu_var(perf_disable_count);
111} 119}
112 120
113void perf_disable(void) 121void perf_disable(void)
@@ -124,7 +132,7 @@ void perf_enable(void)
124 132
125static void get_ctx(struct perf_counter_context *ctx) 133static void get_ctx(struct perf_counter_context *ctx)
126{ 134{
127 atomic_inc(&ctx->refcount); 135 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
128} 136}
129 137
130static void free_ctx(struct rcu_head *head) 138static void free_ctx(struct rcu_head *head)
@@ -146,6 +154,28 @@ static void put_ctx(struct perf_counter_context *ctx)
146 } 154 }
147} 155}
148 156
157static void unclone_ctx(struct perf_counter_context *ctx)
158{
159 if (ctx->parent_ctx) {
160 put_ctx(ctx->parent_ctx);
161 ctx->parent_ctx = NULL;
162 }
163}
164
165/*
166 * If we inherit counters we want to return the parent counter id
167 * to userspace.
168 */
169static u64 primary_counter_id(struct perf_counter *counter)
170{
171 u64 id = counter->id;
172
173 if (counter->parent)
174 id = counter->parent->id;
175
176 return id;
177}
178
149/* 179/*
150 * Get the perf_counter_context for a task and lock it. 180 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked, 181 * This has to cope with with the fact that until it is locked,
@@ -175,6 +205,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
175 spin_unlock_irqrestore(&ctx->lock, *flags); 205 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry; 206 goto retry;
177 } 207 }
208
209 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL;
212 }
178 } 213 }
179 rcu_read_unlock(); 214 rcu_read_unlock();
180 return ctx; 215 return ctx;
@@ -193,7 +228,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
193 ctx = perf_lock_task_context(task, &flags); 228 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) { 229 if (ctx) {
195 ++ctx->pin_count; 230 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags); 231 spin_unlock_irqrestore(&ctx->lock, flags);
198 } 232 }
199 return ctx; 233 return ctx;
@@ -232,6 +266,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
232 266
233 list_add_rcu(&counter->event_entry, &ctx->event_list); 267 list_add_rcu(&counter->event_entry, &ctx->event_list);
234 ctx->nr_counters++; 268 ctx->nr_counters++;
269 if (counter->attr.inherit_stat)
270 ctx->nr_stat++;
235} 271}
236 272
237/* 273/*
@@ -246,6 +282,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
246 if (list_empty(&counter->list_entry)) 282 if (list_empty(&counter->list_entry))
247 return; 283 return;
248 ctx->nr_counters--; 284 ctx->nr_counters--;
285 if (counter->attr.inherit_stat)
286 ctx->nr_stat--;
249 287
250 list_del_init(&counter->list_entry); 288 list_del_init(&counter->list_entry);
251 list_del_rcu(&counter->event_entry); 289 list_del_rcu(&counter->event_entry);
@@ -275,6 +313,10 @@ counter_sched_out(struct perf_counter *counter,
275 return; 313 return;
276 314
277 counter->state = PERF_COUNTER_STATE_INACTIVE; 315 counter->state = PERF_COUNTER_STATE_INACTIVE;
316 if (counter->pending_disable) {
317 counter->pending_disable = 0;
318 counter->state = PERF_COUNTER_STATE_OFF;
319 }
278 counter->tstamp_stopped = ctx->time; 320 counter->tstamp_stopped = ctx->time;
279 counter->pmu->disable(counter); 321 counter->pmu->disable(counter);
280 counter->oncpu = -1; 322 counter->oncpu = -1;
@@ -433,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
433 struct perf_counter_context *ctx = counter->ctx; 475 struct perf_counter_context *ctx = counter->ctx;
434 u64 run_end; 476 u64 run_end;
435 477
436 if (counter->state < PERF_COUNTER_STATE_INACTIVE) 478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
437 return; 480 return;
438 481
439 counter->total_time_enabled = ctx->time - counter->tstamp_enabled; 482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -482,7 +525,7 @@ static void __perf_counter_disable(void *info)
482 */ 525 */
483 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { 526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
484 update_context_time(ctx); 527 update_context_time(ctx);
485 update_counter_times(counter); 528 update_group_times(counter);
486 if (counter == counter->group_leader) 529 if (counter == counter->group_leader)
487 group_sched_out(counter, cpuctx, ctx); 530 group_sched_out(counter, cpuctx, ctx);
488 else 531 else
@@ -537,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
537 * in, so we can change the state safely. 580 * in, so we can change the state safely.
538 */ 581 */
539 if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
540 update_counter_times(counter); 583 update_group_times(counter);
541 counter->state = PERF_COUNTER_STATE_OFF; 584 counter->state = PERF_COUNTER_STATE_OFF;
542 } 585 }
543 586
@@ -815,6 +858,27 @@ retry:
815} 858}
816 859
817/* 860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
818 * Cross CPU call to enable a performance counter 882 * Cross CPU call to enable a performance counter
819 */ 883 */
820static void __perf_counter_enable(void *info) 884static void __perf_counter_enable(void *info)
@@ -841,8 +905,7 @@ static void __perf_counter_enable(void *info)
841 905
842 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
843 goto unlock; 907 goto unlock;
844 counter->state = PERF_COUNTER_STATE_INACTIVE; 908 __perf_counter_mark_enabled(counter, ctx);
845 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
846 909
847 /* 910 /*
848 * If the counter is in a group and isn't the group leader, 911 * If the counter is in a group and isn't the group leader,
@@ -935,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
935 * Since we have the lock this context can't be scheduled 998 * Since we have the lock this context can't be scheduled
936 * in, so we can change the state safely. 999 * in, so we can change the state safely.
937 */ 1000 */
938 if (counter->state == PERF_COUNTER_STATE_OFF) { 1001 if (counter->state == PERF_COUNTER_STATE_OFF)
939 counter->state = PERF_COUNTER_STATE_INACTIVE; 1002 __perf_counter_mark_enabled(counter, ctx);
940 counter->tstamp_enabled = 1003
941 ctx->time - counter->total_time_enabled;
942 }
943 out: 1004 out:
944 spin_unlock_irq(&ctx->lock); 1005 spin_unlock_irq(&ctx->lock);
945} 1006}
@@ -1002,6 +1063,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
1002 && !ctx1->pin_count && !ctx2->pin_count; 1063 && !ctx1->pin_count && !ctx2->pin_count;
1003} 1064}
1004 1065
1066static void __perf_counter_read(void *counter);
1067
1068static void __perf_counter_sync_stat(struct perf_counter *counter,
1069 struct perf_counter *next_counter)
1070{
1071 u64 value;
1072
1073 if (!counter->attr.inherit_stat)
1074 return;
1075
1076 /*
1077 * Update the counter value, we cannot use perf_counter_read()
1078 * because we're in the middle of a context switch and have IRQs
1079 * disabled, which upsets smp_call_function_single(), however
1080 * we know the counter must be on the current CPU, therefore we
1081 * don't need to use it.
1082 */
1083 switch (counter->state) {
1084 case PERF_COUNTER_STATE_ACTIVE:
1085 __perf_counter_read(counter);
1086 break;
1087
1088 case PERF_COUNTER_STATE_INACTIVE:
1089 update_counter_times(counter);
1090 break;
1091
1092 default:
1093 break;
1094 }
1095
1096 /*
1097 * In order to keep per-task stats reliable we need to flip the counter
1098 * values when we flip the contexts.
1099 */
1100 value = atomic64_read(&next_counter->count);
1101 value = atomic64_xchg(&counter->count, value);
1102 atomic64_set(&next_counter->count, value);
1103
1104 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1105 swap(counter->total_time_running, next_counter->total_time_running);
1106
1107 /*
1108 * Since we swizzled the values, update the user visible data too.
1109 */
1110 perf_counter_update_userpage(counter);
1111 perf_counter_update_userpage(next_counter);
1112}
1113
1114#define list_next_entry(pos, member) \
1115 list_entry(pos->member.next, typeof(*pos), member)
1116
1117static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1118 struct perf_counter_context *next_ctx)
1119{
1120 struct perf_counter *counter, *next_counter;
1121
1122 if (!ctx->nr_stat)
1123 return;
1124
1125 counter = list_first_entry(&ctx->event_list,
1126 struct perf_counter, event_entry);
1127
1128 next_counter = list_first_entry(&next_ctx->event_list,
1129 struct perf_counter, event_entry);
1130
1131 while (&counter->event_entry != &ctx->event_list &&
1132 &next_counter->event_entry != &next_ctx->event_list) {
1133
1134 __perf_counter_sync_stat(counter, next_counter);
1135
1136 counter = list_next_entry(counter, event_entry);
1137 next_counter = list_next_entry(next_counter, event_entry);
1138 }
1139}
1140
1005/* 1141/*
1006 * Called from scheduler to remove the counters of the current task, 1142 * Called from scheduler to remove the counters of the current task,
1007 * with interrupts disabled. 1143 * with interrupts disabled.
@@ -1057,6 +1193,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
1057 ctx->task = next; 1193 ctx->task = next;
1058 next_ctx->task = task; 1194 next_ctx->task = task;
1059 do_switch = 0; 1195 do_switch = 0;
1196
1197 perf_counter_sync_stat(ctx, next_ctx);
1060 } 1198 }
1061 spin_unlock(&next_ctx->lock); 1199 spin_unlock(&next_ctx->lock);
1062 spin_unlock(&ctx->lock); 1200 spin_unlock(&ctx->lock);
@@ -1203,7 +1341,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1203#define MAX_INTERRUPTS (~0ULL) 1341#define MAX_INTERRUPTS (~0ULL)
1204 1342
1205static void perf_log_throttle(struct perf_counter *counter, int enable); 1343static void perf_log_throttle(struct perf_counter *counter, int enable);
1206static void perf_log_period(struct perf_counter *counter, u64 period);
1207 1344
1208static void perf_adjust_period(struct perf_counter *counter, u64 events) 1345static void perf_adjust_period(struct perf_counter *counter, u64 events)
1209{ 1346{
@@ -1222,8 +1359,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
1222 if (!sample_period) 1359 if (!sample_period)
1223 sample_period = 1; 1360 sample_period = 1;
1224 1361
1225 perf_log_period(counter, sample_period);
1226
1227 hwc->sample_period = sample_period; 1362 hwc->sample_period = sample_period;
1228} 1363}
1229 1364
@@ -1283,7 +1418,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1283 if (!interrupts) { 1418 if (!interrupts) {
1284 perf_disable(); 1419 perf_disable();
1285 counter->pmu->disable(counter); 1420 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0); 1421 atomic64_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter); 1422 counter->pmu->enable(counter);
1288 perf_enable(); 1423 perf_enable();
1289 } 1424 }
@@ -1344,14 +1479,68 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1344} 1479}
1345 1480
1346/* 1481/*
1482 * Enable all of a task's counters that have been marked enable-on-exec.
1483 * This expects task == current.
1484 */
1485static void perf_counter_enable_on_exec(struct task_struct *task)
1486{
1487 struct perf_counter_context *ctx;
1488 struct perf_counter *counter;
1489 unsigned long flags;
1490 int enabled = 0;
1491
1492 local_irq_save(flags);
1493 ctx = task->perf_counter_ctxp;
1494 if (!ctx || !ctx->nr_counters)
1495 goto out;
1496
1497 __perf_counter_task_sched_out(ctx);
1498
1499 spin_lock(&ctx->lock);
1500
1501 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1502 if (!counter->attr.enable_on_exec)
1503 continue;
1504 counter->attr.enable_on_exec = 0;
1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1506 continue;
1507 __perf_counter_mark_enabled(counter, ctx);
1508 enabled = 1;
1509 }
1510
1511 /*
1512 * Unclone this context if we enabled any counter.
1513 */
1514 if (enabled)
1515 unclone_ctx(ctx);
1516
1517 spin_unlock(&ctx->lock);
1518
1519 perf_counter_task_sched_in(task, smp_processor_id());
1520 out:
1521 local_irq_restore(flags);
1522}
1523
1524/*
1347 * Cross CPU call to read the hardware counter 1525 * Cross CPU call to read the hardware counter
1348 */ 1526 */
1349static void __read(void *info) 1527static void __perf_counter_read(void *info)
1350{ 1528{
1529 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1351 struct perf_counter *counter = info; 1530 struct perf_counter *counter = info;
1352 struct perf_counter_context *ctx = counter->ctx; 1531 struct perf_counter_context *ctx = counter->ctx;
1353 unsigned long flags; 1532 unsigned long flags;
1354 1533
1534 /*
1535 * If this is a task context, we need to check whether it is
1536 * the current task context of this cpu. If not it has been
1537 * scheduled out before the smp call arrived. In that case
1538 * counter->count would have been updated to a recent sample
1539 * when the counter was scheduled out.
1540 */
1541 if (ctx->task && cpuctx->task_ctx != ctx)
1542 return;
1543
1355 local_irq_save(flags); 1544 local_irq_save(flags);
1356 if (ctx->is_active) 1545 if (ctx->is_active)
1357 update_context_time(ctx); 1546 update_context_time(ctx);
@@ -1368,7 +1557,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
1368 */ 1557 */
1369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1558 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1370 smp_call_function_single(counter->oncpu, 1559 smp_call_function_single(counter->oncpu,
1371 __read, counter, 1); 1560 __perf_counter_read, counter, 1);
1372 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 1561 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1373 update_counter_times(counter); 1562 update_counter_times(counter);
1374 } 1563 }
@@ -1394,7 +1583,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1394 1583
1395static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1584static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1396{ 1585{
1397 struct perf_counter_context *parent_ctx;
1398 struct perf_counter_context *ctx; 1586 struct perf_counter_context *ctx;
1399 struct perf_cpu_context *cpuctx; 1587 struct perf_cpu_context *cpuctx;
1400 struct task_struct *task; 1588 struct task_struct *task;
@@ -1454,16 +1642,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1454 retry: 1642 retry:
1455 ctx = perf_lock_task_context(task, &flags); 1643 ctx = perf_lock_task_context(task, &flags);
1456 if (ctx) { 1644 if (ctx) {
1457 parent_ctx = ctx->parent_ctx; 1645 unclone_ctx(ctx);
1458 if (parent_ctx) {
1459 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */
1461 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags); 1646 spin_unlock_irqrestore(&ctx->lock, flags);
1468 } 1647 }
1469 1648
@@ -1509,11 +1688,20 @@ static void free_counter(struct perf_counter *counter)
1509{ 1688{
1510 perf_pending_sync(counter); 1689 perf_pending_sync(counter);
1511 1690
1512 atomic_dec(&nr_counters); 1691 if (!counter->parent) {
1513 if (counter->attr.mmap) 1692 atomic_dec(&nr_counters);
1514 atomic_dec(&nr_mmap_counters); 1693 if (counter->attr.mmap)
1515 if (counter->attr.comm) 1694 atomic_dec(&nr_mmap_counters);
1516 atomic_dec(&nr_comm_counters); 1695 if (counter->attr.comm)
1696 atomic_dec(&nr_comm_counters);
1697 if (counter->attr.task)
1698 atomic_dec(&nr_task_counters);
1699 }
1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1517 1705
1518 if (counter->destroy) 1706 if (counter->destroy)
1519 counter->destroy(counter); 1707 counter->destroy(counter);
@@ -1547,14 +1735,133 @@ static int perf_release(struct inode *inode, struct file *file)
1547 return 0; 1735 return 0;
1548} 1736}
1549 1737
1738static int perf_counter_read_size(struct perf_counter *counter)
1739{
1740 int entry = sizeof(u64); /* value */
1741 int size = 0;
1742 int nr = 1;
1743
1744 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1745 size += sizeof(u64);
1746
1747 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1748 size += sizeof(u64);
1749
1750 if (counter->attr.read_format & PERF_FORMAT_ID)
1751 entry += sizeof(u64);
1752
1753 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1754 nr += counter->group_leader->nr_siblings;
1755 size += sizeof(u64);
1756 }
1757
1758 size += entry * nr;
1759
1760 return size;
1761}
1762
1763static u64 perf_counter_read_value(struct perf_counter *counter)
1764{
1765 struct perf_counter *child;
1766 u64 total = 0;
1767
1768 total += perf_counter_read(counter);
1769 list_for_each_entry(child, &counter->child_list, child_list)
1770 total += perf_counter_read(child);
1771
1772 return total;
1773}
1774
1775static int perf_counter_read_entry(struct perf_counter *counter,
1776 u64 read_format, char __user *buf)
1777{
1778 int n = 0, count = 0;
1779 u64 values[2];
1780
1781 values[n++] = perf_counter_read_value(counter);
1782 if (read_format & PERF_FORMAT_ID)
1783 values[n++] = primary_counter_id(counter);
1784
1785 count = n * sizeof(u64);
1786
1787 if (copy_to_user(buf, values, count))
1788 return -EFAULT;
1789
1790 return count;
1791}
1792
1793static int perf_counter_read_group(struct perf_counter *counter,
1794 u64 read_format, char __user *buf)
1795{
1796 struct perf_counter *leader = counter->group_leader, *sub;
1797 int n = 0, size = 0, err = -EFAULT;
1798 u64 values[3];
1799
1800 values[n++] = 1 + leader->nr_siblings;
1801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1802 values[n++] = leader->total_time_enabled +
1803 atomic64_read(&leader->child_total_time_enabled);
1804 }
1805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1806 values[n++] = leader->total_time_running +
1807 atomic64_read(&leader->child_total_time_running);
1808 }
1809
1810 size = n * sizeof(u64);
1811
1812 if (copy_to_user(buf, values, size))
1813 return -EFAULT;
1814
1815 err = perf_counter_read_entry(leader, read_format, buf + size);
1816 if (err < 0)
1817 return err;
1818
1819 size += err;
1820
1821 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1822 err = perf_counter_read_entry(sub, read_format,
1823 buf + size);
1824 if (err < 0)
1825 return err;
1826
1827 size += err;
1828 }
1829
1830 return size;
1831}
1832
1833static int perf_counter_read_one(struct perf_counter *counter,
1834 u64 read_format, char __user *buf)
1835{
1836 u64 values[4];
1837 int n = 0;
1838
1839 values[n++] = perf_counter_read_value(counter);
1840 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1841 values[n++] = counter->total_time_enabled +
1842 atomic64_read(&counter->child_total_time_enabled);
1843 }
1844 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1845 values[n++] = counter->total_time_running +
1846 atomic64_read(&counter->child_total_time_running);
1847 }
1848 if (read_format & PERF_FORMAT_ID)
1849 values[n++] = primary_counter_id(counter);
1850
1851 if (copy_to_user(buf, values, n * sizeof(u64)))
1852 return -EFAULT;
1853
1854 return n * sizeof(u64);
1855}
1856
1550/* 1857/*
1551 * Read the performance counter - simple non blocking version for now 1858 * Read the performance counter - simple non blocking version for now
1552 */ 1859 */
1553static ssize_t 1860static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1861perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{ 1862{
1556 u64 values[3]; 1863 u64 read_format = counter->attr.read_format;
1557 int n; 1864 int ret;
1558 1865
1559 /* 1866 /*
1560 * Return end-of-file for a read on a counter that is in 1867 * Return end-of-file for a read on a counter that is in
@@ -1564,28 +1871,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1564 if (counter->state == PERF_COUNTER_STATE_ERROR) 1871 if (counter->state == PERF_COUNTER_STATE_ERROR)
1565 return 0; 1872 return 0;
1566 1873
1874 if (count < perf_counter_read_size(counter))
1875 return -ENOSPC;
1876
1567 WARN_ON_ONCE(counter->ctx->parent_ctx); 1877 WARN_ON_ONCE(counter->ctx->parent_ctx);
1568 mutex_lock(&counter->child_mutex); 1878 mutex_lock(&counter->child_mutex);
1569 values[0] = perf_counter_read(counter); 1879 if (read_format & PERF_FORMAT_GROUP)
1570 n = 1; 1880 ret = perf_counter_read_group(counter, read_format, buf);
1571 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1881 else
1572 values[n++] = counter->total_time_enabled + 1882 ret = perf_counter_read_one(counter, read_format, buf);
1573 atomic64_read(&counter->child_total_time_enabled);
1574 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1575 values[n++] = counter->total_time_running +
1576 atomic64_read(&counter->child_total_time_running);
1577 if (counter->attr.read_format & PERF_FORMAT_ID)
1578 values[n++] = counter->id;
1579 mutex_unlock(&counter->child_mutex); 1883 mutex_unlock(&counter->child_mutex);
1580 1884
1581 if (count < n * sizeof(u64)) 1885 return ret;
1582 return -EINVAL;
1583 count = n * sizeof(u64);
1584
1585 if (copy_to_user(buf, values, count))
1586 return -EFAULT;
1587
1588 return count;
1589} 1886}
1590 1887
1591static ssize_t 1888static ssize_t
@@ -1620,22 +1917,6 @@ static void perf_counter_reset(struct perf_counter *counter)
1620 perf_counter_update_userpage(counter); 1917 perf_counter_update_userpage(counter);
1621} 1918}
1622 1919
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/* 1920/*
1640 * Holding the top-level counter's child_mutex means that any 1921 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block 1922 * descendant process that has inherited this counter will block
@@ -1658,14 +1939,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
1658static void perf_counter_for_each(struct perf_counter *counter, 1939static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *)) 1940 void (*func)(struct perf_counter *))
1660{ 1941{
1661 struct perf_counter *child; 1942 struct perf_counter_context *ctx = counter->ctx;
1943 struct perf_counter *sibling;
1662 1944
1663 WARN_ON_ONCE(counter->ctx->parent_ctx); 1945 WARN_ON_ONCE(ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex); 1946 mutex_lock(&ctx->mutex);
1665 perf_counter_for_each_sibling(counter, func); 1947 counter = counter->group_leader;
1666 list_for_each_entry(child, &counter->child_list, child_list) 1948
1667 perf_counter_for_each_sibling(child, func); 1949 perf_counter_for_each_child(counter, func);
1668 mutex_unlock(&counter->child_mutex); 1950 func(counter);
1951 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1952 perf_counter_for_each_child(counter, func);
1953 mutex_unlock(&ctx->mutex);
1669} 1954}
1670 1955
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) 1956static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
@@ -1694,8 +1979,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1694 1979
1695 counter->attr.sample_freq = value; 1980 counter->attr.sample_freq = value;
1696 } else { 1981 } else {
1697 perf_log_period(counter, value);
1698
1699 counter->attr.sample_period = value; 1982 counter->attr.sample_period = value;
1700 counter->hw.sample_period = value; 1983 counter->hw.sample_period = value;
1701 } 1984 }
@@ -1705,6 +1988,8 @@ unlock:
1705 return ret; 1988 return ret;
1706} 1989}
1707 1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1708static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1709{ 1994{
1710 struct perf_counter *counter = file->private_data; 1995 struct perf_counter *counter = file->private_data;
@@ -1728,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1728 case PERF_COUNTER_IOC_PERIOD: 2013 case PERF_COUNTER_IOC_PERIOD:
1729 return perf_counter_period(counter, (u64 __user *)arg); 2014 return perf_counter_period(counter, (u64 __user *)arg);
1730 2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
1731 default: 2019 default:
1732 return -ENOTTY; 2020 return -ENOTTY;
1733 } 2021 }
@@ -1764,6 +2052,18 @@ int perf_counter_task_disable(void)
1764 return 0; 2052 return 0;
1765} 2053}
1766 2054
2055#ifndef PERF_COUNTER_INDEX_OFFSET
2056# define PERF_COUNTER_INDEX_OFFSET 0
2057#endif
2058
2059static int perf_counter_index(struct perf_counter *counter)
2060{
2061 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2062 return 0;
2063
2064 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2065}
2066
1767/* 2067/*
1768 * Callers need to ensure there can be no nesting of this function, otherwise 2068 * Callers need to ensure there can be no nesting of this function, otherwise
1769 * the seqlock logic goes bad. We can not serialize this because the arch 2069 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1788,11 +2088,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
1788 preempt_disable(); 2088 preempt_disable();
1789 ++userpg->lock; 2089 ++userpg->lock;
1790 barrier(); 2090 barrier();
1791 userpg->index = counter->hw.idx; 2091 userpg->index = perf_counter_index(counter);
1792 userpg->offset = atomic64_read(&counter->count); 2092 userpg->offset = atomic64_read(&counter->count);
1793 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 2093 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1794 userpg->offset -= atomic64_read(&counter->hw.prev_count); 2094 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1795 2095
2096 userpg->time_enabled = counter->total_time_enabled +
2097 atomic64_read(&counter->child_total_time_enabled);
2098
2099 userpg->time_running = counter->total_time_running +
2100 atomic64_read(&counter->child_total_time_running);
2101
1796 barrier(); 2102 barrier();
1797 ++userpg->lock; 2103 ++userpg->lock;
1798 preempt_enable(); 2104 preempt_enable();
@@ -1806,6 +2112,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1806 struct perf_mmap_data *data; 2112 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS; 2113 int ret = VM_FAULT_SIGBUS;
1808 2114
2115 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2116 if (vmf->pgoff == 0)
2117 ret = 0;
2118 return ret;
2119 }
2120
1809 rcu_read_lock(); 2121 rcu_read_lock();
1810 data = rcu_dereference(counter->data); 2122 data = rcu_dereference(counter->data);
1811 if (!data) 2123 if (!data)
@@ -1819,9 +2131,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1819 if ((unsigned)nr > data->nr_pages) 2131 if ((unsigned)nr > data->nr_pages)
1820 goto unlock; 2132 goto unlock;
1821 2133
2134 if (vmf->flags & FAULT_FLAG_WRITE)
2135 goto unlock;
2136
1822 vmf->page = virt_to_page(data->data_pages[nr]); 2137 vmf->page = virt_to_page(data->data_pages[nr]);
1823 } 2138 }
2139
1824 get_page(vmf->page); 2140 get_page(vmf->page);
2141 vmf->page->mapping = vma->vm_file->f_mapping;
2142 vmf->page->index = vmf->pgoff;
2143
1825 ret = 0; 2144 ret = 0;
1826unlock: 2145unlock:
1827 rcu_read_unlock(); 2146 rcu_read_unlock();
@@ -1874,6 +2193,14 @@ fail:
1874 return -ENOMEM; 2193 return -ENOMEM;
1875} 2194}
1876 2195
2196static void perf_mmap_free_page(unsigned long addr)
2197{
2198 struct page *page = virt_to_page((void *)addr);
2199
2200 page->mapping = NULL;
2201 __free_page(page);
2202}
2203
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head) 2204static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{ 2205{
1879 struct perf_mmap_data *data; 2206 struct perf_mmap_data *data;
@@ -1881,9 +2208,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1881 2208
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2209 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883 2210
1884 free_page((unsigned long)data->user_page); 2211 perf_mmap_free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++) 2212 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]); 2213 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2214
1887 kfree(data); 2215 kfree(data);
1888} 2216}
1889 2217
@@ -1920,9 +2248,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
1920} 2248}
1921 2249
1922static struct vm_operations_struct perf_mmap_vmops = { 2250static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open, 2251 .open = perf_mmap_open,
1924 .close = perf_mmap_close, 2252 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault, 2253 .fault = perf_mmap_fault,
2254 .page_mkwrite = perf_mmap_fault,
1926}; 2255};
1927 2256
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma) 2257static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1936,7 +2265,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1936 long user_extra, extra; 2265 long user_extra, extra;
1937 int ret = 0; 2266 int ret = 0;
1938 2267
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) 2268 if (!(vma->vm_flags & VM_SHARED))
1940 return -EINVAL; 2269 return -EINVAL;
1941 2270
1942 vma_size = vma->vm_end - vma->vm_start; 2271 vma_size = vma->vm_end - vma->vm_start;
@@ -1957,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1957 2286
1958 WARN_ON_ONCE(counter->ctx->parent_ctx); 2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
1959 mutex_lock(&counter->mmap_mutex); 2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
1960 if (atomic_inc_not_zero(&counter->mmap_count)) { 2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
1961 if (nr_pages != counter->data->nr_pages) 2295 if (nr_pages != counter->data->nr_pages)
1962 ret = -EINVAL; 2296 ret = -EINVAL;
@@ -1995,10 +2329,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1995 atomic_long_add(user_extra, &user->locked_vm); 2329 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra; 2330 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra; 2331 counter->data->nr_locked = extra;
2332 if (vma->vm_flags & VM_WRITE)
2333 counter->data->writable = 1;
2334
1998unlock: 2335unlock:
1999 mutex_unlock(&counter->mmap_mutex); 2336 mutex_unlock(&counter->mmap_mutex);
2000 2337
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED; 2338 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops; 2339 vma->vm_ops = &perf_mmap_vmops;
2004 2340
@@ -2064,7 +2400,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
2064 2400
2065 if (counter->pending_disable) { 2401 if (counter->pending_disable) {
2066 counter->pending_disable = 0; 2402 counter->pending_disable = 0;
2067 perf_counter_disable(counter); 2403 __perf_counter_disable(counter);
2068 } 2404 }
2069 2405
2070 if (counter->pending_wakeup) { 2406 if (counter->pending_wakeup) {
@@ -2175,11 +2511,38 @@ struct perf_output_handle {
2175 unsigned long head; 2511 unsigned long head;
2176 unsigned long offset; 2512 unsigned long offset;
2177 int nmi; 2513 int nmi;
2178 int overflow; 2514 int sample;
2179 int locked; 2515 int locked;
2180 unsigned long flags; 2516 unsigned long flags;
2181}; 2517};
2182 2518
2519static bool perf_output_space(struct perf_mmap_data *data,
2520 unsigned int offset, unsigned int head)
2521{
2522 unsigned long tail;
2523 unsigned long mask;
2524
2525 if (!data->writable)
2526 return true;
2527
2528 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2529 /*
2530 * Userspace could choose to issue a mb() before updating the tail
2531 * pointer. So that all reads will be completed before the write is
2532 * issued.
2533 */
2534 tail = ACCESS_ONCE(data->user_page->data_tail);
2535 smp_rmb();
2536
2537 offset = (offset - tail) & mask;
2538 head = (head - tail) & mask;
2539
2540 if ((int)(head - offset) < 0)
2541 return false;
2542
2543 return true;
2544}
2545
2183static void perf_output_wakeup(struct perf_output_handle *handle) 2546static void perf_output_wakeup(struct perf_output_handle *handle)
2184{ 2547{
2185 atomic_set(&handle->data->poll, POLL_IN); 2548 atomic_set(&handle->data->poll, POLL_IN);
@@ -2270,37 +2633,93 @@ out:
2270 local_irq_restore(handle->flags); 2633 local_irq_restore(handle->flags);
2271} 2634}
2272 2635
2636static void perf_output_copy(struct perf_output_handle *handle,
2637 const void *buf, unsigned int len)
2638{
2639 unsigned int pages_mask;
2640 unsigned int offset;
2641 unsigned int size;
2642 void **pages;
2643
2644 offset = handle->offset;
2645 pages_mask = handle->data->nr_pages - 1;
2646 pages = handle->data->data_pages;
2647
2648 do {
2649 unsigned int page_offset;
2650 int nr;
2651
2652 nr = (offset >> PAGE_SHIFT) & pages_mask;
2653 page_offset = offset & (PAGE_SIZE - 1);
2654 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2655
2656 memcpy(pages[nr] + page_offset, buf, size);
2657
2658 len -= size;
2659 buf += size;
2660 offset += size;
2661 } while (len);
2662
2663 handle->offset = offset;
2664
2665 /*
2666 * Check we didn't copy past our reservation window, taking the
2667 * possible unsigned int wrap into account.
2668 */
2669 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2670}
2671
2672#define perf_output_put(handle, x) \
2673 perf_output_copy((handle), &(x), sizeof(x))
2674
2273static int perf_output_begin(struct perf_output_handle *handle, 2675static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size, 2676 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow) 2677 int nmi, int sample)
2276{ 2678{
2679 struct perf_counter *output_counter;
2277 struct perf_mmap_data *data; 2680 struct perf_mmap_data *data;
2278 unsigned int offset, head; 2681 unsigned int offset, head;
2682 int have_lost;
2683 struct {
2684 struct perf_event_header header;
2685 u64 id;
2686 u64 lost;
2687 } lost_event;
2279 2688
2689 rcu_read_lock();
2280 /* 2690 /*
2281 * For inherited counters we send all the output towards the parent. 2691 * For inherited counters we send all the output towards the parent.
2282 */ 2692 */
2283 if (counter->parent) 2693 if (counter->parent)
2284 counter = counter->parent; 2694 counter = counter->parent;
2285 2695
2286 rcu_read_lock(); 2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2287 data = rcu_dereference(counter->data); 2700 data = rcu_dereference(counter->data);
2288 if (!data) 2701 if (!data)
2289 goto out; 2702 goto out;
2290 2703
2291 handle->data = data; 2704 handle->data = data;
2292 handle->counter = counter; 2705 handle->counter = counter;
2293 handle->nmi = nmi; 2706 handle->nmi = nmi;
2294 handle->overflow = overflow; 2707 handle->sample = sample;
2295 2708
2296 if (!data->nr_pages) 2709 if (!data->nr_pages)
2297 goto fail; 2710 goto fail;
2298 2711
2712 have_lost = atomic_read(&data->lost);
2713 if (have_lost)
2714 size += sizeof(lost_event);
2715
2299 perf_output_lock(handle); 2716 perf_output_lock(handle);
2300 2717
2301 do { 2718 do {
2302 offset = head = atomic_long_read(&data->head); 2719 offset = head = atomic_long_read(&data->head);
2303 head += size; 2720 head += size;
2721 if (unlikely(!perf_output_space(data, offset, head)))
2722 goto fail;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2723 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305 2724
2306 handle->offset = offset; 2725 handle->offset = offset;
@@ -2309,55 +2728,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2728 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1); 2729 atomic_set(&data->wakeup, 1);
2311 2730
2731 if (have_lost) {
2732 lost_event.header.type = PERF_EVENT_LOST;
2733 lost_event.header.misc = 0;
2734 lost_event.header.size = sizeof(lost_event);
2735 lost_event.id = counter->id;
2736 lost_event.lost = atomic_xchg(&data->lost, 0);
2737
2738 perf_output_put(handle, lost_event);
2739 }
2740
2312 return 0; 2741 return 0;
2313 2742
2314fail: 2743fail:
2315 perf_output_wakeup(handle); 2744 atomic_inc(&data->lost);
2745 perf_output_unlock(handle);
2316out: 2746out:
2317 rcu_read_unlock(); 2747 rcu_read_unlock();
2318 2748
2319 return -ENOSPC; 2749 return -ENOSPC;
2320} 2750}
2321 2751
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle) 2752static void perf_output_end(struct perf_output_handle *handle)
2362{ 2753{
2363 struct perf_counter *counter = handle->counter; 2754 struct perf_counter *counter = handle->counter;
@@ -2365,7 +2756,7 @@ static void perf_output_end(struct perf_output_handle *handle)
2365 2756
2366 int wakeup_events = counter->attr.wakeup_events; 2757 int wakeup_events = counter->attr.wakeup_events;
2367 2758
2368 if (handle->overflow && wakeup_events) { 2759 if (handle->sample && wakeup_events) {
2369 int events = atomic_inc_return(&data->events); 2760 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) { 2761 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events); 2762 atomic_sub(wakeup_events, &data->events);
@@ -2399,7 +2790,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2399 return task_pid_nr_ns(p, counter->ns); 2790 return task_pid_nr_ns(p, counter->ns);
2400} 2791}
2401 2792
2402static void perf_counter_output(struct perf_counter *counter, int nmi, 2793static void perf_output_read_one(struct perf_output_handle *handle,
2794 struct perf_counter *counter)
2795{
2796 u64 read_format = counter->attr.read_format;
2797 u64 values[4];
2798 int n = 0;
2799
2800 values[n++] = atomic64_read(&counter->count);
2801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2802 values[n++] = counter->total_time_enabled +
2803 atomic64_read(&counter->child_total_time_enabled);
2804 }
2805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2806 values[n++] = counter->total_time_running +
2807 atomic64_read(&counter->child_total_time_running);
2808 }
2809 if (read_format & PERF_FORMAT_ID)
2810 values[n++] = primary_counter_id(counter);
2811
2812 perf_output_copy(handle, values, n * sizeof(u64));
2813}
2814
2815/*
2816 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2817 */
2818static void perf_output_read_group(struct perf_output_handle *handle,
2819 struct perf_counter *counter)
2820{
2821 struct perf_counter *leader = counter->group_leader, *sub;
2822 u64 read_format = counter->attr.read_format;
2823 u64 values[5];
2824 int n = 0;
2825
2826 values[n++] = 1 + leader->nr_siblings;
2827
2828 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2829 values[n++] = leader->total_time_enabled;
2830
2831 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2832 values[n++] = leader->total_time_running;
2833
2834 if (leader != counter)
2835 leader->pmu->read(leader);
2836
2837 values[n++] = atomic64_read(&leader->count);
2838 if (read_format & PERF_FORMAT_ID)
2839 values[n++] = primary_counter_id(leader);
2840
2841 perf_output_copy(handle, values, n * sizeof(u64));
2842
2843 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2844 n = 0;
2845
2846 if (sub != counter)
2847 sub->pmu->read(sub);
2848
2849 values[n++] = atomic64_read(&sub->count);
2850 if (read_format & PERF_FORMAT_ID)
2851 values[n++] = primary_counter_id(sub);
2852
2853 perf_output_copy(handle, values, n * sizeof(u64));
2854 }
2855}
2856
2857static void perf_output_read(struct perf_output_handle *handle,
2858 struct perf_counter *counter)
2859{
2860 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2861 perf_output_read_group(handle, counter);
2862 else
2863 perf_output_read_one(handle, counter);
2864}
2865
2866void perf_counter_output(struct perf_counter *counter, int nmi,
2403 struct perf_sample_data *data) 2867 struct perf_sample_data *data)
2404{ 2868{
2405 int ret; 2869 int ret;
@@ -2410,10 +2874,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2410 struct { 2874 struct {
2411 u32 pid, tid; 2875 u32 pid, tid;
2412 } tid_entry; 2876 } tid_entry;
2413 struct {
2414 u64 id;
2415 u64 counter;
2416 } group_entry;
2417 struct perf_callchain_entry *callchain = NULL; 2877 struct perf_callchain_entry *callchain = NULL;
2418 int callchain_size = 0; 2878 int callchain_size = 0;
2419 u64 time; 2879 u64 time;
@@ -2421,15 +2881,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2421 u32 cpu, reserved; 2881 u32 cpu, reserved;
2422 } cpu_entry; 2882 } cpu_entry;
2423 2883
2424 header.type = 0; 2884 header.type = PERF_EVENT_SAMPLE;
2425 header.size = sizeof(header); 2885 header.size = sizeof(header);
2426 2886
2427 header.misc = PERF_EVENT_MISC_OVERFLOW; 2887 header.misc = 0;
2428 header.misc |= perf_misc_flags(data->regs); 2888 header.misc |= perf_misc_flags(data->regs);
2429 2889
2430 if (sample_type & PERF_SAMPLE_IP) { 2890 if (sample_type & PERF_SAMPLE_IP) {
2431 ip = perf_instruction_pointer(data->regs); 2891 ip = perf_instruction_pointer(data->regs);
2432 header.type |= PERF_SAMPLE_IP;
2433 header.size += sizeof(ip); 2892 header.size += sizeof(ip);
2434 } 2893 }
2435 2894
@@ -2438,7 +2897,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2438 tid_entry.pid = perf_counter_pid(counter, current); 2897 tid_entry.pid = perf_counter_pid(counter, current);
2439 tid_entry.tid = perf_counter_tid(counter, current); 2898 tid_entry.tid = perf_counter_tid(counter, current);
2440 2899
2441 header.type |= PERF_SAMPLE_TID;
2442 header.size += sizeof(tid_entry); 2900 header.size += sizeof(tid_entry);
2443 } 2901 }
2444 2902
@@ -2448,47 +2906,51 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2448 */ 2906 */
2449 time = sched_clock(); 2907 time = sched_clock();
2450 2908
2451 header.type |= PERF_SAMPLE_TIME;
2452 header.size += sizeof(u64); 2909 header.size += sizeof(u64);
2453 } 2910 }
2454 2911
2455 if (sample_type & PERF_SAMPLE_ADDR) { 2912 if (sample_type & PERF_SAMPLE_ADDR)
2456 header.type |= PERF_SAMPLE_ADDR;
2457 header.size += sizeof(u64); 2913 header.size += sizeof(u64);
2458 }
2459 2914
2460 if (sample_type & PERF_SAMPLE_ID) { 2915 if (sample_type & PERF_SAMPLE_ID)
2461 header.type |= PERF_SAMPLE_ID; 2916 header.size += sizeof(u64);
2917
2918 if (sample_type & PERF_SAMPLE_STREAM_ID)
2462 header.size += sizeof(u64); 2919 header.size += sizeof(u64);
2463 }
2464 2920
2465 if (sample_type & PERF_SAMPLE_CPU) { 2921 if (sample_type & PERF_SAMPLE_CPU) {
2466 header.type |= PERF_SAMPLE_CPU;
2467 header.size += sizeof(cpu_entry); 2922 header.size += sizeof(cpu_entry);
2468 2923
2469 cpu_entry.cpu = raw_smp_processor_id(); 2924 cpu_entry.cpu = raw_smp_processor_id();
2925 cpu_entry.reserved = 0;
2470 } 2926 }
2471 2927
2472 if (sample_type & PERF_SAMPLE_PERIOD) { 2928 if (sample_type & PERF_SAMPLE_PERIOD)
2473 header.type |= PERF_SAMPLE_PERIOD;
2474 header.size += sizeof(u64); 2929 header.size += sizeof(u64);
2475 }
2476 2930
2477 if (sample_type & PERF_SAMPLE_GROUP) { 2931 if (sample_type & PERF_SAMPLE_READ)
2478 header.type |= PERF_SAMPLE_GROUP; 2932 header.size += perf_counter_read_size(counter);
2479 header.size += sizeof(u64) +
2480 counter->nr_siblings * sizeof(group_entry);
2481 }
2482 2933
2483 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2484 callchain = perf_callchain(data->regs); 2935 callchain = perf_callchain(data->regs);
2485 2936
2486 if (callchain) { 2937 if (callchain) {
2487 callchain_size = (1 + callchain->nr) * sizeof(u64); 2938 callchain_size = (1 + callchain->nr) * sizeof(u64);
2488
2489 header.type |= PERF_SAMPLE_CALLCHAIN;
2490 header.size += callchain_size; 2939 header.size += callchain_size;
2491 } 2940 } else
2941 header.size += sizeof(u64);
2942 }
2943
2944 if (sample_type & PERF_SAMPLE_RAW) {
2945 int size = sizeof(u32);
2946
2947 if (data->raw)
2948 size += data->raw->size;
2949 else
2950 size += sizeof(u32);
2951
2952 WARN_ON_ONCE(size & (sizeof(u64)-1));
2953 header.size += size;
2492 } 2954 }
2493 2955
2494 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2956 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2509,7 +2971,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2509 if (sample_type & PERF_SAMPLE_ADDR) 2971 if (sample_type & PERF_SAMPLE_ADDR)
2510 perf_output_put(&handle, data->addr); 2972 perf_output_put(&handle, data->addr);
2511 2973
2512 if (sample_type & PERF_SAMPLE_ID) 2974 if (sample_type & PERF_SAMPLE_ID) {
2975 u64 id = primary_counter_id(counter);
2976
2977 perf_output_put(&handle, id);
2978 }
2979
2980 if (sample_type & PERF_SAMPLE_STREAM_ID)
2513 perf_output_put(&handle, counter->id); 2981 perf_output_put(&handle, counter->id);
2514 2982
2515 if (sample_type & PERF_SAMPLE_CPU) 2983 if (sample_type & PERF_SAMPLE_CPU)
@@ -2518,76 +2986,125 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2518 if (sample_type & PERF_SAMPLE_PERIOD) 2986 if (sample_type & PERF_SAMPLE_PERIOD)
2519 perf_output_put(&handle, data->period); 2987 perf_output_put(&handle, data->period);
2520 2988
2521 /* 2989 if (sample_type & PERF_SAMPLE_READ)
2522 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2990 perf_output_read(&handle, counter);
2523 */
2524 if (sample_type & PERF_SAMPLE_GROUP) {
2525 struct perf_counter *leader, *sub;
2526 u64 nr = counter->nr_siblings;
2527 2991
2528 perf_output_put(&handle, nr); 2992 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2529 2993 if (callchain)
2530 leader = counter->group_leader; 2994 perf_output_copy(&handle, callchain, callchain_size);
2531 list_for_each_entry(sub, &leader->sibling_list, list_entry) { 2995 else {
2532 if (sub != counter) 2996 u64 nr = 0;
2533 sub->pmu->read(sub); 2997 perf_output_put(&handle, nr);
2534 2998 }
2535 group_entry.id = sub->id; 2999 }
2536 group_entry.counter = atomic64_read(&sub->count);
2537 3000
2538 perf_output_put(&handle, group_entry); 3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 if (data->raw) {
3003 perf_output_put(&handle, data->raw->size);
3004 perf_output_copy(&handle, data->raw->data, data->raw->size);
3005 } else {
3006 struct {
3007 u32 size;
3008 u32 data;
3009 } raw = {
3010 .size = sizeof(u32),
3011 .data = 0,
3012 };
3013 perf_output_put(&handle, raw);
2539 } 3014 }
2540 } 3015 }
2541 3016
2542 if (callchain) 3017 perf_output_end(&handle);
2543 perf_output_copy(&handle, callchain, callchain_size); 3018}
3019
3020/*
3021 * read event
3022 */
3023
3024struct perf_read_event {
3025 struct perf_event_header header;
3026
3027 u32 pid;
3028 u32 tid;
3029};
3030
3031static void
3032perf_counter_read_event(struct perf_counter *counter,
3033 struct task_struct *task)
3034{
3035 struct perf_output_handle handle;
3036 struct perf_read_event event = {
3037 .header = {
3038 .type = PERF_EVENT_READ,
3039 .misc = 0,
3040 .size = sizeof(event) + perf_counter_read_size(counter),
3041 },
3042 .pid = perf_counter_pid(counter, task),
3043 .tid = perf_counter_tid(counter, task),
3044 };
3045 int ret;
3046
3047 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3048 if (ret)
3049 return;
3050
3051 perf_output_put(&handle, event);
3052 perf_output_read(&handle, counter);
2544 3053
2545 perf_output_end(&handle); 3054 perf_output_end(&handle);
2546} 3055}
2547 3056
2548/* 3057/*
2549 * fork tracking 3058 * task tracking -- fork/exit
3059 *
3060 * enabled by: attr.comm | attr.mmap | attr.task
2550 */ 3061 */
2551 3062
2552struct perf_fork_event { 3063struct perf_task_event {
2553 struct task_struct *task; 3064 struct task_struct *task;
3065 struct perf_counter_context *task_ctx;
2554 3066
2555 struct { 3067 struct {
2556 struct perf_event_header header; 3068 struct perf_event_header header;
2557 3069
2558 u32 pid; 3070 u32 pid;
2559 u32 ppid; 3071 u32 ppid;
3072 u32 tid;
3073 u32 ptid;
2560 } event; 3074 } event;
2561}; 3075};
2562 3076
2563static void perf_counter_fork_output(struct perf_counter *counter, 3077static void perf_counter_task_output(struct perf_counter *counter,
2564 struct perf_fork_event *fork_event) 3078 struct perf_task_event *task_event)
2565{ 3079{
2566 struct perf_output_handle handle; 3080 struct perf_output_handle handle;
2567 int size = fork_event->event.header.size; 3081 int size = task_event->event.header.size;
2568 struct task_struct *task = fork_event->task; 3082 struct task_struct *task = task_event->task;
2569 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3083 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2570 3084
2571 if (ret) 3085 if (ret)
2572 return; 3086 return;
2573 3087
2574 fork_event->event.pid = perf_counter_pid(counter, task); 3088 task_event->event.pid = perf_counter_pid(counter, task);
2575 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 3089 task_event->event.ppid = perf_counter_pid(counter, current);
2576 3090
2577 perf_output_put(&handle, fork_event->event); 3091 task_event->event.tid = perf_counter_tid(counter, task);
3092 task_event->event.ptid = perf_counter_tid(counter, current);
3093
3094 perf_output_put(&handle, task_event->event);
2578 perf_output_end(&handle); 3095 perf_output_end(&handle);
2579} 3096}
2580 3097
2581static int perf_counter_fork_match(struct perf_counter *counter) 3098static int perf_counter_task_match(struct perf_counter *counter)
2582{ 3099{
2583 if (counter->attr.comm || counter->attr.mmap) 3100 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2584 return 1; 3101 return 1;
2585 3102
2586 return 0; 3103 return 0;
2587} 3104}
2588 3105
2589static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 3106static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2590 struct perf_fork_event *fork_event) 3107 struct perf_task_event *task_event)
2591{ 3108{
2592 struct perf_counter *counter; 3109 struct perf_counter *counter;
2593 3110
@@ -2596,51 +3113,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2596 3113
2597 rcu_read_lock(); 3114 rcu_read_lock();
2598 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3115 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2599 if (perf_counter_fork_match(counter)) 3116 if (perf_counter_task_match(counter))
2600 perf_counter_fork_output(counter, fork_event); 3117 perf_counter_task_output(counter, task_event);
2601 } 3118 }
2602 rcu_read_unlock(); 3119 rcu_read_unlock();
2603} 3120}
2604 3121
2605static void perf_counter_fork_event(struct perf_fork_event *fork_event) 3122static void perf_counter_task_event(struct perf_task_event *task_event)
2606{ 3123{
2607 struct perf_cpu_context *cpuctx; 3124 struct perf_cpu_context *cpuctx;
2608 struct perf_counter_context *ctx; 3125 struct perf_counter_context *ctx = task_event->task_ctx;
2609 3126
2610 cpuctx = &get_cpu_var(perf_cpu_context); 3127 cpuctx = &get_cpu_var(perf_cpu_context);
2611 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 3128 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2612 put_cpu_var(perf_cpu_context); 3129 put_cpu_var(perf_cpu_context);
2613 3130
2614 rcu_read_lock(); 3131 rcu_read_lock();
2615 /* 3132 if (!ctx)
2616 * doesn't really matter which of the child contexts the 3133 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2617 * events ends up in.
2618 */
2619 ctx = rcu_dereference(current->perf_counter_ctxp);
2620 if (ctx) 3134 if (ctx)
2621 perf_counter_fork_ctx(ctx, fork_event); 3135 perf_counter_task_ctx(ctx, task_event);
2622 rcu_read_unlock(); 3136 rcu_read_unlock();
2623} 3137}
2624 3138
2625void perf_counter_fork(struct task_struct *task) 3139static void perf_counter_task(struct task_struct *task,
3140 struct perf_counter_context *task_ctx,
3141 int new)
2626{ 3142{
2627 struct perf_fork_event fork_event; 3143 struct perf_task_event task_event;
2628 3144
2629 if (!atomic_read(&nr_comm_counters) && 3145 if (!atomic_read(&nr_comm_counters) &&
2630 !atomic_read(&nr_mmap_counters)) 3146 !atomic_read(&nr_mmap_counters) &&
3147 !atomic_read(&nr_task_counters))
2631 return; 3148 return;
2632 3149
2633 fork_event = (struct perf_fork_event){ 3150 task_event = (struct perf_task_event){
2634 .task = task, 3151 .task = task,
2635 .event = { 3152 .task_ctx = task_ctx,
3153 .event = {
2636 .header = { 3154 .header = {
2637 .type = PERF_EVENT_FORK, 3155 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2638 .size = sizeof(fork_event.event), 3156 .misc = 0,
3157 .size = sizeof(task_event.event),
2639 }, 3158 },
3159 /* .pid */
3160 /* .ppid */
3161 /* .tid */
3162 /* .ptid */
2640 }, 3163 },
2641 }; 3164 };
2642 3165
2643 perf_counter_fork_event(&fork_event); 3166 perf_counter_task_event(&task_event);
3167}
3168
3169void perf_counter_fork(struct task_struct *task)
3170{
3171 perf_counter_task(task, NULL, 1);
2644} 3172}
2645 3173
2646/* 3174/*
@@ -2708,8 +3236,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2708 struct perf_cpu_context *cpuctx; 3236 struct perf_cpu_context *cpuctx;
2709 struct perf_counter_context *ctx; 3237 struct perf_counter_context *ctx;
2710 unsigned int size; 3238 unsigned int size;
2711 char *comm = comm_event->task->comm; 3239 char comm[TASK_COMM_LEN];
2712 3240
3241 memset(comm, 0, sizeof(comm));
3242 strncpy(comm, comm_event->task->comm, sizeof(comm));
2713 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3243 size = ALIGN(strlen(comm)+1, sizeof(u64));
2714 3244
2715 comm_event->comm = comm; 3245 comm_event->comm = comm;
@@ -2736,13 +3266,24 @@ void perf_counter_comm(struct task_struct *task)
2736{ 3266{
2737 struct perf_comm_event comm_event; 3267 struct perf_comm_event comm_event;
2738 3268
3269 if (task->perf_counter_ctxp)
3270 perf_counter_enable_on_exec(task);
3271
2739 if (!atomic_read(&nr_comm_counters)) 3272 if (!atomic_read(&nr_comm_counters))
2740 return; 3273 return;
2741 3274
2742 comm_event = (struct perf_comm_event){ 3275 comm_event = (struct perf_comm_event){
2743 .task = task, 3276 .task = task,
3277 /* .comm */
3278 /* .comm_size */
2744 .event = { 3279 .event = {
2745 .header = { .type = PERF_EVENT_COMM, }, 3280 .header = {
3281 .type = PERF_EVENT_COMM,
3282 .misc = 0,
3283 /* .size */
3284 },
3285 /* .pid */
3286 /* .tid */
2746 }, 3287 },
2747 }; 3288 };
2748 3289
@@ -2825,8 +3366,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2825 char *buf = NULL; 3366 char *buf = NULL;
2826 const char *name; 3367 const char *name;
2827 3368
3369 memset(tmp, 0, sizeof(tmp));
3370
2828 if (file) { 3371 if (file) {
2829 buf = kzalloc(PATH_MAX, GFP_KERNEL); 3372 /*
3373 * d_path works from the end of the buffer backwards, so we
3374 * need to add enough zero bytes after the string to handle
3375 * the 64bit alignment we do later.
3376 */
3377 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
2830 if (!buf) { 3378 if (!buf) {
2831 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3379 name = strncpy(tmp, "//enomem", sizeof(tmp));
2832 goto got_name; 3380 goto got_name;
@@ -2837,9 +3385,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2837 goto got_name; 3385 goto got_name;
2838 } 3386 }
2839 } else { 3387 } else {
2840 name = arch_vma_name(mmap_event->vma); 3388 if (arch_vma_name(mmap_event->vma)) {
2841 if (name) 3389 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3390 sizeof(tmp));
2842 goto got_name; 3391 goto got_name;
3392 }
2843 3393
2844 if (!vma->vm_mm) { 3394 if (!vma->vm_mm) {
2845 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3395 name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@ -2884,8 +3434,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2884 3434
2885 mmap_event = (struct perf_mmap_event){ 3435 mmap_event = (struct perf_mmap_event){
2886 .vma = vma, 3436 .vma = vma,
3437 /* .file_name */
3438 /* .file_size */
2887 .event = { 3439 .event = {
2888 .header = { .type = PERF_EVENT_MMAP, }, 3440 .header = {
3441 .type = PERF_EVENT_MMAP,
3442 .misc = 0,
3443 /* .size */
3444 },
3445 /* .pid */
3446 /* .tid */
2889 .start = vma->vm_start, 3447 .start = vma->vm_start,
2890 .len = vma->vm_end - vma->vm_start, 3448 .len = vma->vm_end - vma->vm_start,
2891 .pgoff = vma->vm_pgoff, 3449 .pgoff = vma->vm_pgoff,
@@ -2896,49 +3454,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2896} 3454}
2897 3455
2898/* 3456/*
2899 * Log sample_period changes so that analyzing tools can re-normalize the
2900 * event flow.
2901 */
2902
2903struct freq_event {
2904 struct perf_event_header header;
2905 u64 time;
2906 u64 id;
2907 u64 period;
2908};
2909
2910static void perf_log_period(struct perf_counter *counter, u64 period)
2911{
2912 struct perf_output_handle handle;
2913 struct freq_event event;
2914 int ret;
2915
2916 if (counter->hw.sample_period == period)
2917 return;
2918
2919 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2920 return;
2921
2922 event = (struct freq_event) {
2923 .header = {
2924 .type = PERF_EVENT_PERIOD,
2925 .misc = 0,
2926 .size = sizeof(event),
2927 },
2928 .time = sched_clock(),
2929 .id = counter->id,
2930 .period = period,
2931 };
2932
2933 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2934 if (ret)
2935 return;
2936
2937 perf_output_put(&handle, event);
2938 perf_output_end(&handle);
2939}
2940
2941/*
2942 * IRQ throttle logging 3457 * IRQ throttle logging
2943 */ 3458 */
2944 3459
@@ -2951,16 +3466,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2951 struct perf_event_header header; 3466 struct perf_event_header header;
2952 u64 time; 3467 u64 time;
2953 u64 id; 3468 u64 id;
3469 u64 stream_id;
2954 } throttle_event = { 3470 } throttle_event = {
2955 .header = { 3471 .header = {
2956 .type = PERF_EVENT_THROTTLE + 1, 3472 .type = PERF_EVENT_THROTTLE,
2957 .misc = 0, 3473 .misc = 0,
2958 .size = sizeof(throttle_event), 3474 .size = sizeof(throttle_event),
2959 }, 3475 },
2960 .time = sched_clock(), 3476 .time = sched_clock(),
2961 .id = counter->id, 3477 .id = primary_counter_id(counter),
3478 .stream_id = counter->id,
2962 }; 3479 };
2963 3480
3481 if (enable)
3482 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3483
2964 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); 3484 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
2965 if (ret) 3485 if (ret)
2966 return; 3486 return;
@@ -2970,7 +3490,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2970} 3490}
2971 3491
2972/* 3492/*
2973 * Generic counter overflow handling. 3493 * Generic counter overflow handling, sampling.
2974 */ 3494 */
2975 3495
2976int perf_counter_overflow(struct perf_counter *counter, int nmi, 3496int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -3037,130 +3557,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3037 * Generic software counter infrastructure 3557 * Generic software counter infrastructure
3038 */ 3558 */
3039 3559
3040static void perf_swcounter_update(struct perf_counter *counter) 3560/*
3561 * We directly increment counter->count and keep a second value in
3562 * counter->hw.period_left to count intervals. This period counter
3563 * is kept in the range [-sample_period, 0] so that we can use the
3564 * sign as trigger.
3565 */
3566
3567static u64 perf_swcounter_set_period(struct perf_counter *counter)
3041{ 3568{
3042 struct hw_perf_counter *hwc = &counter->hw; 3569 struct hw_perf_counter *hwc = &counter->hw;
3043 u64 prev, now; 3570 u64 period = hwc->last_period;
3044 s64 delta; 3571 u64 nr, offset;
3572 s64 old, val;
3573
3574 hwc->last_period = hwc->sample_period;
3045 3575
3046again: 3576again:
3047 prev = atomic64_read(&hwc->prev_count); 3577 old = val = atomic64_read(&hwc->period_left);
3048 now = atomic64_read(&hwc->count); 3578 if (val < 0)
3049 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3579 return 0;
3050 goto again;
3051 3580
3052 delta = now - prev; 3581 nr = div64_u64(period + val, period);
3582 offset = nr * period;
3583 val -= offset;
3584 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3585 goto again;
3053 3586
3054 atomic64_add(delta, &counter->count); 3587 return nr;
3055 atomic64_sub(delta, &hwc->period_left);
3056} 3588}
3057 3589
3058static void perf_swcounter_set_period(struct perf_counter *counter) 3590static void perf_swcounter_overflow(struct perf_counter *counter,
3591 int nmi, struct perf_sample_data *data)
3059{ 3592{
3060 struct hw_perf_counter *hwc = &counter->hw; 3593 struct hw_perf_counter *hwc = &counter->hw;
3061 s64 left = atomic64_read(&hwc->period_left); 3594 u64 overflow;
3062 s64 period = hwc->sample_period;
3063 3595
3064 if (unlikely(left <= -period)) { 3596 data->period = counter->hw.last_period;
3065 left = period; 3597 overflow = perf_swcounter_set_period(counter);
3066 atomic64_set(&hwc->period_left, left);
3067 hwc->last_period = period;
3068 }
3069 3598
3070 if (unlikely(left <= 0)) { 3599 if (hwc->interrupts == MAX_INTERRUPTS)
3071 left += period; 3600 return;
3072 atomic64_add(period, &hwc->period_left);
3073 hwc->last_period = period;
3074 }
3075 3601
3076 atomic64_set(&hwc->prev_count, -left); 3602 for (; overflow; overflow--) {
3077 atomic64_set(&hwc->count, -left); 3603 if (perf_counter_overflow(counter, nmi, data)) {
3604 /*
3605 * We inhibit the overflow from happening when
3606 * hwc->interrupts == MAX_INTERRUPTS.
3607 */
3608 break;
3609 }
3610 }
3078} 3611}
3079 3612
3080static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3613static void perf_swcounter_unthrottle(struct perf_counter *counter)
3081{ 3614{
3082 enum hrtimer_restart ret = HRTIMER_RESTART;
3083 struct perf_sample_data data;
3084 struct perf_counter *counter;
3085 u64 period;
3086
3087 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3088 counter->pmu->read(counter);
3089
3090 data.addr = 0;
3091 data.regs = get_irq_regs();
3092 /* 3615 /*
3093 * In case we exclude kernel IPs or are somehow not in interrupt 3616 * Nothing to do, we already reset hwc->interrupts.
3094 * context, provide the next best thing, the user IP.
3095 */ 3617 */
3096 if ((counter->attr.exclude_kernel || !data.regs) &&
3097 !counter->attr.exclude_user)
3098 data.regs = task_pt_regs(current);
3099
3100 if (data.regs) {
3101 if (perf_counter_overflow(counter, 0, &data))
3102 ret = HRTIMER_NORESTART;
3103 }
3104
3105 period = max_t(u64, 10000, counter->hw.sample_period);
3106 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3107
3108 return ret;
3109} 3618}
3110 3619
3111static void perf_swcounter_overflow(struct perf_counter *counter, 3620static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3112 int nmi, struct pt_regs *regs, u64 addr) 3621 int nmi, struct perf_sample_data *data)
3113{ 3622{
3114 struct perf_sample_data data = { 3623 struct hw_perf_counter *hwc = &counter->hw;
3115 .regs = regs, 3624
3116 .addr = addr, 3625 atomic64_add(nr, &counter->count);
3117 .period = counter->hw.last_period,
3118 };
3119 3626
3120 perf_swcounter_update(counter); 3627 if (!hwc->sample_period)
3121 perf_swcounter_set_period(counter); 3628 return;
3122 if (perf_counter_overflow(counter, nmi, &data)) 3629
3123 /* soft-disable the counter */ 3630 if (!data->regs)
3124 ; 3631 return;
3125 3632
3633 if (!atomic64_add_negative(nr, &hwc->period_left))
3634 perf_swcounter_overflow(counter, nmi, data);
3126} 3635}
3127 3636
3128static int perf_swcounter_is_counting(struct perf_counter *counter) 3637static int perf_swcounter_is_counting(struct perf_counter *counter)
3129{ 3638{
3130 struct perf_counter_context *ctx; 3639 /*
3131 unsigned long flags; 3640 * The counter is active, we're good!
3132 int count; 3641 */
3133
3134 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3642 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3135 return 1; 3643 return 1;
3136 3644
3645 /*
3646 * The counter is off/error, not counting.
3647 */
3137 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3648 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3138 return 0; 3649 return 0;
3139 3650
3140 /* 3651 /*
3141 * If the counter is inactive, it could be just because 3652 * The counter is inactive, if the context is active
3142 * its task is scheduled out, or because it's in a group 3653 * we're part of a group that didn't make it on the 'pmu',
3143 * which could not go on the PMU. We want to count in 3654 * not counting.
3144 * the first case but not the second. If the context is
3145 * currently active then an inactive software counter must
3146 * be the second case. If it's not currently active then
3147 * we need to know whether the counter was active when the
3148 * context was last active, which we can determine by
3149 * comparing counter->tstamp_stopped with ctx->time.
3150 *
3151 * We are within an RCU read-side critical section,
3152 * which protects the existence of *ctx.
3153 */ 3655 */
3154 ctx = counter->ctx; 3656 if (counter->ctx->is_active)
3155 spin_lock_irqsave(&ctx->lock, flags); 3657 return 0;
3156 count = 1; 3658
3157 /* Re-check state now we have the lock */ 3659 /*
3158 if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3660 * We're inactive and the context is too, this means the
3159 counter->ctx->is_active || 3661 * task is scheduled out, we're counting events that happen
3160 counter->tstamp_stopped < ctx->time) 3662 * to us, like migration events.
3161 count = 0; 3663 */
3162 spin_unlock_irqrestore(&ctx->lock, flags); 3664 return 1;
3163 return count;
3164} 3665}
3165 3666
3166static int perf_swcounter_match(struct perf_counter *counter, 3667static int perf_swcounter_match(struct perf_counter *counter,
@@ -3186,19 +3687,10 @@ static int perf_swcounter_match(struct perf_counter *counter,
3186 return 1; 3687 return 1;
3187} 3688}
3188 3689
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr)
3191{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193
3194 if (counter->hw.sample_period && !neg && regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr);
3196}
3197
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3690static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event, 3691 enum perf_type_id type,
3200 u64 nr, int nmi, struct pt_regs *regs, 3692 u32 event, u64 nr, int nmi,
3201 u64 addr) 3693 struct perf_sample_data *data)
3202{ 3694{
3203 struct perf_counter *counter; 3695 struct perf_counter *counter;
3204 3696
@@ -3207,8 +3699,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3207 3699
3208 rcu_read_lock(); 3700 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3701 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs)) 3702 if (perf_swcounter_match(counter, type, event, data->regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr); 3703 perf_swcounter_add(counter, nr, nmi, data);
3212 } 3704 }
3213 rcu_read_unlock(); 3705 rcu_read_unlock();
3214} 3706}
@@ -3227,9 +3719,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3227 return &cpuctx->recursion[0]; 3719 return &cpuctx->recursion[0];
3228} 3720}
3229 3721
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event, 3722static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs, 3723 u64 nr, int nmi,
3232 u64 addr) 3724 struct perf_sample_data *data)
3233{ 3725{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3726 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx); 3727 int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3242,7 +3734,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3242 barrier(); 3734 barrier();
3243 3735
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, 3736 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr); 3737 nr, nmi, data);
3246 rcu_read_lock(); 3738 rcu_read_lock();
3247 /* 3739 /*
3248 * doesn't really matter which of the child contexts the 3740 * doesn't really matter which of the child contexts the
@@ -3250,7 +3742,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3250 */ 3742 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp); 3743 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx) 3744 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); 3745 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3254 rcu_read_unlock(); 3746 rcu_read_unlock();
3255 3747
3256 barrier(); 3748 barrier();
@@ -3260,35 +3752,79 @@ out:
3260 put_cpu_var(perf_cpu_context); 3752 put_cpu_var(perf_cpu_context);
3261} 3753}
3262 3754
3263void 3755void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3756 struct pt_regs *regs, u64 addr)
3265{ 3757{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); 3758 struct perf_sample_data data = {
3759 .regs = regs,
3760 .addr = addr,
3761 };
3762
3763 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3267} 3764}
3268 3765
3269static void perf_swcounter_read(struct perf_counter *counter) 3766static void perf_swcounter_read(struct perf_counter *counter)
3270{ 3767{
3271 perf_swcounter_update(counter);
3272} 3768}
3273 3769
3274static int perf_swcounter_enable(struct perf_counter *counter) 3770static int perf_swcounter_enable(struct perf_counter *counter)
3275{ 3771{
3276 perf_swcounter_set_period(counter); 3772 struct hw_perf_counter *hwc = &counter->hw;
3773
3774 if (hwc->sample_period) {
3775 hwc->last_period = hwc->sample_period;
3776 perf_swcounter_set_period(counter);
3777 }
3277 return 0; 3778 return 0;
3278} 3779}
3279 3780
3280static void perf_swcounter_disable(struct perf_counter *counter) 3781static void perf_swcounter_disable(struct perf_counter *counter)
3281{ 3782{
3282 perf_swcounter_update(counter);
3283} 3783}
3284 3784
3285static const struct pmu perf_ops_generic = { 3785static const struct pmu perf_ops_generic = {
3286 .enable = perf_swcounter_enable, 3786 .enable = perf_swcounter_enable,
3287 .disable = perf_swcounter_disable, 3787 .disable = perf_swcounter_disable,
3288 .read = perf_swcounter_read, 3788 .read = perf_swcounter_read,
3789 .unthrottle = perf_swcounter_unthrottle,
3289}; 3790};
3290 3791
3291/* 3792/*
3793 * hrtimer based swcounter callback
3794 */
3795
3796static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3797{
3798 enum hrtimer_restart ret = HRTIMER_RESTART;
3799 struct perf_sample_data data;
3800 struct perf_counter *counter;
3801 u64 period;
3802
3803 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3804 counter->pmu->read(counter);
3805
3806 data.addr = 0;
3807 data.regs = get_irq_regs();
3808 /*
3809 * In case we exclude kernel IPs or are somehow not in interrupt
3810 * context, provide the next best thing, the user IP.
3811 */
3812 if ((counter->attr.exclude_kernel || !data.regs) &&
3813 !counter->attr.exclude_user)
3814 data.regs = task_pt_regs(current);
3815
3816 if (data.regs) {
3817 if (perf_counter_overflow(counter, 0, &data))
3818 ret = HRTIMER_NORESTART;
3819 }
3820
3821 period = max_t(u64, 10000, counter->hw.sample_period);
3822 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3823
3824 return ret;
3825}
3826
3827/*
3292 * Software counter: cpu wall time clock 3828 * Software counter: cpu wall time clock
3293 */ 3829 */
3294 3830
@@ -3404,36 +3940,25 @@ static const struct pmu perf_ops_task_clock = {
3404 .read = task_clock_perf_counter_read, 3940 .read = task_clock_perf_counter_read,
3405}; 3941};
3406 3942
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE 3943#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id) 3944void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3945 int entry_size)
3430{ 3946{
3431 struct pt_regs *regs = get_irq_regs(); 3947 struct perf_raw_record raw = {
3948 .size = entry_size,
3949 .data = record,
3950 };
3432 3951
3433 if (!regs) 3952 struct perf_sample_data data = {
3434 regs = task_pt_regs(current); 3953 .regs = get_irq_regs(),
3954 .addr = addr,
3955 .raw = &raw,
3956 };
3435 3957
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); 3958 if (!data.regs)
3959 data.regs = task_pt_regs(current);
3960
3961 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3437} 3962}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3963EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439 3964
@@ -3442,16 +3967,21 @@ extern void ftrace_profile_disable(int);
3442 3967
3443static void tp_perf_counter_destroy(struct perf_counter *counter) 3968static void tp_perf_counter_destroy(struct perf_counter *counter)
3444{ 3969{
3445 ftrace_profile_disable(perf_event_id(&counter->attr)); 3970 ftrace_profile_disable(counter->attr.config);
3446} 3971}
3447 3972
3448static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3973static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3449{ 3974{
3450 int event_id = perf_event_id(&counter->attr); 3975 /*
3451 int ret; 3976 * Raw tracepoint data is a severe data leak, only allow root to
3977 * have these.
3978 */
3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3981 !capable(CAP_SYS_ADMIN))
3982 return ERR_PTR(-EPERM);
3452 3983
3453 ret = ftrace_profile_enable(event_id); 3984 if (ftrace_profile_enable(counter->attr.config))
3454 if (ret)
3455 return NULL; 3985 return NULL;
3456 3986
3457 counter->destroy = tp_perf_counter_destroy; 3987 counter->destroy = tp_perf_counter_destroy;
@@ -3465,9 +3995,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3465} 3995}
3466#endif 3996#endif
3467 3997
3998atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3999
4000static void sw_perf_counter_destroy(struct perf_counter *counter)
4001{
4002 u64 event = counter->attr.config;
4003
4004 WARN_ON(counter->parent);
4005
4006 atomic_dec(&perf_swcounter_enabled[event]);
4007}
4008
3468static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) 4009static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3469{ 4010{
3470 const struct pmu *pmu = NULL; 4011 const struct pmu *pmu = NULL;
4012 u64 event = counter->attr.config;
3471 4013
3472 /* 4014 /*
3473 * Software counters (currently) can't in general distinguish 4015 * Software counters (currently) can't in general distinguish
@@ -3476,7 +4018,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3476 * to be kernel events, and page faults are never hypervisor 4018 * to be kernel events, and page faults are never hypervisor
3477 * events. 4019 * events.
3478 */ 4020 */
3479 switch (counter->attr.config) { 4021 switch (event) {
3480 case PERF_COUNT_SW_CPU_CLOCK: 4022 case PERF_COUNT_SW_CPU_CLOCK:
3481 pmu = &perf_ops_cpu_clock; 4023 pmu = &perf_ops_cpu_clock;
3482 4024
@@ -3497,6 +4039,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3497 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4039 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3498 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4040 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3499 case PERF_COUNT_SW_CPU_MIGRATIONS: 4041 case PERF_COUNT_SW_CPU_MIGRATIONS:
4042 if (!counter->parent) {
4043 atomic_inc(&perf_swcounter_enabled[event]);
4044 counter->destroy = sw_perf_counter_destroy;
4045 }
3500 pmu = &perf_ops_generic; 4046 pmu = &perf_ops_generic;
3501 break; 4047 break;
3502 } 4048 }
@@ -3512,6 +4058,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3512 int cpu, 4058 int cpu,
3513 struct perf_counter_context *ctx, 4059 struct perf_counter_context *ctx,
3514 struct perf_counter *group_leader, 4060 struct perf_counter *group_leader,
4061 struct perf_counter *parent_counter,
3515 gfp_t gfpflags) 4062 gfp_t gfpflags)
3516{ 4063{
3517 const struct pmu *pmu; 4064 const struct pmu *pmu;
@@ -3547,6 +4094,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3547 counter->ctx = ctx; 4094 counter->ctx = ctx;
3548 counter->oncpu = -1; 4095 counter->oncpu = -1;
3549 4096
4097 counter->parent = parent_counter;
4098
3550 counter->ns = get_pid_ns(current->nsproxy->pid_ns); 4099 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3551 counter->id = atomic64_inc_return(&perf_counter_id); 4100 counter->id = atomic64_inc_return(&perf_counter_id);
3552 4101
@@ -3561,13 +4110,14 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3561 hwc->sample_period = attr->sample_period; 4110 hwc->sample_period = attr->sample_period;
3562 if (attr->freq && attr->sample_freq) 4111 if (attr->freq && attr->sample_freq)
3563 hwc->sample_period = 1; 4112 hwc->sample_period = 1;
4113 hwc->last_period = hwc->sample_period;
3564 4114
3565 atomic64_set(&hwc->period_left, hwc->sample_period); 4115 atomic64_set(&hwc->period_left, hwc->sample_period);
3566 4116
3567 /* 4117 /*
3568 * we currently do not support PERF_SAMPLE_GROUP on inherited counters 4118 * we currently do not support PERF_FORMAT_GROUP on inherited counters
3569 */ 4119 */
3570 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 4120 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
3571 goto done; 4121 goto done;
3572 4122
3573 switch (attr->type) { 4123 switch (attr->type) {
@@ -3604,11 +4154,15 @@ done:
3604 4154
3605 counter->pmu = pmu; 4155 counter->pmu = pmu;
3606 4156
3607 atomic_inc(&nr_counters); 4157 if (!counter->parent) {
3608 if (counter->attr.mmap) 4158 atomic_inc(&nr_counters);
3609 atomic_inc(&nr_mmap_counters); 4159 if (counter->attr.mmap)
3610 if (counter->attr.comm) 4160 atomic_inc(&nr_mmap_counters);
3611 atomic_inc(&nr_comm_counters); 4161 if (counter->attr.comm)
4162 atomic_inc(&nr_comm_counters);
4163 if (counter->attr.task)
4164 atomic_inc(&nr_task_counters);
4165 }
3612 4166
3613 return counter; 4167 return counter;
3614} 4168}
@@ -3661,6 +4215,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
3661 if (val) 4215 if (val)
3662 goto err_size; 4216 goto err_size;
3663 } 4217 }
4218 size = sizeof(*attr);
3664 } 4219 }
3665 4220
3666 ret = copy_from_user(attr, uattr, size); 4221 ret = copy_from_user(attr, uattr, size);
@@ -3692,6 +4247,57 @@ err_size:
3692 goto out; 4247 goto out;
3693} 4248}
3694 4249
4250int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4251{
4252 struct perf_counter *output_counter = NULL;
4253 struct file *output_file = NULL;
4254 struct perf_counter *old_output;
4255 int fput_needed = 0;
4256 int ret = -EINVAL;
4257
4258 if (!output_fd)
4259 goto set;
4260
4261 output_file = fget_light(output_fd, &fput_needed);
4262 if (!output_file)
4263 return -EBADF;
4264
4265 if (output_file->f_op != &perf_fops)
4266 goto out;
4267
4268 output_counter = output_file->private_data;
4269
4270 /* Don't chain output fds */
4271 if (output_counter->output)
4272 goto out;
4273
4274 /* Don't set an output fd when we already have an output channel */
4275 if (counter->data)
4276 goto out;
4277
4278 atomic_long_inc(&output_file->f_count);
4279
4280set:
4281 mutex_lock(&counter->mmap_mutex);
4282 old_output = counter->output;
4283 rcu_assign_pointer(counter->output, output_counter);
4284 mutex_unlock(&counter->mmap_mutex);
4285
4286 if (old_output) {
4287 /*
4288 * we need to make sure no existing perf_output_*()
4289 * is still referencing this counter.
4290 */
4291 synchronize_rcu();
4292 fput(old_output->filp);
4293 }
4294
4295 ret = 0;
4296out:
4297 fput_light(output_file, fput_needed);
4298 return ret;
4299}
4300
3695/** 4301/**
3696 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu 4302 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3697 * 4303 *
@@ -3711,15 +4317,15 @@ SYSCALL_DEFINE5(perf_counter_open,
3711 struct file *group_file = NULL; 4317 struct file *group_file = NULL;
3712 int fput_needed = 0; 4318 int fput_needed = 0;
3713 int fput_needed2 = 0; 4319 int fput_needed2 = 0;
3714 int ret; 4320 int err;
3715 4321
3716 /* for future expandability... */ 4322 /* for future expandability... */
3717 if (flags) 4323 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
3718 return -EINVAL; 4324 return -EINVAL;
3719 4325
3720 ret = perf_copy_attr(attr_uptr, &attr); 4326 err = perf_copy_attr(attr_uptr, &attr);
3721 if (ret) 4327 if (err)
3722 return ret; 4328 return err;
3723 4329
3724 if (!attr.exclude_kernel) { 4330 if (!attr.exclude_kernel) {
3725 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 4331 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -3742,8 +4348,8 @@ SYSCALL_DEFINE5(perf_counter_open,
3742 * Look up the group leader (we will attach this counter to it): 4348 * Look up the group leader (we will attach this counter to it):
3743 */ 4349 */
3744 group_leader = NULL; 4350 group_leader = NULL;
3745 if (group_fd != -1) { 4351 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
3746 ret = -EINVAL; 4352 err = -EINVAL;
3747 group_file = fget_light(group_fd, &fput_needed); 4353 group_file = fget_light(group_fd, &fput_needed);
3748 if (!group_file) 4354 if (!group_file)
3749 goto err_put_context; 4355 goto err_put_context;
@@ -3771,19 +4377,25 @@ SYSCALL_DEFINE5(perf_counter_open,
3771 } 4377 }
3772 4378
3773 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4379 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3774 GFP_KERNEL); 4380 NULL, GFP_KERNEL);
3775 ret = PTR_ERR(counter); 4381 err = PTR_ERR(counter);
3776 if (IS_ERR(counter)) 4382 if (IS_ERR(counter))
3777 goto err_put_context; 4383 goto err_put_context;
3778 4384
3779 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 4385 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
3780 if (ret < 0) 4386 if (err < 0)
3781 goto err_free_put_context; 4387 goto err_free_put_context;
3782 4388
3783 counter_file = fget_light(ret, &fput_needed2); 4389 counter_file = fget_light(err, &fput_needed2);
3784 if (!counter_file) 4390 if (!counter_file)
3785 goto err_free_put_context; 4391 goto err_free_put_context;
3786 4392
4393 if (flags & PERF_FLAG_FD_OUTPUT) {
4394 err = perf_counter_set_output(counter, group_fd);
4395 if (err)
4396 goto err_fput_free_put_context;
4397 }
4398
3787 counter->filp = counter_file; 4399 counter->filp = counter_file;
3788 WARN_ON_ONCE(ctx->parent_ctx); 4400 WARN_ON_ONCE(ctx->parent_ctx);
3789 mutex_lock(&ctx->mutex); 4401 mutex_lock(&ctx->mutex);
@@ -3797,20 +4409,20 @@ SYSCALL_DEFINE5(perf_counter_open,
3797 list_add_tail(&counter->owner_entry, &current->perf_counter_list); 4409 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
3798 mutex_unlock(&current->perf_counter_mutex); 4410 mutex_unlock(&current->perf_counter_mutex);
3799 4411
4412err_fput_free_put_context:
3800 fput_light(counter_file, fput_needed2); 4413 fput_light(counter_file, fput_needed2);
3801 4414
3802out_fput:
3803 fput_light(group_file, fput_needed);
3804
3805 return ret;
3806
3807err_free_put_context: 4415err_free_put_context:
3808 kfree(counter); 4416 if (err < 0)
4417 kfree(counter);
3809 4418
3810err_put_context: 4419err_put_context:
3811 put_ctx(ctx); 4420 if (err < 0)
4421 put_ctx(ctx);
3812 4422
3813 goto out_fput; 4423 fput_light(group_file, fput_needed);
4424
4425 return err;
3814} 4426}
3815 4427
3816/* 4428/*
@@ -3837,7 +4449,8 @@ inherit_counter(struct perf_counter *parent_counter,
3837 4449
3838 child_counter = perf_counter_alloc(&parent_counter->attr, 4450 child_counter = perf_counter_alloc(&parent_counter->attr,
3839 parent_counter->cpu, child_ctx, 4451 parent_counter->cpu, child_ctx,
3840 group_leader, GFP_KERNEL); 4452 group_leader, parent_counter,
4453 GFP_KERNEL);
3841 if (IS_ERR(child_counter)) 4454 if (IS_ERR(child_counter))
3842 return child_counter; 4455 return child_counter;
3843 get_ctx(child_ctx); 4456 get_ctx(child_ctx);
@@ -3860,12 +4473,6 @@ inherit_counter(struct perf_counter *parent_counter,
3860 */ 4473 */
3861 add_counter_to_ctx(child_counter, child_ctx); 4474 add_counter_to_ctx(child_counter, child_ctx);
3862 4475
3863 child_counter->parent = parent_counter;
3864 /*
3865 * inherit into child's child as well:
3866 */
3867 child_counter->attr.inherit = 1;
3868
3869 /* 4476 /*
3870 * Get a reference to the parent filp - we will fput it 4477 * Get a reference to the parent filp - we will fput it
3871 * when the child counter exits. This is safe to do because 4478 * when the child counter exits. This is safe to do because
@@ -3909,10 +4516,14 @@ static int inherit_group(struct perf_counter *parent_counter,
3909} 4516}
3910 4517
3911static void sync_child_counter(struct perf_counter *child_counter, 4518static void sync_child_counter(struct perf_counter *child_counter,
3912 struct perf_counter *parent_counter) 4519 struct task_struct *child)
3913{ 4520{
4521 struct perf_counter *parent_counter = child_counter->parent;
3914 u64 child_val; 4522 u64 child_val;
3915 4523
4524 if (child_counter->attr.inherit_stat)
4525 perf_counter_read_event(child_counter, child);
4526
3916 child_val = atomic64_read(&child_counter->count); 4527 child_val = atomic64_read(&child_counter->count);
3917 4528
3918 /* 4529 /*
@@ -3941,7 +4552,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
3941 4552
3942static void 4553static void
3943__perf_counter_exit_task(struct perf_counter *child_counter, 4554__perf_counter_exit_task(struct perf_counter *child_counter,
3944 struct perf_counter_context *child_ctx) 4555 struct perf_counter_context *child_ctx,
4556 struct task_struct *child)
3945{ 4557{
3946 struct perf_counter *parent_counter; 4558 struct perf_counter *parent_counter;
3947 4559
@@ -3955,7 +4567,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
3955 * counters need to be zapped - but otherwise linger. 4567 * counters need to be zapped - but otherwise linger.
3956 */ 4568 */
3957 if (parent_counter) { 4569 if (parent_counter) {
3958 sync_child_counter(child_counter, parent_counter); 4570 sync_child_counter(child_counter, child);
3959 free_counter(child_counter); 4571 free_counter(child_counter);
3960 } 4572 }
3961} 4573}
@@ -3969,8 +4581,10 @@ void perf_counter_exit_task(struct task_struct *child)
3969 struct perf_counter_context *child_ctx; 4581 struct perf_counter_context *child_ctx;
3970 unsigned long flags; 4582 unsigned long flags;
3971 4583
3972 if (likely(!child->perf_counter_ctxp)) 4584 if (likely(!child->perf_counter_ctxp)) {
4585 perf_counter_task(child, NULL, 0);
3973 return; 4586 return;
4587 }
3974 4588
3975 local_irq_save(flags); 4589 local_irq_save(flags);
3976 /* 4590 /*
@@ -3989,17 +4603,20 @@ void perf_counter_exit_task(struct task_struct *child)
3989 */ 4603 */
3990 spin_lock(&child_ctx->lock); 4604 spin_lock(&child_ctx->lock);
3991 child->perf_counter_ctxp = NULL; 4605 child->perf_counter_ctxp = NULL;
3992 if (child_ctx->parent_ctx) { 4606 /*
3993 /* 4607 * If this context is a clone; unclone it so it can't get
3994 * This context is a clone; unclone it so it can't get 4608 * swapped to another process while we're removing all
3995 * swapped to another process while we're removing all 4609 * the counters from it.
3996 * the counters from it. 4610 */
3997 */ 4611 unclone_ctx(child_ctx);
3998 put_ctx(child_ctx->parent_ctx); 4612 spin_unlock_irqrestore(&child_ctx->lock, flags);
3999 child_ctx->parent_ctx = NULL; 4613
4000 } 4614 /*
4001 spin_unlock(&child_ctx->lock); 4615 * Report the task dead after unscheduling the counters so that we
4002 local_irq_restore(flags); 4616 * won't get any samples after PERF_EVENT_EXIT. We can however still
4617 * get a few PERF_EVENT_READ events.
4618 */
4619 perf_counter_task(child, child_ctx, 0);
4003 4620
4004 /* 4621 /*
4005 * We can recurse on the same lock type through: 4622 * We can recurse on the same lock type through:
@@ -4017,7 +4634,7 @@ void perf_counter_exit_task(struct task_struct *child)
4017again: 4634again:
4018 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, 4635 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4019 list_entry) 4636 list_entry)
4020 __perf_counter_exit_task(child_counter, child_ctx); 4637 __perf_counter_exit_task(child_counter, child_ctx, child);
4021 4638
4022 /* 4639 /*
4023 * If the last counter was a group counter, it will have appended all 4640 * If the last counter was a group counter, it will have appended all
@@ -4220,6 +4837,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4220 perf_counter_init_cpu(cpu); 4837 perf_counter_init_cpu(cpu);
4221 break; 4838 break;
4222 4839
4840 case CPU_ONLINE:
4841 case CPU_ONLINE_FROZEN:
4842 hw_perf_counter_setup_online(cpu);
4843 break;
4844
4223 case CPU_DOWN_PREPARE: 4845 case CPU_DOWN_PREPARE:
4224 case CPU_DOWN_PREPARE_FROZEN: 4846 case CPU_DOWN_PREPARE_FROZEN:
4225 perf_counter_exit_cpu(cpu); 4847 perf_counter_exit_cpu(cpu);
@@ -4244,6 +4866,8 @@ void __init perf_counter_init(void)
4244{ 4866{
4245 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4867 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4246 (void *)(long)smp_processor_id()); 4868 (void *)(long)smp_processor_id());
4869 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4870 (void *)(long)smp_processor_id());
4247 register_cpu_notifier(&perf_cpu_nb); 4871 register_cpu_notifier(&perf_cpu_nb);
4248} 4872}
4249 4873
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd281..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
380 */ 380 */
381struct task_struct *find_task_by_pid_type_ns(int type, int nr, 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382 struct pid_namespace *ns)
383{ 382{
384 return pid_task(find_pid_ns(nr, ns), type); 383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
385} 384}
386 385
387EXPORT_SYMBOL(find_task_by_pid_type_ns);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 386struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 387{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399} 389}
400EXPORT_SYMBOL(find_task_by_pid_ns);
401 390
402struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 391struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 392{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858d..821722ae58a7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
67 return NULL; 67 return NULL;
68} 68}
69 69
70static struct pid_namespace *create_pid_namespace(unsigned int level) 70static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
71{ 71{
72 struct pid_namespace *ns; 72 struct pid_namespace *ns;
73 unsigned int level = parent_pid_ns->level + 1;
73 int i; 74 int i;
74 75
75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 76 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
86 87
87 kref_init(&ns->kref); 88 kref_init(&ns->kref);
88 ns->level = level; 89 ns->level = level;
90 ns->parent = get_pid_ns(parent_pid_ns);
89 91
90 set_bit(0, ns->pidmap[0].page); 92 set_bit(0, ns->pidmap[0].page);
91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 93 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
114 116
115struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 117struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
116{ 118{
117 struct pid_namespace *new_ns;
118
119 BUG_ON(!old_ns);
120 new_ns = get_pid_ns(old_ns);
121 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
122 goto out; 120 return get_pid_ns(old_ns);
123
124 new_ns = ERR_PTR(-EINVAL);
125 if (flags & CLONE_THREAD) 121 if (flags & CLONE_THREAD)
126 goto out_put; 122 return ERR_PTR(-EINVAL);
127 123 return create_pid_namespace(old_ns);
128 new_ns = create_pid_namespace(old_ns->level + 1);
129 if (!IS_ERR(new_ns))
130 new_ns->parent = get_pid_ns(old_ns);
131
132out_put:
133 put_pid_ns(old_ns);
134out:
135 return new_ns;
136} 124}
137 125
138void free_pid_ns(struct kref *kref) 126void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
521} 521}
522void posix_cpu_timers_exit_group(struct task_struct *tsk) 522void posix_cpu_timers_exit_group(struct task_struct *tsk)
523{ 523{
524 struct task_cputime cputime; 524 struct signal_struct *const sig = tsk->signal;
525 525
526 thread_group_cputimer(tsk, &cputime);
527 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
528 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime_add(tsk->utime, sig->utime),
528 cputime_add(tsk->stime, sig->stime),
529 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
529} 530}
530 531
531static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 532static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..d089d052c4a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 260 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 261 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 262 .timer_create = no_timer_create,
263 .nsleep = no_nsleep,
257 }; 264 };
258 265
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 266 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
208 random kernel OOPSes or reboots that don't seem to be related to 208 random kernel OOPSes or reboots that don't seem to be related to
209 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
210 APM in your BIOS). 210 APM in your BIOS).
211
212config PM_RUNTIME
213 bool "Run-time PM core functionality"
214 depends on PM
215 ---help---
216 Enable functionality allowing I/O devices to be put into energy-saving
217 (low power) states at run time (or autosuspended) after a specified
218 period of inactivity and woken up in response to a hardware-generated
219 wake-up event or a driver's request.
220
221 Hardware support is generally required for this functionality to work
222 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and
224 wake-up events.
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e7464893..04b3a83d686f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
298 if (error) 298 if (error)
299 return error; 299 return error;
300 300
301 /* Free memory before shutting down devices. */ 301 /* Preallocate image memory before shutting down devices. */
302 error = swsusp_shrink_memory(); 302 error = hibernate_preallocate_memory();
303 if (error) 303 if (error)
304 goto Close; 304 goto Close;
305 305
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 /* We may need to release the preallocated image pages here. */
319 if (error || !in_suspend)
320 swsusp_free();
321
318 dpm_resume_end(in_suspend ? 322 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 324 resume_console();
@@ -460,11 +464,11 @@ int hibernation_platform_enter(void)
460 464
461 error = hibernation_ops->prepare(); 465 error = hibernation_ops->prepare();
462 if (error) 466 if (error)
463 goto Platofrm_finish; 467 goto Platform_finish;
464 468
465 error = disable_nonboot_cpus(); 469 error = disable_nonboot_cpus();
466 if (error) 470 if (error)
467 goto Platofrm_finish; 471 goto Platform_finish;
468 472
469 local_irq_disable(); 473 local_irq_disable();
470 sysdev_suspend(PMSG_HIBERNATE); 474 sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +480,7 @@ int hibernation_platform_enter(void)
476 * We don't need to reenable the nonboot CPUs or resume consoles, since 480 * We don't need to reenable the nonboot CPUs or resume consoles, since
477 * the system is going to be halted anyway. 481 * the system is going to be halted anyway.
478 */ 482 */
479 Platofrm_finish: 483 Platform_finish:
480 hibernation_ops->finish(); 484 hibernation_ops->finish();
481 485
482 dpm_suspend_noirq(PMSG_RESTORE); 486 dpm_suspend_noirq(PMSG_RESTORE);
@@ -578,7 +582,10 @@ int hibernate(void)
578 goto Thaw; 582 goto Thaw;
579 583
580 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 584 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
581 if (in_suspend && !error) { 585 if (error)
586 goto Thaw;
587
588 if (in_suspend) {
582 unsigned int flags = 0; 589 unsigned int flags = 0;
583 590
584 if (hibernation_mode == HIBERNATION_PLATFORM) 591 if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
590 power_down(); 597 power_down();
591 } else { 598 } else {
592 pr_debug("PM: Image restored successfully.\n"); 599 pr_debug("PM: Image restored successfully.\n");
593 swsusp_free();
594 } 600 }
601
595 Thaw: 602 Thaw:
596 thaw_processes(); 603 thaw_processes();
597 Finish: 604 Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h>
14 15
15#include "power.h" 16#include "power.h"
16 17
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
217 .attrs = g, 218 .attrs = g,
218}; 219};
219 220
221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq;
223
224static int __init pm_start_workqueue(void)
225{
226 pm_wq = create_freezeable_workqueue("pm");
227
228 return pm_wq ? 0 : -ENOMEM;
229}
230#else
231static inline int pm_start_workqueue(void) { return 0; }
232#endif
233
220static int __init pm_init(void) 234static int __init pm_init(void)
221{ 235{
236 int error = pm_start_workqueue();
237 if (error)
238 return error;
222 power_kobj = kobject_create_and_add("power", NULL); 239 power_kobj = kobject_create_and_add("power", NULL);
223 if (!power_kobj) 240 if (!power_kobj)
224 return -ENOMEM; 241 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e3..46c5a26630a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern int swsusp_shrink_memory(void); 77extern int hibernate_preallocate_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
117 if (error) 117 if (error)
118 goto Exit; 118 goto Exit;
119 printk("done."); 119 printk("done.");
120
121 oom_killer_disable();
120 Exit: 122 Exit:
121 BUG_ON(in_atomic()); 123 BUG_ON(in_atomic());
122 printk("\n"); 124 printk("\n");
125
123 return error; 126 return error;
124} 127}
125 128
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
145 148
146void thaw_processes(void) 149void thaw_processes(void)
147{ 150{
151 oom_killer_enable();
152
148 printk("Restarting tasks ... "); 153 printk("Restarting tasks ... ");
149 thaw_tasks(true); 154 thaw_tasks(true);
150 thaw_tasks(false); 155 thaw_tasks(false);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d3..97955b0e44f4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
235 235
236#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 236#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
237 237
238struct bm_block { 238struct bm_block {
239 struct list_head hook; /* hook into a list of bitmap blocks */ 239 struct list_head hook; /* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
275 275
276/** 276/**
277 * create_bm_block_list - create a list of block bitmap objects 277 * create_bm_block_list - create a list of block bitmap objects
278 * @nr_blocks - number of blocks to allocate 278 * @pages - number of pages to track
279 * @list - list to put the allocated blocks into 279 * @list - list to put the allocated blocks into
280 * @ca - chain allocator to be used for allocating memory 280 * @ca - chain allocator to be used for allocating memory
281 */ 281 */
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
853 struct zone *zone; 853 struct zone *zone;
854 unsigned int n = 0; 854 unsigned int n = 0;
855 855
856 for_each_zone(zone) { 856 for_each_populated_zone(zone) {
857 unsigned long pfn, max_zone_pfn; 857 unsigned long pfn, max_zone_pfn;
858 858
859 if (!is_highmem(zone)) 859 if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
916 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
917 unsigned int n = 0; 917 unsigned int n = 0;
918 918
919 for_each_zone(zone) { 919 for_each_populated_zone(zone) {
920 if (is_highmem(zone)) 920 if (is_highmem(zone))
921 continue; 921 continue;
922 922
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1010 struct zone *zone; 1010 struct zone *zone;
1011 unsigned long pfn; 1011 unsigned long pfn;
1012 1012
1013 for_each_zone(zone) { 1013 for_each_populated_zone(zone) {
1014 unsigned long max_zone_pfn; 1014 unsigned long max_zone_pfn;
1015 1015
1016 mark_free_pages(zone); 1016 mark_free_pages(zone);
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1033static unsigned int nr_copy_pages; 1033static unsigned int nr_copy_pages;
1034/* Number of pages needed for saving the original pfns of the image pages */ 1034/* Number of pages needed for saving the original pfns of the image pages */
1035static unsigned int nr_meta_pages; 1035static unsigned int nr_meta_pages;
1036/*
1037 * Numbers of normal and highmem page frames allocated for hibernation image
1038 * before suspending devices.
1039 */
1040unsigned int alloc_normal, alloc_highmem;
1041/*
1042 * Memory bitmap used for marking saveable pages (during hibernation) or
1043 * hibernation image pages (during restore)
1044 */
1045static struct memory_bitmap orig_bm;
1046/*
1047 * Memory bitmap used during hibernation for marking allocated page frames that
1048 * will contain copies of saveable pages. During restore it is initially used
1049 * for marking hibernation image pages, but then the set bits from it are
1050 * duplicated in @orig_bm and it is released. On highmem systems it is next
1051 * used for marking "safe" highmem pages, but it has to be reinitialized for
1052 * this purpose.
1053 */
1054static struct memory_bitmap copy_bm;
1036 1055
1037/** 1056/**
1038 * swsusp_free - free pages allocated for the suspend. 1057 * swsusp_free - free pages allocated for the suspend.
@@ -1046,7 +1065,7 @@ void swsusp_free(void)
1046 struct zone *zone; 1065 struct zone *zone;
1047 unsigned long pfn, max_zone_pfn; 1066 unsigned long pfn, max_zone_pfn;
1048 1067
1049 for_each_zone(zone) { 1068 for_each_populated_zone(zone) {
1050 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1069 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1051 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1070 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1052 if (pfn_valid(pfn)) { 1071 if (pfn_valid(pfn)) {
@@ -1064,74 +1083,286 @@ void swsusp_free(void)
1064 nr_meta_pages = 0; 1083 nr_meta_pages = 0;
1065 restore_pblist = NULL; 1084 restore_pblist = NULL;
1066 buffer = NULL; 1085 buffer = NULL;
1086 alloc_normal = 0;
1087 alloc_highmem = 0;
1067} 1088}
1068 1089
1090/* Helper functions used for the shrinking of memory. */
1091
1092#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1093
1069/** 1094/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed 1095 * preallocate_image_pages - Allocate a number of pages for hibernation image
1071 * 1096 * @nr_pages: Number of page frames to allocate.
1072 * ... but do not OOM-kill anyone 1097 * @mask: GFP flags to use for the allocation.
1073 * 1098 *
1074 * Notice: all userland should be stopped before it is called, or 1099 * Return value: Number of page frames actually allocated
1075 * livelock is possible. 1100 */
1101static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1102{
1103 unsigned long nr_alloc = 0;
1104
1105 while (nr_pages > 0) {
1106 struct page *page;
1107
1108 page = alloc_image_page(mask);
1109 if (!page)
1110 break;
1111 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1112 if (PageHighMem(page))
1113 alloc_highmem++;
1114 else
1115 alloc_normal++;
1116 nr_pages--;
1117 nr_alloc++;
1118 }
1119
1120 return nr_alloc;
1121}
1122
1123static unsigned long preallocate_image_memory(unsigned long nr_pages)
1124{
1125 return preallocate_image_pages(nr_pages, GFP_IMAGE);
1126}
1127
1128#ifdef CONFIG_HIGHMEM
1129static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1130{
1131 return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1132}
1133
1134/**
1135 * __fraction - Compute (an approximation of) x * (multiplier / base)
1076 */ 1136 */
1137static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1138{
1139 x *= multiplier;
1140 do_div(x, base);
1141 return (unsigned long)x;
1142}
1143
1144static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1145 unsigned long highmem,
1146 unsigned long total)
1147{
1148 unsigned long alloc = __fraction(nr_pages, highmem, total);
1077 1149
1078#define SHRINK_BITE 10000 1150 return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1079static inline unsigned long __shrink_memory(long tmp) 1151}
1152#else /* CONFIG_HIGHMEM */
1153static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1080{ 1154{
1081 if (tmp > SHRINK_BITE) 1155 return 0;
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084} 1156}
1085 1157
1086int swsusp_shrink_memory(void) 1158static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1159 unsigned long highmem,
1160 unsigned long total)
1161{
1162 return 0;
1163}
1164#endif /* CONFIG_HIGHMEM */
1165
1166/**
1167 * free_unnecessary_pages - Release preallocated pages not needed for the image
1168 */
1169static void free_unnecessary_pages(void)
1170{
1171 unsigned long save_highmem, to_free_normal, to_free_highmem;
1172
1173 to_free_normal = alloc_normal - count_data_pages();
1174 save_highmem = count_highmem_pages();
1175 if (alloc_highmem > save_highmem) {
1176 to_free_highmem = alloc_highmem - save_highmem;
1177 } else {
1178 to_free_highmem = 0;
1179 to_free_normal -= save_highmem - alloc_highmem;
1180 }
1181
1182 memory_bm_position_reset(&copy_bm);
1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn);
1187
1188 if (PageHighMem(page)) {
1189 if (!to_free_highmem)
1190 continue;
1191 to_free_highmem--;
1192 alloc_highmem--;
1193 } else {
1194 if (!to_free_normal)
1195 continue;
1196 to_free_normal--;
1197 alloc_normal--;
1198 }
1199 memory_bm_clear_bit(&copy_bm, pfn);
1200 swsusp_unset_page_forbidden(page);
1201 swsusp_unset_page_free(page);
1202 __free_page(page);
1203 }
1204}
1205
1206/**
1207 * minimum_image_size - Estimate the minimum acceptable size of an image
1208 * @saveable: Number of saveable pages in the system.
1209 *
1210 * We want to avoid attempting to free too much memory too hard, so estimate the
1211 * minimum acceptable size of a hibernation image to use as the lower limit for
1212 * preallocating memory.
1213 *
1214 * We assume that the minimum image size should be proportional to
1215 *
1216 * [number of saveable pages] - [number of pages that can be freed in theory]
1217 *
1218 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1219 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
1220 * minus mapped file pages.
1221 */
1222static unsigned long minimum_image_size(unsigned long saveable)
1223{
1224 unsigned long size;
1225
1226 size = global_page_state(NR_SLAB_RECLAIMABLE)
1227 + global_page_state(NR_ACTIVE_ANON)
1228 + global_page_state(NR_INACTIVE_ANON)
1229 + global_page_state(NR_ACTIVE_FILE)
1230 + global_page_state(NR_INACTIVE_FILE)
1231 - global_page_state(NR_FILE_MAPPED);
1232
1233 return saveable <= size ? 0 : saveable - size;
1234}
1235
1236/**
1237 * hibernate_preallocate_memory - Preallocate memory for hibernation image
1238 *
1239 * To create a hibernation image it is necessary to make a copy of every page
1240 * frame in use. We also need a number of page frames to be free during
1241 * hibernation for allocations made while saving the image and for device
1242 * drivers, in case they need to allocate memory from their hibernation
1243 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
1244 * respectively, both of which are rough estimates). To make this happen, we
1245 * compute the total number of available page frames and allocate at least
1246 *
1247 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
1248 *
1249 * of them, which corresponds to the maximum size of a hibernation image.
1250 *
1251 * If image_size is set below the number following from the above formula,
1252 * the preallocation of memory is continued until the total number of saveable
1253 * pages in the system is below the requested image size or the minimum
1254 * acceptable image size returned by minimum_image_size(), whichever is greater.
1255 */
1256int hibernate_preallocate_memory(void)
1087{ 1257{
1088 long tmp;
1089 struct zone *zone; 1258 struct zone *zone;
1090 unsigned long pages = 0; 1259 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1091 unsigned int i = 0; 1260 unsigned long alloc, save_highmem, pages_highmem;
1092 char *p = "-\\|/";
1093 struct timeval start, stop; 1261 struct timeval start, stop;
1262 int error;
1094 1263
1095 printk(KERN_INFO "PM: Shrinking memory... "); 1264 printk(KERN_INFO "PM: Preallocating image memory... ");
1096 do_gettimeofday(&start); 1265 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114 1266
1115 if (highmem_size < 0) 1267 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1116 highmem_size = 0; 1268 if (error)
1269 goto err_out;
1117 1270
1118 tmp += highmem_size; 1271 error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1119 if (tmp > 0) { 1272 if (error)
1120 tmp = __shrink_memory(tmp); 1273 goto err_out;
1121 if (!tmp) 1274
1122 return -ENOMEM; 1275 alloc_normal = 0;
1123 pages += tmp; 1276 alloc_highmem = 0;
1124 } else if (size > image_size / PAGE_SIZE) { 1277
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); 1278 /* Count the number of saveable data pages. */
1126 pages += tmp; 1279 save_highmem = count_highmem_pages();
1127 } 1280 saveable = count_data_pages();
1128 printk("\b%c", p[i++%4]); 1281
1129 } while (tmp > 0); 1282 /*
1283 * Compute the total number of page frames we can use (count) and the
1284 * number of pages needed for image metadata (size).
1285 */
1286 count = saveable;
1287 saveable += save_highmem;
1288 highmem = save_highmem;
1289 size = 0;
1290 for_each_populated_zone(zone) {
1291 size += snapshot_additional_pages(zone);
1292 if (is_highmem(zone))
1293 highmem += zone_page_state(zone, NR_FREE_PAGES);
1294 else
1295 count += zone_page_state(zone, NR_FREE_PAGES);
1296 }
1297 count += highmem;
1298 count -= totalreserve_pages;
1299
1300 /* Compute the maximum number of saveable pages to leave in memory. */
1301 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1302 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1303 if (size > max_size)
1304 size = max_size;
1305 /*
1306 * If the maximum is not less than the current number of saveable pages
1307 * in memory, allocate page frames for the image and we're done.
1308 */
1309 if (size >= saveable) {
1310 pages = preallocate_image_highmem(save_highmem);
1311 pages += preallocate_image_memory(saveable - pages);
1312 goto out;
1313 }
1314
1315 /* Estimate the minimum size of the image. */
1316 pages = minimum_image_size(saveable);
1317 if (size < pages)
1318 size = min_t(unsigned long, pages, max_size);
1319
1320 /*
1321 * Let the memory management subsystem know that we're going to need a
1322 * large number of page frames to allocate and make it free some memory.
1323 * NOTE: If this is not done, performance will be hurt badly in some
1324 * test cases.
1325 */
1326 shrink_all_memory(saveable - size);
1327
1328 /*
1329 * The number of saveable pages in memory was too high, so apply some
1330 * pressure to decrease it. First, make room for the largest possible
1331 * image and fail if that doesn't work. Next, try to decrease the size
1332 * of the image as much as indicated by 'size' using allocations from
1333 * highmem and non-highmem zones separately.
1334 */
1335 pages_highmem = preallocate_image_highmem(highmem / 2);
1336 alloc = (count - max_size) - pages_highmem;
1337 pages = preallocate_image_memory(alloc);
1338 if (pages < alloc)
1339 goto err_out;
1340 size = max_size - size;
1341 alloc = size;
1342 size = preallocate_highmem_fraction(size, highmem, count);
1343 pages_highmem += size;
1344 alloc -= size;
1345 pages += preallocate_image_memory(alloc);
1346 pages += pages_highmem;
1347
1348 /*
1349 * We only need as many page frames for the image as there are saveable
1350 * pages in memory, but we have allocated more. Release the excessive
1351 * ones now.
1352 */
1353 free_unnecessary_pages();
1354
1355 out:
1130 do_gettimeofday(&stop); 1356 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages); 1357 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed"); 1358 swsusp_show_speed(&start, &stop, pages, "Allocated");
1133 1359
1134 return 0; 1360 return 0;
1361
1362 err_out:
1363 printk(KERN_CONT "\n");
1364 swsusp_free();
1365 return -ENOMEM;
1135} 1366}
1136 1367
1137#ifdef CONFIG_HIGHMEM 1368#ifdef CONFIG_HIGHMEM
@@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void)
1142 1373
1143static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1374static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1144{ 1375{
1145 unsigned int free_highmem = count_free_highmem_pages(); 1376 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1146 1377
1147 if (free_highmem >= nr_highmem) 1378 if (free_highmem >= nr_highmem)
1148 nr_highmem = 0; 1379 nr_highmem = 0;
@@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1164static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1395static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1165{ 1396{
1166 struct zone *zone; 1397 struct zone *zone;
1167 unsigned int free = 0, meta = 0; 1398 unsigned int free = alloc_normal;
1168 1399
1169 for_each_zone(zone) { 1400 for_each_populated_zone(zone)
1170 meta += snapshot_additional_pages(zone);
1171 if (!is_highmem(zone)) 1401 if (!is_highmem(zone))
1172 free += zone_page_state(zone, NR_FREE_PAGES); 1402 free += zone_page_state(zone, NR_FREE_PAGES);
1173 }
1174 1403
1175 nr_pages += count_pages_for_highmem(nr_highmem); 1404 nr_pages += count_pages_for_highmem(nr_highmem);
1176 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", 1405 pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1177 nr_pages, PAGES_FOR_IO, meta, free); 1406 nr_pages, PAGES_FOR_IO, free);
1178 1407
1179 return free > nr_pages + PAGES_FOR_IO + meta; 1408 return free > nr_pages + PAGES_FOR_IO;
1180} 1409}
1181 1410
1182#ifdef CONFIG_HIGHMEM 1411#ifdef CONFIG_HIGHMEM
@@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed)
1198 */ 1427 */
1199 1428
1200static inline unsigned int 1429static inline unsigned int
1201alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) 1430alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1202{ 1431{
1203 unsigned int to_alloc = count_free_highmem_pages(); 1432 unsigned int to_alloc = count_free_highmem_pages();
1204 1433
@@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1218static inline int get_highmem_buffer(int safe_needed) { return 0; } 1447static inline int get_highmem_buffer(int safe_needed) { return 0; }
1219 1448
1220static inline unsigned int 1449static inline unsigned int
1221alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1450alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1222#endif /* CONFIG_HIGHMEM */ 1451#endif /* CONFIG_HIGHMEM */
1223 1452
1224/** 1453/**
@@ -1237,51 +1466,36 @@ static int
1237swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1466swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1238 unsigned int nr_pages, unsigned int nr_highmem) 1467 unsigned int nr_pages, unsigned int nr_highmem)
1239{ 1468{
1240 int error; 1469 int error = 0;
1241
1242 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1243 if (error)
1244 goto Free;
1245
1246 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1247 if (error)
1248 goto Free;
1249 1470
1250 if (nr_highmem > 0) { 1471 if (nr_highmem > 0) {
1251 error = get_highmem_buffer(PG_ANY); 1472 error = get_highmem_buffer(PG_ANY);
1252 if (error) 1473 if (error)
1253 goto Free; 1474 goto err_out;
1254 1475 if (nr_highmem > alloc_highmem) {
1255 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); 1476 nr_highmem -= alloc_highmem;
1477 nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1478 }
1256 } 1479 }
1257 while (nr_pages-- > 0) { 1480 if (nr_pages > alloc_normal) {
1258 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1481 nr_pages -= alloc_normal;
1259 1482 while (nr_pages-- > 0) {
1260 if (!page) 1483 struct page *page;
1261 goto Free;
1262 1484
1263 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1485 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1486 if (!page)
1487 goto err_out;
1488 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1489 }
1264 } 1490 }
1491
1265 return 0; 1492 return 0;
1266 1493
1267 Free: 1494 err_out:
1268 swsusp_free(); 1495 swsusp_free();
1269 return -ENOMEM; 1496 return error;
1270} 1497}
1271 1498
1272/* Memory bitmap used for marking saveable pages (during suspend) or the
1273 * suspend image pages (during resume)
1274 */
1275static struct memory_bitmap orig_bm;
1276/* Memory bitmap used on suspend for marking allocated pages that will contain
1277 * the copies of saveable pages. During resume it is initially used for
1278 * marking the suspend image pages, but then its set bits are duplicated in
1279 * @orig_bm and it is released. Next, on systems with high memory, it may be
1280 * used for marking "safe" highmem pages, but it has to be reinitialized for
1281 * this purpose.
1282 */
1283static struct memory_bitmap copy_bm;
1284
1285asmlinkage int swsusp_save(void) 1499asmlinkage int swsusp_save(void)
1286{ 1500{
1287 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
@@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1474 unsigned long pfn, max_zone_pfn; 1688 unsigned long pfn, max_zone_pfn;
1475 1689
1476 /* Clear page flags */ 1690 /* Clear page flags */
1477 for_each_zone(zone) { 1691 for_each_populated_zone(zone) {
1478 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1692 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1479 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1693 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1480 if (pfn_valid(pfn)) 1694 if (pfn_valid(pfn))
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..602033acd6c7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/* 39/*
40 * for_each_console() allows you to iterate on each console
41 */
42#define for_each_console(con) \
43 for (con = console_drivers; con != NULL; con = con->next)
44
45/*
40 * Architectures can override it: 46 * Architectures can override it:
41 */ 47 */
42void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
61 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
62}; 68};
63 69
70static int saved_console_loglevel = -1;
71
64/* 72/*
65 * Low level drivers may need that to know if they can schedule in 73 * Low level drivers may need that to know if they can schedule in
66 * their unblank() callback or not. So let's export it. 74 * their unblank() callback or not. So let's export it.
@@ -372,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
372 logged_chars = 0; 380 logged_chars = 0;
373 break; 381 break;
374 case 6: /* Disable logging to console */ 382 case 6: /* Disable logging to console */
383 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel;
375 console_loglevel = minimum_console_loglevel; 385 console_loglevel = minimum_console_loglevel;
376 break; 386 break;
377 case 7: /* Enable logging to console */ 387 case 7: /* Enable logging to console */
378 console_loglevel = default_console_loglevel; 388 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1;
391 }
379 break; 392 break;
380 case 8: /* Set level of messages printed to console */ 393 case 8: /* Set level of messages printed to console */
381 error = -EINVAL; 394 error = -EINVAL;
@@ -384,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
384 if (len < minimum_console_loglevel) 397 if (len < minimum_console_loglevel)
385 len = minimum_console_loglevel; 398 len = minimum_console_loglevel;
386 console_loglevel = len; 399 console_loglevel = len;
400 /* Implicitly re-enable logging to console */
401 saved_console_loglevel = -1;
387 error = 0; 402 error = 0;
388 break; 403 break;
389 case 9: /* Number of chars in the log buffer */ 404 case 9: /* Number of chars in the log buffer */
@@ -412,7 +427,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
412{ 427{
413 struct console *con; 428 struct console *con;
414 429
415 for (con = console_drivers; con; con = con->next) { 430 for_each_console(con) {
416 if ((con->flags & CON_ENABLED) && con->write && 431 if ((con->flags & CON_ENABLED) && con->write &&
417 (cpu_online(smp_processor_id()) || 432 (cpu_online(smp_processor_id()) ||
418 (con->flags & CON_ANYTIME))) 433 (con->flags & CON_ANYTIME)))
@@ -544,7 +559,7 @@ static int have_callable_console(void)
544{ 559{
545 struct console *con; 560 struct console *con;
546 561
547 for (con = console_drivers; con; con = con->next) 562 for_each_console(con)
548 if (con->flags & CON_ANYTIME) 563 if (con->flags & CON_ANYTIME)
549 return 1; 564 return 1;
550 565
@@ -1060,12 +1075,6 @@ void __sched console_conditional_schedule(void)
1060} 1075}
1061EXPORT_SYMBOL(console_conditional_schedule); 1076EXPORT_SYMBOL(console_conditional_schedule);
1062 1077
1063void console_print(const char *s)
1064{
1065 printk(KERN_EMERG "%s", s);
1066}
1067EXPORT_SYMBOL(console_print);
1068
1069void console_unblank(void) 1078void console_unblank(void)
1070{ 1079{
1071 struct console *c; 1080 struct console *c;
@@ -1082,7 +1091,7 @@ void console_unblank(void)
1082 1091
1083 console_locked = 1; 1092 console_locked = 1;
1084 console_may_schedule = 0; 1093 console_may_schedule = 0;
1085 for (c = console_drivers; c != NULL; c = c->next) 1094 for_each_console(c)
1086 if ((c->flags & CON_ENABLED) && c->unblank) 1095 if ((c->flags & CON_ENABLED) && c->unblank)
1087 c->unblank(); 1096 c->unblank();
1088 release_console_sem(); 1097 release_console_sem();
@@ -1097,7 +1106,7 @@ struct tty_driver *console_device(int *index)
1097 struct tty_driver *driver = NULL; 1106 struct tty_driver *driver = NULL;
1098 1107
1099 acquire_console_sem(); 1108 acquire_console_sem();
1100 for (c = console_drivers; c != NULL; c = c->next) { 1109 for_each_console(c) {
1101 if (!c->device) 1110 if (!c->device)
1102 continue; 1111 continue;
1103 driver = c->device(c, index); 1112 driver = c->device(c, index);
@@ -1134,25 +1143,49 @@ EXPORT_SYMBOL(console_start);
1134 * to register the console printing procedure with printk() and to 1143 * to register the console printing procedure with printk() and to
1135 * print any messages that were printed by the kernel before the 1144 * print any messages that were printed by the kernel before the
1136 * console driver was initialized. 1145 * console driver was initialized.
1146 *
1147 * This can happen pretty early during the boot process (because of
1148 * early_printk) - sometimes before setup_arch() completes - be careful
1149 * of what kernel features are used - they may not be initialised yet.
1150 *
1151 * There are two types of consoles - bootconsoles (early_printk) and
1152 * "real" consoles (everything which is not a bootconsole) which are
1153 * handled differently.
1154 * - Any number of bootconsoles can be registered at any time.
1155 * - As soon as a "real" console is registered, all bootconsoles
1156 * will be unregistered automatically.
1157 * - Once a "real" console is registered, any attempt to register a
1158 * bootconsoles will be rejected
1137 */ 1159 */
1138void register_console(struct console *console) 1160void register_console(struct console *newcon)
1139{ 1161{
1140 int i; 1162 int i;
1141 unsigned long flags; 1163 unsigned long flags;
1142 struct console *bootconsole = NULL; 1164 struct console *bcon = NULL;
1143 1165
1144 if (console_drivers) { 1166 /*
1145 if (console->flags & CON_BOOT) 1167 * before we register a new CON_BOOT console, make sure we don't
1146 return; 1168 * already have a valid console
1147 if (console_drivers->flags & CON_BOOT) 1169 */
1148 bootconsole = console_drivers; 1170 if (console_drivers && newcon->flags & CON_BOOT) {
1171 /* find the last or real console */
1172 for_each_console(bcon) {
1173 if (!(bcon->flags & CON_BOOT)) {
1174 printk(KERN_INFO "Too late to register bootconsole %s%d\n",
1175 newcon->name, newcon->index);
1176 return;
1177 }
1178 }
1149 } 1179 }
1150 1180
1151 if (preferred_console < 0 || bootconsole || !console_drivers) 1181 if (console_drivers && console_drivers->flags & CON_BOOT)
1182 bcon = console_drivers;
1183
1184 if (preferred_console < 0 || bcon || !console_drivers)
1152 preferred_console = selected_console; 1185 preferred_console = selected_console;
1153 1186
1154 if (console->early_setup) 1187 if (newcon->early_setup)
1155 console->early_setup(); 1188 newcon->early_setup();
1156 1189
1157 /* 1190 /*
1158 * See if we want to use this console driver. If we 1191 * See if we want to use this console driver. If we
@@ -1160,13 +1193,13 @@ void register_console(struct console *console)
1160 * that registers here. 1193 * that registers here.
1161 */ 1194 */
1162 if (preferred_console < 0) { 1195 if (preferred_console < 0) {
1163 if (console->index < 0) 1196 if (newcon->index < 0)
1164 console->index = 0; 1197 newcon->index = 0;
1165 if (console->setup == NULL || 1198 if (newcon->setup == NULL ||
1166 console->setup(console, NULL) == 0) { 1199 newcon->setup(newcon, NULL) == 0) {
1167 console->flags |= CON_ENABLED; 1200 newcon->flags |= CON_ENABLED;
1168 if (console->device) { 1201 if (newcon->device) {
1169 console->flags |= CON_CONSDEV; 1202 newcon->flags |= CON_CONSDEV;
1170 preferred_console = 0; 1203 preferred_console = 0;
1171 } 1204 }
1172 } 1205 }
@@ -1178,64 +1211,62 @@ void register_console(struct console *console)
1178 */ 1211 */
1179 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 1212 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
1180 i++) { 1213 i++) {
1181 if (strcmp(console_cmdline[i].name, console->name) != 0) 1214 if (strcmp(console_cmdline[i].name, newcon->name) != 0)
1182 continue; 1215 continue;
1183 if (console->index >= 0 && 1216 if (newcon->index >= 0 &&
1184 console->index != console_cmdline[i].index) 1217 newcon->index != console_cmdline[i].index)
1185 continue; 1218 continue;
1186 if (console->index < 0) 1219 if (newcon->index < 0)
1187 console->index = console_cmdline[i].index; 1220 newcon->index = console_cmdline[i].index;
1188#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1221#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1189 if (console_cmdline[i].brl_options) { 1222 if (console_cmdline[i].brl_options) {
1190 console->flags |= CON_BRL; 1223 newcon->flags |= CON_BRL;
1191 braille_register_console(console, 1224 braille_register_console(newcon,
1192 console_cmdline[i].index, 1225 console_cmdline[i].index,
1193 console_cmdline[i].options, 1226 console_cmdline[i].options,
1194 console_cmdline[i].brl_options); 1227 console_cmdline[i].brl_options);
1195 return; 1228 return;
1196 } 1229 }
1197#endif 1230#endif
1198 if (console->setup && 1231 if (newcon->setup &&
1199 console->setup(console, console_cmdline[i].options) != 0) 1232 newcon->setup(newcon, console_cmdline[i].options) != 0)
1200 break; 1233 break;
1201 console->flags |= CON_ENABLED; 1234 newcon->flags |= CON_ENABLED;
1202 console->index = console_cmdline[i].index; 1235 newcon->index = console_cmdline[i].index;
1203 if (i == selected_console) { 1236 if (i == selected_console) {
1204 console->flags |= CON_CONSDEV; 1237 newcon->flags |= CON_CONSDEV;
1205 preferred_console = selected_console; 1238 preferred_console = selected_console;
1206 } 1239 }
1207 break; 1240 break;
1208 } 1241 }
1209 1242
1210 if (!(console->flags & CON_ENABLED)) 1243 if (!(newcon->flags & CON_ENABLED))
1211 return; 1244 return;
1212 1245
1213 if (bootconsole && (console->flags & CON_CONSDEV)) { 1246 /*
1214 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1247 * If we have a bootconsole, and are switching to a real console,
1215 bootconsole->name, bootconsole->index, 1248 * don't print everything out again, since when the boot console, and
1216 console->name, console->index); 1249 * the real console are the same physical device, it's annoying to
1217 unregister_console(bootconsole); 1250 * see the beginning boot messages twice
1218 console->flags &= ~CON_PRINTBUFFER; 1251 */
1219 } else { 1252 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
1220 printk(KERN_INFO "console [%s%d] enabled\n", 1253 newcon->flags &= ~CON_PRINTBUFFER;
1221 console->name, console->index);
1222 }
1223 1254
1224 /* 1255 /*
1225 * Put this console in the list - keep the 1256 * Put this console in the list - keep the
1226 * preferred driver at the head of the list. 1257 * preferred driver at the head of the list.
1227 */ 1258 */
1228 acquire_console_sem(); 1259 acquire_console_sem();
1229 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 1260 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1230 console->next = console_drivers; 1261 newcon->next = console_drivers;
1231 console_drivers = console; 1262 console_drivers = newcon;
1232 if (console->next) 1263 if (newcon->next)
1233 console->next->flags &= ~CON_CONSDEV; 1264 newcon->next->flags &= ~CON_CONSDEV;
1234 } else { 1265 } else {
1235 console->next = console_drivers->next; 1266 newcon->next = console_drivers->next;
1236 console_drivers->next = console; 1267 console_drivers->next = newcon;
1237 } 1268 }
1238 if (console->flags & CON_PRINTBUFFER) { 1269 if (newcon->flags & CON_PRINTBUFFER) {
1239 /* 1270 /*
1240 * release_console_sem() will print out the buffered messages 1271 * release_console_sem() will print out the buffered messages
1241 * for us. 1272 * for us.
@@ -1245,6 +1276,28 @@ void register_console(struct console *console)
1245 spin_unlock_irqrestore(&logbuf_lock, flags); 1276 spin_unlock_irqrestore(&logbuf_lock, flags);
1246 } 1277 }
1247 release_console_sem(); 1278 release_console_sem();
1279
1280 /*
1281 * By unregistering the bootconsoles after we enable the real console
1282 * we get the "console xxx enabled" message on all the consoles -
1283 * boot consoles, real consoles, etc - this is to ensure that end
1284 * users know there might be something in the kernel's log buffer that
1285 * went to the bootconsole (that they do not see on the real console)
1286 */
1287 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
1288 /* we need to iterate through twice, to make sure we print
1289 * everything out, before we unregister the console(s)
1290 */
1291 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
1292 newcon->name, newcon->index);
1293 for_each_console(bcon)
1294 if (bcon->flags & CON_BOOT)
1295 unregister_console(bcon);
1296 } else {
1297 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
1298 (newcon->flags & CON_BOOT) ? "boot" : "" ,
1299 newcon->name, newcon->index);
1300 }
1248} 1301}
1249EXPORT_SYMBOL(register_console); 1302EXPORT_SYMBOL(register_console);
1250 1303
@@ -1287,11 +1340,13 @@ EXPORT_SYMBOL(unregister_console);
1287 1340
1288static int __init disable_boot_consoles(void) 1341static int __init disable_boot_consoles(void)
1289{ 1342{
1290 if (console_drivers != NULL) { 1343 struct console *con;
1291 if (console_drivers->flags & CON_BOOT) { 1344
1345 for_each_console(con) {
1346 if (con->flags & CON_BOOT) {
1292 printk(KERN_INFO "turn off boot console %s%d\n", 1347 printk(KERN_INFO "turn off boot console %s%d\n",
1293 console_drivers->name, console_drivers->index); 1348 con->name, con->index);
1294 return unregister_console(console_drivers); 1349 unregister_console(con);
1295 } 1350 }
1296 } 1351 }
1297 return 0; 1352 return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 28cf26ad2d24..419250ebec4d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -117,11 +117,12 @@ int __ref profile_init(void)
117 117
118 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
119 119
120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
121 if (prof_buffer) 121 if (prof_buffer)
122 return 0; 122 return 0;
123 123
124 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
125 if (prof_buffer) 126 if (prof_buffer)
126 return 0; 127 return 0;
127 128
@@ -365,7 +366,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
365 node = cpu_to_node(cpu); 366 node = cpu_to_node(cpu);
366 per_cpu(cpu_profile_flip, cpu) = 0; 367 per_cpu(cpu_profile_flip, cpu) = 0;
367 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 368 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
368 page = alloc_pages_node(node, 369 page = alloc_pages_exact_node(node,
369 GFP_KERNEL | __GFP_ZERO, 370 GFP_KERNEL | __GFP_ZERO,
370 0); 371 0);
371 if (!page) 372 if (!page)
@@ -373,7 +374,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
373 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 374 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
374 } 375 }
375 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 376 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
376 page = alloc_pages_node(node, 377 page = alloc_pages_exact_node(node,
377 GFP_KERNEL | __GFP_ZERO, 378 GFP_KERNEL | __GFP_ZERO,
378 0); 379 0);
379 if (!page) 380 if (!page)
@@ -564,14 +565,14 @@ static int create_hash_tables(void)
564 int node = cpu_to_node(cpu); 565 int node = cpu_to_node(cpu);
565 struct page *page; 566 struct page *page;
566 567
567 page = alloc_pages_node(node, 568 page = alloc_pages_exact_node(node,
568 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 569 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
569 0); 570 0);
570 if (!page) 571 if (!page)
571 goto out_cleanup; 572 goto out_cleanup;
572 per_cpu(cpu_profile_hits, cpu)[1] 573 per_cpu(cpu_profile_hits, cpu)[1]
573 = (struct profile_hit *)page_address(page); 574 = (struct profile_hit *)page_address(page);
574 page = alloc_pages_node(node, 575 page = alloc_pages_exact_node(node,
575 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 576 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
576 0); 577 0);
577 if (!page) 578 if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f6d8b8cb5e34..307c285af59e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
152 if (!dumpable && !capable(CAP_SYS_PTRACE)) 152 if (!dumpable && !capable(CAP_SYS_PTRACE))
153 return -EPERM; 153 return -EPERM;
154 154
155 return security_ptrace_may_access(task, mode); 155 return security_ptrace_access_check(task, mode);
156} 156}
157 157
158bool ptrace_may_access(struct task_struct *task, unsigned int mode) 158bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -167,67 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
167int ptrace_attach(struct task_struct *task) 167int ptrace_attach(struct task_struct *task)
168{ 168{
169 int retval; 169 int retval;
170 unsigned long flags;
171 170
172 audit_ptrace(task); 171 audit_ptrace(task);
173 172
174 retval = -EPERM; 173 retval = -EPERM;
174 if (unlikely(task->flags & PF_KTHREAD))
175 goto out;
175 if (same_thread_group(task, current)) 176 if (same_thread_group(task, current))
176 goto out; 177 goto out;
177 178
178 /* Protect the target's credential calculations against our 179 /*
180 * Protect exec's credential calculations against our interference;
179 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
180 * under ptrace. 182 * under ptrace.
181 */ 183 */
182 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
183 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
184 goto out; 186 goto out;
185 187
186 retval = -EPERM;
187repeat:
188 /*
189 * Nasty, nasty.
190 *
191 * We want to hold both the task-lock and the
192 * tasklist_lock for writing at the same time.
193 * But that's against the rules (tasklist_lock
194 * is taken for reading by interrupts on other
195 * cpu's that may have task_lock).
196 */
197 task_lock(task); 188 task_lock(task);
198 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
199 task_unlock(task);
200 do {
201 cpu_relax();
202 } while (!write_can_lock(&tasklist_lock));
203 goto repeat;
204 }
205
206 if (!task->mm)
207 goto bad;
208 /* the same process cannot be attached many times */
209 if (task->ptrace & PT_PTRACED)
210 goto bad;
211 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); 189 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
190 task_unlock(task);
212 if (retval) 191 if (retval)
213 goto bad; 192 goto unlock_creds;
214 193
215 /* Go */ 194 write_lock_irq(&tasklist_lock);
216 task->ptrace |= PT_PTRACED; 195 retval = -EPERM;
196 if (unlikely(task->exit_state))
197 goto unlock_tasklist;
198 if (task->ptrace)
199 goto unlock_tasklist;
200
201 task->ptrace = PT_PTRACED;
217 if (capable(CAP_SYS_PTRACE)) 202 if (capable(CAP_SYS_PTRACE))
218 task->ptrace |= PT_PTRACE_CAP; 203 task->ptrace |= PT_PTRACE_CAP;
219 204
220 __ptrace_link(task, current); 205 __ptrace_link(task, current);
221
222 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 206 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
223bad: 207
224 write_unlock_irqrestore(&tasklist_lock, flags); 208 retval = 0;
225 task_unlock(task); 209unlock_tasklist:
210 write_unlock_irq(&tasklist_lock);
211unlock_creds:
226 mutex_unlock(&task->cred_guard_mutex); 212 mutex_unlock(&task->cred_guard_mutex);
227out: 213out:
228 return retval; 214 return retval;
229} 215}
230 216
217/**
218 * ptrace_traceme -- helper for PTRACE_TRACEME
219 *
220 * Performs checks and sets PT_PTRACED.
221 * Should be used by all ptrace implementations for PTRACE_TRACEME.
222 */
223int ptrace_traceme(void)
224{
225 int ret = -EPERM;
226
227 write_lock_irq(&tasklist_lock);
228 /* Are we already being traced? */
229 if (!current->ptrace) {
230 ret = security_ptrace_traceme(current->parent);
231 /*
232 * Check PF_EXITING to ensure ->real_parent has not passed
233 * exit_ptrace(). Otherwise we don't report the error but
234 * pretend ->real_parent untraces us right after return.
235 */
236 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
237 current->ptrace = PT_PTRACED;
238 __ptrace_link(current, current->real_parent);
239 }
240 }
241 write_unlock_irq(&tasklist_lock);
242
243 return ret;
244}
245
231/* 246/*
232 * Called with irqs disabled, returns true if childs should reap themselves. 247 * Called with irqs disabled, returns true if childs should reap themselves.
233 */ 248 */
@@ -409,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
409 424
410static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 425static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
411{ 426{
427 unsigned long flags;
412 int error = -ESRCH; 428 int error = -ESRCH;
413 429
414 read_lock(&tasklist_lock); 430 if (lock_task_sighand(child, &flags)) {
415 if (likely(child->sighand != NULL)) {
416 error = -EINVAL; 431 error = -EINVAL;
417 spin_lock_irq(&child->sighand->siglock);
418 if (likely(child->last_siginfo != NULL)) { 432 if (likely(child->last_siginfo != NULL)) {
419 *info = *child->last_siginfo; 433 *info = *child->last_siginfo;
420 error = 0; 434 error = 0;
421 } 435 }
422 spin_unlock_irq(&child->sighand->siglock); 436 unlock_task_sighand(child, &flags);
423 } 437 }
424 read_unlock(&tasklist_lock);
425 return error; 438 return error;
426} 439}
427 440
428static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) 441static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
429{ 442{
443 unsigned long flags;
430 int error = -ESRCH; 444 int error = -ESRCH;
431 445
432 read_lock(&tasklist_lock); 446 if (lock_task_sighand(child, &flags)) {
433 if (likely(child->sighand != NULL)) {
434 error = -EINVAL; 447 error = -EINVAL;
435 spin_lock_irq(&child->sighand->siglock);
436 if (likely(child->last_siginfo != NULL)) { 448 if (likely(child->last_siginfo != NULL)) {
437 *child->last_siginfo = *info; 449 *child->last_siginfo = *info;
438 error = 0; 450 error = 0;
439 } 451 }
440 spin_unlock_irq(&child->sighand->siglock); 452 unlock_task_sighand(child, &flags);
441 } 453 }
442 read_unlock(&tasklist_lock);
443 return error; 454 return error;
444} 455}
445 456
@@ -566,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
566 return ret; 577 return ret;
567} 578}
568 579
569/** 580static struct task_struct *ptrace_get_task_struct(pid_t pid)
570 * ptrace_traceme -- helper for PTRACE_TRACEME
571 *
572 * Performs checks and sets PT_PTRACED.
573 * Should be used by all ptrace implementations for PTRACE_TRACEME.
574 */
575int ptrace_traceme(void)
576{
577 int ret = -EPERM;
578
579 /*
580 * Are we already being traced?
581 */
582repeat:
583 task_lock(current);
584 if (!(current->ptrace & PT_PTRACED)) {
585 /*
586 * See ptrace_attach() comments about the locking here.
587 */
588 unsigned long flags;
589 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
590 task_unlock(current);
591 do {
592 cpu_relax();
593 } while (!write_can_lock(&tasklist_lock));
594 goto repeat;
595 }
596
597 ret = security_ptrace_traceme(current->parent);
598
599 /*
600 * Check PF_EXITING to ensure ->real_parent has not passed
601 * exit_ptrace(). Otherwise we don't report the error but
602 * pretend ->real_parent untraces us right after return.
603 */
604 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
605 current->ptrace |= PT_PTRACED;
606 __ptrace_link(current, current->real_parent);
607 }
608
609 write_unlock_irqrestore(&tasklist_lock, flags);
610 }
611 task_unlock(current);
612 return ret;
613}
614
615/**
616 * ptrace_get_task_struct -- grab a task struct reference for ptrace
617 * @pid: process id to grab a task_struct reference of
618 *
619 * This function is a helper for ptrace implementations. It checks
620 * permissions and then grabs a task struct for use of the actual
621 * ptrace implementation.
622 *
623 * Returns the task_struct for @pid or an ERR_PTR() on failure.
624 */
625struct task_struct *ptrace_get_task_struct(pid_t pid)
626{ 581{
627 struct task_struct *child; 582 struct task_struct *child;
628 583
629 read_lock(&tasklist_lock); 584 rcu_read_lock();
630 child = find_task_by_vpid(pid); 585 child = find_task_by_vpid(pid);
631 if (child) 586 if (child)
632 get_task_struct(child); 587 get_task_struct(child);
588 rcu_read_unlock();
633 589
634 read_unlock(&tasklist_lock);
635 if (!child) 590 if (!child)
636 return ERR_PTR(-ESRCH); 591 return ERR_PTR(-ESRCH);
637 return child; 592 return child;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..bd5d5c8e5140 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -98,6 +98,30 @@ void synchronize_rcu(void)
98} 98}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 99EXPORT_SYMBOL_GPL(synchronize_rcu);
100 100
101/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 *
104 * Control will return to the caller some time after a full rcu_bh grace
105 * period has elapsed, in other words after all currently executing rcu_bh
106 * read-side critical sections have completed. RCU read-side critical
107 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
108 * and may be nested.
109 */
110void synchronize_rcu_bh(void)
111{
112 struct rcu_synchronize rcu;
113
114 if (rcu_blocking_is_gp())
115 return;
116
117 init_completion(&rcu.completion);
118 /* Will wake me after RCU finished. */
119 call_rcu_bh(&rcu.head, wakeme_after_rcu);
120 /* Wait for it. */
121 wait_for_completion(&rcu.completion);
122}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124
101static void rcu_barrier_callback(struct rcu_head *notused) 125static void rcu_barrier_callback(struct rcu_head *notused)
102{ 126{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 153static inline void wait_migrated_callbacks(void)
130{ 154{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
132} 157}
133 158
134/* 159/*
@@ -192,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 217 wake_up(&rcu_migrate_wq);
193} 218}
194 219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 224 unsigned long action, void *hcpu)
197{ 225{
226 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 227 if (action == CPU_DYING) {
199 /* 228 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +238,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 244 wait_migrated_callbacks();
215 } 245 }
@@ -219,8 +249,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 249
220void __init rcu_init(void) 250void __init rcu_init(void)
221{ 251{
252 int i;
253
222 __rcu_init(); 254 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 255 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
256
257 /*
258 * We don't need protection against CPU-hotplug here because
259 * this is called early in boot, before either interrupts
260 * or the scheduler are operational.
261 */
262 for_each_online_cpu(i)
263 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 264}
225 265
226void rcu_scheduler_starting(void) 266void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..b33db539a8ad 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 320 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 321 rcu_torture_free(rp);
322 } else 322 } else
323 cur_ops->deferredfree(rp); 323 cur_ops->deferred_free(rp);
324} 324}
325 325
326static void rcu_torture_deferred_free(struct rcu_torture *p) 326static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 329}
330 330
331static struct rcu_torture_ops rcu_ops = { 331static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 332 .init = NULL,
333 .cleanup = NULL, 333 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 334 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 335 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 336 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 337 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 338 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 339 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 340 .cb_barrier = rcu_barrier,
341 .stats = NULL, 341 .stats = NULL,
342 .irqcapable = 1, 342 .irq_capable = 1,
343 .name = "rcu" 343 .name = "rcu"
344}; 344};
345 345
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 346static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
370} 370}
371 371
372static struct rcu_torture_ops rcu_sync_ops = { 372static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 373 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 374 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 375 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 376 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 377 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 378 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 379 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 380 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 381 .cb_barrier = NULL,
382 .stats = NULL, 382 .stats = NULL,
383 .irqcapable = 1, 383 .irq_capable = 1,
384 .name = "rcu_sync" 384 .name = "rcu_sync"
385}; 385};
386 386
387/* 387/*
@@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
432} 432}
433 433
434static struct rcu_torture_ops rcu_bh_ops = { 434static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 435 .init = NULL,
436 .cleanup = NULL, 436 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 437 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 438 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 439 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 440 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 441 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 442 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 443 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 444 .stats = NULL,
445 .irqcapable = 1, 445 .irq_capable = 1,
446 .name = "rcu_bh" 446 .name = "rcu_bh"
447}; 447};
448 448
449static struct rcu_torture_ops rcu_bh_sync_ops = { 449static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 450 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 451 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 452 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 453 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 454 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 455 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 456 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 457 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 458 .cb_barrier = NULL,
459 .stats = NULL, 459 .stats = NULL,
460 .irqcapable = 1, 460 .irq_capable = 1,
461 .name = "rcu_bh_sync" 461 .name = "rcu_bh_sync"
462}; 462};
463 463
464/* 464/*
@@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page)
530} 530}
531 531
532static struct rcu_torture_ops srcu_ops = { 532static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 533 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 534 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 535 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 536 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 537 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 538 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 539 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 540 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 541 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 542 .stats = srcu_torture_stats,
543 .name = "srcu" 543 .name = "srcu"
544}; 544};
545 545
546/* 546/*
@@ -574,32 +574,49 @@ static void sched_torture_synchronize(void)
574} 574}
575 575
576static struct rcu_torture_ops sched_ops = { 576static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 577 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 578 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 579 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 580 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 581 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 582 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 583 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 584 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 585 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 586 .stats = NULL,
587 .irqcapable = 1, 587 .irq_capable = 1,
588 .name = "sched" 588 .name = "sched"
589}; 589};
590 590
591static struct rcu_torture_ops sched_ops_sync = { 591static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 592 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 593 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 594 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 595 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 596 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 597 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 598 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 599 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 600 .cb_barrier = NULL,
601 .stats = NULL, 601 .stats = NULL,
602 .name = "sched_sync" 602 .name = "sched_sync"
603};
604
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init,
609 .cleanup = NULL,
610 .readlock = sched_torture_read_lock,
611 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
612 .readunlock = sched_torture_read_unlock,
613 .completed = sched_torture_completed,
614 .deferred_free = rcu_sync_torture_deferred_free,
615 .sync = synchronize_sched_expedited,
616 .cb_barrier = NULL,
617 .stats = rcu_expedited_torture_stats,
618 .irq_capable = 1,
619 .name = "sched_expedited"
603}; 620};
604 621
605/* 622/*
@@ -635,7 +652,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 652 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 653 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 654 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 655 cur_ops->deferred_free(old_rp);
639 } 656 }
640 rcu_torture_current_version++; 657 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 658 oldbatch = cur_ops->completed();
@@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 717 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 718 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 719 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 720 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 721 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 722 spin_unlock(&rand_lock);
706 preempt_disable(); 723 preempt_disable();
@@ -738,11 +755,11 @@ rcu_torture_reader(void *arg)
738 755
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 756 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 757 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 759 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 760
744 do { 761 do {
745 if (irqreader && cur_ops->irqcapable) { 762 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 763 if (!timer_pending(&t))
747 mod_timer(&t, 1); 764 mod_timer(&t, 1);
748 } 765 }
@@ -757,7 +774,7 @@ rcu_torture_reader(void *arg)
757 } 774 }
758 if (p->rtort_mbtest == 0) 775 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 776 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 777 cur_ops->read_delay(&rand);
761 preempt_disable(); 778 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 779 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 780 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +795,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 795 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 796 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 797 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 798 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 799 del_timer_sync(&t);
783 while (!kthread_should_stop()) 800 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 801 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1095,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1095 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1096 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1097 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1098 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1099 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1100
1083 mutex_lock(&fullstop_mutex); 1101 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..6b11b07cfe7f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <linux/bitops.h> 40#include <linux/bitops.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -46,6 +47,8 @@
46#include <linux/mutex.h> 47#include <linux/mutex.h>
47#include <linux/time.h> 48#include <linux/time.h>
48 49
50#include "rcutree.h"
51
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 54struct lockdep_map rcu_lock_map =
@@ -72,30 +75,59 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
73} 76}
74 77
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 80
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 83
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100
101#include "rcutree_plugin.h"
102
81/* 103/*
82 * Increment the quiescent state counter. 104 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 105 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 106 * one since the start of the grace period, this just sets a flag.
86 */ 107 */
87void rcu_qsctr_inc(int cpu) 108void rcu_sched_qs(int cpu)
88{ 109{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 110 unsigned long flags;
111 struct rcu_data *rdp;
112
113 local_irq_save(flags);
114 rdp = &per_cpu(rcu_sched_data, cpu);
90 rdp->passed_quiesc = 1; 115 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed; 116 rdp->passed_quiesc_completed = rdp->completed;
117 rcu_preempt_qs(cpu);
118 local_irq_restore(flags);
92} 119}
93 120
94void rcu_bh_qsctr_inc(int cpu) 121void rcu_bh_qs(int cpu)
95{ 122{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 123 unsigned long flags;
124 struct rcu_data *rdp;
125
126 local_irq_save(flags);
127 rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed; 129 rdp->passed_quiesc_completed = rdp->completed;
130 local_irq_restore(flags);
99} 131}
100 132
101#ifdef CONFIG_NO_HZ 133#ifdef CONFIG_NO_HZ
@@ -110,15 +142,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 142static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 143
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 144static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
145static int rcu_pending(int cpu);
113 146
114/* 147/*
115 * Return the number of RCU batches processed thus far for debug & stats. 148 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 149 */
117long rcu_batches_completed(void) 150long rcu_batches_completed_sched(void)
118{ 151{
119 return rcu_state.completed; 152 return rcu_sched_state.completed;
120} 153}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 154EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 155
123/* 156/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 157 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +214,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 214 return 1;
182 } 215 }
183 216
217 /* If preemptable RCU, no point in sending reschedule IPI. */
218 if (rdp->preemptable)
219 return 0;
220
184 /* The CPU is online, so send it a reschedule IPI. */ 221 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 222 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 223 smp_send_reschedule(rdp->cpu);
@@ -193,7 +230,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 230#endif /* #ifdef CONFIG_SMP */
194 231
195#ifdef CONFIG_NO_HZ 232#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 233
198/** 234/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 235 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +249,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 249 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 250 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 251 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 252 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 253 local_irq_restore(flags);
218} 254}
219 255
@@ -232,7 +268,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 268 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 269 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 270 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 271 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 272 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 273 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 274}
@@ -251,7 +287,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 287 if (rdtp->dynticks & 0x1)
252 return; 288 return;
253 rdtp->dynticks_nmi++; 289 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 290 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 291 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 292}
257 293
@@ -270,7 +306,7 @@ void rcu_nmi_exit(void)
270 return; 306 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 307 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 308 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 309 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 310}
275 311
276/** 312/**
@@ -286,7 +322,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 322 if (rdtp->dynticks_nesting++)
287 return; 323 return;
288 rdtp->dynticks++; 324 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 325 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 326 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 327}
292 328
@@ -305,10 +341,10 @@ void rcu_irq_exit(void)
305 return; 341 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 342 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 343 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 344 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 345
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 346 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 347 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 348 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 349 set_need_resched();
314} 350}
@@ -461,6 +497,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 497
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 498 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 499 for (; rnp_cur < rnp_end; rnp_cur++) {
500 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 501 if (rnp_cur->qsmask == 0)
465 continue; 502 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 503 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -469,6 +506,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 } 506 }
470 printk(" (detected by %d, t=%ld jiffies)\n", 507 printk(" (detected by %d, t=%ld jiffies)\n",
471 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 508 smp_processor_id(), (long)(jiffies - rsp->gp_start));
509 trigger_all_cpu_backtrace();
510
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 511 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 512}
474 513
@@ -479,12 +518,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
479 518
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 519 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 520 smp_processor_id(), jiffies - rsp->gp_start);
482 dump_stack(); 521 trigger_all_cpu_backtrace();
522
483 spin_lock_irqsave(&rnp->lock, flags); 523 spin_lock_irqsave(&rnp->lock, flags);
484 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 524 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
485 rsp->jiffies_stall = 525 rsp->jiffies_stall =
486 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 526 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
487 spin_unlock_irqrestore(&rnp->lock, flags); 527 spin_unlock_irqrestore(&rnp->lock, flags);
528
488 set_need_resched(); /* kick ourselves to get things going. */ 529 set_need_resched(); /* kick ourselves to get things going. */
489} 530}
490 531
@@ -674,6 +715,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 715}
675 716
676/* 717/*
718 * Clean up after the prior grace period and let rcu_start_gp() start up
719 * the next grace period if one is needed. Note that the caller must
720 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
721 */
722static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
723 __releases(rnp->lock)
724{
725 rsp->completed = rsp->gpnum;
726 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
727 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
728}
729
730/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 731 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 732 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 733 * group must be represented by the same leaf rcu_node structure.
@@ -694,7 +748,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 748 return;
695 } 749 }
696 rnp->qsmask &= ~mask; 750 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 751 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 752
699 /* Other bits still set at this level, so done. */ 753 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 754 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -714,14 +768,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
714 768
715 /* 769 /*
716 * Get here if we are the last CPU to pass through a quiescent 770 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 771 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 772 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 773 */
722 rsp->completed = rsp->gpnum; 774 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 775}
726 776
727/* 777/*
@@ -828,11 +878,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 878 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 879 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 880 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 881 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 882 break;
833 } 883 }
884 rcu_preempt_offline_tasks(rsp, rnp);
834 mask = rnp->grpmask; 885 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 886 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 887 rnp = rnp->parent;
837 } while (rnp != NULL); 888 } while (rnp != NULL);
838 lastcomp = rsp->completed; 889 lastcomp = rsp->completed;
@@ -845,7 +896,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
845 /* 896 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 897 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 898 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 899 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 900 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 901 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 902 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +927,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 927 */
877static void rcu_offline_cpu(int cpu) 928static void rcu_offline_cpu(int cpu)
878{ 929{
879 __rcu_offline_cpu(cpu, &rcu_state); 930 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 931 __rcu_offline_cpu(cpu, &rcu_bh_state);
932 rcu_preempt_offline_cpu(cpu);
881} 933}
882 934
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 935#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +1015,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 1015 */
964void rcu_check_callbacks(int cpu, int user) 1016void rcu_check_callbacks(int cpu, int user)
965{ 1017{
1018 if (!rcu_pending(cpu))
1019 return; /* if nothing for RCU to do. */
966 if (user || 1020 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1021 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1022 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1025,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1025 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1026 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1027 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1028 * a quiescent state, so note it.
975 * 1029 *
976 * No memory barrier is required here because both 1030 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1031 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1032 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1033 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1034 */
982 1035
983 rcu_qsctr_inc(cpu); 1036 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1037 rcu_bh_qs(cpu);
985 1038
986 } else if (!in_softirq()) { 1039 } else if (!in_softirq()) {
987 1040
@@ -989,11 +1042,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1042 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1043 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1044 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1045 * critical section, so note it.
993 */ 1046 */
994 1047
995 rcu_bh_qsctr_inc(cpu); 1048 rcu_bh_qs(cpu);
996 } 1049 }
1050 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1051 raise_softirq(RCU_SOFTIRQ);
998} 1052}
999 1053
@@ -1132,6 +1186,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1186{
1133 unsigned long flags; 1187 unsigned long flags;
1134 1188
1189 WARN_ON_ONCE(rdp->beenonline == 0);
1190
1135 /* 1191 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1192 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1193 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1226,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1226 */
1171 smp_mb(); /* See above block comment. */ 1227 smp_mb(); /* See above block comment. */
1172 1228
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1229 __rcu_process_callbacks(&rcu_sched_state,
1230 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1231 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1232 rcu_preempt_process_callbacks();
1175 1233
1176 /* 1234 /*
1177 * Memory references from any later RCU read-side critical sections 1235 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1285,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1285}
1228 1286
1229/* 1287/*
1230 * Queue an RCU callback for invocation after a grace period. 1288 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1289 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1290void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1291{
1234 __call_rcu(head, func, &rcu_state); 1292 __call_rcu(head, func, &rcu_sched_state);
1235} 1293}
1236EXPORT_SYMBOL_GPL(call_rcu); 1294EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1295
1238/* 1296/*
1239 * Queue an RCU for invocation after a quicker grace period. 1297 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1363,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1363 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1364 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1365 */
1308int rcu_pending(int cpu) 1366static int rcu_pending(int cpu)
1309{ 1367{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1368 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1369 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1370 rcu_preempt_pending(cpu);
1312} 1371}
1313 1372
1314/* 1373/*
@@ -1320,27 +1379,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1379int rcu_needs_cpu(int cpu)
1321{ 1380{
1322 /* RCU callbacks either ready or pending? */ 1381 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1382 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1383 per_cpu(rcu_bh_data, cpu).nxtlist ||
1384 rcu_preempt_needs_cpu(cpu);
1325} 1385}
1326 1386
1327/* 1387/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1388 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1389 */
1339static void __cpuinit 1390static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1391rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1392{
1342 unsigned long flags; 1393 unsigned long flags;
1343 int i; 1394 int i;
1395 struct rcu_data *rdp = rsp->rda[cpu];
1396 struct rcu_node *rnp = rcu_get_root(rsp);
1397
1398 /* Set up local state, ensuring consistent view of global state. */
1399 spin_lock_irqsave(&rnp->lock, flags);
1400 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1401 rdp->nxtlist = NULL;
1402 for (i = 0; i < RCU_NEXT_SIZE; i++)
1403 rdp->nxttail[i] = &rdp->nxtlist;
1404 rdp->qlen = 0;
1405#ifdef CONFIG_NO_HZ
1406 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1407#endif /* #ifdef CONFIG_NO_HZ */
1408 rdp->cpu = cpu;
1409 spin_unlock_irqrestore(&rnp->lock, flags);
1410}
1411
1412/*
1413 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1414 * offline event can be happening at a given time. Note also that we
1415 * can accept some slop in the rsp->completed access due to the fact
1416 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1417 */
1418static void __cpuinit
1419rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1420{
1421 unsigned long flags;
1344 long lastcomp; 1422 long lastcomp;
1345 unsigned long mask; 1423 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1424 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1432,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1432 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1433 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1434 rdp->beenonline = 1; /* We have now been online. */
1435 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1436 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1437 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1438 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1439
1370 /* 1440 /*
@@ -1405,16 +1475,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1405 1475
1406static void __cpuinit rcu_online_cpu(int cpu) 1476static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1477{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1478 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1479 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1480 rcu_preempt_init_percpu_data(cpu);
1411} 1481}
1412 1482
1413/* 1483/*
1414 * Handle CPU online/offline notifcation events. 1484 * Handle CPU online/offline notification events.
1415 */ 1485 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1486int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1487 unsigned long action, void *hcpu)
1418{ 1488{
1419 long cpu = (long)hcpu; 1489 long cpu = (long)hcpu;
1420 1490
@@ -1486,6 +1556,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1556 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1557 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1558 spin_lock_init(&rnp->lock);
1559 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1560 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1561 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1562 rnp->grplo = j * cpustride;
@@ -1503,16 +1574,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1574 j / rsp->levelspread[i - 1];
1504 } 1575 }
1505 rnp->level = i; 1576 rnp->level = i;
1577 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1578 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1579 }
1507 } 1580 }
1508} 1581}
1509 1582
1510/* 1583/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1584 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1585 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1586 * structure.
1513 */ 1587 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1588#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1589do { \
1590 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1591 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1592 j = 0; \
1518 for_each_possible_cpu(i) { \ 1593 for_each_possible_cpu(i) { \
@@ -1520,33 +1595,43 @@ do { \
1520 j++; \ 1595 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1596 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1597 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1598 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1599 } \
1524} while (0) 1600} while (0)
1525 1601
1526static struct notifier_block __cpuinitdata rcu_nb = { 1602#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1603
1528}; 1604void __init __rcu_init_preempt(void)
1605{
1606 int i; /* All used by RCU_INIT_FLAVOR(). */
1607 int j;
1608 struct rcu_node *rnp;
1609
1610 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1611}
1612
1613#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1614
1615void __init __rcu_init_preempt(void)
1616{
1617}
1618
1619#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1620
1530void __init __rcu_init(void) 1621void __init __rcu_init(void)
1531{ 1622{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1623 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1624 int j;
1534 struct rcu_node *rnp; 1625 struct rcu_node *rnp;
1535 1626
1536 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1627 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1628#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1629 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1630#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1631 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1632 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1633 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1634 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1550} 1635}
1551 1636
1552module_param(blimit, int, 0); 1637module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..bf8a6f9f134d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..47789369ea59
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,532 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs_record(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed;
72}
73
74/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd.
76 * If we are in an RCU read-side critical section, we need to reflect
77 * that in the state of the rcu_node structure corresponding to this CPU.
78 * Caller must disable hardirqs.
79 */
80static void rcu_preempt_qs(int cpu)
81{
82 struct task_struct *t = current;
83 int phase;
84 struct rcu_data *rdp;
85 struct rcu_node *rnp;
86
87 if (t->rcu_read_lock_nesting &&
88 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
89
90 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode;
93 spin_lock(&rnp->lock);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp;
96
97 /*
98 * If this CPU has already checked in, then this task
99 * will hold up the next grace period rather than the
100 * current grace period. Queue the task accordingly.
101 * If the task is queued for the current grace period
102 * (i.e., this CPU has not yet passed through a quiescent
103 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period
105 * cannot end.
106 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */
110 spin_unlock(&rnp->lock);
111 }
112
113 /*
114 * Either we were not in an RCU read-side critical section to
115 * begin with, or we have now recorded that critical section
116 * globally. Either way, we can now note a quiescent state
117 * for this CPU. Again, if we were in an RCU read-side critical
118 * section, and if that critical section was blocking the current
119 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period.
121 */
122 rcu_preempt_qs_record(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
124 RCU_READ_UNLOCK_GOT_QS);
125}
126
127/*
128 * Tree-preemptable RCU implementation for rcu_read_lock().
129 * Just increment ->rcu_read_lock_nesting, shared state will be updated
130 * if we block.
131 */
132void __rcu_read_lock(void)
133{
134 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
135 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
136}
137EXPORT_SYMBOL_GPL(__rcu_read_lock);
138
139static void rcu_read_unlock_special(struct task_struct *t)
140{
141 int empty;
142 unsigned long flags;
143 unsigned long mask;
144 struct rcu_node *rnp;
145 int special;
146
147 /* NMI handlers cannot block and cannot safely manipulate state. */
148 if (in_nmi())
149 return;
150
151 local_irq_save(flags);
152
153 /*
154 * If RCU core is waiting for this CPU to exit critical section,
155 * let it know that we have done so.
156 */
157 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
161 }
162
163 /* Hardware IRQ handlers cannot block. */
164 if (in_irq()) {
165 local_irq_restore(flags);
166 return;
167 }
168
169 /* Clean up if blocked during RCU read-side critical section. */
170 if (special & RCU_READ_UNLOCK_BLOCKED) {
171 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
172
173 /*
174 * Remove this task from the list it blocked on. The
175 * task can migrate while we acquire the lock, but at
176 * most one time. So at most two passes through loop.
177 */
178 for (;;) {
179 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock);
181 if (rnp == t->rcu_blocked_node)
182 break;
183 spin_unlock(&rnp->lock);
184 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
186 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL;
188
189 /*
190 * If this was the last task on the current list, and if
191 * we aren't waiting on any CPUs, report the quiescent state.
192 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
193 * drop rnp->lock and restore irq.
194 */
195 if (!empty && rnp->qsmask == 0 &&
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
197 t->rcu_read_unlock_special &=
198 ~(RCU_READ_UNLOCK_NEED_QS |
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
203 return;
204 }
205 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent;
209 spin_lock_irqsave(&rnp->lock, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
211 return;
212 }
213 spin_unlock(&rnp->lock);
214 }
215 local_irq_restore(flags);
216}
217
218/*
219 * Tree-preemptable RCU implementation for rcu_read_unlock().
220 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
221 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
222 * invoke rcu_read_unlock_special() to clean up after a context switch
223 * in an RCU read-side critical section and other special cases.
224 */
225void __rcu_read_unlock(void)
226{
227 struct task_struct *t = current;
228
229 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
230 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
231 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
232 rcu_read_unlock_special(t);
233}
234EXPORT_SYMBOL_GPL(__rcu_read_unlock);
235
236#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
237
238/*
239 * Scan the current list of tasks blocked within RCU read-side critical
240 * sections, printing out the tid of each.
241 */
242static void rcu_print_task_stall(struct rcu_node *rnp)
243{
244 unsigned long flags;
245 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1;
247 struct task_struct *t;
248
249 if (!list_empty(&rnp->blocked_tasks[phase])) {
250 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */
252 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid);
255 spin_unlock_irqrestore(&rnp->lock, flags);
256 }
257}
258
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260
261/*
262 * Check for preempted RCU readers for the specified rcu_node structure.
263 * If the caller needs a reliable answer, it must hold the rcu_node's
264 * >lock.
265 */
266static int rcu_preempted_readers(struct rcu_node *rnp)
267{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
269}
270
271#ifdef CONFIG_HOTPLUG_CPU
272
273/*
274 * Handle tasklist migration for case in which all CPUs covered by the
275 * specified rcu_node have gone offline. Move them up to the root
276 * rcu_node. The reason for not just moving them to the immediate
277 * parent is to remove the need for rcu_read_unlock_special() to
278 * make more than two attempts to acquire the target rcu_node's lock.
279 *
280 * The caller must hold rnp->lock with irqs disabled.
281 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp)
284{
285 int i;
286 struct list_head *lp;
287 struct list_head *lp_root;
288 struct rcu_node *rnp_root = rcu_get_root(rsp);
289 struct task_struct *tp;
290
291 if (rnp == rnp_root) {
292 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */
294 }
295
296 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the
298 * root rcu_node can be at most one ahead of the rest of the
299 * rcu_nodes in terms of gp_num value. This fact allows us to
300 * move the blocked_tasks[] array directly, element by element.
301 */
302 for (i = 0; i < 2; i++) {
303 lp = &rnp->blocked_tasks[i];
304 lp_root = &rnp_root->blocked_tasks[i];
305 while (!list_empty(lp)) {
306 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
307 spin_lock(&rnp_root->lock); /* irqs already disabled */
308 list_del(&tp->rcu_node_entry);
309 tp->rcu_blocked_node = rnp_root;
310 list_add(&tp->rcu_node_entry, lp_root);
311 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
312 }
313 }
314}
315
316/*
317 * Do CPU-offline processing for preemptable RCU.
318 */
319static void rcu_preempt_offline_cpu(int cpu)
320{
321 __rcu_offline_cpu(cpu, &rcu_preempt_state);
322}
323
324#endif /* #ifdef CONFIG_HOTPLUG_CPU */
325
326/*
327 * Check for a quiescent state from the current CPU. When a task blocks,
328 * the task is recorded in the corresponding CPU's rcu_node structure,
329 * which is checked elsewhere.
330 *
331 * Caller must disable hard irqs.
332 */
333static void rcu_preempt_check_callbacks(int cpu)
334{
335 struct task_struct *t = current;
336
337 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &=
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
340 rcu_preempt_qs_record(cpu);
341 return;
342 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352}
353
354/*
355 * Process callbacks for preemptable RCU.
356 */
357static void rcu_preempt_process_callbacks(void)
358{
359 __rcu_process_callbacks(&rcu_preempt_state,
360 &__get_cpu_var(rcu_preempt_data));
361}
362
363/*
364 * Queue a preemptable-RCU callback for invocation after a grace period.
365 */
366void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
367{
368 __call_rcu(head, func, &rcu_preempt_state);
369}
370EXPORT_SYMBOL_GPL(call_rcu);
371
372/*
373 * Check to see if there is any immediate preemptable-RCU-related work
374 * to be done.
375 */
376static int rcu_preempt_pending(int cpu)
377{
378 return __rcu_pending(&rcu_preempt_state,
379 &per_cpu(rcu_preempt_data, cpu));
380}
381
382/*
383 * Does preemptable RCU need the CPU to stay out of dynticks mode?
384 */
385static int rcu_preempt_needs_cpu(int cpu)
386{
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388}
389
390/*
391 * Initialize preemptable RCU's per-CPU data.
392 */
393static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
394{
395 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
396}
397
398/*
399 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep
402 * is enabled.
403 */
404void exit_rcu(void)
405{
406 struct task_struct *t = current;
407
408 if (t->rcu_read_lock_nesting == 0)
409 return;
410 t->rcu_read_lock_nesting = 1;
411 rcu_read_unlock();
412}
413
414#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
415
416/*
417 * Tell them what RCU they are running.
418 */
419static inline void rcu_bootup_announce(void)
420{
421 printk(KERN_INFO "Hierarchical RCU implementation.\n");
422}
423
424/*
425 * Return the number of RCU batches processed thus far for debug & stats.
426 */
427long rcu_batches_completed(void)
428{
429 return rcu_batches_completed_sched();
430}
431EXPORT_SYMBOL_GPL(rcu_batches_completed);
432
433/*
434 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states.
436 */
437static void rcu_preempt_qs(int cpu)
438{
439}
440
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
442
443/*
444 * Because preemptable RCU does not exist, we never have to check for
445 * tasks blocked within RCU read-side critical sections.
446 */
447static void rcu_print_task_stall(struct rcu_node *rnp)
448{
449}
450
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452
453/*
454 * Because preemptable RCU does not exist, there are never any preempted
455 * RCU readers.
456 */
457static int rcu_preempted_readers(struct rcu_node *rnp)
458{
459 return 0;
460}
461
462#ifdef CONFIG_HOTPLUG_CPU
463
464/*
465 * Because preemptable RCU does not exist, it never needs to migrate
466 * tasks that were blocked within RCU read-side critical sections.
467 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp)
470{
471}
472
473/*
474 * Because preemptable RCU does not exist, it never needs CPU-offline
475 * processing.
476 */
477static void rcu_preempt_offline_cpu(int cpu)
478{
479}
480
481#endif /* #ifdef CONFIG_HOTPLUG_CPU */
482
483/*
484 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check.
486 */
487void rcu_preempt_check_callbacks(int cpu)
488{
489}
490
491/*
492 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process.
494 */
495void rcu_preempt_process_callbacks(void)
496{
497}
498
499/*
500 * In classic RCU, call_rcu() is just call_rcu_sched().
501 */
502void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
503{
504 call_rcu_sched(head, func);
505}
506EXPORT_SYMBOL_GPL(call_rcu);
507
508/*
509 * Because preemptable RCU does not exist, it never has any work to do.
510 */
511static int rcu_preempt_pending(int cpu)
512{
513 return 0;
514}
515
516/*
517 * Because preemptable RCU does not exist, it never needs any CPU.
518 */
519static int rcu_preempt_needs_cpu(int cpu)
520{
521 return 0;
522}
523
524/*
525 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize.
527 */
528static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{
530}
531
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..0ea1bff69727 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c803..e1338f074314 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
18void res_counter_init(struct res_counter *counter, struct res_counter *parent) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
133 unsigned long long *res) 133 unsigned long long *res)
134{ 134{
135 char *end; 135 char *end;
136
137 /* return RESOURCE_MAX(unlimited) if "-1" is specified */
138 if (*buf == '-') {
139 *res = simple_strtoull(buf + 1, &end, 10);
140 if (*res != 1 || *end != '\0')
141 return -EINVAL;
142 *res = RESOURCE_MAX;
143 return 0;
144 }
145
136 /* FIXME - make memparse() take const char* args */ 146 /* FIXME - make memparse() take const char* args */
137 *res = memparse((char *)buf, &end); 147 *res = memparse((char *)buf, &end);
138 if (*end != '\0') 148 if (*end != '\0')
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a78c5a..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { 1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1044 1043 spin_unlock(&lock->wait_lock);
1045 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1; 1045 return 1;
1047 } 1046 }
1048 1047
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050 1049
1051
1052 if (ret && !waiter->task) { 1050 if (ret && !waiter->task) {
1053 /* 1051 /*
1054 * Reset the return value. We might have 1052 * Reset the return value. We might have
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fb88a906aaa..faf4d463bbff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,6 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
148{ 123{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
309 284
310/* 285/*
311 * Root task group. 286 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 288 * be a child to this group.
314 */ 289 */
315struct task_group root_task_group; 290struct task_group root_task_group;
316 291
@@ -318,12 +293,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
323 298
324#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
401 376
402#else 377#else
403 378
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
413{ 381{
@@ -493,6 +461,7 @@ struct rt_rq {
493#endif 461#endif
494#ifdef CONFIG_SMP 462#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory; 463 unsigned long rt_nr_migratory;
464 unsigned long rt_nr_total;
496 int overloaded; 465 int overloaded;
497 struct plist_head pushable_tasks; 466 struct plist_head pushable_tasks;
498#endif 467#endif
@@ -536,14 +505,6 @@ struct root_domain {
536#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
537 struct cpupri cpupri; 506 struct cpupri cpupri;
538#endif 507#endif
539#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
540 /*
541 * Preferred wake up cpu nominated by sched_mc balance that will be
542 * used when most cpus are idle in the system indicating overall very
543 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
544 */
545 unsigned int sched_mc_preferred_wakeup_cpu;
546#endif
547}; 508};
548 509
549/* 510/*
@@ -615,6 +576,7 @@ struct rq {
615 576
616 unsigned char idle_at_tick; 577 unsigned char idle_at_tick;
617 /* For active balancing */ 578 /* For active balancing */
579 int post_schedule;
618 int active_balance; 580 int active_balance;
619 int push_cpu; 581 int push_cpu;
620 /* cpu of this runqueue: */ 582 /* cpu of this runqueue: */
@@ -625,6 +587,9 @@ struct rq {
625 587
626 struct task_struct *migration_thread; 588 struct task_struct *migration_thread;
627 struct list_head migration_queue; 589 struct list_head migration_queue;
590
591 u64 rt_avg;
592 u64 age_stamp;
628#endif 593#endif
629 594
630 /* calc_load related fields */ 595 /* calc_load related fields */
@@ -664,9 +629,10 @@ struct rq {
664 629
665static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
666 631
667static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
668{ 634{
669 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
670} 636}
671 637
672static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -692,6 +658,7 @@ static inline int cpu_of(struct rq *rq)
692#define this_rq() (&__get_cpu_var(runqueues)) 658#define this_rq() (&__get_cpu_var(runqueues))
693#define task_rq(p) cpu_rq(task_cpu(p)) 659#define task_rq(p) cpu_rq(task_cpu(p))
694#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 660#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
661#define raw_rq() (&__raw_get_cpu_var(runqueues))
695 662
696inline void update_rq_clock(struct rq *rq) 663inline void update_rq_clock(struct rq *rq)
697{ 664{
@@ -860,6 +827,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
860unsigned int sysctl_sched_shares_thresh = 4; 827unsigned int sysctl_sched_shares_thresh = 4;
861 828
862/* 829/*
830 * period over which we average the RT time consumption, measured
831 * in ms.
832 *
833 * default: 1s
834 */
835const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
836
837/*
863 * period over which we measure -rt task cpu usage in us. 838 * period over which we measure -rt task cpu usage in us.
864 * default: 1s 839 * default: 1s
865 */ 840 */
@@ -1277,12 +1252,37 @@ void wake_up_idle_cpu(int cpu)
1277} 1252}
1278#endif /* CONFIG_NO_HZ */ 1253#endif /* CONFIG_NO_HZ */
1279 1254
1255static u64 sched_avg_period(void)
1256{
1257 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1258}
1259
1260static void sched_avg_update(struct rq *rq)
1261{
1262 s64 period = sched_avg_period();
1263
1264 while ((s64)(rq->clock - rq->age_stamp) > period) {
1265 rq->age_stamp += period;
1266 rq->rt_avg /= 2;
1267 }
1268}
1269
1270static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1271{
1272 rq->rt_avg += rt_delta;
1273 sched_avg_update(rq);
1274}
1275
1280#else /* !CONFIG_SMP */ 1276#else /* !CONFIG_SMP */
1281static void resched_task(struct task_struct *p) 1277static void resched_task(struct task_struct *p)
1282{ 1278{
1283 assert_spin_locked(&task_rq(p)->lock); 1279 assert_spin_locked(&task_rq(p)->lock);
1284 set_tsk_need_resched(p); 1280 set_tsk_need_resched(p);
1285} 1281}
1282
1283static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1284{
1285}
1286#endif /* CONFIG_SMP */ 1286#endif /* CONFIG_SMP */
1287 1287
1288#if BITS_PER_LONG == 32 1288#if BITS_PER_LONG == 32
@@ -1493,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)
1493#endif 1493#endif
1494 1494
1495#ifdef CONFIG_SMP 1495#ifdef CONFIG_SMP
1496static unsigned long source_load(int cpu, int type); 1496/* Used instead of source_load when we know the type == 0 */
1497static unsigned long target_load(int cpu, int type); 1497static unsigned long weighted_cpuload(const int cpu)
1498{
1499 return cpu_rq(cpu)->load.weight;
1500}
1501
1502/*
1503 * Return a low guess at the load of a migration-source cpu weighted
1504 * according to the scheduling class and "nice" value.
1505 *
1506 * We want to under-estimate the load of migration sources, to
1507 * balance conservatively.
1508 */
1509static unsigned long source_load(int cpu, int type)
1510{
1511 struct rq *rq = cpu_rq(cpu);
1512 unsigned long total = weighted_cpuload(cpu);
1513
1514 if (type == 0 || !sched_feat(LB_BIAS))
1515 return total;
1516
1517 return min(rq->cpu_load[type-1], total);
1518}
1519
1520/*
1521 * Return a high guess at the load of a migration-target cpu weighted
1522 * according to the scheduling class and "nice" value.
1523 */
1524static unsigned long target_load(int cpu, int type)
1525{
1526 struct rq *rq = cpu_rq(cpu);
1527 unsigned long total = weighted_cpuload(cpu);
1528
1529 if (type == 0 || !sched_feat(LB_BIAS))
1530 return total;
1531
1532 return max(rq->cpu_load[type-1], total);
1533}
1534
1535static struct sched_group *group_of(int cpu)
1536{
1537 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1538
1539 if (!sd)
1540 return NULL;
1541
1542 return sd->groups;
1543}
1544
1545static unsigned long power_of(int cpu)
1546{
1547 struct sched_group *group = group_of(cpu);
1548
1549 if (!group)
1550 return SCHED_LOAD_SCALE;
1551
1552 return group->cpu_power;
1553}
1554
1498static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1555static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1499 1556
1500static unsigned long cpu_avg_load_per_task(int cpu) 1557static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1512,28 +1569,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1512 1569
1513#ifdef CONFIG_FAIR_GROUP_SCHED 1570#ifdef CONFIG_FAIR_GROUP_SCHED
1514 1571
1572struct update_shares_data {
1573 unsigned long rq_weight[NR_CPUS];
1574};
1575
1576static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1577
1515static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1578static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1516 1579
1517/* 1580/*
1518 * Calculate and set the cpu's group shares. 1581 * Calculate and set the cpu's group shares.
1519 */ 1582 */
1520static void 1583static void update_group_shares_cpu(struct task_group *tg, int cpu,
1521update_group_shares_cpu(struct task_group *tg, int cpu, 1584 unsigned long sd_shares,
1522 unsigned long sd_shares, unsigned long sd_rq_weight) 1585 unsigned long sd_rq_weight,
1586 struct update_shares_data *usd)
1523{ 1587{
1524 unsigned long shares; 1588 unsigned long shares, rq_weight;
1525 unsigned long rq_weight; 1589 int boost = 0;
1526
1527 if (!tg->se[cpu])
1528 return;
1529 1590
1530 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1591 rq_weight = usd->rq_weight[cpu];
1592 if (!rq_weight) {
1593 boost = 1;
1594 rq_weight = NICE_0_LOAD;
1595 }
1531 1596
1532 /* 1597 /*
1533 * \Sum shares * rq_weight 1598 * \Sum_j shares_j * rq_weight_i
1534 * shares = ----------------------- 1599 * shares_i = -----------------------------
1535 * \Sum rq_weight 1600 * \Sum_j rq_weight_j
1536 *
1537 */ 1601 */
1538 shares = (sd_shares * rq_weight) / sd_rq_weight; 1602 shares = (sd_shares * rq_weight) / sd_rq_weight;
1539 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1603 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1544,8 +1608,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1544 unsigned long flags; 1608 unsigned long flags;
1545 1609
1546 spin_lock_irqsave(&rq->lock, flags); 1610 spin_lock_irqsave(&rq->lock, flags);
1547 tg->cfs_rq[cpu]->shares = shares; 1611 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1548 1612 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1549 __set_se_shares(tg->se[cpu], shares); 1613 __set_se_shares(tg->se[cpu], shares);
1550 spin_unlock_irqrestore(&rq->lock, flags); 1614 spin_unlock_irqrestore(&rq->lock, flags);
1551 } 1615 }
@@ -1558,22 +1622,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1558 */ 1622 */
1559static int tg_shares_up(struct task_group *tg, void *data) 1623static int tg_shares_up(struct task_group *tg, void *data)
1560{ 1624{
1561 unsigned long weight, rq_weight = 0; 1625 unsigned long weight, rq_weight = 0, shares = 0;
1562 unsigned long shares = 0; 1626 struct update_shares_data *usd;
1563 struct sched_domain *sd = data; 1627 struct sched_domain *sd = data;
1628 unsigned long flags;
1564 int i; 1629 int i;
1565 1630
1631 if (!tg->se[0])
1632 return 0;
1633
1634 local_irq_save(flags);
1635 usd = &__get_cpu_var(update_shares_data);
1636
1566 for_each_cpu(i, sched_domain_span(sd)) { 1637 for_each_cpu(i, sched_domain_span(sd)) {
1638 weight = tg->cfs_rq[i]->load.weight;
1639 usd->rq_weight[i] = weight;
1640
1567 /* 1641 /*
1568 * If there are currently no tasks on the cpu pretend there 1642 * If there are currently no tasks on the cpu pretend there
1569 * is one of average load so that when a new task gets to 1643 * is one of average load so that when a new task gets to
1570 * run here it will not get delayed by group starvation. 1644 * run here it will not get delayed by group starvation.
1571 */ 1645 */
1572 weight = tg->cfs_rq[i]->load.weight;
1573 if (!weight) 1646 if (!weight)
1574 weight = NICE_0_LOAD; 1647 weight = NICE_0_LOAD;
1575 1648
1576 tg->cfs_rq[i]->rq_weight = weight;
1577 rq_weight += weight; 1649 rq_weight += weight;
1578 shares += tg->cfs_rq[i]->shares; 1650 shares += tg->cfs_rq[i]->shares;
1579 } 1651 }
@@ -1585,7 +1657,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1585 shares = tg->shares; 1657 shares = tg->shares;
1586 1658
1587 for_each_cpu(i, sched_domain_span(sd)) 1659 for_each_cpu(i, sched_domain_span(sd))
1588 update_group_shares_cpu(tg, i, shares, rq_weight); 1660 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1661
1662 local_irq_restore(flags);
1589 1663
1590 return 0; 1664 return 0;
1591} 1665}
@@ -1615,8 +1689,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1615 1689
1616static void update_shares(struct sched_domain *sd) 1690static void update_shares(struct sched_domain *sd)
1617{ 1691{
1618 u64 now = cpu_clock(raw_smp_processor_id()); 1692 s64 elapsed;
1619 s64 elapsed = now - sd->last_update; 1693 u64 now;
1694
1695 if (root_task_group_empty())
1696 return;
1697
1698 now = cpu_clock(raw_smp_processor_id());
1699 elapsed = now - sd->last_update;
1620 1700
1621 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1701 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1622 sd->last_update = now; 1702 sd->last_update = now;
@@ -1626,6 +1706,9 @@ static void update_shares(struct sched_domain *sd)
1626 1706
1627static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1707static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1628{ 1708{
1709 if (root_task_group_empty())
1710 return;
1711
1629 spin_unlock(&rq->lock); 1712 spin_unlock(&rq->lock);
1630 update_shares(sd); 1713 update_shares(sd);
1631 spin_lock(&rq->lock); 1714 spin_lock(&rq->lock);
@@ -1633,6 +1716,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1633 1716
1634static void update_h_load(long cpu) 1717static void update_h_load(long cpu)
1635{ 1718{
1719 if (root_task_group_empty())
1720 return;
1721
1636 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1722 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1637} 1723}
1638 1724
@@ -1650,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1650 1736
1651#ifdef CONFIG_PREEMPT 1737#ifdef CONFIG_PREEMPT
1652 1738
1739static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1740
1653/* 1741/*
1654 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1742 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1655 * way at the expense of forcing extra atomic operations in all 1743 * way at the expense of forcing extra atomic operations in all
@@ -1914,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1914} 2002}
1915 2003
1916#ifdef CONFIG_SMP 2004#ifdef CONFIG_SMP
1917
1918/* Used instead of source_load when we know the type == 0 */
1919static unsigned long weighted_cpuload(const int cpu)
1920{
1921 return cpu_rq(cpu)->load.weight;
1922}
1923
1924/* 2005/*
1925 * Is this task likely cache-hot: 2006 * Is this task likely cache-hot:
1926 */ 2007 */
@@ -1978,7 +2059,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1978 if (task_hot(p, old_rq->clock, NULL)) 2059 if (task_hot(p, old_rq->clock, NULL))
1979 schedstat_inc(p, se.nr_forced2_migrations); 2060 schedstat_inc(p, se.nr_forced2_migrations);
1980#endif 2061#endif
1981 perf_counter_task_migration(p, new_cpu); 2062 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2063 1, 1, NULL, 0);
1982 } 2064 }
1983 p->se.vruntime -= old_cfsrq->min_vruntime - 2065 p->se.vruntime -= old_cfsrq->min_vruntime -
1984 new_cfsrq->min_vruntime; 2066 new_cfsrq->min_vruntime;
@@ -2193,186 +2275,6 @@ void kick_process(struct task_struct *p)
2193 preempt_enable(); 2275 preempt_enable();
2194} 2276}
2195EXPORT_SYMBOL_GPL(kick_process); 2277EXPORT_SYMBOL_GPL(kick_process);
2196
2197/*
2198 * Return a low guess at the load of a migration-source cpu weighted
2199 * according to the scheduling class and "nice" value.
2200 *
2201 * We want to under-estimate the load of migration sources, to
2202 * balance conservatively.
2203 */
2204static unsigned long source_load(int cpu, int type)
2205{
2206 struct rq *rq = cpu_rq(cpu);
2207 unsigned long total = weighted_cpuload(cpu);
2208
2209 if (type == 0 || !sched_feat(LB_BIAS))
2210 return total;
2211
2212 return min(rq->cpu_load[type-1], total);
2213}
2214
2215/*
2216 * Return a high guess at the load of a migration-target cpu weighted
2217 * according to the scheduling class and "nice" value.
2218 */
2219static unsigned long target_load(int cpu, int type)
2220{
2221 struct rq *rq = cpu_rq(cpu);
2222 unsigned long total = weighted_cpuload(cpu);
2223
2224 if (type == 0 || !sched_feat(LB_BIAS))
2225 return total;
2226
2227 return max(rq->cpu_load[type-1], total);
2228}
2229
2230/*
2231 * find_idlest_group finds and returns the least busy CPU group within the
2232 * domain.
2233 */
2234static struct sched_group *
2235find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2236{
2237 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2238 unsigned long min_load = ULONG_MAX, this_load = 0;
2239 int load_idx = sd->forkexec_idx;
2240 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2241
2242 do {
2243 unsigned long load, avg_load;
2244 int local_group;
2245 int i;
2246
2247 /* Skip over this group if it has no CPUs allowed */
2248 if (!cpumask_intersects(sched_group_cpus(group),
2249 &p->cpus_allowed))
2250 continue;
2251
2252 local_group = cpumask_test_cpu(this_cpu,
2253 sched_group_cpus(group));
2254
2255 /* Tally up the load of all CPUs in the group */
2256 avg_load = 0;
2257
2258 for_each_cpu(i, sched_group_cpus(group)) {
2259 /* Bias balancing toward cpus of our domain */
2260 if (local_group)
2261 load = source_load(i, load_idx);
2262 else
2263 load = target_load(i, load_idx);
2264
2265 avg_load += load;
2266 }
2267
2268 /* Adjust by relative CPU power of the group */
2269 avg_load = sg_div_cpu_power(group,
2270 avg_load * SCHED_LOAD_SCALE);
2271
2272 if (local_group) {
2273 this_load = avg_load;
2274 this = group;
2275 } else if (avg_load < min_load) {
2276 min_load = avg_load;
2277 idlest = group;
2278 }
2279 } while (group = group->next, group != sd->groups);
2280
2281 if (!idlest || 100*this_load < imbalance*min_load)
2282 return NULL;
2283 return idlest;
2284}
2285
2286/*
2287 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2288 */
2289static int
2290find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2291{
2292 unsigned long load, min_load = ULONG_MAX;
2293 int idlest = -1;
2294 int i;
2295
2296 /* Traverse only the allowed CPUs */
2297 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2298 load = weighted_cpuload(i);
2299
2300 if (load < min_load || (load == min_load && i == this_cpu)) {
2301 min_load = load;
2302 idlest = i;
2303 }
2304 }
2305
2306 return idlest;
2307}
2308
2309/*
2310 * sched_balance_self: balance the current task (running on cpu) in domains
2311 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2312 * SD_BALANCE_EXEC.
2313 *
2314 * Balance, ie. select the least loaded group.
2315 *
2316 * Returns the target CPU number, or the same CPU if no balancing is needed.
2317 *
2318 * preempt must be disabled.
2319 */
2320static int sched_balance_self(int cpu, int flag)
2321{
2322 struct task_struct *t = current;
2323 struct sched_domain *tmp, *sd = NULL;
2324
2325 for_each_domain(cpu, tmp) {
2326 /*
2327 * If power savings logic is enabled for a domain, stop there.
2328 */
2329 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2330 break;
2331 if (tmp->flags & flag)
2332 sd = tmp;
2333 }
2334
2335 if (sd)
2336 update_shares(sd);
2337
2338 while (sd) {
2339 struct sched_group *group;
2340 int new_cpu, weight;
2341
2342 if (!(sd->flags & flag)) {
2343 sd = sd->child;
2344 continue;
2345 }
2346
2347 group = find_idlest_group(sd, t, cpu);
2348 if (!group) {
2349 sd = sd->child;
2350 continue;
2351 }
2352
2353 new_cpu = find_idlest_cpu(group, t, cpu);
2354 if (new_cpu == -1 || new_cpu == cpu) {
2355 /* Now try balancing at a lower domain level of cpu */
2356 sd = sd->child;
2357 continue;
2358 }
2359
2360 /* Now try balancing at a lower domain level of new_cpu */
2361 cpu = new_cpu;
2362 weight = cpumask_weight(sched_domain_span(sd));
2363 sd = NULL;
2364 for_each_domain(cpu, tmp) {
2365 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2366 break;
2367 if (tmp->flags & flag)
2368 sd = tmp;
2369 }
2370 /* while loop will break here if sd == NULL */
2371 }
2372
2373 return cpu;
2374}
2375
2376#endif /* CONFIG_SMP */ 2278#endif /* CONFIG_SMP */
2377 2279
2378/** 2280/**
@@ -2410,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p,
2410 * 2312 *
2411 * returns failure only if the task is already active. 2313 * returns failure only if the task is already active.
2412 */ 2314 */
2413static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2315static int try_to_wake_up(struct task_struct *p, unsigned int state,
2316 int wake_flags)
2414{ 2317{
2415 int cpu, orig_cpu, this_cpu, success = 0; 2318 int cpu, orig_cpu, this_cpu, success = 0;
2416 unsigned long flags; 2319 unsigned long flags;
2417 long old_state;
2418 struct rq *rq; 2320 struct rq *rq;
2419 2321
2420 if (!sched_feat(SYNC_WAKEUPS)) 2322 if (!sched_feat(SYNC_WAKEUPS))
2421 sync = 0; 2323 wake_flags &= ~WF_SYNC;
2422 2324
2423#ifdef CONFIG_SMP 2325 this_cpu = get_cpu();
2424 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2425 struct sched_domain *sd;
2426
2427 this_cpu = raw_smp_processor_id();
2428 cpu = task_cpu(p);
2429
2430 for_each_domain(this_cpu, sd) {
2431 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2432 update_shares(sd);
2433 break;
2434 }
2435 }
2436 }
2437#endif
2438 2326
2439 smp_wmb(); 2327 smp_wmb();
2440 rq = task_rq_lock(p, &flags); 2328 rq = task_rq_lock(p, &flags);
2441 update_rq_clock(rq); 2329 update_rq_clock(rq);
2442 old_state = p->state; 2330 if (!(p->state & state))
2443 if (!(old_state & state))
2444 goto out; 2331 goto out;
2445 2332
2446 if (p->se.on_rq) 2333 if (p->se.on_rq)
@@ -2448,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2448 2335
2449 cpu = task_cpu(p); 2336 cpu = task_cpu(p);
2450 orig_cpu = cpu; 2337 orig_cpu = cpu;
2451 this_cpu = smp_processor_id();
2452 2338
2453#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2454 if (unlikely(task_running(rq, p))) 2340 if (unlikely(task_running(rq, p)))
2455 goto out_activate; 2341 goto out_activate;
2456 2342
2457 cpu = p->sched_class->select_task_rq(p, sync); 2343 /*
2458 if (cpu != orig_cpu) { 2344 * In order to handle concurrent wakeups and release the rq->lock
2345 * we put the task in TASK_WAKING state.
2346 *
2347 * First fix up the nr_uninterruptible count:
2348 */
2349 if (task_contributes_to_load(p))
2350 rq->nr_uninterruptible--;
2351 p->state = TASK_WAKING;
2352 task_rq_unlock(rq, &flags);
2353
2354 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2355 if (cpu != orig_cpu)
2459 set_task_cpu(p, cpu); 2356 set_task_cpu(p, cpu);
2460 task_rq_unlock(rq, &flags);
2461 /* might preempt at this point */
2462 rq = task_rq_lock(p, &flags);
2463 old_state = p->state;
2464 if (!(old_state & state))
2465 goto out;
2466 if (p->se.on_rq)
2467 goto out_running;
2468 2357
2469 this_cpu = smp_processor_id(); 2358 rq = task_rq_lock(p, &flags);
2470 cpu = task_cpu(p); 2359 WARN_ON(p->state != TASK_WAKING);
2471 } 2360 cpu = task_cpu(p);
2472 2361
2473#ifdef CONFIG_SCHEDSTATS 2362#ifdef CONFIG_SCHEDSTATS
2474 schedstat_inc(rq, ttwu_count); 2363 schedstat_inc(rq, ttwu_count);
@@ -2488,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2488out_activate: 2377out_activate:
2489#endif /* CONFIG_SMP */ 2378#endif /* CONFIG_SMP */
2490 schedstat_inc(p, se.nr_wakeups); 2379 schedstat_inc(p, se.nr_wakeups);
2491 if (sync) 2380 if (wake_flags & WF_SYNC)
2492 schedstat_inc(p, se.nr_wakeups_sync); 2381 schedstat_inc(p, se.nr_wakeups_sync);
2493 if (orig_cpu != cpu) 2382 if (orig_cpu != cpu)
2494 schedstat_inc(p, se.nr_wakeups_migrate); 2383 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2517,7 +2406,7 @@ out_activate:
2517 2406
2518out_running: 2407out_running:
2519 trace_sched_wakeup(rq, p, success); 2408 trace_sched_wakeup(rq, p, success);
2520 check_preempt_curr(rq, p, sync); 2409 check_preempt_curr(rq, p, wake_flags);
2521 2410
2522 p->state = TASK_RUNNING; 2411 p->state = TASK_RUNNING;
2523#ifdef CONFIG_SMP 2412#ifdef CONFIG_SMP
@@ -2526,6 +2415,7 @@ out_running:
2526#endif 2415#endif
2527out: 2416out:
2528 task_rq_unlock(rq, &flags); 2417 task_rq_unlock(rq, &flags);
2418 put_cpu();
2529 2419
2530 return success; 2420 return success;
2531} 2421}
@@ -2568,17 +2458,40 @@ static void __sched_fork(struct task_struct *p)
2568 p->se.avg_overlap = 0; 2458 p->se.avg_overlap = 0;
2569 p->se.start_runtime = 0; 2459 p->se.start_runtime = 0;
2570 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2460 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2461 p->se.avg_running = 0;
2571 2462
2572#ifdef CONFIG_SCHEDSTATS 2463#ifdef CONFIG_SCHEDSTATS
2573 p->se.wait_start = 0; 2464 p->se.wait_start = 0;
2574 p->se.sum_sleep_runtime = 0; 2465 p->se.wait_max = 0;
2575 p->se.sleep_start = 0; 2466 p->se.wait_count = 0;
2576 p->se.block_start = 0; 2467 p->se.wait_sum = 0;
2577 p->se.sleep_max = 0; 2468
2578 p->se.block_max = 0; 2469 p->se.sleep_start = 0;
2579 p->se.exec_max = 0; 2470 p->se.sleep_max = 0;
2580 p->se.slice_max = 0; 2471 p->se.sum_sleep_runtime = 0;
2581 p->se.wait_max = 0; 2472
2473 p->se.block_start = 0;
2474 p->se.block_max = 0;
2475 p->se.exec_max = 0;
2476 p->se.slice_max = 0;
2477
2478 p->se.nr_migrations_cold = 0;
2479 p->se.nr_failed_migrations_affine = 0;
2480 p->se.nr_failed_migrations_running = 0;
2481 p->se.nr_failed_migrations_hot = 0;
2482 p->se.nr_forced_migrations = 0;
2483 p->se.nr_forced2_migrations = 0;
2484
2485 p->se.nr_wakeups = 0;
2486 p->se.nr_wakeups_sync = 0;
2487 p->se.nr_wakeups_migrate = 0;
2488 p->se.nr_wakeups_local = 0;
2489 p->se.nr_wakeups_remote = 0;
2490 p->se.nr_wakeups_affine = 0;
2491 p->se.nr_wakeups_affine_attempts = 0;
2492 p->se.nr_wakeups_passive = 0;
2493 p->se.nr_wakeups_idle = 0;
2494
2582#endif 2495#endif
2583 2496
2584 INIT_LIST_HEAD(&p->rt.run_list); 2497 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2607,18 +2520,41 @@ void sched_fork(struct task_struct *p, int clone_flags)
2607 2520
2608 __sched_fork(p); 2521 __sched_fork(p);
2609 2522
2610#ifdef CONFIG_SMP
2611 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2612#endif
2613 set_task_cpu(p, cpu);
2614
2615 /* 2523 /*
2616 * Make sure we do not leak PI boosting priority to the child: 2524 * Make sure we do not leak PI boosting priority to the child.
2617 */ 2525 */
2618 p->prio = current->normal_prio; 2526 p->prio = current->normal_prio;
2527
2528 /*
2529 * Revert to default priority/policy on fork if requested.
2530 */
2531 if (unlikely(p->sched_reset_on_fork)) {
2532 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2533 p->policy = SCHED_NORMAL;
2534
2535 if (p->normal_prio < DEFAULT_PRIO)
2536 p->prio = DEFAULT_PRIO;
2537
2538 if (PRIO_TO_NICE(p->static_prio) < 0) {
2539 p->static_prio = NICE_TO_PRIO(0);
2540 set_load_weight(p);
2541 }
2542
2543 /*
2544 * We don't need the reset flag anymore after the fork. It has
2545 * fulfilled its duty:
2546 */
2547 p->sched_reset_on_fork = 0;
2548 }
2549
2619 if (!rt_prio(p->prio)) 2550 if (!rt_prio(p->prio))
2620 p->sched_class = &fair_sched_class; 2551 p->sched_class = &fair_sched_class;
2621 2552
2553#ifdef CONFIG_SMP
2554 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2555#endif
2556 set_task_cpu(p, cpu);
2557
2622#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2558#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2623 if (likely(sched_info_on())) 2559 if (likely(sched_info_on()))
2624 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2560 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2664,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2664 inc_nr_running(rq); 2600 inc_nr_running(rq);
2665 } 2601 }
2666 trace_sched_wakeup_new(rq, p, 1); 2602 trace_sched_wakeup_new(rq, p, 1);
2667 check_preempt_curr(rq, p, 0); 2603 check_preempt_curr(rq, p, WF_FORK);
2668#ifdef CONFIG_SMP 2604#ifdef CONFIG_SMP
2669 if (p->sched_class->task_wake_up) 2605 if (p->sched_class->task_wake_up)
2670 p->sched_class->task_wake_up(rq, p); 2606 p->sched_class->task_wake_up(rq, p);
@@ -2772,12 +2708,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2772{ 2708{
2773 struct mm_struct *mm = rq->prev_mm; 2709 struct mm_struct *mm = rq->prev_mm;
2774 long prev_state; 2710 long prev_state;
2775#ifdef CONFIG_SMP
2776 int post_schedule = 0;
2777
2778 if (current->sched_class->needs_post_schedule)
2779 post_schedule = current->sched_class->needs_post_schedule(rq);
2780#endif
2781 2711
2782 rq->prev_mm = NULL; 2712 rq->prev_mm = NULL;
2783 2713
@@ -2796,10 +2726,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796 finish_arch_switch(prev); 2726 finish_arch_switch(prev);
2797 perf_counter_task_sched_in(current, cpu_of(rq)); 2727 perf_counter_task_sched_in(current, cpu_of(rq));
2798 finish_lock_switch(rq, prev); 2728 finish_lock_switch(rq, prev);
2799#ifdef CONFIG_SMP
2800 if (post_schedule)
2801 current->sched_class->post_schedule(rq);
2802#endif
2803 2729
2804 fire_sched_in_preempt_notifiers(current); 2730 fire_sched_in_preempt_notifiers(current);
2805 if (mm) 2731 if (mm)
@@ -2814,6 +2740,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2814 } 2740 }
2815} 2741}
2816 2742
2743#ifdef CONFIG_SMP
2744
2745/* assumes rq->lock is held */
2746static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2747{
2748 if (prev->sched_class->pre_schedule)
2749 prev->sched_class->pre_schedule(rq, prev);
2750}
2751
2752/* rq->lock is NOT held, but preemption is disabled */
2753static inline void post_schedule(struct rq *rq)
2754{
2755 if (rq->post_schedule) {
2756 unsigned long flags;
2757
2758 spin_lock_irqsave(&rq->lock, flags);
2759 if (rq->curr->sched_class->post_schedule)
2760 rq->curr->sched_class->post_schedule(rq);
2761 spin_unlock_irqrestore(&rq->lock, flags);
2762
2763 rq->post_schedule = 0;
2764 }
2765}
2766
2767#else
2768
2769static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2770{
2771}
2772
2773static inline void post_schedule(struct rq *rq)
2774{
2775}
2776
2777#endif
2778
2817/** 2779/**
2818 * schedule_tail - first thing a freshly forked thread must call. 2780 * schedule_tail - first thing a freshly forked thread must call.
2819 * @prev: the thread we just switched away from. 2781 * @prev: the thread we just switched away from.
@@ -2824,6 +2786,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2824 struct rq *rq = this_rq(); 2786 struct rq *rq = this_rq();
2825 2787
2826 finish_task_switch(rq, prev); 2788 finish_task_switch(rq, prev);
2789
2790 /*
2791 * FIXME: do we need to worry about rq being invalidated by the
2792 * task_switch?
2793 */
2794 post_schedule(rq);
2795
2827#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2796#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2828 /* In this case, finish_task_switch does not reenable preemption */ 2797 /* In this case, finish_task_switch does not reenable preemption */
2829 preempt_enable(); 2798 preempt_enable();
@@ -3140,7 +3109,7 @@ out:
3140void sched_exec(void) 3109void sched_exec(void)
3141{ 3110{
3142 int new_cpu, this_cpu = get_cpu(); 3111 int new_cpu, this_cpu = get_cpu();
3143 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3112 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3144 put_cpu(); 3113 put_cpu();
3145 if (new_cpu != this_cpu) 3114 if (new_cpu != this_cpu)
3146 sched_migrate_task(current, new_cpu); 3115 sched_migrate_task(current, new_cpu);
@@ -3355,9 +3324,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3355{ 3324{
3356 const struct sched_class *class; 3325 const struct sched_class *class;
3357 3326
3358 for (class = sched_class_highest; class; class = class->next) 3327 for_each_class(class) {
3359 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3328 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3360 return 1; 3329 return 1;
3330 }
3361 3331
3362 return 0; 3332 return 0;
3363} 3333}
@@ -3520,7 +3490,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3520 * capacity but still has some space to pick up some load 3490 * capacity but still has some space to pick up some load
3521 * from other group and save more power 3491 * from other group and save more power
3522 */ 3492 */
3523 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3493 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3524 return; 3494 return;
3525 3495
3526 if (sgs->sum_nr_running > sds->leader_nr_running || 3496 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3559,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3559 *imbalance = sds->min_load_per_task; 3529 *imbalance = sds->min_load_per_task;
3560 sds->busiest = sds->group_min; 3530 sds->busiest = sds->group_min;
3561 3531
3562 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3563 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3564 group_first_cpu(sds->group_leader);
3565 }
3566
3567 return 1; 3532 return 1;
3568 3533
3569} 3534}
@@ -3588,6 +3553,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3588#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3553#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3589 3554
3590 3555
3556unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3557{
3558 return SCHED_LOAD_SCALE;
3559}
3560
3561unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3562{
3563 return default_scale_freq_power(sd, cpu);
3564}
3565
3566unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3567{
3568 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3569 unsigned long smt_gain = sd->smt_gain;
3570
3571 smt_gain /= weight;
3572
3573 return smt_gain;
3574}
3575
3576unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3577{
3578 return default_scale_smt_power(sd, cpu);
3579}
3580
3581unsigned long scale_rt_power(int cpu)
3582{
3583 struct rq *rq = cpu_rq(cpu);
3584 u64 total, available;
3585
3586 sched_avg_update(rq);
3587
3588 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3589 available = total - rq->rt_avg;
3590
3591 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3592 total = SCHED_LOAD_SCALE;
3593
3594 total >>= SCHED_LOAD_SHIFT;
3595
3596 return div_u64(available, total);
3597}
3598
3599static void update_cpu_power(struct sched_domain *sd, int cpu)
3600{
3601 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3602 unsigned long power = SCHED_LOAD_SCALE;
3603 struct sched_group *sdg = sd->groups;
3604
3605 if (sched_feat(ARCH_POWER))
3606 power *= arch_scale_freq_power(sd, cpu);
3607 else
3608 power *= default_scale_freq_power(sd, cpu);
3609
3610 power >>= SCHED_LOAD_SHIFT;
3611
3612 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3613 if (sched_feat(ARCH_POWER))
3614 power *= arch_scale_smt_power(sd, cpu);
3615 else
3616 power *= default_scale_smt_power(sd, cpu);
3617
3618 power >>= SCHED_LOAD_SHIFT;
3619 }
3620
3621 power *= scale_rt_power(cpu);
3622 power >>= SCHED_LOAD_SHIFT;
3623
3624 if (!power)
3625 power = 1;
3626
3627 sdg->cpu_power = power;
3628}
3629
3630static void update_group_power(struct sched_domain *sd, int cpu)
3631{
3632 struct sched_domain *child = sd->child;
3633 struct sched_group *group, *sdg = sd->groups;
3634 unsigned long power;
3635
3636 if (!child) {
3637 update_cpu_power(sd, cpu);
3638 return;
3639 }
3640
3641 power = 0;
3642
3643 group = child->groups;
3644 do {
3645 power += group->cpu_power;
3646 group = group->next;
3647 } while (group != child->groups);
3648
3649 sdg->cpu_power = power;
3650}
3651
3591/** 3652/**
3592 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3653 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3593 * @group: sched_group whose statistics are to be updated. 3654 * @group: sched_group whose statistics are to be updated.
@@ -3600,7 +3661,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3600 * @balance: Should we balance. 3661 * @balance: Should we balance.
3601 * @sgs: variable to hold the statistics for this group. 3662 * @sgs: variable to hold the statistics for this group.
3602 */ 3663 */
3603static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3664static inline void update_sg_lb_stats(struct sched_domain *sd,
3665 struct sched_group *group, int this_cpu,
3604 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3666 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3605 int local_group, const struct cpumask *cpus, 3667 int local_group, const struct cpumask *cpus,
3606 int *balance, struct sg_lb_stats *sgs) 3668 int *balance, struct sg_lb_stats *sgs)
@@ -3611,8 +3673,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3611 unsigned long sum_avg_load_per_task; 3673 unsigned long sum_avg_load_per_task;
3612 unsigned long avg_load_per_task; 3674 unsigned long avg_load_per_task;
3613 3675
3614 if (local_group) 3676 if (local_group) {
3615 balance_cpu = group_first_cpu(group); 3677 balance_cpu = group_first_cpu(group);
3678 if (balance_cpu == this_cpu)
3679 update_group_power(sd, this_cpu);
3680 }
3616 3681
3617 /* Tally up the load of all CPUs in the group */ 3682 /* Tally up the load of all CPUs in the group */
3618 sum_avg_load_per_task = avg_load_per_task = 0; 3683 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3661,8 +3726,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3661 } 3726 }
3662 3727
3663 /* Adjust by relative CPU power of the group */ 3728 /* Adjust by relative CPU power of the group */
3664 sgs->avg_load = sg_div_cpu_power(group, 3729 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3665 sgs->group_load * SCHED_LOAD_SCALE);
3666 3730
3667 3731
3668 /* 3732 /*
@@ -3674,14 +3738,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3674 * normalized nr_running number somewhere that negates 3738 * normalized nr_running number somewhere that negates
3675 * the hierarchy? 3739 * the hierarchy?
3676 */ 3740 */
3677 avg_load_per_task = sg_div_cpu_power(group, 3741 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3678 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3742 group->cpu_power;
3679 3743
3680 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3744 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3681 sgs->group_imb = 1; 3745 sgs->group_imb = 1;
3682 3746
3683 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3747 sgs->group_capacity =
3684 3748 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3685} 3749}
3686 3750
3687/** 3751/**
@@ -3699,9 +3763,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3699 const struct cpumask *cpus, int *balance, 3763 const struct cpumask *cpus, int *balance,
3700 struct sd_lb_stats *sds) 3764 struct sd_lb_stats *sds)
3701{ 3765{
3766 struct sched_domain *child = sd->child;
3702 struct sched_group *group = sd->groups; 3767 struct sched_group *group = sd->groups;
3703 struct sg_lb_stats sgs; 3768 struct sg_lb_stats sgs;
3704 int load_idx; 3769 int load_idx, prefer_sibling = 0;
3770
3771 if (child && child->flags & SD_PREFER_SIBLING)
3772 prefer_sibling = 1;
3705 3773
3706 init_sd_power_savings_stats(sd, sds, idle); 3774 init_sd_power_savings_stats(sd, sds, idle);
3707 load_idx = get_sd_load_idx(sd, idle); 3775 load_idx = get_sd_load_idx(sd, idle);
@@ -3712,14 +3780,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3712 local_group = cpumask_test_cpu(this_cpu, 3780 local_group = cpumask_test_cpu(this_cpu,
3713 sched_group_cpus(group)); 3781 sched_group_cpus(group));
3714 memset(&sgs, 0, sizeof(sgs)); 3782 memset(&sgs, 0, sizeof(sgs));
3715 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3783 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3716 local_group, cpus, balance, &sgs); 3784 local_group, cpus, balance, &sgs);
3717 3785
3718 if (local_group && balance && !(*balance)) 3786 if (local_group && balance && !(*balance))
3719 return; 3787 return;
3720 3788
3721 sds->total_load += sgs.group_load; 3789 sds->total_load += sgs.group_load;
3722 sds->total_pwr += group->__cpu_power; 3790 sds->total_pwr += group->cpu_power;
3791
3792 /*
3793 * In case the child domain prefers tasks go to siblings
3794 * first, lower the group capacity to one so that we'll try
3795 * and move all the excess tasks away.
3796 */
3797 if (prefer_sibling)
3798 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3723 3799
3724 if (local_group) { 3800 if (local_group) {
3725 sds->this_load = sgs.avg_load; 3801 sds->this_load = sgs.avg_load;
@@ -3739,7 +3815,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3739 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3815 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3740 group = group->next; 3816 group = group->next;
3741 } while (group != sd->groups); 3817 } while (group != sd->groups);
3742
3743} 3818}
3744 3819
3745/** 3820/**
@@ -3777,28 +3852,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3777 * moving them. 3852 * moving them.
3778 */ 3853 */
3779 3854
3780 pwr_now += sds->busiest->__cpu_power * 3855 pwr_now += sds->busiest->cpu_power *
3781 min(sds->busiest_load_per_task, sds->max_load); 3856 min(sds->busiest_load_per_task, sds->max_load);
3782 pwr_now += sds->this->__cpu_power * 3857 pwr_now += sds->this->cpu_power *
3783 min(sds->this_load_per_task, sds->this_load); 3858 min(sds->this_load_per_task, sds->this_load);
3784 pwr_now /= SCHED_LOAD_SCALE; 3859 pwr_now /= SCHED_LOAD_SCALE;
3785 3860
3786 /* Amount of load we'd subtract */ 3861 /* Amount of load we'd subtract */
3787 tmp = sg_div_cpu_power(sds->busiest, 3862 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3788 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3863 sds->busiest->cpu_power;
3789 if (sds->max_load > tmp) 3864 if (sds->max_load > tmp)
3790 pwr_move += sds->busiest->__cpu_power * 3865 pwr_move += sds->busiest->cpu_power *
3791 min(sds->busiest_load_per_task, sds->max_load - tmp); 3866 min(sds->busiest_load_per_task, sds->max_load - tmp);
3792 3867
3793 /* Amount of load we'd add */ 3868 /* Amount of load we'd add */
3794 if (sds->max_load * sds->busiest->__cpu_power < 3869 if (sds->max_load * sds->busiest->cpu_power <
3795 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3870 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3796 tmp = sg_div_cpu_power(sds->this, 3871 tmp = (sds->max_load * sds->busiest->cpu_power) /
3797 sds->max_load * sds->busiest->__cpu_power); 3872 sds->this->cpu_power;
3798 else 3873 else
3799 tmp = sg_div_cpu_power(sds->this, 3874 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3800 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3875 sds->this->cpu_power;
3801 pwr_move += sds->this->__cpu_power * 3876 pwr_move += sds->this->cpu_power *
3802 min(sds->this_load_per_task, sds->this_load + tmp); 3877 min(sds->this_load_per_task, sds->this_load + tmp);
3803 pwr_move /= SCHED_LOAD_SCALE; 3878 pwr_move /= SCHED_LOAD_SCALE;
3804 3879
@@ -3833,8 +3908,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3833 sds->max_load - sds->busiest_load_per_task); 3908 sds->max_load - sds->busiest_load_per_task);
3834 3909
3835 /* How much load to actually move to equalise the imbalance */ 3910 /* How much load to actually move to equalise the imbalance */
3836 *imbalance = min(max_pull * sds->busiest->__cpu_power, 3911 *imbalance = min(max_pull * sds->busiest->cpu_power,
3837 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 3912 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3838 / SCHED_LOAD_SCALE; 3913 / SCHED_LOAD_SCALE;
3839 3914
3840 /* 3915 /*
@@ -3964,15 +4039,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3964 int i; 4039 int i;
3965 4040
3966 for_each_cpu(i, sched_group_cpus(group)) { 4041 for_each_cpu(i, sched_group_cpus(group)) {
4042 unsigned long power = power_of(i);
4043 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3967 unsigned long wl; 4044 unsigned long wl;
3968 4045
3969 if (!cpumask_test_cpu(i, cpus)) 4046 if (!cpumask_test_cpu(i, cpus))
3970 continue; 4047 continue;
3971 4048
3972 rq = cpu_rq(i); 4049 rq = cpu_rq(i);
3973 wl = weighted_cpuload(i); 4050 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4051 wl /= power;
3974 4052
3975 if (rq->nr_running == 1 && wl > imbalance) 4053 if (capacity && rq->nr_running == 1 && wl > imbalance)
3976 continue; 4054 continue;
3977 4055
3978 if (wl > max_load) { 4056 if (wl > max_load) {
@@ -5233,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
5233#endif 5311#endif
5234} 5312}
5235 5313
5236static void put_prev_task(struct rq *rq, struct task_struct *prev) 5314static void put_prev_task(struct rq *rq, struct task_struct *p)
5237{ 5315{
5238 if (prev->state == TASK_RUNNING) { 5316 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5239 u64 runtime = prev->se.sum_exec_runtime;
5240 5317
5241 runtime -= prev->se.prev_sum_exec_runtime; 5318 update_avg(&p->se.avg_running, runtime);
5242 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5243 5319
5320 if (p->state == TASK_RUNNING) {
5244 /* 5321 /*
5245 * In order to avoid avg_overlap growing stale when we are 5322 * In order to avoid avg_overlap growing stale when we are
5246 * indeed overlapping and hence not getting put to sleep, grow 5323 * indeed overlapping and hence not getting put to sleep, grow
@@ -5250,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5250 * correlates to the amount of cache footprint a task can 5327 * correlates to the amount of cache footprint a task can
5251 * build up. 5328 * build up.
5252 */ 5329 */
5253 update_avg(&prev->se.avg_overlap, runtime); 5330 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5331 update_avg(&p->se.avg_overlap, runtime);
5332 } else {
5333 update_avg(&p->se.avg_running, 0);
5254 } 5334 }
5255 prev->sched_class->put_prev_task(rq, prev); 5335 p->sched_class->put_prev_task(rq, p);
5256} 5336}
5257 5337
5258/* 5338/*
@@ -5301,7 +5381,7 @@ need_resched:
5301 preempt_disable(); 5381 preempt_disable();
5302 cpu = smp_processor_id(); 5382 cpu = smp_processor_id();
5303 rq = cpu_rq(cpu); 5383 rq = cpu_rq(cpu);
5304 rcu_qsctr_inc(cpu); 5384 rcu_sched_qs(cpu);
5305 prev = rq->curr; 5385 prev = rq->curr;
5306 switch_count = &prev->nivcsw; 5386 switch_count = &prev->nivcsw;
5307 5387
@@ -5325,10 +5405,7 @@ need_resched_nonpreemptible:
5325 switch_count = &prev->nvcsw; 5405 switch_count = &prev->nvcsw;
5326 } 5406 }
5327 5407
5328#ifdef CONFIG_SMP 5408 pre_schedule(rq, prev);
5329 if (prev->sched_class->pre_schedule)
5330 prev->sched_class->pre_schedule(rq, prev);
5331#endif
5332 5409
5333 if (unlikely(!rq->nr_running)) 5410 if (unlikely(!rq->nr_running))
5334 idle_balance(cpu, rq); 5411 idle_balance(cpu, rq);
@@ -5354,6 +5431,8 @@ need_resched_nonpreemptible:
5354 } else 5431 } else
5355 spin_unlock_irq(&rq->lock); 5432 spin_unlock_irq(&rq->lock);
5356 5433
5434 post_schedule(rq);
5435
5357 if (unlikely(reacquire_kernel_lock(current) < 0)) 5436 if (unlikely(reacquire_kernel_lock(current) < 0))
5358 goto need_resched_nonpreemptible; 5437 goto need_resched_nonpreemptible;
5359 5438
@@ -5485,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5485 5564
5486#endif /* CONFIG_PREEMPT */ 5565#endif /* CONFIG_PREEMPT */
5487 5566
5488int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5567int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5489 void *key) 5568 void *key)
5490{ 5569{
5491 return try_to_wake_up(curr->private, mode, sync); 5570 return try_to_wake_up(curr->private, mode, wake_flags);
5492} 5571}
5493EXPORT_SYMBOL(default_wake_function); 5572EXPORT_SYMBOL(default_wake_function);
5494 5573
@@ -5502,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
5502 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5581 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5503 */ 5582 */
5504static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5583static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5505 int nr_exclusive, int sync, void *key) 5584 int nr_exclusive, int wake_flags, void *key)
5506{ 5585{
5507 wait_queue_t *curr, *next; 5586 wait_queue_t *curr, *next;
5508 5587
5509 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5588 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5510 unsigned flags = curr->flags; 5589 unsigned flags = curr->flags;
5511 5590
5512 if (curr->func(curr, mode, sync, key) && 5591 if (curr->func(curr, mode, wake_flags, key) &&
5513 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5592 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5514 break; 5593 break;
5515 } 5594 }
@@ -5570,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5570 int nr_exclusive, void *key) 5649 int nr_exclusive, void *key)
5571{ 5650{
5572 unsigned long flags; 5651 unsigned long flags;
5573 int sync = 1; 5652 int wake_flags = WF_SYNC;
5574 5653
5575 if (unlikely(!q)) 5654 if (unlikely(!q))
5576 return; 5655 return;
5577 5656
5578 if (unlikely(!nr_exclusive)) 5657 if (unlikely(!nr_exclusive))
5579 sync = 0; 5658 wake_flags = 0;
5580 5659
5581 spin_lock_irqsave(&q->lock, flags); 5660 spin_lock_irqsave(&q->lock, flags);
5582 __wake_up_common(q, mode, nr_exclusive, sync, key); 5661 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5583 spin_unlock_irqrestore(&q->lock, flags); 5662 spin_unlock_irqrestore(&q->lock, flags);
5584} 5663}
5585EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5664EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6099,17 +6178,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6099 unsigned long flags; 6178 unsigned long flags;
6100 const struct sched_class *prev_class = p->sched_class; 6179 const struct sched_class *prev_class = p->sched_class;
6101 struct rq *rq; 6180 struct rq *rq;
6181 int reset_on_fork;
6102 6182
6103 /* may grab non-irq protected spin_locks */ 6183 /* may grab non-irq protected spin_locks */
6104 BUG_ON(in_interrupt()); 6184 BUG_ON(in_interrupt());
6105recheck: 6185recheck:
6106 /* double check policy once rq lock held */ 6186 /* double check policy once rq lock held */
6107 if (policy < 0) 6187 if (policy < 0) {
6188 reset_on_fork = p->sched_reset_on_fork;
6108 policy = oldpolicy = p->policy; 6189 policy = oldpolicy = p->policy;
6109 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6190 } else {
6110 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6191 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6111 policy != SCHED_IDLE) 6192 policy &= ~SCHED_RESET_ON_FORK;
6112 return -EINVAL; 6193
6194 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6195 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6196 policy != SCHED_IDLE)
6197 return -EINVAL;
6198 }
6199
6113 /* 6200 /*
6114 * Valid priorities for SCHED_FIFO and SCHED_RR are 6201 * Valid priorities for SCHED_FIFO and SCHED_RR are
6115 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6202 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6153,6 +6240,10 @@ recheck:
6153 /* can't change other user's priorities */ 6240 /* can't change other user's priorities */
6154 if (!check_same_owner(p)) 6241 if (!check_same_owner(p))
6155 return -EPERM; 6242 return -EPERM;
6243
6244 /* Normal users shall not reset the sched_reset_on_fork flag */
6245 if (p->sched_reset_on_fork && !reset_on_fork)
6246 return -EPERM;
6156 } 6247 }
6157 6248
6158 if (user) { 6249 if (user) {
@@ -6196,6 +6287,8 @@ recheck:
6196 if (running) 6287 if (running)
6197 p->sched_class->put_prev_task(rq, p); 6288 p->sched_class->put_prev_task(rq, p);
6198 6289
6290 p->sched_reset_on_fork = reset_on_fork;
6291
6199 oldprio = p->prio; 6292 oldprio = p->prio;
6200 __setscheduler(rq, p, policy, param->sched_priority); 6293 __setscheduler(rq, p, policy, param->sched_priority);
6201 6294
@@ -6312,14 +6405,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6312 if (p) { 6405 if (p) {
6313 retval = security_task_getscheduler(p); 6406 retval = security_task_getscheduler(p);
6314 if (!retval) 6407 if (!retval)
6315 retval = p->policy; 6408 retval = p->policy
6409 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6316 } 6410 }
6317 read_unlock(&tasklist_lock); 6411 read_unlock(&tasklist_lock);
6318 return retval; 6412 return retval;
6319} 6413}
6320 6414
6321/** 6415/**
6322 * sys_sched_getscheduler - get the RT priority of a thread 6416 * sys_sched_getparam - get the RT priority of a thread
6323 * @pid: the pid in question. 6417 * @pid: the pid in question.
6324 * @param: structure containing the RT priority. 6418 * @param: structure containing the RT priority.
6325 */ 6419 */
@@ -6540,27 +6634,21 @@ SYSCALL_DEFINE0(sched_yield)
6540 return 0; 6634 return 0;
6541} 6635}
6542 6636
6637static inline int should_resched(void)
6638{
6639 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6640}
6641
6543static void __cond_resched(void) 6642static void __cond_resched(void)
6544{ 6643{
6545#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6644 add_preempt_count(PREEMPT_ACTIVE);
6546 __might_sleep(__FILE__, __LINE__); 6645 schedule();
6547#endif 6646 sub_preempt_count(PREEMPT_ACTIVE);
6548 /*
6549 * The BKS might be reacquired before we have dropped
6550 * PREEMPT_ACTIVE, which could trigger a second
6551 * cond_resched() call.
6552 */
6553 do {
6554 add_preempt_count(PREEMPT_ACTIVE);
6555 schedule();
6556 sub_preempt_count(PREEMPT_ACTIVE);
6557 } while (need_resched());
6558} 6647}
6559 6648
6560int __sched _cond_resched(void) 6649int __sched _cond_resched(void)
6561{ 6650{
6562 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6651 if (should_resched()) {
6563 system_state == SYSTEM_RUNNING) {
6564 __cond_resched(); 6652 __cond_resched();
6565 return 1; 6653 return 1;
6566 } 6654 }
@@ -6569,21 +6657,23 @@ int __sched _cond_resched(void)
6569EXPORT_SYMBOL(_cond_resched); 6657EXPORT_SYMBOL(_cond_resched);
6570 6658
6571/* 6659/*
6572 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6660 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6573 * call schedule, and on return reacquire the lock. 6661 * call schedule, and on return reacquire the lock.
6574 * 6662 *
6575 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6663 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6576 * operations here to prevent schedule() from being called twice (once via 6664 * operations here to prevent schedule() from being called twice (once via
6577 * spin_unlock(), once by hand). 6665 * spin_unlock(), once by hand).
6578 */ 6666 */
6579int cond_resched_lock(spinlock_t *lock) 6667int __cond_resched_lock(spinlock_t *lock)
6580{ 6668{
6581 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6669 int resched = should_resched();
6582 int ret = 0; 6670 int ret = 0;
6583 6671
6672 lockdep_assert_held(lock);
6673
6584 if (spin_needbreak(lock) || resched) { 6674 if (spin_needbreak(lock) || resched) {
6585 spin_unlock(lock); 6675 spin_unlock(lock);
6586 if (resched && need_resched()) 6676 if (resched)
6587 __cond_resched(); 6677 __cond_resched();
6588 else 6678 else
6589 cpu_relax(); 6679 cpu_relax();
@@ -6592,13 +6682,13 @@ int cond_resched_lock(spinlock_t *lock)
6592 } 6682 }
6593 return ret; 6683 return ret;
6594} 6684}
6595EXPORT_SYMBOL(cond_resched_lock); 6685EXPORT_SYMBOL(__cond_resched_lock);
6596 6686
6597int __sched cond_resched_softirq(void) 6687int __sched __cond_resched_softirq(void)
6598{ 6688{
6599 BUG_ON(!in_softirq()); 6689 BUG_ON(!in_softirq());
6600 6690
6601 if (need_resched() && system_state == SYSTEM_RUNNING) { 6691 if (should_resched()) {
6602 local_bh_enable(); 6692 local_bh_enable();
6603 __cond_resched(); 6693 __cond_resched();
6604 local_bh_disable(); 6694 local_bh_disable();
@@ -6606,7 +6696,7 @@ int __sched cond_resched_softirq(void)
6606 } 6696 }
6607 return 0; 6697 return 0;
6608} 6698}
6609EXPORT_SYMBOL(cond_resched_softirq); 6699EXPORT_SYMBOL(__cond_resched_softirq);
6610 6700
6611/** 6701/**
6612 * yield - yield the current processor to other threads. 6702 * yield - yield the current processor to other threads.
@@ -6630,11 +6720,13 @@ EXPORT_SYMBOL(yield);
6630 */ 6720 */
6631void __sched io_schedule(void) 6721void __sched io_schedule(void)
6632{ 6722{
6633 struct rq *rq = &__raw_get_cpu_var(runqueues); 6723 struct rq *rq = raw_rq();
6634 6724
6635 delayacct_blkio_start(); 6725 delayacct_blkio_start();
6636 atomic_inc(&rq->nr_iowait); 6726 atomic_inc(&rq->nr_iowait);
6727 current->in_iowait = 1;
6637 schedule(); 6728 schedule();
6729 current->in_iowait = 0;
6638 atomic_dec(&rq->nr_iowait); 6730 atomic_dec(&rq->nr_iowait);
6639 delayacct_blkio_end(); 6731 delayacct_blkio_end();
6640} 6732}
@@ -6642,12 +6734,14 @@ EXPORT_SYMBOL(io_schedule);
6642 6734
6643long __sched io_schedule_timeout(long timeout) 6735long __sched io_schedule_timeout(long timeout)
6644{ 6736{
6645 struct rq *rq = &__raw_get_cpu_var(runqueues); 6737 struct rq *rq = raw_rq();
6646 long ret; 6738 long ret;
6647 6739
6648 delayacct_blkio_start(); 6740 delayacct_blkio_start();
6649 atomic_inc(&rq->nr_iowait); 6741 atomic_inc(&rq->nr_iowait);
6742 current->in_iowait = 1;
6650 ret = schedule_timeout(timeout); 6743 ret = schedule_timeout(timeout);
6744 current->in_iowait = 0;
6651 atomic_dec(&rq->nr_iowait); 6745 atomic_dec(&rq->nr_iowait);
6652 delayacct_blkio_end(); 6746 delayacct_blkio_end();
6653 return ret; 6747 return ret;
@@ -6964,8 +7058,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6964 7058
6965 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7059 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6966 /* Need help from migration thread: drop lock and wait. */ 7060 /* Need help from migration thread: drop lock and wait. */
7061 struct task_struct *mt = rq->migration_thread;
7062
7063 get_task_struct(mt);
6967 task_rq_unlock(rq, &flags); 7064 task_rq_unlock(rq, &flags);
6968 wake_up_process(rq->migration_thread); 7065 wake_up_process(rq->migration_thread);
7066 put_task_struct(mt);
6969 wait_for_completion(&req.done); 7067 wait_for_completion(&req.done);
6970 tlb_migrate_finish(p->mm); 7068 tlb_migrate_finish(p->mm);
6971 return 0; 7069 return 0;
@@ -7023,6 +7121,11 @@ fail:
7023 return ret; 7121 return ret;
7024} 7122}
7025 7123
7124#define RCU_MIGRATION_IDLE 0
7125#define RCU_MIGRATION_NEED_QS 1
7126#define RCU_MIGRATION_GOT_QS 2
7127#define RCU_MIGRATION_MUST_SYNC 3
7128
7026/* 7129/*
7027 * migration_thread - this is a highprio system thread that performs 7130 * migration_thread - this is a highprio system thread that performs
7028 * thread migration by bumping thread off CPU then 'pushing' onto 7131 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7030,6 +7133,7 @@ fail:
7030 */ 7133 */
7031static int migration_thread(void *data) 7134static int migration_thread(void *data)
7032{ 7135{
7136 int badcpu;
7033 int cpu = (long)data; 7137 int cpu = (long)data;
7034 struct rq *rq; 7138 struct rq *rq;
7035 7139
@@ -7045,7 +7149,7 @@ static int migration_thread(void *data)
7045 7149
7046 if (cpu_is_offline(cpu)) { 7150 if (cpu_is_offline(cpu)) {
7047 spin_unlock_irq(&rq->lock); 7151 spin_unlock_irq(&rq->lock);
7048 goto wait_to_die; 7152 break;
7049 } 7153 }
7050 7154
7051 if (rq->active_balance) { 7155 if (rq->active_balance) {
@@ -7064,23 +7168,23 @@ static int migration_thread(void *data)
7064 req = list_entry(head->next, struct migration_req, list); 7168 req = list_entry(head->next, struct migration_req, list);
7065 list_del_init(head->next); 7169 list_del_init(head->next);
7066 7170
7067 spin_unlock(&rq->lock); 7171 if (req->task != NULL) {
7068 __migrate_task(req->task, cpu, req->dest_cpu); 7172 spin_unlock(&rq->lock);
7173 __migrate_task(req->task, cpu, req->dest_cpu);
7174 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7175 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7176 spin_unlock(&rq->lock);
7177 } else {
7178 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7179 spin_unlock(&rq->lock);
7180 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7181 }
7069 local_irq_enable(); 7182 local_irq_enable();
7070 7183
7071 complete(&req->done); 7184 complete(&req->done);
7072 } 7185 }
7073 __set_current_state(TASK_RUNNING); 7186 __set_current_state(TASK_RUNNING);
7074 return 0;
7075 7187
7076wait_to_die:
7077 /* Wait for kthread_stop */
7078 set_current_state(TASK_INTERRUPTIBLE);
7079 while (!kthread_should_stop()) {
7080 schedule();
7081 set_current_state(TASK_INTERRUPTIBLE);
7082 }
7083 __set_current_state(TASK_RUNNING);
7084 return 0; 7188 return 0;
7085} 7189}
7086 7190
@@ -7270,6 +7374,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7270static void calc_global_load_remove(struct rq *rq) 7374static void calc_global_load_remove(struct rq *rq)
7271{ 7375{
7272 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7376 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7377 rq->calc_load_active = 0;
7273} 7378}
7274#endif /* CONFIG_HOTPLUG_CPU */ 7379#endif /* CONFIG_HOTPLUG_CPU */
7275 7380
@@ -7494,7 +7599,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7494 rq = task_rq_lock(p, &flags); 7599 rq = task_rq_lock(p, &flags);
7495 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7600 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7496 task_rq_unlock(rq, &flags); 7601 task_rq_unlock(rq, &flags);
7602 get_task_struct(p);
7497 cpu_rq(cpu)->migration_thread = p; 7603 cpu_rq(cpu)->migration_thread = p;
7604 rq->calc_load_update = calc_load_update;
7498 break; 7605 break;
7499 7606
7500 case CPU_ONLINE: 7607 case CPU_ONLINE:
@@ -7505,8 +7612,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7505 /* Update our root-domain */ 7612 /* Update our root-domain */
7506 rq = cpu_rq(cpu); 7613 rq = cpu_rq(cpu);
7507 spin_lock_irqsave(&rq->lock, flags); 7614 spin_lock_irqsave(&rq->lock, flags);
7508 rq->calc_load_update = calc_load_update;
7509 rq->calc_load_active = 0;
7510 if (rq->rd) { 7615 if (rq->rd) {
7511 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7616 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7512 7617
@@ -7524,6 +7629,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7524 kthread_bind(cpu_rq(cpu)->migration_thread, 7629 kthread_bind(cpu_rq(cpu)->migration_thread,
7525 cpumask_any(cpu_online_mask)); 7630 cpumask_any(cpu_online_mask));
7526 kthread_stop(cpu_rq(cpu)->migration_thread); 7631 kthread_stop(cpu_rq(cpu)->migration_thread);
7632 put_task_struct(cpu_rq(cpu)->migration_thread);
7527 cpu_rq(cpu)->migration_thread = NULL; 7633 cpu_rq(cpu)->migration_thread = NULL;
7528 break; 7634 break;
7529 7635
@@ -7533,6 +7639,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7533 migrate_live_tasks(cpu); 7639 migrate_live_tasks(cpu);
7534 rq = cpu_rq(cpu); 7640 rq = cpu_rq(cpu);
7535 kthread_stop(rq->migration_thread); 7641 kthread_stop(rq->migration_thread);
7642 put_task_struct(rq->migration_thread);
7536 rq->migration_thread = NULL; 7643 rq->migration_thread = NULL;
7537 /* Idle task back to normal (off runqueue, low prio) */ 7644 /* Idle task back to normal (off runqueue, low prio) */
7538 spin_lock_irq(&rq->lock); 7645 spin_lock_irq(&rq->lock);
@@ -7603,7 +7710,7 @@ static int __init migration_init(void)
7603 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7710 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7604 register_cpu_notifier(&migration_notifier); 7711 register_cpu_notifier(&migration_notifier);
7605 7712
7606 return err; 7713 return 0;
7607} 7714}
7608early_initcall(migration_init); 7715early_initcall(migration_init);
7609#endif 7716#endif
@@ -7650,7 +7757,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7650 break; 7757 break;
7651 } 7758 }
7652 7759
7653 if (!group->__cpu_power) { 7760 if (!group->cpu_power) {
7654 printk(KERN_CONT "\n"); 7761 printk(KERN_CONT "\n");
7655 printk(KERN_ERR "ERROR: domain->cpu_power not " 7762 printk(KERN_ERR "ERROR: domain->cpu_power not "
7656 "set\n"); 7763 "set\n");
@@ -7674,9 +7781,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7674 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7781 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7675 7782
7676 printk(KERN_CONT " %s", str); 7783 printk(KERN_CONT " %s", str);
7677 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7784 if (group->cpu_power != SCHED_LOAD_SCALE) {
7678 printk(KERN_CONT " (__cpu_power = %d)", 7785 printk(KERN_CONT " (cpu_power = %d)",
7679 group->__cpu_power); 7786 group->cpu_power);
7680 } 7787 }
7681 7788
7682 group = group->next; 7789 group = group->next;
@@ -7741,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd)
7741 } 7848 }
7742 7849
7743 /* Following flags don't use groups */ 7850 /* Following flags don't use groups */
7744 if (sd->flags & (SD_WAKE_IDLE | 7851 if (sd->flags & (SD_WAKE_AFFINE))
7745 SD_WAKE_AFFINE |
7746 SD_WAKE_BALANCE))
7747 return 0; 7852 return 0;
7748 7853
7749 return 1; 7854 return 1;
@@ -7760,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7760 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7865 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7761 return 0; 7866 return 0;
7762 7867
7763 /* Does parent contain flags not in child? */
7764 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7765 if (cflags & SD_WAKE_AFFINE)
7766 pflags &= ~SD_WAKE_BALANCE;
7767 /* Flags needing groups don't count if only 1 group in parent */ 7868 /* Flags needing groups don't count if only 1 group in parent */
7768 if (parent->groups == parent->groups->next) { 7869 if (parent->groups == parent->groups->next) {
7769 pflags &= ~(SD_LOAD_BALANCE | 7870 pflags &= ~(SD_LOAD_BALANCE |
@@ -7819,7 +7920,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7819 rq->rd = rd; 7920 rq->rd = rd;
7820 7921
7821 cpumask_set_cpu(rq->cpu, rd->span); 7922 cpumask_set_cpu(rq->cpu, rd->span);
7822 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7923 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7823 set_rq_online(rq); 7924 set_rq_online(rq);
7824 7925
7825 spin_unlock_irqrestore(&rq->lock, flags); 7926 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7828,7 +7929,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7828 free_rootdomain(old_rd); 7929 free_rootdomain(old_rd);
7829} 7930}
7830 7931
7831static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7932static int init_rootdomain(struct root_domain *rd, bool bootmem)
7832{ 7933{
7833 gfp_t gfp = GFP_KERNEL; 7934 gfp_t gfp = GFP_KERNEL;
7834 7935
@@ -7961,7 +8062,7 @@ init_sched_build_groups(const struct cpumask *span,
7961 continue; 8062 continue;
7962 8063
7963 cpumask_clear(sched_group_cpus(sg)); 8064 cpumask_clear(sched_group_cpus(sg));
7964 sg->__cpu_power = 0; 8065 sg->cpu_power = 0;
7965 8066
7966 for_each_cpu(j, span) { 8067 for_each_cpu(j, span) {
7967 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8068 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8069,6 +8170,39 @@ struct static_sched_domain {
8069 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8170 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8070}; 8171};
8071 8172
8173struct s_data {
8174#ifdef CONFIG_NUMA
8175 int sd_allnodes;
8176 cpumask_var_t domainspan;
8177 cpumask_var_t covered;
8178 cpumask_var_t notcovered;
8179#endif
8180 cpumask_var_t nodemask;
8181 cpumask_var_t this_sibling_map;
8182 cpumask_var_t this_core_map;
8183 cpumask_var_t send_covered;
8184 cpumask_var_t tmpmask;
8185 struct sched_group **sched_group_nodes;
8186 struct root_domain *rd;
8187};
8188
8189enum s_alloc {
8190 sa_sched_groups = 0,
8191 sa_rootdomain,
8192 sa_tmpmask,
8193 sa_send_covered,
8194 sa_this_core_map,
8195 sa_this_sibling_map,
8196 sa_nodemask,
8197 sa_sched_group_nodes,
8198#ifdef CONFIG_NUMA
8199 sa_notcovered,
8200 sa_covered,
8201 sa_domainspan,
8202#endif
8203 sa_none,
8204};
8205
8072/* 8206/*
8073 * SMT sched-domains: 8207 * SMT sched-domains:
8074 */ 8208 */
@@ -8186,11 +8320,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8186 continue; 8320 continue;
8187 } 8321 }
8188 8322
8189 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8323 sg->cpu_power += sd->groups->cpu_power;
8190 } 8324 }
8191 sg = sg->next; 8325 sg = sg->next;
8192 } while (sg != group_head); 8326 } while (sg != group_head);
8193} 8327}
8328
8329static int build_numa_sched_groups(struct s_data *d,
8330 const struct cpumask *cpu_map, int num)
8331{
8332 struct sched_domain *sd;
8333 struct sched_group *sg, *prev;
8334 int n, j;
8335
8336 cpumask_clear(d->covered);
8337 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8338 if (cpumask_empty(d->nodemask)) {
8339 d->sched_group_nodes[num] = NULL;
8340 goto out;
8341 }
8342
8343 sched_domain_node_span(num, d->domainspan);
8344 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8345
8346 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8347 GFP_KERNEL, num);
8348 if (!sg) {
8349 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8350 num);
8351 return -ENOMEM;
8352 }
8353 d->sched_group_nodes[num] = sg;
8354
8355 for_each_cpu(j, d->nodemask) {
8356 sd = &per_cpu(node_domains, j).sd;
8357 sd->groups = sg;
8358 }
8359
8360 sg->cpu_power = 0;
8361 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8362 sg->next = sg;
8363 cpumask_or(d->covered, d->covered, d->nodemask);
8364
8365 prev = sg;
8366 for (j = 0; j < nr_node_ids; j++) {
8367 n = (num + j) % nr_node_ids;
8368 cpumask_complement(d->notcovered, d->covered);
8369 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8370 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8371 if (cpumask_empty(d->tmpmask))
8372 break;
8373 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8374 if (cpumask_empty(d->tmpmask))
8375 continue;
8376 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8377 GFP_KERNEL, num);
8378 if (!sg) {
8379 printk(KERN_WARNING
8380 "Can not alloc domain group for node %d\n", j);
8381 return -ENOMEM;
8382 }
8383 sg->cpu_power = 0;
8384 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8385 sg->next = prev->next;
8386 cpumask_or(d->covered, d->covered, d->tmpmask);
8387 prev->next = sg;
8388 prev = sg;
8389 }
8390out:
8391 return 0;
8392}
8194#endif /* CONFIG_NUMA */ 8393#endif /* CONFIG_NUMA */
8195 8394
8196#ifdef CONFIG_NUMA 8395#ifdef CONFIG_NUMA
@@ -8244,15 +8443,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8244 * there are asymmetries in the topology. If there are asymmetries, group 8443 * there are asymmetries in the topology. If there are asymmetries, group
8245 * having more cpu_power will pickup more load compared to the group having 8444 * having more cpu_power will pickup more load compared to the group having
8246 * less cpu_power. 8445 * less cpu_power.
8247 *
8248 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8249 * the maximum number of tasks a group can handle in the presence of other idle
8250 * or lightly loaded groups in the same sched domain.
8251 */ 8446 */
8252static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8447static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8253{ 8448{
8254 struct sched_domain *child; 8449 struct sched_domain *child;
8255 struct sched_group *group; 8450 struct sched_group *group;
8451 long power;
8452 int weight;
8256 8453
8257 WARN_ON(!sd || !sd->groups); 8454 WARN_ON(!sd || !sd->groups);
8258 8455
@@ -8261,28 +8458,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8261 8458
8262 child = sd->child; 8459 child = sd->child;
8263 8460
8264 sd->groups->__cpu_power = 0; 8461 sd->groups->cpu_power = 0;
8265 8462
8266 /* 8463 if (!child) {
8267 * For perf policy, if the groups in child domain share resources 8464 power = SCHED_LOAD_SCALE;
8268 * (for example cores sharing some portions of the cache hierarchy 8465 weight = cpumask_weight(sched_domain_span(sd));
8269 * or SMT), then set this domain groups cpu_power such that each group 8466 /*
8270 * can handle only one task, when there are other idle groups in the 8467 * SMT siblings share the power of a single core.
8271 * same sched domain. 8468 * Usually multiple threads get a better yield out of
8272 */ 8469 * that one core than a single thread would have,
8273 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8470 * reflect that in sd->smt_gain.
8274 (child->flags & 8471 */
8275 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8472 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8276 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8473 power *= sd->smt_gain;
8474 power /= weight;
8475 power >>= SCHED_LOAD_SHIFT;
8476 }
8477 sd->groups->cpu_power += power;
8277 return; 8478 return;
8278 } 8479 }
8279 8480
8280 /* 8481 /*
8281 * add cpu_power of each child group to this groups cpu_power 8482 * Add cpu_power of each child group to this groups cpu_power.
8282 */ 8483 */
8283 group = child->groups; 8484 group = child->groups;
8284 do { 8485 do {
8285 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8486 sd->groups->cpu_power += group->cpu_power;
8286 group = group->next; 8487 group = group->next;
8287 } while (group != child->groups); 8488 } while (group != child->groups);
8288} 8489}
@@ -8349,287 +8550,292 @@ static void set_domain_attribute(struct sched_domain *sd,
8349 request = attr->relax_domain_level; 8550 request = attr->relax_domain_level;
8350 if (request < sd->level) { 8551 if (request < sd->level) {
8351 /* turn off idle balance on this domain */ 8552 /* turn off idle balance on this domain */
8352 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8553 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8353 } else { 8554 } else {
8354 /* turn on idle balance on this domain */ 8555 /* turn on idle balance on this domain */
8355 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8556 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8557 }
8558}
8559
8560static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8561 const struct cpumask *cpu_map)
8562{
8563 switch (what) {
8564 case sa_sched_groups:
8565 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8566 d->sched_group_nodes = NULL;
8567 case sa_rootdomain:
8568 free_rootdomain(d->rd); /* fall through */
8569 case sa_tmpmask:
8570 free_cpumask_var(d->tmpmask); /* fall through */
8571 case sa_send_covered:
8572 free_cpumask_var(d->send_covered); /* fall through */
8573 case sa_this_core_map:
8574 free_cpumask_var(d->this_core_map); /* fall through */
8575 case sa_this_sibling_map:
8576 free_cpumask_var(d->this_sibling_map); /* fall through */
8577 case sa_nodemask:
8578 free_cpumask_var(d->nodemask); /* fall through */
8579 case sa_sched_group_nodes:
8580#ifdef CONFIG_NUMA
8581 kfree(d->sched_group_nodes); /* fall through */
8582 case sa_notcovered:
8583 free_cpumask_var(d->notcovered); /* fall through */
8584 case sa_covered:
8585 free_cpumask_var(d->covered); /* fall through */
8586 case sa_domainspan:
8587 free_cpumask_var(d->domainspan); /* fall through */
8588#endif
8589 case sa_none:
8590 break;
8356 } 8591 }
8357} 8592}
8358 8593
8359/* 8594static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8360 * Build sched domains for a given set of cpus and attach the sched domains 8595 const struct cpumask *cpu_map)
8361 * to the individual cpus
8362 */
8363static int __build_sched_domains(const struct cpumask *cpu_map,
8364 struct sched_domain_attr *attr)
8365{ 8596{
8366 int i, err = -ENOMEM;
8367 struct root_domain *rd;
8368 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8369 tmpmask;
8370#ifdef CONFIG_NUMA 8597#ifdef CONFIG_NUMA
8371 cpumask_var_t domainspan, covered, notcovered; 8598 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8372 struct sched_group **sched_group_nodes = NULL; 8599 return sa_none;
8373 int sd_allnodes = 0; 8600 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8374 8601 return sa_domainspan;
8375 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) 8602 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8376 goto out; 8603 return sa_covered;
8377 if (!alloc_cpumask_var(&covered, GFP_KERNEL)) 8604 /* Allocate the per-node list of sched groups */
8378 goto free_domainspan; 8605 d->sched_group_nodes = kcalloc(nr_node_ids,
8379 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) 8606 sizeof(struct sched_group *), GFP_KERNEL);
8380 goto free_covered; 8607 if (!d->sched_group_nodes) {
8381#endif
8382
8383 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8384 goto free_notcovered;
8385 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8386 goto free_nodemask;
8387 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8388 goto free_this_sibling_map;
8389 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8390 goto free_this_core_map;
8391 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8392 goto free_send_covered;
8393
8394#ifdef CONFIG_NUMA
8395 /*
8396 * Allocate the per-node list of sched groups
8397 */
8398 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
8399 GFP_KERNEL);
8400 if (!sched_group_nodes) {
8401 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8608 printk(KERN_WARNING "Can not alloc sched group node list\n");
8402 goto free_tmpmask; 8609 return sa_notcovered;
8403 } 8610 }
8404#endif 8611 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8405 8612#endif
8406 rd = alloc_rootdomain(); 8613 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8407 if (!rd) { 8614 return sa_sched_group_nodes;
8615 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8616 return sa_nodemask;
8617 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8618 return sa_this_sibling_map;
8619 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8620 return sa_this_core_map;
8621 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8622 return sa_send_covered;
8623 d->rd = alloc_rootdomain();
8624 if (!d->rd) {
8408 printk(KERN_WARNING "Cannot alloc root domain\n"); 8625 printk(KERN_WARNING "Cannot alloc root domain\n");
8409 goto free_sched_groups; 8626 return sa_tmpmask;
8410 } 8627 }
8628 return sa_rootdomain;
8629}
8411 8630
8631static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8632 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8633{
8634 struct sched_domain *sd = NULL;
8412#ifdef CONFIG_NUMA 8635#ifdef CONFIG_NUMA
8413 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8636 struct sched_domain *parent;
8414#endif
8415
8416 /*
8417 * Set up domains for cpus specified by the cpu_map.
8418 */
8419 for_each_cpu(i, cpu_map) {
8420 struct sched_domain *sd = NULL, *p;
8421
8422 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8423
8424#ifdef CONFIG_NUMA
8425 if (cpumask_weight(cpu_map) >
8426 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8427 sd = &per_cpu(allnodes_domains, i).sd;
8428 SD_INIT(sd, ALLNODES);
8429 set_domain_attribute(sd, attr);
8430 cpumask_copy(sched_domain_span(sd), cpu_map);
8431 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8432 p = sd;
8433 sd_allnodes = 1;
8434 } else
8435 p = NULL;
8436 8637
8437 sd = &per_cpu(node_domains, i).sd; 8638 d->sd_allnodes = 0;
8438 SD_INIT(sd, NODE); 8639 if (cpumask_weight(cpu_map) >
8640 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8641 sd = &per_cpu(allnodes_domains, i).sd;
8642 SD_INIT(sd, ALLNODES);
8439 set_domain_attribute(sd, attr); 8643 set_domain_attribute(sd, attr);
8440 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8644 cpumask_copy(sched_domain_span(sd), cpu_map);
8441 sd->parent = p; 8645 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8442 if (p) 8646 d->sd_allnodes = 1;
8443 p->child = sd; 8647 }
8444 cpumask_and(sched_domain_span(sd), 8648 parent = sd;
8445 sched_domain_span(sd), cpu_map); 8649
8650 sd = &per_cpu(node_domains, i).sd;
8651 SD_INIT(sd, NODE);
8652 set_domain_attribute(sd, attr);
8653 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8654 sd->parent = parent;
8655 if (parent)
8656 parent->child = sd;
8657 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8446#endif 8658#endif
8659 return sd;
8660}
8447 8661
8448 p = sd; 8662static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8449 sd = &per_cpu(phys_domains, i).sd; 8663 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8450 SD_INIT(sd, CPU); 8664 struct sched_domain *parent, int i)
8451 set_domain_attribute(sd, attr); 8665{
8452 cpumask_copy(sched_domain_span(sd), nodemask); 8666 struct sched_domain *sd;
8453 sd->parent = p; 8667 sd = &per_cpu(phys_domains, i).sd;
8454 if (p) 8668 SD_INIT(sd, CPU);
8455 p->child = sd; 8669 set_domain_attribute(sd, attr);
8456 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8670 cpumask_copy(sched_domain_span(sd), d->nodemask);
8671 sd->parent = parent;
8672 if (parent)
8673 parent->child = sd;
8674 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8675 return sd;
8676}
8457 8677
8678static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8679 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8680 struct sched_domain *parent, int i)
8681{
8682 struct sched_domain *sd = parent;
8458#ifdef CONFIG_SCHED_MC 8683#ifdef CONFIG_SCHED_MC
8459 p = sd; 8684 sd = &per_cpu(core_domains, i).sd;
8460 sd = &per_cpu(core_domains, i).sd; 8685 SD_INIT(sd, MC);
8461 SD_INIT(sd, MC); 8686 set_domain_attribute(sd, attr);
8462 set_domain_attribute(sd, attr); 8687 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8463 cpumask_and(sched_domain_span(sd), cpu_map, 8688 sd->parent = parent;
8464 cpu_coregroup_mask(i)); 8689 parent->child = sd;
8465 sd->parent = p; 8690 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8466 p->child = sd;
8467 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8468#endif 8691#endif
8692 return sd;
8693}
8469 8694
8695static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8696 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8697 struct sched_domain *parent, int i)
8698{
8699 struct sched_domain *sd = parent;
8470#ifdef CONFIG_SCHED_SMT 8700#ifdef CONFIG_SCHED_SMT
8471 p = sd; 8701 sd = &per_cpu(cpu_domains, i).sd;
8472 sd = &per_cpu(cpu_domains, i).sd; 8702 SD_INIT(sd, SIBLING);
8473 SD_INIT(sd, SIBLING); 8703 set_domain_attribute(sd, attr);
8474 set_domain_attribute(sd, attr); 8704 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8475 cpumask_and(sched_domain_span(sd), 8705 sd->parent = parent;
8476 topology_thread_cpumask(i), cpu_map); 8706 parent->child = sd;
8477 sd->parent = p; 8707 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8478 p->child = sd;
8479 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8480#endif 8708#endif
8481 } 8709 return sd;
8710}
8482 8711
8712static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8713 const struct cpumask *cpu_map, int cpu)
8714{
8715 switch (l) {
8483#ifdef CONFIG_SCHED_SMT 8716#ifdef CONFIG_SCHED_SMT
8484 /* Set up CPU (sibling) groups */ 8717 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8485 for_each_cpu(i, cpu_map) { 8718 cpumask_and(d->this_sibling_map, cpu_map,
8486 cpumask_and(this_sibling_map, 8719 topology_thread_cpumask(cpu));
8487 topology_thread_cpumask(i), cpu_map); 8720 if (cpu == cpumask_first(d->this_sibling_map))
8488 if (i != cpumask_first(this_sibling_map)) 8721 init_sched_build_groups(d->this_sibling_map, cpu_map,
8489 continue; 8722 &cpu_to_cpu_group,
8490 8723 d->send_covered, d->tmpmask);
8491 init_sched_build_groups(this_sibling_map, cpu_map, 8724 break;
8492 &cpu_to_cpu_group,
8493 send_covered, tmpmask);
8494 }
8495#endif 8725#endif
8496
8497#ifdef CONFIG_SCHED_MC 8726#ifdef CONFIG_SCHED_MC
8498 /* Set up multi-core groups */ 8727 case SD_LV_MC: /* set up multi-core groups */
8499 for_each_cpu(i, cpu_map) { 8728 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8500 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8729 if (cpu == cpumask_first(d->this_core_map))
8501 if (i != cpumask_first(this_core_map)) 8730 init_sched_build_groups(d->this_core_map, cpu_map,
8502 continue; 8731 &cpu_to_core_group,
8503 8732 d->send_covered, d->tmpmask);
8504 init_sched_build_groups(this_core_map, cpu_map, 8733 break;
8505 &cpu_to_core_group,
8506 send_covered, tmpmask);
8507 }
8508#endif 8734#endif
8509 8735 case SD_LV_CPU: /* set up physical groups */
8510 /* Set up physical groups */ 8736 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8511 for (i = 0; i < nr_node_ids; i++) { 8737 if (!cpumask_empty(d->nodemask))
8512 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8738 init_sched_build_groups(d->nodemask, cpu_map,
8513 if (cpumask_empty(nodemask)) 8739 &cpu_to_phys_group,
8514 continue; 8740 d->send_covered, d->tmpmask);
8515 8741 break;
8516 init_sched_build_groups(nodemask, cpu_map,
8517 &cpu_to_phys_group,
8518 send_covered, tmpmask);
8519 }
8520
8521#ifdef CONFIG_NUMA 8742#ifdef CONFIG_NUMA
8522 /* Set up node groups */ 8743 case SD_LV_ALLNODES:
8523 if (sd_allnodes) { 8744 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8524 init_sched_build_groups(cpu_map, cpu_map, 8745 d->send_covered, d->tmpmask);
8525 &cpu_to_allnodes_group, 8746 break;
8526 send_covered, tmpmask); 8747#endif
8748 default:
8749 break;
8527 } 8750 }
8751}
8528 8752
8529 for (i = 0; i < nr_node_ids; i++) { 8753/*
8530 /* Set up node groups */ 8754 * Build sched domains for a given set of cpus and attach the sched domains
8531 struct sched_group *sg, *prev; 8755 * to the individual cpus
8532 int j; 8756 */
8533 8757static int __build_sched_domains(const struct cpumask *cpu_map,
8534 cpumask_clear(covered); 8758 struct sched_domain_attr *attr)
8535 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8759{
8536 if (cpumask_empty(nodemask)) { 8760 enum s_alloc alloc_state = sa_none;
8537 sched_group_nodes[i] = NULL; 8761 struct s_data d;
8538 continue; 8762 struct sched_domain *sd;
8539 } 8763 int i;
8764#ifdef CONFIG_NUMA
8765 d.sd_allnodes = 0;
8766#endif
8540 8767
8541 sched_domain_node_span(i, domainspan); 8768 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8542 cpumask_and(domainspan, domainspan, cpu_map); 8769 if (alloc_state != sa_rootdomain)
8770 goto error;
8771 alloc_state = sa_sched_groups;
8543 8772
8544 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8773 /*
8545 GFP_KERNEL, i); 8774 * Set up domains for cpus specified by the cpu_map.
8546 if (!sg) { 8775 */
8547 printk(KERN_WARNING "Can not alloc domain group for " 8776 for_each_cpu(i, cpu_map) {
8548 "node %d\n", i); 8777 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8549 goto error; 8778 cpu_map);
8550 }
8551 sched_group_nodes[i] = sg;
8552 for_each_cpu(j, nodemask) {
8553 struct sched_domain *sd;
8554 8779
8555 sd = &per_cpu(node_domains, j).sd; 8780 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8556 sd->groups = sg; 8781 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8557 } 8782 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8558 sg->__cpu_power = 0; 8783 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8559 cpumask_copy(sched_group_cpus(sg), nodemask); 8784 }
8560 sg->next = sg;
8561 cpumask_or(covered, covered, nodemask);
8562 prev = sg;
8563 8785
8564 for (j = 0; j < nr_node_ids; j++) { 8786 for_each_cpu(i, cpu_map) {
8565 int n = (i + j) % nr_node_ids; 8787 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8788 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8789 }
8566 8790
8567 cpumask_complement(notcovered, covered); 8791 /* Set up physical groups */
8568 cpumask_and(tmpmask, notcovered, cpu_map); 8792 for (i = 0; i < nr_node_ids; i++)
8569 cpumask_and(tmpmask, tmpmask, domainspan); 8793 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8570 if (cpumask_empty(tmpmask))
8571 break;
8572 8794
8573 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8795#ifdef CONFIG_NUMA
8574 if (cpumask_empty(tmpmask)) 8796 /* Set up node groups */
8575 continue; 8797 if (d.sd_allnodes)
8798 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8576 8799
8577 sg = kmalloc_node(sizeof(struct sched_group) + 8800 for (i = 0; i < nr_node_ids; i++)
8578 cpumask_size(), 8801 if (build_numa_sched_groups(&d, cpu_map, i))
8579 GFP_KERNEL, i); 8802 goto error;
8580 if (!sg) {
8581 printk(KERN_WARNING
8582 "Can not alloc domain group for node %d\n", j);
8583 goto error;
8584 }
8585 sg->__cpu_power = 0;
8586 cpumask_copy(sched_group_cpus(sg), tmpmask);
8587 sg->next = prev->next;
8588 cpumask_or(covered, covered, tmpmask);
8589 prev->next = sg;
8590 prev = sg;
8591 }
8592 }
8593#endif 8803#endif
8594 8804
8595 /* Calculate CPU power for physical packages and nodes */ 8805 /* Calculate CPU power for physical packages and nodes */
8596#ifdef CONFIG_SCHED_SMT 8806#ifdef CONFIG_SCHED_SMT
8597 for_each_cpu(i, cpu_map) { 8807 for_each_cpu(i, cpu_map) {
8598 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8808 sd = &per_cpu(cpu_domains, i).sd;
8599
8600 init_sched_groups_power(i, sd); 8809 init_sched_groups_power(i, sd);
8601 } 8810 }
8602#endif 8811#endif
8603#ifdef CONFIG_SCHED_MC 8812#ifdef CONFIG_SCHED_MC
8604 for_each_cpu(i, cpu_map) { 8813 for_each_cpu(i, cpu_map) {
8605 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8814 sd = &per_cpu(core_domains, i).sd;
8606
8607 init_sched_groups_power(i, sd); 8815 init_sched_groups_power(i, sd);
8608 } 8816 }
8609#endif 8817#endif
8610 8818
8611 for_each_cpu(i, cpu_map) { 8819 for_each_cpu(i, cpu_map) {
8612 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8820 sd = &per_cpu(phys_domains, i).sd;
8613
8614 init_sched_groups_power(i, sd); 8821 init_sched_groups_power(i, sd);
8615 } 8822 }
8616 8823
8617#ifdef CONFIG_NUMA 8824#ifdef CONFIG_NUMA
8618 for (i = 0; i < nr_node_ids; i++) 8825 for (i = 0; i < nr_node_ids; i++)
8619 init_numa_sched_groups_power(sched_group_nodes[i]); 8826 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8620 8827
8621 if (sd_allnodes) { 8828 if (d.sd_allnodes) {
8622 struct sched_group *sg; 8829 struct sched_group *sg;
8623 8830
8624 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8831 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8625 tmpmask); 8832 d.tmpmask);
8626 init_numa_sched_groups_power(sg); 8833 init_numa_sched_groups_power(sg);
8627 } 8834 }
8628#endif 8835#endif
8629 8836
8630 /* Attach the domains */ 8837 /* Attach the domains */
8631 for_each_cpu(i, cpu_map) { 8838 for_each_cpu(i, cpu_map) {
8632 struct sched_domain *sd;
8633#ifdef CONFIG_SCHED_SMT 8839#ifdef CONFIG_SCHED_SMT
8634 sd = &per_cpu(cpu_domains, i).sd; 8840 sd = &per_cpu(cpu_domains, i).sd;
8635#elif defined(CONFIG_SCHED_MC) 8841#elif defined(CONFIG_SCHED_MC)
@@ -8637,44 +8843,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8637#else 8843#else
8638 sd = &per_cpu(phys_domains, i).sd; 8844 sd = &per_cpu(phys_domains, i).sd;
8639#endif 8845#endif
8640 cpu_attach_domain(sd, rd, i); 8846 cpu_attach_domain(sd, d.rd, i);
8641 } 8847 }
8642 8848
8643 err = 0; 8849 d.sched_group_nodes = NULL; /* don't free this we still need it */
8644 8850 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8645free_tmpmask: 8851 return 0;
8646 free_cpumask_var(tmpmask);
8647free_send_covered:
8648 free_cpumask_var(send_covered);
8649free_this_core_map:
8650 free_cpumask_var(this_core_map);
8651free_this_sibling_map:
8652 free_cpumask_var(this_sibling_map);
8653free_nodemask:
8654 free_cpumask_var(nodemask);
8655free_notcovered:
8656#ifdef CONFIG_NUMA
8657 free_cpumask_var(notcovered);
8658free_covered:
8659 free_cpumask_var(covered);
8660free_domainspan:
8661 free_cpumask_var(domainspan);
8662out:
8663#endif
8664 return err;
8665
8666free_sched_groups:
8667#ifdef CONFIG_NUMA
8668 kfree(sched_group_nodes);
8669#endif
8670 goto free_tmpmask;
8671 8852
8672#ifdef CONFIG_NUMA
8673error: 8853error:
8674 free_sched_groups(cpu_map, tmpmask); 8854 __free_domain_allocs(&d, alloc_state, cpu_map);
8675 free_rootdomain(rd); 8855 return -ENOMEM;
8676 goto free_tmpmask;
8677#endif
8678} 8856}
8679 8857
8680static int build_sched_domains(const struct cpumask *cpu_map) 8858static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9075,7 +9253,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9075#ifdef CONFIG_SMP 9253#ifdef CONFIG_SMP
9076 rt_rq->rt_nr_migratory = 0; 9254 rt_rq->rt_nr_migratory = 0;
9077 rt_rq->overloaded = 0; 9255 rt_rq->overloaded = 0;
9078 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9256 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9079#endif 9257#endif
9080 9258
9081 rt_rq->rt_time = 0; 9259 rt_rq->rt_time = 0;
@@ -9282,11 +9460,11 @@ void __init sched_init(void)
9282 * system cpu resource, based on the weight assigned to root 9460 * system cpu resource, based on the weight assigned to root
9283 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9461 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9284 * by letting tasks of init_task_group sit in a separate cfs_rq 9462 * by letting tasks of init_task_group sit in a separate cfs_rq
9285 * (init_cfs_rq) and having one entity represent this group of 9463 * (init_tg_cfs_rq) and having one entity represent this group of
9286 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9464 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9287 */ 9465 */
9288 init_tg_cfs_entry(&init_task_group, 9466 init_tg_cfs_entry(&init_task_group,
9289 &per_cpu(init_cfs_rq, i), 9467 &per_cpu(init_tg_cfs_rq, i),
9290 &per_cpu(init_sched_entity, i), i, 1, 9468 &per_cpu(init_sched_entity, i), i, 1,
9291 root_task_group.se[i]); 9469 root_task_group.se[i]);
9292 9470
@@ -9312,6 +9490,7 @@ void __init sched_init(void)
9312#ifdef CONFIG_SMP 9490#ifdef CONFIG_SMP
9313 rq->sd = NULL; 9491 rq->sd = NULL;
9314 rq->rd = NULL; 9492 rq->rd = NULL;
9493 rq->post_schedule = 0;
9315 rq->active_balance = 0; 9494 rq->active_balance = 0;
9316 rq->next_balance = jiffies; 9495 rq->next_balance = jiffies;
9317 rq->push_cpu = 0; 9496 rq->push_cpu = 0;
@@ -9376,13 +9555,20 @@ void __init sched_init(void)
9376} 9555}
9377 9556
9378#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9557#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9379void __might_sleep(char *file, int line) 9558static inline int preempt_count_equals(int preempt_offset)
9559{
9560 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9561
9562 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9563}
9564
9565void __might_sleep(char *file, int line, int preempt_offset)
9380{ 9566{
9381#ifdef in_atomic 9567#ifdef in_atomic
9382 static unsigned long prev_jiffy; /* ratelimiting */ 9568 static unsigned long prev_jiffy; /* ratelimiting */
9383 9569
9384 if ((!in_atomic() && !irqs_disabled()) || 9570 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9385 system_state != SYSTEM_RUNNING || oops_in_progress) 9571 system_state != SYSTEM_RUNNING || oops_in_progress)
9386 return; 9572 return;
9387 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9573 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9388 return; 9574 return;
@@ -10559,3 +10745,113 @@ struct cgroup_subsys cpuacct_subsys = {
10559 .subsys_id = cpuacct_subsys_id, 10745 .subsys_id = cpuacct_subsys_id,
10560}; 10746};
10561#endif /* CONFIG_CGROUP_CPUACCT */ 10747#endif /* CONFIG_CGROUP_CPUACCT */
10748
10749#ifndef CONFIG_SMP
10750
10751int rcu_expedited_torture_stats(char *page)
10752{
10753 return 0;
10754}
10755EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10756
10757void synchronize_sched_expedited(void)
10758{
10759}
10760EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10761
10762#else /* #ifndef CONFIG_SMP */
10763
10764static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10765static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10766
10767#define RCU_EXPEDITED_STATE_POST -2
10768#define RCU_EXPEDITED_STATE_IDLE -1
10769
10770static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10771
10772int rcu_expedited_torture_stats(char *page)
10773{
10774 int cnt = 0;
10775 int cpu;
10776
10777 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10778 for_each_online_cpu(cpu) {
10779 cnt += sprintf(&page[cnt], " %d:%d",
10780 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10781 }
10782 cnt += sprintf(&page[cnt], "\n");
10783 return cnt;
10784}
10785EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10786
10787static long synchronize_sched_expedited_count;
10788
10789/*
10790 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10791 * approach to force grace period to end quickly. This consumes
10792 * significant time on all CPUs, and is thus not recommended for
10793 * any sort of common-case code.
10794 *
10795 * Note that it is illegal to call this function while holding any
10796 * lock that is acquired by a CPU-hotplug notifier. Failing to
10797 * observe this restriction will result in deadlock.
10798 */
10799void synchronize_sched_expedited(void)
10800{
10801 int cpu;
10802 unsigned long flags;
10803 bool need_full_sync = 0;
10804 struct rq *rq;
10805 struct migration_req *req;
10806 long snap;
10807 int trycount = 0;
10808
10809 smp_mb(); /* ensure prior mod happens before capturing snap. */
10810 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10811 get_online_cpus();
10812 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10813 put_online_cpus();
10814 if (trycount++ < 10)
10815 udelay(trycount * num_online_cpus());
10816 else {
10817 synchronize_sched();
10818 return;
10819 }
10820 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10821 smp_mb(); /* ensure test happens before caller kfree */
10822 return;
10823 }
10824 get_online_cpus();
10825 }
10826 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10827 for_each_online_cpu(cpu) {
10828 rq = cpu_rq(cpu);
10829 req = &per_cpu(rcu_migration_req, cpu);
10830 init_completion(&req->done);
10831 req->task = NULL;
10832 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10833 spin_lock_irqsave(&rq->lock, flags);
10834 list_add(&req->list, &rq->migration_queue);
10835 spin_unlock_irqrestore(&rq->lock, flags);
10836 wake_up_process(rq->migration_thread);
10837 }
10838 for_each_online_cpu(cpu) {
10839 rcu_expedited_state = cpu;
10840 req = &per_cpu(rcu_migration_req, cpu);
10841 rq = cpu_rq(cpu);
10842 wait_for_completion(&req->done);
10843 spin_lock_irqsave(&rq->lock, flags);
10844 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10845 need_full_sync = 1;
10846 req->dest_cpu = RCU_MIGRATION_IDLE;
10847 spin_unlock_irqrestore(&rq->lock, flags);
10848 }
10849 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10850 mutex_unlock(&rcu_sched_expedited_mutex);
10851 put_online_cpus();
10852 if (need_full_sync)
10853 synchronize_sched();
10854}
10855EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10856
10857#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 7deffc9f0e5f..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
@@ -114,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
114 127
115 /* 128 /*
116 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
117 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
118 */ 134 */
119 if (likely(oldpri != CPUPRI_INVALID)) {
120 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
121
122 spin_lock_irqsave(&vec->lock, flags);
123
124 vec->count--;
125 if (!vec->count)
126 clear_bit(oldpri, cp->pri_active);
127 cpumask_clear_cpu(cpu, vec->mask);
128
129 spin_unlock_irqrestore(&vec->lock, flags);
130 }
131
132 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
134 137
@@ -141,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
141 144
142 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
143 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
144 159
145 *currpri = newpri; 160 *currpri = newpri;
146} 161}
@@ -152,7 +167,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
152 * 167 *
153 * Returns: -ENOMEM if memory fails. 168 * Returns: -ENOMEM if memory fails.
154 */ 169 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 170int cpupri_init(struct cpupri *cp, bool bootmem)
156{ 171{
157 gfp_t gfp = GFP_KERNEL; 172 gfp_t gfp = GFP_KERNEL;
158 int i; 173 int i;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162{ 162{
163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
164 spread, rq0_min_vruntime, spread0; 164 spread, rq0_min_vruntime, spread0;
165 struct rq *rq = &per_cpu(runqueues, cpu); 165 struct rq *rq = cpu_rq(cpu);
166 struct sched_entity *last; 166 struct sched_entity *last;
167 unsigned long flags; 167 unsigned long flags;
168 168
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 if (last) 191 if (last)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
248 248
249static void print_cpu(struct seq_file *m, int cpu) 249static void print_cpu(struct seq_file *m, int cpu)
250{ 250{
251 struct rq *rq = &per_cpu(runqueues, cpu); 251 struct rq *rq = cpu_rq(cpu);
252 252
253#ifdef CONFIG_X86 253#ifdef CONFIG_X86
254 { 254 {
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
@@ -409,6 +410,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 410 PN(se.wait_max);
410 PN(se.wait_sum); 411 PN(se.wait_sum);
411 P(se.wait_count); 412 P(se.wait_count);
413 PN(se.iowait_sum);
414 P(se.iowait_count);
412 P(sched_info.bkl_count); 415 P(sched_info.bkl_count);
413 P(se.nr_migrations); 416 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 417 P(se.nr_migrations_cold);
@@ -479,6 +482,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 482 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 483 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 484 p->se.wait_count = 0;
485 p->se.iowait_sum = 0;
486 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 487 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 488 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 489 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f9650e8fe75..10d218ab69f2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -266,6 +274,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 274 return min_vruntime;
267} 275}
268 276
277static inline int entity_before(struct sched_entity *a,
278 struct sched_entity *b)
279{
280 return (s64)(a->vruntime - b->vruntime) < 0;
281}
282
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 283static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 284{
271 return se->vruntime - cfs_rq->min_vruntime; 285 return se->vruntime - cfs_rq->min_vruntime;
@@ -430,12 +444,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
430 444
431 for_each_sched_entity(se) { 445 for_each_sched_entity(se) {
432 struct load_weight *load; 446 struct load_weight *load;
447 struct load_weight lw;
433 448
434 cfs_rq = cfs_rq_of(se); 449 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load; 450 load = &cfs_rq->load;
436 451
437 if (unlikely(!se->on_rq)) { 452 if (unlikely(!se->on_rq)) {
438 struct load_weight lw = cfs_rq->load; 453 lw = cfs_rq->load;
439 454
440 update_load_add(&lw, se->load.weight); 455 update_load_add(&lw, se->load.weight);
441 load = &lw; 456 load = &lw;
@@ -530,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
530 schedstat_set(se->wait_count, se->wait_count + 1); 545 schedstat_set(se->wait_count, se->wait_count + 1);
531 schedstat_set(se->wait_sum, se->wait_sum + 546 schedstat_set(se->wait_sum, se->wait_sum +
532 rq_of(cfs_rq)->clock - se->wait_start); 547 rq_of(cfs_rq)->clock - se->wait_start);
548#ifdef CONFIG_SCHEDSTATS
549 if (entity_is_task(se)) {
550 trace_sched_stat_wait(task_of(se),
551 rq_of(cfs_rq)->clock - se->wait_start);
552 }
553#endif
533 schedstat_set(se->wait_start, 0); 554 schedstat_set(se->wait_start, 0);
534} 555}
535 556
@@ -604,9 +625,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
604static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 625static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
605{ 626{
606#ifdef CONFIG_SCHEDSTATS 627#ifdef CONFIG_SCHEDSTATS
628 struct task_struct *tsk = NULL;
629
630 if (entity_is_task(se))
631 tsk = task_of(se);
632
607 if (se->sleep_start) { 633 if (se->sleep_start) {
608 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 634 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
609 struct task_struct *tsk = task_of(se);
610 635
611 if ((s64)delta < 0) 636 if ((s64)delta < 0)
612 delta = 0; 637 delta = 0;
@@ -617,11 +642,13 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
617 se->sleep_start = 0; 642 se->sleep_start = 0;
618 se->sum_sleep_runtime += delta; 643 se->sum_sleep_runtime += delta;
619 644
620 account_scheduler_latency(tsk, delta >> 10, 1); 645 if (tsk) {
646 account_scheduler_latency(tsk, delta >> 10, 1);
647 trace_sched_stat_sleep(tsk, delta);
648 }
621 } 649 }
622 if (se->block_start) { 650 if (se->block_start) {
623 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 651 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
624 struct task_struct *tsk = task_of(se);
625 652
626 if ((s64)delta < 0) 653 if ((s64)delta < 0)
627 delta = 0; 654 delta = 0;
@@ -632,17 +659,25 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
632 se->block_start = 0; 659 se->block_start = 0;
633 se->sum_sleep_runtime += delta; 660 se->sum_sleep_runtime += delta;
634 661
635 /* 662 if (tsk) {
636 * Blocking time is in units of nanosecs, so shift by 20 to 663 if (tsk->in_iowait) {
637 * get a milliseconds-range estimation of the amount of 664 se->iowait_sum += delta;
638 * time that the task spent sleeping: 665 se->iowait_count++;
639 */ 666 trace_sched_stat_iowait(tsk, delta);
640 if (unlikely(prof_on == SLEEP_PROFILING)) { 667 }
641 668
642 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 669 /*
643 delta >> 20); 670 * Blocking time is in units of nanosecs, so shift by
671 * 20 to get a milliseconds-range estimation of the
672 * amount of time that the task spent sleeping:
673 */
674 if (unlikely(prof_on == SLEEP_PROFILING)) {
675 profile_hits(SLEEP_PROFILING,
676 (void *)get_wchan(tsk),
677 delta >> 20);
678 }
679 account_scheduler_latency(tsk, delta >> 10, 0);
644 } 680 }
645 account_scheduler_latency(tsk, delta >> 10, 0);
646 } 681 }
647#endif 682#endif
648} 683}
@@ -676,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
676 711
677 if (!initial) { 712 if (!initial) {
678 /* sleeps upto a single latency don't count. */ 713 /* sleeps upto a single latency don't count. */
679 if (sched_feat(NEW_FAIR_SLEEPERS)) { 714 if (sched_feat(FAIR_SLEEPERS)) {
680 unsigned long thresh = sysctl_sched_latency; 715 unsigned long thresh = sysctl_sched_latency;
681 716
682 /* 717 /*
@@ -686,16 +721,24 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
686 * all of which have the same weight. 721 * all of which have the same weight.
687 */ 722 */
688 if (sched_feat(NORMALIZED_SLEEPER) && 723 if (sched_feat(NORMALIZED_SLEEPER) &&
689 task_of(se)->policy != SCHED_IDLE) 724 (!entity_is_task(se) ||
725 task_of(se)->policy != SCHED_IDLE))
690 thresh = calc_delta_fair(thresh, se); 726 thresh = calc_delta_fair(thresh, se);
691 727
728 /*
729 * Halve their sleep time's effect, to allow
730 * for a gentler effect of sleepers:
731 */
732 if (sched_feat(GENTLE_FAIR_SLEEPERS))
733 thresh >>= 1;
734
692 vruntime -= thresh; 735 vruntime -= thresh;
693 } 736 }
694
695 /* ensure we never gain time by being placed backwards. */
696 vruntime = max_vruntime(se->vruntime, vruntime);
697 } 737 }
698 738
739 /* ensure we never gain time by being placed backwards. */
740 vruntime = max_vruntime(se->vruntime, vruntime);
741
699 se->vruntime = vruntime; 742 se->vruntime = vruntime;
700} 743}
701 744
@@ -721,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
721 764
722static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 765static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
723{ 766{
724 if (cfs_rq->last == se) 767 if (!se || cfs_rq->last == se)
725 cfs_rq->last = NULL; 768 cfs_rq->last = NULL;
726 769
727 if (cfs_rq->next == se) 770 if (!se || cfs_rq->next == se)
728 cfs_rq->next = NULL; 771 cfs_rq->next = NULL;
729} 772}
730 773
@@ -1015,7 +1058,7 @@ static void yield_task_fair(struct rq *rq)
1015 /* 1058 /*
1016 * Already in the rightmost position? 1059 * Already in the rightmost position?
1017 */ 1060 */
1018 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1061 if (unlikely(!rightmost || entity_before(rightmost, se)))
1019 return; 1062 return;
1020 1063
1021 /* 1064 /*
@@ -1026,79 +1069,6 @@ static void yield_task_fair(struct rq *rq)
1026 se->vruntime = rightmost->vruntime + 1; 1069 se->vruntime = rightmost->vruntime + 1;
1027} 1070}
1028 1071
1029/*
1030 * wake_idle() will wake a task on an idle cpu if task->cpu is
1031 * not idle and an idle cpu is available. The span of cpus to
1032 * search starts with cpus closest then further out as needed,
1033 * so we always favor a closer, idle cpu.
1034 * Domains may include CPUs that are not usable for migration,
1035 * hence we need to mask them out (cpu_active_mask)
1036 *
1037 * Returns the CPU we should wake onto.
1038 */
1039#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1040static int wake_idle(int cpu, struct task_struct *p)
1041{
1042 struct sched_domain *sd;
1043 int i;
1044 unsigned int chosen_wakeup_cpu;
1045 int this_cpu;
1046
1047 /*
1048 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1049 * are idle and this is not a kernel thread and this task's affinity
1050 * allows it to be moved to preferred cpu, then just move!
1051 */
1052
1053 this_cpu = smp_processor_id();
1054 chosen_wakeup_cpu =
1055 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1056
1057 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1058 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1059 p->mm && !(p->flags & PF_KTHREAD) &&
1060 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1061 return chosen_wakeup_cpu;
1062
1063 /*
1064 * If it is idle, then it is the best cpu to run this task.
1065 *
1066 * This cpu is also the best, if it has more than one task already.
1067 * Siblings must be also busy(in most cases) as they didn't already
1068 * pickup the extra load from this cpu and hence we need not check
1069 * sibling runqueue info. This will avoid the checks and cache miss
1070 * penalities associated with that.
1071 */
1072 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1073 return cpu;
1074
1075 for_each_domain(cpu, sd) {
1076 if ((sd->flags & SD_WAKE_IDLE)
1077 || ((sd->flags & SD_WAKE_IDLE_FAR)
1078 && !task_hot(p, task_rq(p)->clock, sd))) {
1079 for_each_cpu_and(i, sched_domain_span(sd),
1080 &p->cpus_allowed) {
1081 if (cpu_active(i) && idle_cpu(i)) {
1082 if (i != task_cpu(p)) {
1083 schedstat_inc(p,
1084 se.nr_wakeups_idle);
1085 }
1086 return i;
1087 }
1088 }
1089 } else {
1090 break;
1091 }
1092 }
1093 return cpu;
1094}
1095#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1096static inline int wake_idle(int cpu, struct task_struct *p)
1097{
1098 return cpu;
1099}
1100#endif
1101
1102#ifdef CONFIG_SMP 1072#ifdef CONFIG_SMP
1103 1073
1104#ifdef CONFIG_FAIR_GROUP_SCHED 1074#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1185,25 +1155,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1185 1155
1186#endif 1156#endif
1187 1157
1188static int 1158static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1189wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1190 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1191 int idx, unsigned long load, unsigned long this_load,
1192 unsigned int imbalance)
1193{ 1159{
1194 struct task_struct *curr = this_rq->curr; 1160 struct task_struct *curr = current;
1195 struct task_group *tg; 1161 unsigned long this_load, load;
1196 unsigned long tl = this_load; 1162 int idx, this_cpu, prev_cpu;
1197 unsigned long tl_per_task; 1163 unsigned long tl_per_task;
1164 unsigned int imbalance;
1165 struct task_group *tg;
1198 unsigned long weight; 1166 unsigned long weight;
1199 int balanced; 1167 int balanced;
1200 1168
1201 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1169 idx = sd->wake_idx;
1202 return 0; 1170 this_cpu = smp_processor_id();
1171 prev_cpu = task_cpu(p);
1172 load = source_load(prev_cpu, idx);
1173 this_load = target_load(this_cpu, idx);
1203 1174
1204 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1175 if (sync) {
1205 p->se.avg_overlap > sysctl_sched_migration_cost)) 1176 if (sched_feat(SYNC_LESS) &&
1206 sync = 0; 1177 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1178 p->se.avg_overlap > sysctl_sched_migration_cost))
1179 sync = 0;
1180 } else {
1181 if (sched_feat(SYNC_MORE) &&
1182 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1183 p->se.avg_overlap < sysctl_sched_migration_cost))
1184 sync = 1;
1185 }
1207 1186
1208 /* 1187 /*
1209 * If sync wakeup then subtract the (maximum possible) 1188 * If sync wakeup then subtract the (maximum possible)
@@ -1214,14 +1193,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1214 tg = task_group(current); 1193 tg = task_group(current);
1215 weight = current->se.load.weight; 1194 weight = current->se.load.weight;
1216 1195
1217 tl += effective_load(tg, this_cpu, -weight, -weight); 1196 this_load += effective_load(tg, this_cpu, -weight, -weight);
1218 load += effective_load(tg, prev_cpu, 0, -weight); 1197 load += effective_load(tg, prev_cpu, 0, -weight);
1219 } 1198 }
1220 1199
1221 tg = task_group(p); 1200 tg = task_group(p);
1222 weight = p->se.load.weight; 1201 weight = p->se.load.weight;
1223 1202
1224 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1203 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1204
1205 /*
1206 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1207 * due to the sync cause above having dropped this_load to 0, we'll
1208 * always have an imbalance, but there's really nothing you can do
1209 * about that, so that's good too.
1210 *
1211 * Otherwise check if either cpus are near enough in load to allow this
1212 * task to be woken on this_cpu.
1213 */
1214 balanced = !this_load ||
1215 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1225 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1216 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1226 1217
1227 /* 1218 /*
@@ -1235,14 +1226,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1235 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1226 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1236 tl_per_task = cpu_avg_load_per_task(this_cpu); 1227 tl_per_task = cpu_avg_load_per_task(this_cpu);
1237 1228
1238 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1229 if (balanced ||
1239 tl_per_task)) { 1230 (this_load <= load &&
1231 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1240 /* 1232 /*
1241 * This domain has SD_WAKE_AFFINE and 1233 * This domain has SD_WAKE_AFFINE and
1242 * p is cache cold in this domain, and 1234 * p is cache cold in this domain, and
1243 * there is no bad imbalance. 1235 * there is no bad imbalance.
1244 */ 1236 */
1245 schedstat_inc(this_sd, ttwu_move_affine); 1237 schedstat_inc(sd, ttwu_move_affine);
1246 schedstat_inc(p, se.nr_wakeups_affine); 1238 schedstat_inc(p, se.nr_wakeups_affine);
1247 1239
1248 return 1; 1240 return 1;
@@ -1250,67 +1242,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1250 return 0; 1242 return 0;
1251} 1243}
1252 1244
1253static int select_task_rq_fair(struct task_struct *p, int sync) 1245/*
1246 * find_idlest_group finds and returns the least busy CPU group within the
1247 * domain.
1248 */
1249static struct sched_group *
1250find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1251 int this_cpu, int load_idx)
1254{ 1252{
1255 struct sched_domain *sd, *this_sd = NULL; 1253 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1256 int prev_cpu, this_cpu, new_cpu; 1254 unsigned long min_load = ULONG_MAX, this_load = 0;
1257 unsigned long load, this_load; 1255 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1258 struct rq *this_rq;
1259 unsigned int imbalance;
1260 int idx;
1261 1256
1262 prev_cpu = task_cpu(p); 1257 do {
1263 this_cpu = smp_processor_id(); 1258 unsigned long load, avg_load;
1264 this_rq = cpu_rq(this_cpu); 1259 int local_group;
1265 new_cpu = prev_cpu; 1260 int i;
1266 1261
1267 if (prev_cpu == this_cpu) 1262 /* Skip over this group if it has no CPUs allowed */
1268 goto out; 1263 if (!cpumask_intersects(sched_group_cpus(group),
1269 /* 1264 &p->cpus_allowed))
1270 * 'this_sd' is the first domain that both 1265 continue;
1271 * this_cpu and prev_cpu are present in: 1266
1272 */ 1267 local_group = cpumask_test_cpu(this_cpu,
1273 for_each_domain(this_cpu, sd) { 1268 sched_group_cpus(group));
1274 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1269
1275 this_sd = sd; 1270 /* Tally up the load of all CPUs in the group */
1276 break; 1271 avg_load = 0;
1272
1273 for_each_cpu(i, sched_group_cpus(group)) {
1274 /* Bias balancing toward cpus of our domain */
1275 if (local_group)
1276 load = source_load(i, load_idx);
1277 else
1278 load = target_load(i, load_idx);
1279
1280 avg_load += load;
1281 }
1282
1283 /* Adjust by relative CPU power of the group */
1284 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1285
1286 if (local_group) {
1287 this_load = avg_load;
1288 this = group;
1289 } else if (avg_load < min_load) {
1290 min_load = avg_load;
1291 idlest = group;
1292 }
1293 } while (group = group->next, group != sd->groups);
1294
1295 if (!idlest || 100*this_load < imbalance*min_load)
1296 return NULL;
1297 return idlest;
1298}
1299
1300/*
1301 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1302 */
1303static int
1304find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1305{
1306 unsigned long load, min_load = ULONG_MAX;
1307 int idlest = -1;
1308 int i;
1309
1310 /* Traverse only the allowed CPUs */
1311 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1312 load = weighted_cpuload(i);
1313
1314 if (load < min_load || (load == min_load && i == this_cpu)) {
1315 min_load = load;
1316 idlest = i;
1277 } 1317 }
1278 } 1318 }
1279 1319
1280 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1320 return idlest;
1281 goto out; 1321}
1282 1322
1283 /* 1323/*
1284 * Check for affine wakeup and passive balancing possibilities. 1324 * sched_balance_self: balance the current task (running on cpu) in domains
1285 */ 1325 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1286 if (!this_sd) 1326 * SD_BALANCE_EXEC.
1327 *
1328 * Balance, ie. select the least loaded group.
1329 *
1330 * Returns the target CPU number, or the same CPU if no balancing is needed.
1331 *
1332 * preempt must be disabled.
1333 */
1334static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1335{
1336 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1337 int cpu = smp_processor_id();
1338 int prev_cpu = task_cpu(p);
1339 int new_cpu = cpu;
1340 int want_affine = 0;
1341 int want_sd = 1;
1342 int sync = wake_flags & WF_SYNC;
1343
1344 if (sd_flag & SD_BALANCE_WAKE) {
1345 if (sched_feat(AFFINE_WAKEUPS))
1346 want_affine = 1;
1347 new_cpu = prev_cpu;
1348 }
1349
1350 rcu_read_lock();
1351 for_each_domain(cpu, tmp) {
1352 /*
1353 * If power savings logic is enabled for a domain, see if we
1354 * are not overloaded, if so, don't balance wider.
1355 */
1356 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1357 unsigned long power = 0;
1358 unsigned long nr_running = 0;
1359 unsigned long capacity;
1360 int i;
1361
1362 for_each_cpu(i, sched_domain_span(tmp)) {
1363 power += power_of(i);
1364 nr_running += cpu_rq(i)->cfs.nr_running;
1365 }
1366
1367 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1368
1369 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1370 nr_running /= 2;
1371
1372 if (nr_running < capacity)
1373 want_sd = 0;
1374 }
1375
1376 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1377 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1378
1379 affine_sd = tmp;
1380 want_affine = 0;
1381 }
1382
1383 if (!want_sd && !want_affine)
1384 break;
1385
1386 if (!(tmp->flags & sd_flag))
1387 continue;
1388
1389 if (want_sd)
1390 sd = tmp;
1391 }
1392
1393 if (sched_feat(LB_SHARES_UPDATE)) {
1394 /*
1395 * Pick the largest domain to update shares over
1396 */
1397 tmp = sd;
1398 if (affine_sd && (!tmp ||
1399 cpumask_weight(sched_domain_span(affine_sd)) >
1400 cpumask_weight(sched_domain_span(sd))))
1401 tmp = affine_sd;
1402
1403 if (tmp)
1404 update_shares(tmp);
1405 }
1406
1407 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1408 new_cpu = cpu;
1287 goto out; 1409 goto out;
1410 }
1288 1411
1289 idx = this_sd->wake_idx; 1412 while (sd) {
1413 int load_idx = sd->forkexec_idx;
1414 struct sched_group *group;
1415 int weight;
1290 1416
1291 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1417 if (!(sd->flags & sd_flag)) {
1418 sd = sd->child;
1419 continue;
1420 }
1292 1421
1293 load = source_load(prev_cpu, idx); 1422 if (sd_flag & SD_BALANCE_WAKE)
1294 this_load = target_load(this_cpu, idx); 1423 load_idx = sd->wake_idx;
1295 1424
1296 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1425 group = find_idlest_group(sd, p, cpu, load_idx);
1297 load, this_load, imbalance)) 1426 if (!group) {
1298 return this_cpu; 1427 sd = sd->child;
1428 continue;
1429 }
1299 1430
1300 /* 1431 new_cpu = find_idlest_cpu(group, p, cpu);
1301 * Start passive balancing when half the imbalance_pct 1432 if (new_cpu == -1 || new_cpu == cpu) {
1302 * limit is reached. 1433 /* Now try balancing at a lower domain level of cpu */
1303 */ 1434 sd = sd->child;
1304 if (this_sd->flags & SD_WAKE_BALANCE) { 1435 continue;
1305 if (imbalance*this_load <= 100*load) {
1306 schedstat_inc(this_sd, ttwu_move_balance);
1307 schedstat_inc(p, se.nr_wakeups_passive);
1308 return this_cpu;
1309 } 1436 }
1437
1438 /* Now try balancing at a lower domain level of new_cpu */
1439 cpu = new_cpu;
1440 weight = cpumask_weight(sched_domain_span(sd));
1441 sd = NULL;
1442 for_each_domain(cpu, tmp) {
1443 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1444 break;
1445 if (tmp->flags & sd_flag)
1446 sd = tmp;
1447 }
1448 /* while loop will break here if sd == NULL */
1310 } 1449 }
1311 1450
1312out: 1451out:
1313 return wake_idle(new_cpu, p); 1452 rcu_read_unlock();
1453 return new_cpu;
1314} 1454}
1315#endif /* CONFIG_SMP */ 1455#endif /* CONFIG_SMP */
1316 1456
@@ -1423,11 +1563,12 @@ static void set_next_buddy(struct sched_entity *se)
1423/* 1563/*
1424 * Preempt the current task with a newly woken task if needed: 1564 * Preempt the current task with a newly woken task if needed:
1425 */ 1565 */
1426static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1566static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1427{ 1567{
1428 struct task_struct *curr = rq->curr; 1568 struct task_struct *curr = rq->curr;
1429 struct sched_entity *se = &curr->se, *pse = &p->se; 1569 struct sched_entity *se = &curr->se, *pse = &p->se;
1430 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1570 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1571 int sync = wake_flags & WF_SYNC;
1431 1572
1432 update_curr(cfs_rq); 1573 update_curr(cfs_rq);
1433 1574
@@ -1453,7 +1594,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1453 */ 1594 */
1454 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1595 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1455 set_last_buddy(se); 1596 set_last_buddy(se);
1456 set_next_buddy(pse); 1597 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1598 set_next_buddy(pse);
1457 1599
1458 /* 1600 /*
1459 * We can come here with TIF_NEED_RESCHED already set from new task 1601 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1475,16 +1617,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1475 return; 1617 return;
1476 } 1618 }
1477 1619
1478 if (!sched_feat(WAKEUP_PREEMPT)) 1620 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1479 return; 1621 (sched_feat(WAKEUP_OVERLAP) &&
1480 1622 (se->avg_overlap < sysctl_sched_migration_cost &&
1481 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1623 pse->avg_overlap < sysctl_sched_migration_cost))) {
1482 (se->avg_overlap < sysctl_sched_migration_cost &&
1483 pse->avg_overlap < sysctl_sched_migration_cost))) {
1484 resched_task(curr); 1624 resched_task(curr);
1485 return; 1625 return;
1486 } 1626 }
1487 1627
1628 if (sched_feat(WAKEUP_RUNNING)) {
1629 if (pse->avg_running < se->avg_running) {
1630 set_next_buddy(pse);
1631 resched_task(curr);
1632 return;
1633 }
1634 }
1635
1636 if (!sched_feat(WAKEUP_PREEMPT))
1637 return;
1638
1488 find_matching_se(&se, &pse); 1639 find_matching_se(&se, &pse);
1489 1640
1490 BUG_ON(!pse); 1641 BUG_ON(!pse);
@@ -1507,8 +1658,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1507 /* 1658 /*
1508 * If se was a buddy, clear it so that it will have to earn 1659 * If se was a buddy, clear it so that it will have to earn
1509 * the favour again. 1660 * the favour again.
1661 *
1662 * If se was not a buddy, clear the buddies because neither
1663 * was elegible to run, let them earn it again.
1664 *
1665 * IOW. unconditionally clear buddies.
1510 */ 1666 */
1511 __clear_buddies(cfs_rq, se); 1667 __clear_buddies(cfs_rq, NULL);
1512 set_next_entity(cfs_rq, se); 1668 set_next_entity(cfs_rq, se);
1513 cfs_rq = group_cfs_rq(se); 1669 cfs_rq = group_cfs_rq(se);
1514 } while (cfs_rq); 1670 } while (cfs_rq);
@@ -1707,11 +1863,13 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1707 sched_info_queued(p); 1863 sched_info_queued(p);
1708 1864
1709 update_curr(cfs_rq); 1865 update_curr(cfs_rq);
1866 if (curr)
1867 se->vruntime = curr->vruntime;
1710 place_entity(cfs_rq, se, 1); 1868 place_entity(cfs_rq, se, 1);
1711 1869
1712 /* 'curr' will be NULL if the child belongs to a different group */ 1870 /* 'curr' will be NULL if the child belongs to a different group */
1713 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1871 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1714 curr && curr->vruntime < se->vruntime) { 1872 curr && entity_before(curr, se)) {
1715 /* 1873 /*
1716 * Upon rescheduling, sched_class::put_prev_task() will place 1874 * Upon rescheduling, sched_class::put_prev_task() will place
1717 * 'current' within the tree based on its new key value. 1875 * 'current' within the tree based on its new key value.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..a8b448af004b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..13de7126a6ab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,13 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 19{
15 return rt_rq->rq; 20 return rt_rq->rq;
@@ -22,6 +27,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 27
23#else /* CONFIG_RT_GROUP_SCHED */ 28#else /* CONFIG_RT_GROUP_SCHED */
24 29
30#define rt_entity_is_task(rt_se) (1)
31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 38{
27 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +85,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 85
74static void update_rt_migration(struct rt_rq *rt_rq) 86static void update_rt_migration(struct rt_rq *rt_rq)
75{ 87{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 88 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 89 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 90 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 91 rt_rq->overloaded = 1;
@@ -86,6 +98,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 98
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 99static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 100{
101 if (!rt_entity_is_task(rt_se))
102 return;
103
104 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
105
106 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 107 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 108 rt_rq->rt_nr_migratory++;
91 109
@@ -94,6 +112,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 112
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 113static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 114{
115 if (!rt_entity_is_task(rt_se))
116 return;
117
118 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
119
120 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 121 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 122 rt_rq->rt_nr_migratory--;
99 123
@@ -112,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
112 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
113} 137}
114 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
115#else 144#else
116 145
117static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -586,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
586 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
587 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
588 617
618 sched_rt_avg_update(rq, delta_exec);
619
589 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
590 return; 621 return;
591 622
@@ -858,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
858 889
859 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
860 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
861
862 inc_cpu_load(rq, p->se.load.weight);
863} 892}
864 893
865static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -870,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
870 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
871 900
872 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
873
874 dec_cpu_load(rq, p->se.load.weight);
875} 902}
876 903
877/* 904/*
@@ -911,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
911#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
912static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
913 940
914static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
915{ 942{
916 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
917 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
918 /* 948 /*
919 * If the current task is an RT task, then 949 * If the current task is an RT task, then
920 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -972,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
972/* 1002/*
973 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
974 */ 1004 */
975static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
976{ 1006{
977 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
978 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1048,6 +1078,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1048 if (p) 1078 if (p)
1049 dequeue_pushable_task(rq, p); 1079 dequeue_pushable_task(rq, p);
1050 1080
1081#ifdef CONFIG_SMP
1082 /*
1083 * We detect this state here so that we can avoid taking the RQ
1084 * lock again later if there is no need to push
1085 */
1086 rq->post_schedule = has_pushable_tasks(rq);
1087#endif
1088
1051 return p; 1089 return p;
1052} 1090}
1053 1091
@@ -1146,13 +1184,6 @@ static int find_lowest_rq(struct task_struct *task)
1146 return -1; /* No targets found */ 1184 return -1; /* No targets found */
1147 1185
1148 /* 1186 /*
1149 * Only consider CPUs that are usable for migration.
1150 * I guess we might want to change cpupri_find() to ignore those
1151 * in the first place.
1152 */
1153 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1154
1155 /*
1156 * At this point we have built a mask of cpus representing the 1187 * At this point we have built a mask of cpus representing the
1157 * lowest priority tasks in the system. Now we want to elect 1188 * lowest priority tasks in the system. Now we want to elect
1158 * the best one based on our affinity and topology. 1189 * the best one based on our affinity and topology.
@@ -1246,11 +1277,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1246 return lowest_rq; 1277 return lowest_rq;
1247} 1278}
1248 1279
1249static inline int has_pushable_tasks(struct rq *rq)
1250{
1251 return !plist_head_empty(&rq->rt.pushable_tasks);
1252}
1253
1254static struct task_struct *pick_next_pushable_task(struct rq *rq) 1280static struct task_struct *pick_next_pushable_task(struct rq *rq)
1255{ 1281{
1256 struct task_struct *p; 1282 struct task_struct *p;
@@ -1450,23 +1476,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1450 pull_rt_task(rq); 1476 pull_rt_task(rq);
1451} 1477}
1452 1478
1453/*
1454 * assumes rq->lock is held
1455 */
1456static int needs_post_schedule_rt(struct rq *rq)
1457{
1458 return has_pushable_tasks(rq);
1459}
1460
1461static void post_schedule_rt(struct rq *rq) 1479static void post_schedule_rt(struct rq *rq)
1462{ 1480{
1463 /*
1464 * This is only called if needs_post_schedule_rt() indicates that
1465 * we need to push tasks away
1466 */
1467 spin_lock_irq(&rq->lock);
1468 push_rt_tasks(rq); 1481 push_rt_tasks(rq);
1469 spin_unlock_irq(&rq->lock);
1470} 1482}
1471 1483
1472/* 1484/*
@@ -1742,7 +1754,6 @@ static const struct sched_class rt_sched_class = {
1742 .rq_online = rq_online_rt, 1754 .rq_online = rq_online_rt,
1743 .rq_offline = rq_offline_rt, 1755 .rq_offline = rq_offline_rt,
1744 .pre_schedule = pre_schedule_rt, 1756 .pre_schedule = pre_schedule_rt,
1745 .needs_post_schedule = needs_post_schedule_rt,
1746 .post_schedule = post_schedule_rt, 1757 .post_schedule = post_schedule_rt,
1747 .task_wake_up = task_wake_up_rt, 1758 .task_wake_up = task_wake_up_rt,
1748 .switched_from = switched_from_rt, 1759 .switched_from = switched_from_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 809a228019ad..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -832,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
832{ 832{
833 struct sigpending *pending; 833 struct sigpending *pending;
834 struct sigqueue *q; 834 struct sigqueue *q;
835 int override_rlimit;
835 836
836 trace_sched_signal_send(sig, t); 837 trace_sched_signal_send(sig, t);
837 838
@@ -863,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
863 make sure at least one signal gets delivered and don't 864 make sure at least one signal gets delivered and don't
864 pass on the info struct. */ 865 pass on the info struct. */
865 866
866 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 867 if (sig < SIGRTMIN)
867 (is_si_special(info) || 868 override_rlimit = (is_si_special(info) || info->si_code >= 0);
868 info->si_code >= 0))); 869 else
870 override_rlimit = 0;
871
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit);
869 if (q) { 874 if (q) {
870 list_add_tail(&q->list, &pending->list); 875 list_add_tail(&q->list, &pending->list);
871 switch ((unsigned long) info) { 876 switch ((unsigned long) info) {
@@ -1405,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1405 /* do_notify_parent_cldstop should have been called instead. */ 1410 /* do_notify_parent_cldstop should have been called instead. */
1406 BUG_ON(task_is_stopped_or_traced(tsk)); 1411 BUG_ON(task_is_stopped_or_traced(tsk));
1407 1412
1408 BUG_ON(!tsk->ptrace && 1413 BUG_ON(!task_ptrace(tsk) &&
1409 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1414 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1410 1415
1411 info.si_signo = sig; 1416 info.si_signo = sig;
@@ -1444,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1444 1449
1445 psig = tsk->parent->sighand; 1450 psig = tsk->parent->sighand;
1446 spin_lock_irqsave(&psig->siglock, flags); 1451 spin_lock_irqsave(&psig->siglock, flags);
1447 if (!tsk->ptrace && sig == SIGCHLD && 1452 if (!task_ptrace(tsk) && sig == SIGCHLD &&
1448 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1453 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1449 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1454 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1450 /* 1455 /*
@@ -1481,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1481 struct task_struct *parent; 1486 struct task_struct *parent;
1482 struct sighand_struct *sighand; 1487 struct sighand_struct *sighand;
1483 1488
1484 if (tsk->ptrace & PT_PTRACED) 1489 if (task_ptrace(tsk))
1485 parent = tsk->parent; 1490 parent = tsk->parent;
1486 else { 1491 else {
1487 tsk = tsk->group_leader; 1492 tsk = tsk->group_leader;
@@ -1494,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1494 * see comment in do_notify_parent() abot the following 3 lines 1499 * see comment in do_notify_parent() abot the following 3 lines
1495 */ 1500 */
1496 rcu_read_lock(); 1501 rcu_read_lock();
1497 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1502 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1498 info.si_uid = __task_cred(tsk)->uid; 1503 info.si_uid = __task_cred(tsk)->uid;
1499 rcu_read_unlock(); 1504 rcu_read_unlock();
1500 1505
@@ -1530,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1530 1535
1531static inline int may_ptrace_stop(void) 1536static inline int may_ptrace_stop(void)
1532{ 1537{
1533 if (!likely(current->ptrace & PT_PTRACED)) 1538 if (!likely(task_ptrace(current)))
1534 return 0; 1539 return 0;
1535 /* 1540 /*
1536 * Are we in the middle of do_coredump? 1541 * Are we in the middle of do_coredump?
@@ -1748,7 +1753,7 @@ static int do_signal_stop(int signr)
1748static int ptrace_signal(int signr, siginfo_t *info, 1753static int ptrace_signal(int signr, siginfo_t *info,
1749 struct pt_regs *regs, void *cookie) 1754 struct pt_regs *regs, void *cookie)
1750{ 1755{
1751 if (!(current->ptrace & PT_PTRACED)) 1756 if (!task_ptrace(current))
1752 return signr; 1757 return signr;
1753 1758
1754 ptrace_signal_deliver(regs, cookie); 1759 ptrace_signal_deliver(regs, cookie);
@@ -2449,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2449 stack_t oss; 2454 stack_t oss;
2450 int error; 2455 int error;
2451 2456
2452 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2453 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2454 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2455 oss.ss_flags = sas_ss_flags(sp);
2456 }
2457 2460
2458 if (uss) { 2461 if (uss) {
2459 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2461,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2461 int ss_flags; 2464 int ss_flags;
2462 2465
2463 error = -EFAULT; 2466 error = -EFAULT;
2464 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2465 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2466 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2467 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2468 goto out; 2473 goto out;
2469 2474
2470 error = -EPERM; 2475 error = -EPERM;
@@ -2496,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2496 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2497 } 2502 }
2498 2503
2504 error = 0;
2499 if (uoss) { 2505 if (uoss) {
2500 error = -EFAULT; 2506 error = -EFAULT;
2501 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2502 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2503 } 2512 }
2504 2513
2505 error = 0;
2506out: 2514out:
2507 return error; 2515 return error;
2508} 2516}
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 521ed2004d63..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
319EXPORT_SYMBOL(slow_work_enqueue); 319EXPORT_SYMBOL(slow_work_enqueue);
320 320
321/* 321/*
322 * Schedule a cull of the thread pool at some time in the near future
323 */
324static void slow_work_schedule_cull(void)
325{
326 mod_timer(&slow_work_cull_timer,
327 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
328}
329
330/*
322 * Worker thread culling algorithm 331 * Worker thread culling algorithm
323 */ 332 */
324static bool slow_work_cull_thread(void) 333static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
335 list_empty(&vslow_work_queue) && 344 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) > 345 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) { 346 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer, 347 slow_work_schedule_cull();
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true; 348 do_cull = true;
341 } 349 }
342 } 350 }
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
393 list_empty(&vslow_work_queue) && 401 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) > 402 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads) 403 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer, 404 slow_work_schedule_cull();
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue; 405 continue;
399 } 406 }
400 407
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
458 if (atomic_dec_and_test(&slow_work_thread_count)) 465 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */ 466 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer, 467 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT); 468 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
462 } else { 469 } else {
463 /* ratelimit the starting of new threads */ 470 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1); 471 mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
502 if (n < 0 && !slow_work_may_not_start_new_thread) 509 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread); 510 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0) 511 else if (n > 0)
505 mod_timer(&slow_work_cull_timer, 512 slow_work_schedule_cull();
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 } 513 }
508 mutex_unlock(&slow_work_user_lock); 514 mutex_unlock(&slow_work_user_lock);
509 } 515 }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
529 atomic_read(&slow_work_thread_count); 535 atomic_read(&slow_work_thread_count);
530 536
531 if (n < 0) 537 if (n < 0)
532 mod_timer(&slow_work_cull_timer, 538 slow_work_schedule_cull();
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 } 539 }
535 mutex_unlock(&slow_work_user_lock); 540 mutex_unlock(&slow_work_user_lock);
536 } 541 }
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..8e218500ab14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
@@ -177,6 +177,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 177 int cpu = get_cpu();
178 178
179 /* 179 /*
180 * Shouldn't receive this interrupt on a cpu that is not yet online.
181 */
182 WARN_ON_ONCE(!cpu_online(cpu));
183
184 /*
180 * Ensure entry is visible on call_function_queue after we have 185 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 186 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 187 * If we don't have this, then we may miss an entry on the list
@@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 235 unsigned int data_flags;
231 LIST_HEAD(list); 236 LIST_HEAD(list);
232 237
238 /*
239 * Shouldn't receive this interrupt on a cpu that is not yet online.
240 */
241 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
242
233 spin_lock(&q->lock); 243 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 244 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 245 spin_unlock(&q->lock);
@@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 295 */
286 this_cpu = get_cpu(); 296 this_cpu = get_cpu();
287 297
288 /* Can deadlock when called with interrupts disabled */ 298 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 299 * Can deadlock when called with interrupts disabled.
300 * We allow cpu's that are not yet online though, as no one else can
301 * send smp call function interrupt to this cpu and as such deadlocks
302 * can't happen.
303 */
304 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
305 && !oops_in_progress);
290 306
291 if (cpu == this_cpu) { 307 if (cpu == this_cpu) {
292 local_irq_save(flags); 308 local_irq_save(flags);
@@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 345{
330 csd_lock(data); 346 csd_lock(data);
331 347
332 /* Can deadlock when called with interrupts disabled */ 348 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 349 * Can deadlock when called with interrupts disabled.
350 * We allow cpu's that are not yet online though, as no one else can
351 * send smp call function interrupt to this cpu and as such deadlocks
352 * can't happen.
353 */
354 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
355 && !oops_in_progress);
334 356
335 generic_exec_single(cpu, data, wait); 357 generic_exec_single(cpu, data, wait);
336} 358}
@@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 387 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 388 int cpu, next_cpu, this_cpu = smp_processor_id();
367 389
368 /* Can deadlock when called with interrupts disabled */ 390 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 391 * Can deadlock when called with interrupts disabled.
392 * We allow cpu's that are not yet online though, as no one else can
393 * send smp call function interrupt to this cpu and as such deadlocks
394 * can't happen.
395 */
396 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
397 && !oops_in_progress);
370 398
371 /* So, what's a CPU they want? Ignoring this one. */ 399 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 400 cpu = cpumask_first_and(mask, cpu_online_mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 258885a543db..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
@@ -213,6 +213,7 @@ restart:
213 do { 213 do {
214 if (pending & 1) { 214 if (pending & 1) {
215 int prev_count = preempt_count(); 215 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
216 217
217 trace_softirq_entry(h, softirq_vec); 218 trace_softirq_entry(h, softirq_vec);
218 h->action(h); 219 h->action(h);
@@ -226,7 +227,7 @@ restart:
226 preempt_count() = prev_count; 227 preempt_count() = prev_count;
227 } 228 }
228 229
229 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
230 } 231 }
231 h++; 232 h++;
232 pending >>= 1; 233 pending >>= 1;
@@ -344,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
344 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
345} 346}
346 347
347/* Tasklets */ 348/*
349 * Tasklets
350 */
348struct tasklet_head 351struct tasklet_head
349{ 352{
350 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -382,6 +385,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
382 385
383EXPORT_SYMBOL(__tasklet_hi_schedule); 386EXPORT_SYMBOL(__tasklet_hi_schedule);
384 387
388void __tasklet_hi_schedule_first(struct tasklet_struct *t)
389{
390 BUG_ON(!irqs_disabled());
391
392 t->next = __get_cpu_var(tasklet_hi_vec).head;
393 __get_cpu_var(tasklet_hi_vec).head = t;
394 __raise_softirq_irqoff(HI_SOFTIRQ);
395}
396
397EXPORT_SYMBOL(__tasklet_hi_schedule_first);
398
385static void tasklet_action(struct softirq_action *a) 399static void tasklet_action(struct softirq_action *a)
386{ 400{
387 struct tasklet_struct *list; 401 struct tasklet_struct *list;
@@ -481,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
481 495
482EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
483 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
484DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
485EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
486 560
@@ -647,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
647 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
648 cond_resched(); 722 cond_resched();
649 preempt_disable(); 723 preempt_disable();
650 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
651 } 725 }
652 preempt_enable(); 726 preempt_enable();
653 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4ebd..5ddab730cb2f 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
24int __lockfunc _spin_trylock(spinlock_t *lock) 25int __lockfunc _spin_trylock(spinlock_t *lock)
25{ 26{
26 preempt_disable(); 27 return __spin_trylock(lock);
27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
29 return 1;
30 }
31
32 preempt_enable();
33 return 0;
34} 28}
35EXPORT_SYMBOL(_spin_trylock); 29EXPORT_SYMBOL(_spin_trylock);
30#endif
36 31
32#ifndef _read_trylock
37int __lockfunc _read_trylock(rwlock_t *lock) 33int __lockfunc _read_trylock(rwlock_t *lock)
38{ 34{
39 preempt_disable(); 35 return __read_trylock(lock);
40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
42 return 1;
43 }
44
45 preempt_enable();
46 return 0;
47} 36}
48EXPORT_SYMBOL(_read_trylock); 37EXPORT_SYMBOL(_read_trylock);
38#endif
49 39
40#ifndef _write_trylock
50int __lockfunc _write_trylock(rwlock_t *lock) 41int __lockfunc _write_trylock(rwlock_t *lock)
51{ 42{
52 preempt_disable(); 43 return __write_trylock(lock);
53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
55 return 1;
56 }
57
58 preempt_enable();
59 return 0;
60} 44}
61EXPORT_SYMBOL(_write_trylock); 45EXPORT_SYMBOL(_write_trylock);
46#endif
62 47
63/* 48/*
64 * If lockdep is enabled then we use the non-preemption spin-ops 49 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
67 */ 52 */
68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 54
55#ifndef _read_lock
70void __lockfunc _read_lock(rwlock_t *lock) 56void __lockfunc _read_lock(rwlock_t *lock)
71{ 57{
72 preempt_disable(); 58 __read_lock(lock);
73 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
74 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
75} 59}
76EXPORT_SYMBOL(_read_lock); 60EXPORT_SYMBOL(_read_lock);
61#endif
77 62
63#ifndef _spin_lock_irqsave
78unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
79{ 65{
80 unsigned long flags; 66 return __spin_lock_irqsave(lock);
81
82 local_irq_save(flags);
83 preempt_disable();
84 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
85 /*
86 * On lockdep we dont want the hand-coded irq-enable of
87 * _raw_spin_lock_flags() code, because lockdep assumes
88 * that interrupts are not re-enabled during lock-acquire:
89 */
90#ifdef CONFIG_LOCKDEP
91 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
92#else
93 _raw_spin_lock_flags(lock, &flags);
94#endif
95 return flags;
96} 67}
97EXPORT_SYMBOL(_spin_lock_irqsave); 68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
98 70
71#ifndef _spin_lock_irq
99void __lockfunc _spin_lock_irq(spinlock_t *lock) 72void __lockfunc _spin_lock_irq(spinlock_t *lock)
100{ 73{
101 local_irq_disable(); 74 __spin_lock_irq(lock);
102 preempt_disable();
103 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
104 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
105} 75}
106EXPORT_SYMBOL(_spin_lock_irq); 76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
107 78
79#ifndef _spin_lock_bh
108void __lockfunc _spin_lock_bh(spinlock_t *lock) 80void __lockfunc _spin_lock_bh(spinlock_t *lock)
109{ 81{
110 local_bh_disable(); 82 __spin_lock_bh(lock);
111 preempt_disable();
112 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
113 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
114} 83}
115EXPORT_SYMBOL(_spin_lock_bh); 84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
116 86
87#ifndef _read_lock_irqsave
117unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
118{ 89{
119 unsigned long flags; 90 return __read_lock_irqsave(lock);
120
121 local_irq_save(flags);
122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
126 return flags;
127} 91}
128EXPORT_SYMBOL(_read_lock_irqsave); 92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
129 94
95#ifndef _read_lock_irq
130void __lockfunc _read_lock_irq(rwlock_t *lock) 96void __lockfunc _read_lock_irq(rwlock_t *lock)
131{ 97{
132 local_irq_disable(); 98 __read_lock_irq(lock);
133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 99}
137EXPORT_SYMBOL(_read_lock_irq); 100EXPORT_SYMBOL(_read_lock_irq);
101#endif
138 102
103#ifndef _read_lock_bh
139void __lockfunc _read_lock_bh(rwlock_t *lock) 104void __lockfunc _read_lock_bh(rwlock_t *lock)
140{ 105{
141 local_bh_disable(); 106 __read_lock_bh(lock);
142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 107}
146EXPORT_SYMBOL(_read_lock_bh); 108EXPORT_SYMBOL(_read_lock_bh);
109#endif
147 110
111#ifndef _write_lock_irqsave
148unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
149{ 113{
150 unsigned long flags; 114 return __write_lock_irqsave(lock);
151
152 local_irq_save(flags);
153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
157 return flags;
158} 115}
159EXPORT_SYMBOL(_write_lock_irqsave); 116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
160 118
119#ifndef _write_lock_irq
161void __lockfunc _write_lock_irq(rwlock_t *lock) 120void __lockfunc _write_lock_irq(rwlock_t *lock)
162{ 121{
163 local_irq_disable(); 122 __write_lock_irq(lock);
164 preempt_disable();
165 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
166 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
167} 123}
168EXPORT_SYMBOL(_write_lock_irq); 124EXPORT_SYMBOL(_write_lock_irq);
125#endif
169 126
127#ifndef _write_lock_bh
170void __lockfunc _write_lock_bh(rwlock_t *lock) 128void __lockfunc _write_lock_bh(rwlock_t *lock)
171{ 129{
172 local_bh_disable(); 130 __write_lock_bh(lock);
173 preempt_disable();
174 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
175 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
176} 131}
177EXPORT_SYMBOL(_write_lock_bh); 132EXPORT_SYMBOL(_write_lock_bh);
133#endif
178 134
135#ifndef _spin_lock
179void __lockfunc _spin_lock(spinlock_t *lock) 136void __lockfunc _spin_lock(spinlock_t *lock)
180{ 137{
181 preempt_disable(); 138 __spin_lock(lock);
182 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
183 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
184} 139}
185
186EXPORT_SYMBOL(_spin_lock); 140EXPORT_SYMBOL(_spin_lock);
141#endif
187 142
143#ifndef _write_lock
188void __lockfunc _write_lock(rwlock_t *lock) 144void __lockfunc _write_lock(rwlock_t *lock)
189{ 145{
190 preempt_disable(); 146 __write_lock(lock);
191 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
192 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
193} 147}
194
195EXPORT_SYMBOL(_write_lock); 148EXPORT_SYMBOL(_write_lock);
149#endif
196 150
197#else /* CONFIG_PREEMPT: */ 151#else /* CONFIG_PREEMPT: */
198 152
@@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
318 272
319#endif 273#endif
320 274
275#ifndef _spin_unlock
321void __lockfunc _spin_unlock(spinlock_t *lock) 276void __lockfunc _spin_unlock(spinlock_t *lock)
322{ 277{
323 spin_release(&lock->dep_map, 1, _RET_IP_); 278 __spin_unlock(lock);
324 _raw_spin_unlock(lock);
325 preempt_enable();
326} 279}
327EXPORT_SYMBOL(_spin_unlock); 280EXPORT_SYMBOL(_spin_unlock);
281#endif
328 282
283#ifndef _write_unlock
329void __lockfunc _write_unlock(rwlock_t *lock) 284void __lockfunc _write_unlock(rwlock_t *lock)
330{ 285{
331 rwlock_release(&lock->dep_map, 1, _RET_IP_); 286 __write_unlock(lock);
332 _raw_write_unlock(lock);
333 preempt_enable();
334} 287}
335EXPORT_SYMBOL(_write_unlock); 288EXPORT_SYMBOL(_write_unlock);
289#endif
336 290
291#ifndef _read_unlock
337void __lockfunc _read_unlock(rwlock_t *lock) 292void __lockfunc _read_unlock(rwlock_t *lock)
338{ 293{
339 rwlock_release(&lock->dep_map, 1, _RET_IP_); 294 __read_unlock(lock);
340 _raw_read_unlock(lock);
341 preempt_enable();
342} 295}
343EXPORT_SYMBOL(_read_unlock); 296EXPORT_SYMBOL(_read_unlock);
297#endif
344 298
299#ifndef _spin_unlock_irqrestore
345void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
346{ 301{
347 spin_release(&lock->dep_map, 1, _RET_IP_); 302 __spin_unlock_irqrestore(lock, flags);
348 _raw_spin_unlock(lock);
349 local_irq_restore(flags);
350 preempt_enable();
351} 303}
352EXPORT_SYMBOL(_spin_unlock_irqrestore); 304EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif
353 306
307#ifndef _spin_unlock_irq
354void __lockfunc _spin_unlock_irq(spinlock_t *lock) 308void __lockfunc _spin_unlock_irq(spinlock_t *lock)
355{ 309{
356 spin_release(&lock->dep_map, 1, _RET_IP_); 310 __spin_unlock_irq(lock);
357 _raw_spin_unlock(lock);
358 local_irq_enable();
359 preempt_enable();
360} 311}
361EXPORT_SYMBOL(_spin_unlock_irq); 312EXPORT_SYMBOL(_spin_unlock_irq);
313#endif
362 314
315#ifndef _spin_unlock_bh
363void __lockfunc _spin_unlock_bh(spinlock_t *lock) 316void __lockfunc _spin_unlock_bh(spinlock_t *lock)
364{ 317{
365 spin_release(&lock->dep_map, 1, _RET_IP_); 318 __spin_unlock_bh(lock);
366 _raw_spin_unlock(lock);
367 preempt_enable_no_resched();
368 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
369} 319}
370EXPORT_SYMBOL(_spin_unlock_bh); 320EXPORT_SYMBOL(_spin_unlock_bh);
321#endif
371 322
323#ifndef _read_unlock_irqrestore
372void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
373{ 325{
374 rwlock_release(&lock->dep_map, 1, _RET_IP_); 326 __read_unlock_irqrestore(lock, flags);
375 _raw_read_unlock(lock);
376 local_irq_restore(flags);
377 preempt_enable();
378} 327}
379EXPORT_SYMBOL(_read_unlock_irqrestore); 328EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif
380 330
331#ifndef _read_unlock_irq
381void __lockfunc _read_unlock_irq(rwlock_t *lock) 332void __lockfunc _read_unlock_irq(rwlock_t *lock)
382{ 333{
383 rwlock_release(&lock->dep_map, 1, _RET_IP_); 334 __read_unlock_irq(lock);
384 _raw_read_unlock(lock);
385 local_irq_enable();
386 preempt_enable();
387} 335}
388EXPORT_SYMBOL(_read_unlock_irq); 336EXPORT_SYMBOL(_read_unlock_irq);
337#endif
389 338
339#ifndef _read_unlock_bh
390void __lockfunc _read_unlock_bh(rwlock_t *lock) 340void __lockfunc _read_unlock_bh(rwlock_t *lock)
391{ 341{
392 rwlock_release(&lock->dep_map, 1, _RET_IP_); 342 __read_unlock_bh(lock);
393 _raw_read_unlock(lock);
394 preempt_enable_no_resched();
395 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
396} 343}
397EXPORT_SYMBOL(_read_unlock_bh); 344EXPORT_SYMBOL(_read_unlock_bh);
345#endif
398 346
347#ifndef _write_unlock_irqrestore
399void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
400{ 349{
401 rwlock_release(&lock->dep_map, 1, _RET_IP_); 350 __write_unlock_irqrestore(lock, flags);
402 _raw_write_unlock(lock);
403 local_irq_restore(flags);
404 preempt_enable();
405} 351}
406EXPORT_SYMBOL(_write_unlock_irqrestore); 352EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif
407 354
355#ifndef _write_unlock_irq
408void __lockfunc _write_unlock_irq(rwlock_t *lock) 356void __lockfunc _write_unlock_irq(rwlock_t *lock)
409{ 357{
410 rwlock_release(&lock->dep_map, 1, _RET_IP_); 358 __write_unlock_irq(lock);
411 _raw_write_unlock(lock);
412 local_irq_enable();
413 preempt_enable();
414} 359}
415EXPORT_SYMBOL(_write_unlock_irq); 360EXPORT_SYMBOL(_write_unlock_irq);
361#endif
416 362
363#ifndef _write_unlock_bh
417void __lockfunc _write_unlock_bh(rwlock_t *lock) 364void __lockfunc _write_unlock_bh(rwlock_t *lock)
418{ 365{
419 rwlock_release(&lock->dep_map, 1, _RET_IP_); 366 __write_unlock_bh(lock);
420 _raw_write_unlock(lock);
421 preempt_enable_no_resched();
422 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
423} 367}
424EXPORT_SYMBOL(_write_unlock_bh); 368EXPORT_SYMBOL(_write_unlock_bh);
369#endif
425 370
371#ifndef _spin_trylock_bh
426int __lockfunc _spin_trylock_bh(spinlock_t *lock) 372int __lockfunc _spin_trylock_bh(spinlock_t *lock)
427{ 373{
428 local_bh_disable(); 374 return __spin_trylock_bh(lock);
429 preempt_disable();
430 if (_raw_spin_trylock(lock)) {
431 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
432 return 1;
433 }
434
435 preempt_enable_no_resched();
436 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
437 return 0;
438} 375}
439EXPORT_SYMBOL(_spin_trylock_bh); 376EXPORT_SYMBOL(_spin_trylock_bh);
377#endif
440 378
441notrace int in_lock_functions(unsigned long addr) 379notrace int in_lock_functions(unsigned long addr)
442{ 380{
diff --git a/kernel/sys.c b/kernel/sys.c
index 438d99a38c87..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1113,289 +1113,6 @@ out:
1113 return err; 1113 return err;
1114} 1114}
1115 1115
1116/*
1117 * Supplementary group IDs
1118 */
1119
1120/* init to 2 - one for init_task, one to ensure it is never freed */
1121struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
1122
1123struct group_info *groups_alloc(int gidsetsize)
1124{
1125 struct group_info *group_info;
1126 int nblocks;
1127 int i;
1128
1129 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
1130 /* Make sure we always allocate at least one indirect block pointer */
1131 nblocks = nblocks ? : 1;
1132 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
1133 if (!group_info)
1134 return NULL;
1135 group_info->ngroups = gidsetsize;
1136 group_info->nblocks = nblocks;
1137 atomic_set(&group_info->usage, 1);
1138
1139 if (gidsetsize <= NGROUPS_SMALL)
1140 group_info->blocks[0] = group_info->small_block;
1141 else {
1142 for (i = 0; i < nblocks; i++) {
1143 gid_t *b;
1144 b = (void *)__get_free_page(GFP_USER);
1145 if (!b)
1146 goto out_undo_partial_alloc;
1147 group_info->blocks[i] = b;
1148 }
1149 }
1150 return group_info;
1151
1152out_undo_partial_alloc:
1153 while (--i >= 0) {
1154 free_page((unsigned long)group_info->blocks[i]);
1155 }
1156 kfree(group_info);
1157 return NULL;
1158}
1159
1160EXPORT_SYMBOL(groups_alloc);
1161
1162void groups_free(struct group_info *group_info)
1163{
1164 if (group_info->blocks[0] != group_info->small_block) {
1165 int i;
1166 for (i = 0; i < group_info->nblocks; i++)
1167 free_page((unsigned long)group_info->blocks[i]);
1168 }
1169 kfree(group_info);
1170}
1171
1172EXPORT_SYMBOL(groups_free);
1173
1174/* export the group_info to a user-space array */
1175static int groups_to_user(gid_t __user *grouplist,
1176 const struct group_info *group_info)
1177{
1178 int i;
1179 unsigned int count = group_info->ngroups;
1180
1181 for (i = 0; i < group_info->nblocks; i++) {
1182 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1183 unsigned int len = cp_count * sizeof(*grouplist);
1184
1185 if (copy_to_user(grouplist, group_info->blocks[i], len))
1186 return -EFAULT;
1187
1188 grouplist += NGROUPS_PER_BLOCK;
1189 count -= cp_count;
1190 }
1191 return 0;
1192}
1193
1194/* fill a group_info from a user-space array - it must be allocated already */
1195static int groups_from_user(struct group_info *group_info,
1196 gid_t __user *grouplist)
1197{
1198 int i;
1199 unsigned int count = group_info->ngroups;
1200
1201 for (i = 0; i < group_info->nblocks; i++) {
1202 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1203 unsigned int len = cp_count * sizeof(*grouplist);
1204
1205 if (copy_from_user(group_info->blocks[i], grouplist, len))
1206 return -EFAULT;
1207
1208 grouplist += NGROUPS_PER_BLOCK;
1209 count -= cp_count;
1210 }
1211 return 0;
1212}
1213
1214/* a simple Shell sort */
1215static void groups_sort(struct group_info *group_info)
1216{
1217 int base, max, stride;
1218 int gidsetsize = group_info->ngroups;
1219
1220 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
1221 ; /* nothing */
1222 stride /= 3;
1223
1224 while (stride) {
1225 max = gidsetsize - stride;
1226 for (base = 0; base < max; base++) {
1227 int left = base;
1228 int right = left + stride;
1229 gid_t tmp = GROUP_AT(group_info, right);
1230
1231 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
1232 GROUP_AT(group_info, right) =
1233 GROUP_AT(group_info, left);
1234 right = left;
1235 left -= stride;
1236 }
1237 GROUP_AT(group_info, right) = tmp;
1238 }
1239 stride /= 3;
1240 }
1241}
1242
1243/* a simple bsearch */
1244int groups_search(const struct group_info *group_info, gid_t grp)
1245{
1246 unsigned int left, right;
1247
1248 if (!group_info)
1249 return 0;
1250
1251 left = 0;
1252 right = group_info->ngroups;
1253 while (left < right) {
1254 unsigned int mid = (left+right)/2;
1255 int cmp = grp - GROUP_AT(group_info, mid);
1256 if (cmp > 0)
1257 left = mid + 1;
1258 else if (cmp < 0)
1259 right = mid;
1260 else
1261 return 1;
1262 }
1263 return 0;
1264}
1265
1266/**
1267 * set_groups - Change a group subscription in a set of credentials
1268 * @new: The newly prepared set of credentials to alter
1269 * @group_info: The group list to install
1270 *
1271 * Validate a group subscription and, if valid, insert it into a set
1272 * of credentials.
1273 */
1274int set_groups(struct cred *new, struct group_info *group_info)
1275{
1276 int retval;
1277
1278 retval = security_task_setgroups(group_info);
1279 if (retval)
1280 return retval;
1281
1282 put_group_info(new->group_info);
1283 groups_sort(group_info);
1284 get_group_info(group_info);
1285 new->group_info = group_info;
1286 return 0;
1287}
1288
1289EXPORT_SYMBOL(set_groups);
1290
1291/**
1292 * set_current_groups - Change current's group subscription
1293 * @group_info: The group list to impose
1294 *
1295 * Validate a group subscription and, if valid, impose it upon current's task
1296 * security record.
1297 */
1298int set_current_groups(struct group_info *group_info)
1299{
1300 struct cred *new;
1301 int ret;
1302
1303 new = prepare_creds();
1304 if (!new)
1305 return -ENOMEM;
1306
1307 ret = set_groups(new, group_info);
1308 if (ret < 0) {
1309 abort_creds(new);
1310 return ret;
1311 }
1312
1313 return commit_creds(new);
1314}
1315
1316EXPORT_SYMBOL(set_current_groups);
1317
1318SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
1319{
1320 const struct cred *cred = current_cred();
1321 int i;
1322
1323 if (gidsetsize < 0)
1324 return -EINVAL;
1325
1326 /* no need to grab task_lock here; it cannot change */
1327 i = cred->group_info->ngroups;
1328 if (gidsetsize) {
1329 if (i > gidsetsize) {
1330 i = -EINVAL;
1331 goto out;
1332 }
1333 if (groups_to_user(grouplist, cred->group_info)) {
1334 i = -EFAULT;
1335 goto out;
1336 }
1337 }
1338out:
1339 return i;
1340}
1341
1342/*
1343 * SMP: Our groups are copy-on-write. We can set them safely
1344 * without another task interfering.
1345 */
1346
1347SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
1348{
1349 struct group_info *group_info;
1350 int retval;
1351
1352 if (!capable(CAP_SETGID))
1353 return -EPERM;
1354 if ((unsigned)gidsetsize > NGROUPS_MAX)
1355 return -EINVAL;
1356
1357 group_info = groups_alloc(gidsetsize);
1358 if (!group_info)
1359 return -ENOMEM;
1360 retval = groups_from_user(group_info, grouplist);
1361 if (retval) {
1362 put_group_info(group_info);
1363 return retval;
1364 }
1365
1366 retval = set_current_groups(group_info);
1367 put_group_info(group_info);
1368
1369 return retval;
1370}
1371
1372/*
1373 * Check whether we're fsgid/egid or in the supplemental group..
1374 */
1375int in_group_p(gid_t grp)
1376{
1377 const struct cred *cred = current_cred();
1378 int retval = 1;
1379
1380 if (grp != cred->fsgid)
1381 retval = groups_search(cred->group_info, grp);
1382 return retval;
1383}
1384
1385EXPORT_SYMBOL(in_group_p);
1386
1387int in_egroup_p(gid_t grp)
1388{
1389 const struct cred *cred = current_cred();
1390 int retval = 1;
1391
1392 if (grp != cred->egid)
1393 retval = groups_search(cred->group_info, grp);
1394 return retval;
1395}
1396
1397EXPORT_SYMBOL(in_egroup_p);
1398
1399DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1400 1117
1401SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1118SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0e51a35a4486..1a631ba684a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/fs.h> 32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
@@ -90,6 +91,9 @@ extern int sysctl_nr_trim_pages;
90#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
91extern int rcutorture_runnable; 92extern int rcutorture_runnable;
92#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled;
96#endif
93 97
94/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
95#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -244,6 +248,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
244#endif 248#endif
245 249
246static struct ctl_table kern_table[] = { 250static struct ctl_table kern_table[] = {
251 {
252 .ctl_name = CTL_UNNUMBERED,
253 .procname = "sched_child_runs_first",
254 .data = &sysctl_sched_child_runs_first,
255 .maxlen = sizeof(unsigned int),
256 .mode = 0644,
257 .proc_handler = &proc_dointvec,
258 },
247#ifdef CONFIG_SCHED_DEBUG 259#ifdef CONFIG_SCHED_DEBUG
248 { 260 {
249 .ctl_name = CTL_UNNUMBERED, 261 .ctl_name = CTL_UNNUMBERED,
@@ -298,14 +310,6 @@ static struct ctl_table kern_table[] = {
298 }, 310 },
299 { 311 {
300 .ctl_name = CTL_UNNUMBERED, 312 .ctl_name = CTL_UNNUMBERED,
301 .procname = "sched_child_runs_first",
302 .data = &sysctl_sched_child_runs_first,
303 .maxlen = sizeof(unsigned int),
304 .mode = 0644,
305 .proc_handler = &proc_dointvec,
306 },
307 {
308 .ctl_name = CTL_UNNUMBERED,
309 .procname = "sched_features", 313 .procname = "sched_features",
310 .data = &sysctl_sched_features, 314 .data = &sysctl_sched_features,
311 .maxlen = sizeof(unsigned int), 315 .maxlen = sizeof(unsigned int),
@@ -330,11 +334,22 @@ static struct ctl_table kern_table[] = {
330 }, 334 },
331 { 335 {
332 .ctl_name = CTL_UNNUMBERED, 336 .ctl_name = CTL_UNNUMBERED,
337 .procname = "sched_time_avg",
338 .data = &sysctl_sched_time_avg,
339 .maxlen = sizeof(unsigned int),
340 .mode = 0644,
341 .proc_handler = &proc_dointvec,
342 },
343 {
344 .ctl_name = CTL_UNNUMBERED,
333 .procname = "timer_migration", 345 .procname = "timer_migration",
334 .data = &sysctl_timer_migration, 346 .data = &sysctl_timer_migration,
335 .maxlen = sizeof(unsigned int), 347 .maxlen = sizeof(unsigned int),
336 .mode = 0644, 348 .mode = 0644,
337 .proc_handler = &proc_dointvec, 349 .proc_handler = &proc_dointvec_minmax,
350 .strategy = &sysctl_intvec,
351 .extra1 = &zero,
352 .extra2 = &one,
338 }, 353 },
339#endif 354#endif
340 { 355 {
@@ -743,6 +758,14 @@ static struct ctl_table kern_table[] = {
743 .proc_handler = &proc_dointvec, 758 .proc_handler = &proc_dointvec,
744 }, 759 },
745 { 760 {
761 .ctl_name = CTL_UNNUMBERED,
762 .procname = "panic_on_io_nmi",
763 .data = &panic_on_io_nmi,
764 .maxlen = sizeof(int),
765 .mode = 0644,
766 .proc_handler = &proc_dointvec,
767 },
768 {
746 .ctl_name = KERN_BOOTLOADER_TYPE, 769 .ctl_name = KERN_BOOTLOADER_TYPE,
747 .procname = "bootloader_type", 770 .procname = "bootloader_type",
748 .data = &bootloader_type, 771 .data = &bootloader_type,
@@ -967,6 +990,26 @@ static struct ctl_table kern_table[] = {
967 .proc_handler = &proc_dointvec, 990 .proc_handler = &proc_dointvec,
968 }, 991 },
969#endif 992#endif
993#ifdef CONFIG_KMEMCHECK
994 {
995 .ctl_name = CTL_UNNUMBERED,
996 .procname = "kmemcheck",
997 .data = &kmemcheck_enabled,
998 .maxlen = sizeof(int),
999 .mode = 0644,
1000 .proc_handler = &proc_dointvec,
1001 },
1002#endif
1003#ifdef CONFIG_BLOCK
1004 {
1005 .ctl_name = CTL_UNNUMBERED,
1006 .procname = "blk_iopoll",
1007 .data = &blk_iopoll_enabled,
1008 .maxlen = sizeof(int),
1009 .mode = 0644,
1010 .proc_handler = &proc_dointvec,
1011 },
1012#endif
970/* 1013/*
971 * NOTE: do not add new entries to this table unless you have read 1014 * NOTE: do not add new entries to this table unless you have read
972 * Documentation/sysctl/ctl_unnumbered.txt 1015 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1283,10 +1326,10 @@ static struct ctl_table vm_table[] = {
1283 { 1326 {
1284 .ctl_name = CTL_UNNUMBERED, 1327 .ctl_name = CTL_UNNUMBERED,
1285 .procname = "mmap_min_addr", 1328 .procname = "mmap_min_addr",
1286 .data = &mmap_min_addr, 1329 .data = &dac_mmap_min_addr,
1287 .maxlen = sizeof(unsigned long), 1330 .maxlen = sizeof(unsigned long),
1288 .mode = 0644, 1331 .mode = 0644,
1289 .proc_handler = &proc_doulongvec_minmax, 1332 .proc_handler = &mmap_min_addr_handler,
1290 }, 1333 },
1291#ifdef CONFIG_NUMA 1334#ifdef CONFIG_NUMA
1292 { 1335 {
@@ -1325,7 +1368,6 @@ static struct ctl_table vm_table[] = {
1325 .extra2 = &one, 1368 .extra2 = &one,
1326 }, 1369 },
1327#endif 1370#endif
1328#ifdef CONFIG_UNEVICTABLE_LRU
1329 { 1371 {
1330 .ctl_name = CTL_UNNUMBERED, 1372 .ctl_name = CTL_UNNUMBERED,
1331 .procname = "scan_unevictable_pages", 1373 .procname = "scan_unevictable_pages",
@@ -1334,7 +1376,6 @@ static struct ctl_table vm_table[] = {
1334 .mode = 0644, 1376 .mode = 0644,
1335 .proc_handler = &scan_unevictable_handler, 1377 .proc_handler = &scan_unevictable_handler,
1336 }, 1378 },
1337#endif
1338/* 1379/*
1339 * NOTE: do not add new entries to this table unless you have read 1380 * NOTE: do not add new entries to this table unless you have read
1340 * Documentation/sysctl/ctl_unnumbered.txt 1381 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2273,7 +2314,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2273 void *data) 2314 void *data)
2274{ 2315{
2275#define TMPBUFLEN 21 2316#define TMPBUFLEN 21
2276 int *i, vleft, first=1, neg, val; 2317 int *i, vleft, first = 1, neg;
2277 unsigned long lval; 2318 unsigned long lval;
2278 size_t left, len; 2319 size_t left, len;
2279 2320
@@ -2326,8 +2367,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2326 len = p-buf; 2367 len = p-buf;
2327 if ((len < left) && *p && !isspace(*p)) 2368 if ((len < left) && *p && !isspace(*p))
2328 break; 2369 break;
2329 if (neg)
2330 val = -val;
2331 s += len; 2370 s += len;
2332 left -= len; 2371 left -= len;
2333 2372
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
108/* 108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid 109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */ 110 */
111static int send_reply(struct sk_buff *skb, pid_t pid) 111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 112{
113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114 void *reply = genlmsg_data(genlhdr); 114 void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
120 return rc; 120 return rc;
121 } 121 }
122 122
123 return genlmsg_unicast(skb, pid); 123 return genlmsg_reply(skb, info);
124} 124}
125 125
126/* 126/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
150 if (!skb_next) 150 if (!skb_next)
151 break; 151 break;
152 } 152 }
153 rc = genlmsg_unicast(skb_cur, s->pid); 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154 if (rc == -ECONNREFUSED) { 154 if (rc == -ECONNREFUSED) {
155 s->valid = 0; 155 s->valid = 0;
156 delcount++; 156 delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
418 goto err; 418 goto err;
419 } 419 }
420 420
421 rc = send_reply(rep_skb, info->snd_pid); 421 rc = send_reply(rep_skb, info);
422 422
423err: 423err:
424 fput_light(file, fput_needed); 424 fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
487 } else 487 } else
488 goto err; 488 goto err;
489 489
490 return send_reply(rep_skb, info->snd_pid); 490 return send_reply(rep_skb, info);
491err: 491err:
492 nlmsg_free(rep_skb); 492 nlmsg_free(rep_skb);
493 return rc; 493 return rc;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..620b58abdc32 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
137 */ 137 */
138int clockevents_register_notifier(struct notifier_block *nb) 138int clockevents_register_notifier(struct notifier_block *nb)
139{ 139{
140 unsigned long flags;
140 int ret; 141 int ret;
141 142
142 spin_lock(&clockevents_lock); 143 spin_lock_irqsave(&clockevents_lock, flags);
143 ret = raw_notifier_chain_register(&clockevents_chain, nb); 144 ret = raw_notifier_chain_register(&clockevents_chain, nb);
144 spin_unlock(&clockevents_lock); 145 spin_unlock_irqrestore(&clockevents_lock, flags);
145 146
146 return ret; 147 return ret;
147} 148}
@@ -178,16 +179,18 @@ static void clockevents_notify_released(void)
178 */ 179 */
179void clockevents_register_device(struct clock_event_device *dev) 180void clockevents_register_device(struct clock_event_device *dev)
180{ 181{
182 unsigned long flags;
183
181 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
182 BUG_ON(!dev->cpumask); 185 BUG_ON(!dev->cpumask);
183 186
184 spin_lock(&clockevents_lock); 187 spin_lock_irqsave(&clockevents_lock, flags);
185 188
186 list_add(&dev->list, &clockevent_devices); 189 list_add(&dev->list, &clockevent_devices);
187 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
188 clockevents_notify_released(); 191 clockevents_notify_released();
189 192
190 spin_unlock(&clockevents_lock); 193 spin_unlock_irqrestore(&clockevents_lock, flags);
191} 194}
192EXPORT_SYMBOL_GPL(clockevents_register_device); 195EXPORT_SYMBOL_GPL(clockevents_register_device);
193 196
@@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
235void clockevents_notify(unsigned long reason, void *arg) 238void clockevents_notify(unsigned long reason, void *arg)
236{ 239{
237 struct list_head *node, *tmp; 240 struct list_head *node, *tmp;
241 unsigned long flags;
238 242
239 spin_lock(&clockevents_lock); 243 spin_lock_irqsave(&clockevents_lock, flags);
240 clockevents_do_notify(reason, arg); 244 clockevents_do_notify(reason, arg);
241 245
242 switch (reason) { 246 switch (reason) {
@@ -251,18 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
251 default: 255 default:
252 break; 256 break;
253 } 257 }
254 spin_unlock(&clockevents_lock); 258 spin_unlock_irqrestore(&clockevents_lock, flags);
255} 259}
256EXPORT_SYMBOL_GPL(clockevents_notify); 260EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 261#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 592bf584d1d2..7466cb811251 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
513 * Check to make sure we don't switch to a non-highres capable 513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz) 514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */ 515 */
516 if (tick_oneshot_mode_active() && 516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { 517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. " 518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name); 519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 877dbedc3118..c2ec25087a35 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
205 * Powerstate information: The system enters/leaves a state, where 205 * Powerstate information: The system enters/leaves a state, where
206 * affected devices might stop 206 * affected devices might stop
207 */ 207 */
208static void tick_do_broadcast_on_off(void *why) 208static void tick_do_broadcast_on_off(unsigned long *reason)
209{ 209{
210 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
211 struct tick_device *td; 211 struct tick_device *td;
212 unsigned long flags, *reason = why; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
279 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 279 tick_do_broadcast_on_off(&reason);
280 &reason, 1);
281} 280}
282 281
283/* 282/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2aff39c6f10c..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
222 222
223 cpu = smp_processor_id(); 223 cpu = smp_processor_id();
224 ts = &per_cpu(tick_cpu_sched, cpu); 224 ts = &per_cpu(tick_cpu_sched, cpu);
225
226 /*
227 * Call to tick_nohz_start_idle stops the last_update_time from being
228 * updated. Thus, it must not be called in the event we are called from
229 * irq_exit() with the prior state different than idle.
230 */
231 if (!inidle && !ts->inidle)
232 goto end;
233
225 now = tick_nohz_start_idle(ts); 234 now = tick_nohz_start_idle(ts);
226 235
227 /* 236 /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
240 goto end; 249 goto end;
241 250
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1; 251 ts->inidle = 1;
246 252
247 if (need_resched()) 253 if (need_resched())
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a1277..fddd69d16e03 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
286{ 286{
287 struct proc_dir_entry *pe; 287 struct proc_dir_entry *pe;
288 288
289 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); 289 pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
290 if (!pe) 290 if (!pe)
291 return -ENOMEM; 291 return -ENOMEM;
292 return 0; 292 return 0;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166d..4cde8b9c716f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
96/* 96/*
97 * Collection status, active/inactive: 97 * Collection status, active/inactive:
98 */ 98 */
99static int __read_mostly active; 99int __read_mostly timer_stats_active;
100 100
101/* 101/*
102 * Beginning/end timestamps of measurement: 102 * Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
257 if (!active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
260 entry = tstat_lookup(&input, comm); 260 entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
290 /* 290 /*
291 * If still active then calculate up to now: 291 * If still active then calculate up to now:
292 */ 292 */
293 if (active) 293 if (timer_stats_active)
294 time_stop = ktime_get(); 294 time_stop = ktime_get();
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
368 mutex_lock(&show_mutex); 368 mutex_lock(&show_mutex);
369 switch (ctl[0]) { 369 switch (ctl[0]) {
370 case '0': 370 case '0':
371 if (active) { 371 if (timer_stats_active) {
372 active = 0; 372 timer_stats_active = 0;
373 time_stop = ktime_get(); 373 time_stop = ktime_get();
374 sync_access(); 374 sync_access();
375 } 375 }
376 break; 376 break;
377 case '1': 377 case '1':
378 if (!active) { 378 if (!timer_stats_active) {
379 reset_entries(); 379 reset_entries();
380 time_start = ktime_get(); 380 time_start = ktime_get();
381 smp_mb(); 381 smp_mb();
382 active = 1; 382 timer_stats_active = 1;
383 } 383 }
384 break; 384 break;
385 default: 385 default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 54d3912f8cad..a3d25f415019 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
380{ 380{
381 unsigned int flag = 0; 381 unsigned int flag = 0;
382 382
383 if (likely(!timer->start_site))
384 return;
383 if (unlikely(tbase_get_deferrable(timer->base))) 385 if (unlikely(tbase_get_deferrable(timer->base)))
384 flag |= TIMER_STATS_FLAG_DEFERRABLE; 386 flag |= TIMER_STATS_FLAG_DEFERRABLE;
385 387
@@ -712,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
712 * networking code - if the timer is re-modified 714 * networking code - if the timer is re-modified
713 * to be the same thing then just return: 715 * to be the same thing then just return:
714 */ 716 */
715 if (timer->expires == expires && timer_pending(timer)) 717 if (timer_pending(timer) && timer->expires == expires)
716 return 1; 718 return 1;
717 719
718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
@@ -1154,8 +1156,7 @@ void update_process_times(int user_tick)
1154 /* Note: this timer irq context must be accounted for as well. */ 1156 /* Note: this timer irq context must be accounted for as well. */
1155 account_process_tick(p, user_tick); 1157 account_process_tick(p, user_tick);
1156 run_local_timers(); 1158 run_local_timers();
1157 if (rcu_pending(cpu)) 1159 rcu_check_callbacks(cpu, user_tick);
1158 rcu_check_callbacks(cpu, user_tick);
1159 printk_tick(); 1160 printk_tick();
1160 scheduler_tick(); 1161 scheduler_tick();
1161 run_posix_cpu_timers(p); 1162 run_posix_cpu_timers(p);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4a13e5a01ce3..e71634604400 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,31 +11,48 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
26
27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool
29 help
30 An arch may pass in a unique value (frame pointer) to both the
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
20 33
21config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
22 bool 35 bool
23 help 36 help
24 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
25 variable at the mcount call site. Otherwise, this variable
26 is tested by the called function.
27 38
28config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
29 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
30 43
31config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
32 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
33 48
34config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
35 bool 50 bool
36 51
37config HAVE_FTRACE_SYSCALLS 52config HAVE_SYSCALL_TRACEPOINTS
38 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
39 56
40config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
41 bool 58 bool
@@ -53,9 +70,14 @@ config EVENT_TRACING
53 bool 70 bool
54 71
55config CONTEXT_SWITCH_TRACER 72config CONTEXT_SWITCH_TRACER
56 select MARKERS
57 bool 73 bool
58 74
75config RING_BUFFER_ALLOW_SWAP
76 bool
77 help
78 Allow the use of ring_buffer_swap_cpu.
79 Adds a very slight overhead to tracing when enabled.
80
59# All tracer options should select GENERIC_TRACER. For those options that are 81# All tracer options should select GENERIC_TRACER. For those options that are
60# enabled by all tracers (context switch and event tracer) they select TRACING. 82# enabled by all tracers (context switch and event tracer) they select TRACING.
61# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
@@ -121,6 +143,7 @@ config FUNCTION_GRAPH_TRACER
121 bool "Kernel Function Graph Tracer" 143 bool "Kernel Function Graph Tracer"
122 depends on HAVE_FUNCTION_GRAPH_TRACER 144 depends on HAVE_FUNCTION_GRAPH_TRACER
123 depends on FUNCTION_TRACER 145 depends on FUNCTION_TRACER
146 depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
124 default y 147 default y
125 help 148 help
126 Enable the kernel to trace a function at both its return 149 Enable the kernel to trace a function at both its return
@@ -139,6 +162,7 @@ config IRQSOFF_TRACER
139 select TRACE_IRQFLAGS 162 select TRACE_IRQFLAGS
140 select GENERIC_TRACER 163 select GENERIC_TRACER
141 select TRACER_MAX_TRACE 164 select TRACER_MAX_TRACE
165 select RING_BUFFER_ALLOW_SWAP
142 help 166 help
143 This option measures the time spent in irqs-off critical 167 This option measures the time spent in irqs-off critical
144 sections, with microsecond accuracy. 168 sections, with microsecond accuracy.
@@ -147,7 +171,7 @@ config IRQSOFF_TRACER
147 disabled by default and can be runtime (re-)started 171 disabled by default and can be runtime (re-)started
148 via: 172 via:
149 173
150 echo 0 > /debugfs/tracing/tracing_max_latency 174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
151 175
152 (Note that kernel size and overhead increases with this option 176 (Note that kernel size and overhead increases with this option
153 enabled. This option and the preempt-off timing option can be 177 enabled. This option and the preempt-off timing option can be
@@ -160,6 +184,7 @@ config PREEMPT_TRACER
160 depends on PREEMPT 184 depends on PREEMPT
161 select GENERIC_TRACER 185 select GENERIC_TRACER
162 select TRACER_MAX_TRACE 186 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP
163 help 188 help
164 This option measures the time spent in preemption off critical 189 This option measures the time spent in preemption off critical
165 sections, with microsecond accuracy. 190 sections, with microsecond accuracy.
@@ -168,7 +193,7 @@ config PREEMPT_TRACER
168 disabled by default and can be runtime (re-)started 193 disabled by default and can be runtime (re-)started
169 via: 194 via:
170 195
171 echo 0 > /debugfs/tracing/tracing_max_latency 196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
172 197
173 (Note that kernel size and overhead increases with this option 198 (Note that kernel size and overhead increases with this option
174 enabled. This option and the irqs-off timing option can be 199 enabled. This option and the irqs-off timing option can be
@@ -203,7 +228,7 @@ config ENABLE_DEFAULT_TRACERS
203 228
204config FTRACE_SYSCALLS 229config FTRACE_SYSCALLS
205 bool "Trace syscalls" 230 bool "Trace syscalls"
206 depends on HAVE_FTRACE_SYSCALLS 231 depends on HAVE_SYSCALL_TRACEPOINTS
207 select GENERIC_TRACER 232 select GENERIC_TRACER
208 select KALLSYMS 233 select KALLSYMS
209 help 234 help
@@ -218,13 +243,13 @@ config BOOT_TRACER
218 the timings of the initcalls and traces key events and the identity 243 the timings of the initcalls and traces key events and the identity
219 of tasks that can cause boot delays, such as context-switches. 244 of tasks that can cause boot delays, such as context-switches.
220 245
221 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 246 Its aim is to be parsed by the scripts/bootgraph.pl tool to
222 produce pretty graphics about boot inefficiencies, giving a visual 247 produce pretty graphics about boot inefficiencies, giving a visual
223 representation of the delays during initcalls - but the raw 248 representation of the delays during initcalls - but the raw
224 /debug/tracing/trace text output is readable too. 249 /debug/tracing/trace text output is readable too.
225 250
226 You must pass in ftrace=initcall to the kernel command line 251 You must pass in initcall_debug and ftrace=initcall to the kernel
227 to enable this on bootup. 252 command line to enable this on bootup.
228 253
229config TRACE_BRANCH_PROFILING 254config TRACE_BRANCH_PROFILING
230 bool 255 bool
@@ -261,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
261 This tracer profiles all the the likely and unlikely macros 286 This tracer profiles all the the likely and unlikely macros
262 in the kernel. It will display the results in: 287 in the kernel. It will display the results in:
263 288
264 /debugfs/tracing/profile_annotated_branch 289 /sys/kernel/debug/tracing/profile_annotated_branch
265 290
266 Note: this will add a significant overhead, only turn this 291 Note: this will add a significant overhead, only turn this
267 on if you need to profile the system's use of these macros. 292 on if you need to profile the system's use of these macros.
@@ -274,7 +299,7 @@ config PROFILE_ALL_BRANCHES
274 taken in the kernel is recorded whether it hit or miss. 299 taken in the kernel is recorded whether it hit or miss.
275 The results will be displayed in: 300 The results will be displayed in:
276 301
277 /debugfs/tracing/profile_branch 302 /sys/kernel/debug/tracing/profile_branch
278 303
279 This option also enables the likely/unlikely profiler. 304 This option also enables the likely/unlikely profiler.
280 305
@@ -323,7 +348,7 @@ config STACK_TRACER
323 select KALLSYMS 348 select KALLSYMS
324 help 349 help
325 This special tracer records the maximum stack footprint of the 350 This special tracer records the maximum stack footprint of the
326 kernel and displays it in debugfs/tracing/stack_trace. 351 kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
327 352
328 This tracer works by hooking into every function call that the 353 This tracer works by hooking into every function call that the
329 kernel executes, and keeping a maximum stack depth value and 354 kernel executes, and keeping a maximum stack depth value and
@@ -454,6 +479,18 @@ config FTRACE_STARTUP_TEST
454 functioning properly. It will do tests on all the configured 479 functioning properly. It will do tests on all the configured
455 tracers of ftrace. 480 tracers of ftrace.
456 481
482config EVENT_TRACE_TEST_SYSCALLS
483 bool "Run selftest on syscall events"
484 depends on FTRACE_STARTUP_TEST
485 help
486 This option will also enable testing every syscall event.
487 It only enables the event and disables it and runs various loads
488 with the event enabled. This adds a bit more time for kernel boot
489 up since it runs this on every system call defined.
490
491 TBD - enable a way to actually call the syscalls as we test their
492 events
493
457config MMIOTRACE 494config MMIOTRACE
458 bool "Memory mapped IO tracing" 495 bool "Memory mapped IO tracing"
459 depends on HAVE_MMIOTRACE_SUPPORT && PCI 496 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..3eb159c277c8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
@@ -64,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
64{ 65{
65 struct blk_io_trace *t; 66 struct blk_io_trace *t;
66 struct ring_buffer_event *event = NULL; 67 struct ring_buffer_event *event = NULL;
68 struct ring_buffer *buffer = NULL;
67 int pc = 0; 69 int pc = 0;
68 int cpu = smp_processor_id(); 70 int cpu = smp_processor_id();
69 bool blk_tracer = blk_tracer_enabled; 71 bool blk_tracer = blk_tracer_enabled;
70 72
71 if (blk_tracer) { 73 if (blk_tracer) {
74 buffer = blk_tr->buffer;
72 pc = preempt_count(); 75 pc = preempt_count();
73 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 76 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
74 sizeof(*t) + len, 77 sizeof(*t) + len,
75 0, pc); 78 0, pc);
76 if (!event) 79 if (!event)
@@ -95,7 +98,7 @@ record_it:
95 memcpy((void *) t + sizeof(*t), data, len); 98 memcpy((void *) t + sizeof(*t), data, len);
96 99
97 if (blk_tracer) 100 if (blk_tracer)
98 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 101 trace_buffer_unlock_commit(buffer, event, 0, pc);
99 } 102 }
100} 103}
101 104
@@ -178,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
178{ 181{
179 struct task_struct *tsk = current; 182 struct task_struct *tsk = current;
180 struct ring_buffer_event *event = NULL; 183 struct ring_buffer_event *event = NULL;
184 struct ring_buffer *buffer = NULL;
181 struct blk_io_trace *t; 185 struct blk_io_trace *t;
182 unsigned long flags = 0; 186 unsigned long flags = 0;
183 unsigned long *sequence; 187 unsigned long *sequence;
@@ -203,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
203 if (blk_tracer) { 207 if (blk_tracer) {
204 tracing_record_cmdline(current); 208 tracing_record_cmdline(current);
205 209
210 buffer = blk_tr->buffer;
206 pc = preempt_count(); 211 pc = preempt_count();
207 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 212 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
208 sizeof(*t) + pdu_len, 213 sizeof(*t) + pdu_len,
209 0, pc); 214 0, pc);
210 if (!event) 215 if (!event)
@@ -251,7 +256,7 @@ record_it:
251 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 256 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
252 257
253 if (blk_tracer) { 258 if (blk_tracer) {
254 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 259 trace_buffer_unlock_commit(buffer, event, 0, pc);
255 return; 260 return;
256 } 261 }
257 } 262 }
@@ -266,8 +271,8 @@ static void blk_trace_free(struct blk_trace *bt)
266{ 271{
267 debugfs_remove(bt->msg_file); 272 debugfs_remove(bt->msg_file);
268 debugfs_remove(bt->dropped_file); 273 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
270 relay_close(bt->rchan); 274 relay_close(bt->rchan);
275 debugfs_remove(bt->dir);
271 free_percpu(bt->sequence); 276 free_percpu(bt->sequence);
272 free_percpu(bt->msg_data); 277 free_percpu(bt->msg_data);
273 kfree(bt); 278 kfree(bt);
@@ -377,18 +382,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
377 382
378static int blk_remove_buf_file_callback(struct dentry *dentry) 383static int blk_remove_buf_file_callback(struct dentry *dentry)
379{ 384{
380 struct dentry *parent = dentry->d_parent;
381 debugfs_remove(dentry); 385 debugfs_remove(dentry);
382 386
383 /*
384 * this will fail for all but the last file, but that is ok. what we
385 * care about is the top level buts->name directory going away, when
386 * the last trace file is gone. Then we don't have to rmdir() that
387 * manually on trace stop, so it nicely solves the issue with
388 * force killing of running traces.
389 */
390
391 debugfs_remove(parent);
392 return 0; 387 return 0;
393} 388}
394 389
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb60732ade0c..cc615f84751b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -291,7 +291,9 @@ function_stat_next(void *v, int idx)
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); 291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292 292
293 again: 293 again:
294 rec++; 294 if (idx != 0)
295 rec++;
296
295 if ((void *)rec >= (void *)&pg->records[pg->index]) { 297 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next; 298 pg = pg->next;
297 if (!pg) 299 if (!pg)
@@ -766,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
766 .stat_show = function_stat_show 768 .stat_show = function_stat_show
767}; 769};
768 770
769static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
770{ 772{
771 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
772 struct dentry *entry; 774 struct dentry *entry;
@@ -784,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
784 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
785 * we still do not free memory. 787 * we still do not free memory.
786 */ 788 */
787 kfree(stat);
788 WARN(1, 789 WARN(1,
789 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
790 cpu); 791 cpu);
@@ -811,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
811} 812}
812 813
813#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
815{ 816{
816} 817}
817#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1015,71 +1016,35 @@ static int
1015__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1016{ 1017{
1017 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1018 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1019 1020
1020 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1021 1022
1022 ip = rec->ip;
1023
1024 /* 1023 /*
1025 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1026 * it is not enabled then do nothing. 1025 * then disable it.
1027 * 1026 *
1028 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1029 * it is enabled then disable it.
1030 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1031 */ 1031 */
1032 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1033 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1034 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1035 else 1035 }
1036 return 0;
1037
1038 } else if (ftrace_filtered && enable) {
1039 /*
1040 * Filtering is on:
1041 */
1042
1043 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1044
1045 /* Record is filtered and enabled, do nothing */
1046 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1047 return 0;
1048
1049 /* Record is not filtered or enabled, do nothing */
1050 if (!fl)
1051 return 0;
1052
1053 /* Record is not filtered but enabled, disable it */
1054 if (fl == FTRACE_FL_ENABLED)
1055 rec->flags &= ~FTRACE_FL_ENABLED;
1056 else
1057 /* Otherwise record is filtered but not enabled, enable it */
1058 rec->flags |= FTRACE_FL_ENABLED;
1059 } else {
1060 /* Disable or not filtered */
1061
1062 if (enable) {
1063 /* if record is enabled, do nothing */
1064 if (rec->flags & FTRACE_FL_ENABLED)
1065 return 0;
1066
1067 rec->flags |= FTRACE_FL_ENABLED;
1068
1069 } else {
1070 1036
1071 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1072 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1073 return 0; 1039 return 0;
1074 1040
1075 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1076 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1077 } 1044 }
1078 1045
1079 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1080 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1081 else
1082 return ftrace_make_nop(NULL, rec, ftrace_addr);
1083} 1048}
1084 1049
1085static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1224,6 +1189,13 @@ static void ftrace_shutdown(int command)
1224 return; 1189 return;
1225 1190
1226 ftrace_start_up--; 1191 ftrace_start_up--;
1192 /*
1193 * Just warn in case of unbalance, no need to kill ftrace, it's not
1194 * critical but the ftrace_call callers may be never nopped again after
1195 * further ftrace uses.
1196 */
1197 WARN_ON_ONCE(ftrace_start_up < 0);
1198
1227 if (!ftrace_start_up) 1199 if (!ftrace_start_up)
1228 command |= FTRACE_DISABLE_CALLS; 1200 command |= FTRACE_DISABLE_CALLS;
1229 1201
@@ -1351,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1351 1323
1352enum { 1324enum {
1353 FTRACE_ITER_FILTER = (1 << 0), 1325 FTRACE_ITER_FILTER = (1 << 0),
1354 FTRACE_ITER_CONT = (1 << 1), 1326 FTRACE_ITER_NOTRACE = (1 << 1),
1355 FTRACE_ITER_NOTRACE = (1 << 2), 1327 FTRACE_ITER_FAILURES = (1 << 2),
1356 FTRACE_ITER_FAILURES = (1 << 3), 1328 FTRACE_ITER_PRINTALL = (1 << 3),
1357 FTRACE_ITER_PRINTALL = (1 << 4), 1329 FTRACE_ITER_HASH = (1 << 4),
1358 FTRACE_ITER_HASH = (1 << 5),
1359}; 1330};
1360 1331
1361#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1332#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,9 +1336,7 @@ struct ftrace_iterator {
1365 int hidx; 1336 int hidx;
1366 int idx; 1337 int idx;
1367 unsigned flags; 1338 unsigned flags;
1368 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1339 struct trace_parser parser;
1369 unsigned buffer_idx;
1370 unsigned filtered;
1371}; 1340};
1372 1341
1373static void * 1342static void *
@@ -1410,28 +1379,33 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1410{ 1379{
1411 struct ftrace_iterator *iter = m->private; 1380 struct ftrace_iterator *iter = m->private;
1412 void *p = NULL; 1381 void *p = NULL;
1382 loff_t l;
1383
1384 if (!(iter->flags & FTRACE_ITER_HASH))
1385 *pos = 0;
1413 1386
1414 iter->flags |= FTRACE_ITER_HASH; 1387 iter->flags |= FTRACE_ITER_HASH;
1415 1388
1416 return t_hash_next(m, p, pos); 1389 iter->hidx = 0;
1390 for (l = 0; l <= *pos; ) {
1391 p = t_hash_next(m, p, &l);
1392 if (!p)
1393 break;
1394 }
1395 return p;
1417} 1396}
1418 1397
1419static int t_hash_show(struct seq_file *m, void *v) 1398static int t_hash_show(struct seq_file *m, void *v)
1420{ 1399{
1421 struct ftrace_func_probe *rec; 1400 struct ftrace_func_probe *rec;
1422 struct hlist_node *hnd = v; 1401 struct hlist_node *hnd = v;
1423 char str[KSYM_SYMBOL_LEN];
1424 1402
1425 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1403 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1426 1404
1427 if (rec->ops->print) 1405 if (rec->ops->print)
1428 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1406 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1429 1407
1430 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1408 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1431 seq_printf(m, "%s:", str);
1432
1433 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1434 seq_printf(m, "%s", str);
1435 1409
1436 if (rec->data) 1410 if (rec->data)
1437 seq_printf(m, ":%p", rec->data); 1411 seq_printf(m, ":%p", rec->data);
@@ -1460,8 +1434,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1460 iter->pg = iter->pg->next; 1434 iter->pg = iter->pg->next;
1461 iter->idx = 0; 1435 iter->idx = 0;
1462 goto retry; 1436 goto retry;
1463 } else {
1464 iter->idx = -1;
1465 } 1437 }
1466 } else { 1438 } else {
1467 rec = &iter->pg->records[iter->idx++]; 1439 rec = &iter->pg->records[iter->idx++];
@@ -1490,6 +1462,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1490{ 1462{
1491 struct ftrace_iterator *iter = m->private; 1463 struct ftrace_iterator *iter = m->private;
1492 void *p = NULL; 1464 void *p = NULL;
1465 loff_t l;
1493 1466
1494 mutex_lock(&ftrace_lock); 1467 mutex_lock(&ftrace_lock);
1495 /* 1468 /*
@@ -1501,23 +1474,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1501 if (*pos > 0) 1474 if (*pos > 0)
1502 return t_hash_start(m, pos); 1475 return t_hash_start(m, pos);
1503 iter->flags |= FTRACE_ITER_PRINTALL; 1476 iter->flags |= FTRACE_ITER_PRINTALL;
1504 (*pos)++;
1505 return iter; 1477 return iter;
1506 } 1478 }
1507 1479
1508 if (iter->flags & FTRACE_ITER_HASH) 1480 if (iter->flags & FTRACE_ITER_HASH)
1509 return t_hash_start(m, pos); 1481 return t_hash_start(m, pos);
1510 1482
1511 if (*pos > 0) { 1483 iter->pg = ftrace_pages_start;
1512 if (iter->idx < 0) 1484 iter->idx = 0;
1513 return p; 1485 for (l = 0; l <= *pos; ) {
1514 (*pos)--; 1486 p = t_next(m, p, &l);
1515 iter->idx--; 1487 if (!p)
1488 break;
1516 } 1489 }
1517 1490
1518 p = t_next(m, p, pos); 1491 if (!p && iter->flags & FTRACE_ITER_FILTER)
1519
1520 if (!p)
1521 return t_hash_start(m, pos); 1492 return t_hash_start(m, pos);
1522 1493
1523 return p; 1494 return p;
@@ -1532,7 +1503,6 @@ static int t_show(struct seq_file *m, void *v)
1532{ 1503{
1533 struct ftrace_iterator *iter = m->private; 1504 struct ftrace_iterator *iter = m->private;
1534 struct dyn_ftrace *rec = v; 1505 struct dyn_ftrace *rec = v;
1535 char str[KSYM_SYMBOL_LEN];
1536 1506
1537 if (iter->flags & FTRACE_ITER_HASH) 1507 if (iter->flags & FTRACE_ITER_HASH)
1538 return t_hash_show(m, v); 1508 return t_hash_show(m, v);
@@ -1545,9 +1515,7 @@ static int t_show(struct seq_file *m, void *v)
1545 if (!rec) 1515 if (!rec)
1546 return 0; 1516 return 0;
1547 1517
1548 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1518 seq_printf(m, "%ps\n", (void *)rec->ip);
1549
1550 seq_printf(m, "%s\n", str);
1551 1519
1552 return 0; 1520 return 0;
1553} 1521}
@@ -1586,17 +1554,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1586 return ret; 1554 return ret;
1587} 1555}
1588 1556
1589int ftrace_avail_release(struct inode *inode, struct file *file)
1590{
1591 struct seq_file *m = (struct seq_file *)file->private_data;
1592 struct ftrace_iterator *iter = m->private;
1593
1594 seq_release(inode, file);
1595 kfree(iter);
1596
1597 return 0;
1598}
1599
1600static int 1557static int
1601ftrace_failures_open(struct inode *inode, struct file *file) 1558ftrace_failures_open(struct inode *inode, struct file *file)
1602{ 1559{
@@ -1645,9 +1602,14 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1645 if (!iter) 1602 if (!iter)
1646 return -ENOMEM; 1603 return -ENOMEM;
1647 1604
1605 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1606 kfree(iter);
1607 return -ENOMEM;
1608 }
1609
1648 mutex_lock(&ftrace_regex_lock); 1610 mutex_lock(&ftrace_regex_lock);
1649 if ((file->f_mode & FMODE_WRITE) && 1611 if ((file->f_mode & FMODE_WRITE) &&
1650 !(file->f_flags & O_APPEND)) 1612 (file->f_flags & O_TRUNC))
1651 ftrace_filter_reset(enable); 1613 ftrace_filter_reset(enable);
1652 1614
1653 if (file->f_mode & FMODE_READ) { 1615 if (file->f_mode & FMODE_READ) {
@@ -2100,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2100 int i, len = 0; 2062 int i, len = 0;
2101 char *search; 2063 char *search;
2102 2064
2103 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2065 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2104 glob = NULL; 2066 glob = NULL;
2105 else { 2067 else if (glob) {
2106 int not; 2068 int not;
2107 2069
2108 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2070 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
@@ -2237,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2237 size_t cnt, loff_t *ppos, int enable) 2199 size_t cnt, loff_t *ppos, int enable)
2238{ 2200{
2239 struct ftrace_iterator *iter; 2201 struct ftrace_iterator *iter;
2240 char ch; 2202 struct trace_parser *parser;
2241 size_t read = 0; 2203 ssize_t ret, read;
2242 ssize_t ret;
2243 2204
2244 if (!cnt || cnt < 0) 2205 if (!cnt || cnt < 0)
2245 return 0; 2206 return 0;
@@ -2252,68 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2252 } else 2213 } else
2253 iter = file->private_data; 2214 iter = file->private_data;
2254 2215
2255 if (!*ppos) { 2216 parser = &iter->parser;
2256 iter->flags &= ~FTRACE_ITER_CONT; 2217 read = trace_get_user(parser, ubuf, cnt, ppos);
2257 iter->buffer_idx = 0;
2258 }
2259
2260 ret = get_user(ch, ubuf++);
2261 if (ret)
2262 goto out;
2263 read++;
2264 cnt--;
2265
2266 if (!(iter->flags & ~FTRACE_ITER_CONT)) {
2267 /* skip white space */
2268 while (cnt && isspace(ch)) {
2269 ret = get_user(ch, ubuf++);
2270 if (ret)
2271 goto out;
2272 read++;
2273 cnt--;
2274 }
2275 2218
2276 if (isspace(ch)) { 2219 if (trace_parser_loaded(parser) &&
2277 file->f_pos += read; 2220 !trace_parser_cont(parser)) {
2278 ret = read; 2221 ret = ftrace_process_regex(parser->buffer,
2279 goto out; 2222 parser->idx, enable);
2280 }
2281
2282 iter->buffer_idx = 0;
2283 }
2284
2285 while (cnt && !isspace(ch)) {
2286 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2287 iter->buffer[iter->buffer_idx++] = ch;
2288 else {
2289 ret = -EINVAL;
2290 goto out;
2291 }
2292 ret = get_user(ch, ubuf++);
2293 if (ret) 2223 if (ret)
2294 goto out; 2224 goto out;
2295 read++;
2296 cnt--;
2297 }
2298 2225
2299 if (isspace(ch)) { 2226 trace_parser_clear(parser);
2300 iter->filtered++; 2227 }
2301 iter->buffer[iter->buffer_idx] = 0;
2302 ret = ftrace_process_regex(iter->buffer,
2303 iter->buffer_idx, enable);
2304 if (ret)
2305 goto out;
2306 iter->buffer_idx = 0;
2307 } else
2308 iter->flags |= FTRACE_ITER_CONT;
2309
2310
2311 file->f_pos += read;
2312 2228
2313 ret = read; 2229 ret = read;
2314 out:
2315 mutex_unlock(&ftrace_regex_lock);
2316 2230
2231 mutex_unlock(&ftrace_regex_lock);
2232out:
2317 return ret; 2233 return ret;
2318} 2234}
2319 2235
@@ -2418,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2418{ 2334{
2419 struct seq_file *m = (struct seq_file *)file->private_data; 2335 struct seq_file *m = (struct seq_file *)file->private_data;
2420 struct ftrace_iterator *iter; 2336 struct ftrace_iterator *iter;
2337 struct trace_parser *parser;
2421 2338
2422 mutex_lock(&ftrace_regex_lock); 2339 mutex_lock(&ftrace_regex_lock);
2423 if (file->f_mode & FMODE_READ) { 2340 if (file->f_mode & FMODE_READ) {
@@ -2427,10 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2427 } else 2344 } else
2428 iter = file->private_data; 2345 iter = file->private_data;
2429 2346
2430 if (iter->buffer_idx) { 2347 parser = &iter->parser;
2431 iter->filtered++; 2348 if (trace_parser_loaded(parser)) {
2432 iter->buffer[iter->buffer_idx] = 0; 2349 parser->buffer[parser->idx] = 0;
2433 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2350 ftrace_match_records(parser->buffer, parser->idx, enable);
2434 } 2351 }
2435 2352
2436 mutex_lock(&ftrace_lock); 2353 mutex_lock(&ftrace_lock);
@@ -2438,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2438 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2355 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2439 mutex_unlock(&ftrace_lock); 2356 mutex_unlock(&ftrace_lock);
2440 2357
2358 trace_parser_put(parser);
2441 kfree(iter); 2359 kfree(iter);
2360
2442 mutex_unlock(&ftrace_regex_lock); 2361 mutex_unlock(&ftrace_regex_lock);
2443 return 0; 2362 return 0;
2444} 2363}
@@ -2459,14 +2378,14 @@ static const struct file_operations ftrace_avail_fops = {
2459 .open = ftrace_avail_open, 2378 .open = ftrace_avail_open,
2460 .read = seq_read, 2379 .read = seq_read,
2461 .llseek = seq_lseek, 2380 .llseek = seq_lseek,
2462 .release = ftrace_avail_release, 2381 .release = seq_release_private,
2463}; 2382};
2464 2383
2465static const struct file_operations ftrace_failures_fops = { 2384static const struct file_operations ftrace_failures_fops = {
2466 .open = ftrace_failures_open, 2385 .open = ftrace_failures_open,
2467 .read = seq_read, 2386 .read = seq_read,
2468 .llseek = seq_lseek, 2387 .llseek = seq_lseek,
2469 .release = ftrace_avail_release, 2388 .release = seq_release_private,
2470}; 2389};
2471 2390
2472static const struct file_operations ftrace_filter_fops = { 2391static const struct file_operations ftrace_filter_fops = {
@@ -2493,32 +2412,31 @@ int ftrace_graph_count;
2493unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2412unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2494 2413
2495static void * 2414static void *
2496g_next(struct seq_file *m, void *v, loff_t *pos) 2415__g_next(struct seq_file *m, loff_t *pos)
2497{ 2416{
2498 unsigned long *array = m->private; 2417 unsigned long *array = m->private;
2499 int index = *pos;
2500
2501 (*pos)++;
2502 2418
2503 if (index >= ftrace_graph_count) 2419 if (*pos >= ftrace_graph_count)
2504 return NULL; 2420 return NULL;
2421 return &array[*pos];
2422}
2505 2423
2506 return &array[index]; 2424static void *
2425g_next(struct seq_file *m, void *v, loff_t *pos)
2426{
2427 (*pos)++;
2428 return __g_next(m, pos);
2507} 2429}
2508 2430
2509static void *g_start(struct seq_file *m, loff_t *pos) 2431static void *g_start(struct seq_file *m, loff_t *pos)
2510{ 2432{
2511 void *p = NULL;
2512
2513 mutex_lock(&graph_lock); 2433 mutex_lock(&graph_lock);
2514 2434
2515 /* Nothing, tell g_show to print all functions are enabled */ 2435 /* Nothing, tell g_show to print all functions are enabled */
2516 if (!ftrace_graph_count && !*pos) 2436 if (!ftrace_graph_count && !*pos)
2517 return (void *)1; 2437 return (void *)1;
2518 2438
2519 p = g_next(m, p, pos); 2439 return __g_next(m, pos);
2520
2521 return p;
2522} 2440}
2523 2441
2524static void g_stop(struct seq_file *m, void *p) 2442static void g_stop(struct seq_file *m, void *p)
@@ -2529,7 +2447,6 @@ static void g_stop(struct seq_file *m, void *p)
2529static int g_show(struct seq_file *m, void *v) 2447static int g_show(struct seq_file *m, void *v)
2530{ 2448{
2531 unsigned long *ptr = v; 2449 unsigned long *ptr = v;
2532 char str[KSYM_SYMBOL_LEN];
2533 2450
2534 if (!ptr) 2451 if (!ptr)
2535 return 0; 2452 return 0;
@@ -2539,9 +2456,7 @@ static int g_show(struct seq_file *m, void *v)
2539 return 0; 2456 return 0;
2540 } 2457 }
2541 2458
2542 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2459 seq_printf(m, "%ps\n", (void *)*ptr);
2543
2544 seq_printf(m, "%s\n", str);
2545 2460
2546 return 0; 2461 return 0;
2547} 2462}
@@ -2563,7 +2478,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2563 2478
2564 mutex_lock(&graph_lock); 2479 mutex_lock(&graph_lock);
2565 if ((file->f_mode & FMODE_WRITE) && 2480 if ((file->f_mode & FMODE_WRITE) &&
2566 !(file->f_flags & O_APPEND)) { 2481 (file->f_flags & O_TRUNC)) {
2567 ftrace_graph_count = 0; 2482 ftrace_graph_count = 0;
2568 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2483 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2569 } 2484 }
@@ -2582,6 +2497,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2582} 2497}
2583 2498
2584static int 2499static int
2500ftrace_graph_release(struct inode *inode, struct file *file)
2501{
2502 if (file->f_mode & FMODE_READ)
2503 seq_release(inode, file);
2504 return 0;
2505}
2506
2507static int
2585ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2508ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2586{ 2509{
2587 struct dyn_ftrace *rec; 2510 struct dyn_ftrace *rec;
@@ -2636,12 +2559,10 @@ static ssize_t
2636ftrace_graph_write(struct file *file, const char __user *ubuf, 2559ftrace_graph_write(struct file *file, const char __user *ubuf,
2637 size_t cnt, loff_t *ppos) 2560 size_t cnt, loff_t *ppos)
2638{ 2561{
2639 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2562 struct trace_parser parser;
2640 unsigned long *array; 2563 unsigned long *array;
2641 size_t read = 0; 2564 size_t read = 0;
2642 ssize_t ret; 2565 ssize_t ret;
2643 int index = 0;
2644 char ch;
2645 2566
2646 if (!cnt || cnt < 0) 2567 if (!cnt || cnt < 0)
2647 return 0; 2568 return 0;
@@ -2659,60 +2580,36 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2659 } else 2580 } else
2660 array = file->private_data; 2581 array = file->private_data;
2661 2582
2662 ret = get_user(ch, ubuf++); 2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2663 if (ret) 2584 ret = -ENOMEM;
2664 goto out; 2585 goto out;
2665 read++;
2666 cnt--;
2667
2668 /* skip white space */
2669 while (cnt && isspace(ch)) {
2670 ret = get_user(ch, ubuf++);
2671 if (ret)
2672 goto out;
2673 read++;
2674 cnt--;
2675 } 2586 }
2676 2587
2677 if (isspace(ch)) { 2588 read = trace_get_user(&parser, ubuf, cnt, ppos);
2678 *ppos += read;
2679 ret = read;
2680 goto out;
2681 }
2682 2589
2683 while (cnt && !isspace(ch)) { 2590 if (trace_parser_loaded((&parser))) {
2684 if (index < FTRACE_BUFF_MAX) 2591 parser.buffer[parser.idx] = 0;
2685 buffer[index++] = ch; 2592
2686 else { 2593 /* we allow only one expression at a time */
2687 ret = -EINVAL; 2594 ret = ftrace_set_func(array, &ftrace_graph_count,
2688 goto out; 2595 parser.buffer);
2689 }
2690 ret = get_user(ch, ubuf++);
2691 if (ret) 2596 if (ret)
2692 goto out; 2597 goto out;
2693 read++;
2694 cnt--;
2695 } 2598 }
2696 buffer[index] = 0;
2697
2698 /* we allow only one expression at a time */
2699 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2700 if (ret)
2701 goto out;
2702
2703 file->f_pos += read;
2704 2599
2705 ret = read; 2600 ret = read;
2706 out: 2601 out:
2602 trace_parser_put(&parser);
2707 mutex_unlock(&graph_lock); 2603 mutex_unlock(&graph_lock);
2708 2604
2709 return ret; 2605 return ret;
2710} 2606}
2711 2607
2712static const struct file_operations ftrace_graph_fops = { 2608static const struct file_operations ftrace_graph_fops = {
2713 .open = ftrace_graph_open, 2609 .open = ftrace_graph_open,
2714 .read = seq_read, 2610 .read = seq_read,
2715 .write = ftrace_graph_write, 2611 .write = ftrace_graph_write,
2612 .release = ftrace_graph_release,
2716}; 2613};
2717#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2614#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2718 2615
@@ -3145,10 +3042,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3145 3042
3146 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3043 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3147 3044
3148 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3045 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3149 goto out; 3046 goto out;
3150 3047
3151 last_ftrace_enabled = ftrace_enabled; 3048 last_ftrace_enabled = !!ftrace_enabled;
3152 3049
3153 if (ftrace_enabled) { 3050 if (ftrace_enabled) {
3154 3051
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 86cdf671d7e2..81b1645c8549 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void)
183 183
184static int kmem_trace_init(struct trace_array *tr) 184static int kmem_trace_init(struct trace_array *tr)
185{ 185{
186 int cpu;
187 kmemtrace_array = tr; 186 kmemtrace_array = tr;
188 187
189 for_each_cpu_mask(cpu, cpu_possible_map) 188 tracing_reset_online_cpus(tr);
190 tracing_reset(tr, cpu);
191 189
192 kmemtrace_start_probes(); 190 kmemtrace_start_probes();
193 191
@@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc {
239}; 237};
240 238
241static enum print_line_t 239static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 241{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
248 286
249 ev = trace_seq_reserve(s, sizeof(*ev)); 287 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 288 if (!ev)
@@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 309}
272 310
273static enum print_line_t 311static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 313{
277 struct trace_seq *s = &iter->seq; 314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 316 struct kmemtrace_user_event *ev;
279 317
318 trace_assign_type(entry, iter->ent);
319
280 ev = trace_seq_reserve(s, sizeof(*ev)); 320 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 321 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 322 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 334
295/* The two other following provide a more minimalistic output */ 335/* The two other following provide a more minimalistic output */
296static enum print_line_t 336static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 338{
339 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 340 struct trace_seq *s = &iter->seq;
301 int ret; 341 int ret;
302 342
343 trace_assign_type(entry, iter->ent);
344
303 /* Alloc entry */ 345 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 346 ret = trace_seq_printf(s, " + ");
305 if (!ret) 347 if (!ret)
@@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 387 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 388 return TRACE_TYPE_PARTIAL_LINE;
347 389
348 /* Node */ 390 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 392 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 393 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 394 return TRACE_TYPE_PARTIAL_LINE;
357 395
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
362} 397}
363 398
364static enum print_line_t 399static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 400kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 401{
402 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 403 struct trace_seq *s = &iter->seq;
369 int ret; 404 int ret;
370 405
406 trace_assign_type(entry, iter->ent);
407
371 /* Free entry */ 408 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 409 ret = trace_seq_printf(s, " - ");
373 if (!ret) 410 if (!ret)
@@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 438 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 439 return TRACE_TYPE_PARTIAL_LINE;
403 440
404 /* Skip node */ 441 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 443 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
408 445
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 446 return TRACE_TYPE_HANDLED;
418} 447}
419 448
@@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 450{
422 struct trace_entry *entry = iter->ent; 451 struct trace_entry *entry = iter->ent;
423 452
424 switch (entry->type) { 453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 454 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
445 default: 461 default:
446 return TRACE_TYPE_UNHANDLED; 462 return TRACE_TYPE_UNHANDLED;
447 } 463 }
448} 464}
449 465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
450static struct tracer kmem_tracer __read_mostly = { 478static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 479 .name = "kmemtrace",
452 .init = kmem_trace_init, 480 .init = kmem_trace_init,
@@ -463,6 +491,21 @@ void kmemtrace_init(void)
463 491
464static int __init init_kmem_tracer(void) 492static int __init init_kmem_tracer(void)
465{ 493{
466 return register_tracer(&kmem_tracer); 494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (!register_tracer(&kmem_tracer)) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
467} 510}
468device_initcall(init_kmem_tracer); 511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2e642b2b7253..6eef38923b07 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <linux/kmemcheck.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/percpu.h> 15#include <linux/percpu.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
@@ -205,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
205#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
206#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
207#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
208 210
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 211/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 212#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -216,17 +218,12 @@ enum {
216 218
217static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
218{ 220{
219 return event->type_len == RINGBUF_TYPE_PADDING 221 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
220 && event->time_delta == 0;
221}
222
223static inline int rb_discarded_event(struct ring_buffer_event *event)
224{
225 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
226} 222}
227 223
228static void rb_event_set_padding(struct ring_buffer_event *event) 224static void rb_event_set_padding(struct ring_buffer_event *event)
229{ 225{
226 /* padding has a NULL time_delta */
230 event->type_len = RINGBUF_TYPE_PADDING; 227 event->type_len = RINGBUF_TYPE_PADDING;
231 event->time_delta = 0; 228 event->time_delta = 0;
232} 229}
@@ -320,6 +317,14 @@ struct buffer_data_page {
320 unsigned char data[]; /* data of buffer page */ 317 unsigned char data[]; /* data of buffer page */
321}; 318};
322 319
320/*
321 * Note, the buffer_page list must be first. The buffer pages
322 * are allocated in cache lines, which means that each buffer
323 * page will be at the beginning of a cache line, and thus
324 * the least significant bits will be zero. We use this to
325 * add flags in the list struct pointers, to make the ring buffer
326 * lockless.
327 */
323struct buffer_page { 328struct buffer_page {
324 struct list_head list; /* list of buffer pages */ 329 struct list_head list; /* list of buffer pages */
325 local_t write; /* index for next write */ 330 local_t write; /* index for next write */
@@ -328,6 +333,21 @@ struct buffer_page {
328 struct buffer_data_page *page; /* Actual data page */ 333 struct buffer_data_page *page; /* Actual data page */
329}; 334};
330 335
336/*
337 * The buffer page counters, write and entries, must be reset
338 * atomically when crossing page boundaries. To synchronize this
339 * update, two counters are inserted into the number. One is
340 * the actual counter for the write position or count on the page.
341 *
342 * The other is a counter of updaters. Before an update happens
343 * the update partition of the counter is incremented. This will
344 * allow the updater to update the counter atomically.
345 *
346 * The counter is 20 bits, and the state data is 12.
347 */
348#define RB_WRITE_MASK 0xfffff
349#define RB_WRITE_INTCNT (1 << 20)
350
331static void rb_init_page(struct buffer_data_page *bpage) 351static void rb_init_page(struct buffer_data_page *bpage)
332{ 352{
333 local_set(&bpage->commit, 0); 353 local_set(&bpage->commit, 0);
@@ -401,19 +421,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
401struct ring_buffer_per_cpu { 421struct ring_buffer_per_cpu {
402 int cpu; 422 int cpu;
403 struct ring_buffer *buffer; 423 struct ring_buffer *buffer;
404 spinlock_t reader_lock; /* serialize readers */ 424 spinlock_t reader_lock; /* serialize readers */
405 raw_spinlock_t lock; 425 raw_spinlock_t lock;
406 struct lock_class_key lock_key; 426 struct lock_class_key lock_key;
407 struct list_head pages; 427 struct list_head *pages;
408 struct buffer_page *head_page; /* read from head */ 428 struct buffer_page *head_page; /* read from head */
409 struct buffer_page *tail_page; /* write to tail */ 429 struct buffer_page *tail_page; /* write to tail */
410 struct buffer_page *commit_page; /* committed pages */ 430 struct buffer_page *commit_page; /* committed pages */
411 struct buffer_page *reader_page; 431 struct buffer_page *reader_page;
412 unsigned long nmi_dropped; 432 local_t commit_overrun;
413 unsigned long commit_overrun; 433 local_t overrun;
414 unsigned long overrun;
415 unsigned long read;
416 local_t entries; 434 local_t entries;
435 local_t committing;
436 local_t commits;
437 unsigned long read;
417 u64 write_stamp; 438 u64 write_stamp;
418 u64 read_stamp; 439 u64 read_stamp;
419 atomic_t record_disabled; 440 atomic_t record_disabled;
@@ -446,14 +467,19 @@ struct ring_buffer_iter {
446}; 467};
447 468
448/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 469/* buffer may be either ring_buffer or ring_buffer_per_cpu */
449#define RB_WARN_ON(buffer, cond) \ 470#define RB_WARN_ON(b, cond) \
450 ({ \ 471 ({ \
451 int _____ret = unlikely(cond); \ 472 int _____ret = unlikely(cond); \
452 if (_____ret) { \ 473 if (_____ret) { \
453 atomic_inc(&buffer->record_disabled); \ 474 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
454 WARN_ON(1); \ 475 struct ring_buffer_per_cpu *__b = \
455 } \ 476 (void *)b; \
456 _____ret; \ 477 atomic_inc(&__b->buffer->record_disabled); \
478 } else \
479 atomic_inc(&b->record_disabled); \
480 WARN_ON(1); \
481 } \
482 _____ret; \
457 }) 483 })
458 484
459/* Up this if you want to test the TIME_EXTENTS and normalization */ 485/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -485,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
485} 511}
486EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
487 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = cmpxchg((unsigned long *)&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 *
849 * We add (void) to let the compiler know that we do not care
850 * about the return value of these functions. We use the
851 * cmpxchg to only update if an interrupt did not already
852 * do it for us. If the cmpxchg fails, we don't care.
853 */
854 (void)local_cmpxchg(&next_page->write, old_write, val);
855 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
856
857 /*
858 * No need to worry about races with clearing out the commit.
859 * it only can increment when a commit takes place. But that
860 * only happens in the outer most nested commit.
861 */
862 local_set(&next_page->page->commit, 0);
863
864 old_tail = cmpxchg(&cpu_buffer->tail_page,
865 tail_page, next_page);
866
867 if (old_tail == tail_page)
868 ret = 1;
869 }
870
871 return ret;
872}
873
874static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
875 struct buffer_page *bpage)
876{
877 unsigned long val = (unsigned long)bpage;
878
879 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
880 return 1;
881
882 return 0;
883}
884
885/**
886 * rb_check_list - make sure a pointer to a list has the last bits zero
887 */
888static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
889 struct list_head *list)
890{
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
892 return 1;
893 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
894 return 1;
895 return 0;
896}
897
488/** 898/**
489 * check_pages - integrity check of buffer pages 899 * check_pages - integrity check of buffer pages
490 * @cpu_buffer: CPU buffer with pages to test 900 * @cpu_buffer: CPU buffer with pages to test
@@ -494,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
494 */ 904 */
495static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 905static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
496{ 906{
497 struct list_head *head = &cpu_buffer->pages; 907 struct list_head *head = cpu_buffer->pages;
498 struct buffer_page *bpage, *tmp; 908 struct buffer_page *bpage, *tmp;
499 909
910 rb_head_page_deactivate(cpu_buffer);
911
500 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 912 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
501 return -1; 913 return -1;
502 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 914 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
503 return -1; 915 return -1;
504 916
917 if (rb_check_list(cpu_buffer, head))
918 return -1;
919
505 list_for_each_entry_safe(bpage, tmp, head, list) { 920 list_for_each_entry_safe(bpage, tmp, head, list) {
506 if (RB_WARN_ON(cpu_buffer, 921 if (RB_WARN_ON(cpu_buffer,
507 bpage->list.next->prev != &bpage->list)) 922 bpage->list.next->prev != &bpage->list))
@@ -509,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
509 if (RB_WARN_ON(cpu_buffer, 924 if (RB_WARN_ON(cpu_buffer,
510 bpage->list.prev->next != &bpage->list)) 925 bpage->list.prev->next != &bpage->list))
511 return -1; 926 return -1;
927 if (rb_check_list(cpu_buffer, &bpage->list))
928 return -1;
512 } 929 }
513 930
931 rb_head_page_activate(cpu_buffer);
932
514 return 0; 933 return 0;
515} 934}
516 935
517static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 936static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
518 unsigned nr_pages) 937 unsigned nr_pages)
519{ 938{
520 struct list_head *head = &cpu_buffer->pages;
521 struct buffer_page *bpage, *tmp; 939 struct buffer_page *bpage, *tmp;
522 unsigned long addr; 940 unsigned long addr;
523 LIST_HEAD(pages); 941 LIST_HEAD(pages);
524 unsigned i; 942 unsigned i;
525 943
944 WARN_ON(!nr_pages);
945
526 for (i = 0; i < nr_pages; i++) { 946 for (i = 0; i < nr_pages; i++) {
527 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 947 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
528 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 948 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
529 if (!bpage) 949 if (!bpage)
530 goto free_pages; 950 goto free_pages;
951
952 rb_check_bpage(cpu_buffer, bpage);
953
531 list_add(&bpage->list, &pages); 954 list_add(&bpage->list, &pages);
532 955
533 addr = __get_free_page(GFP_KERNEL); 956 addr = __get_free_page(GFP_KERNEL);
@@ -537,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
537 rb_init_page(bpage->page); 960 rb_init_page(bpage->page);
538 } 961 }
539 962
540 list_splice(&pages, head); 963 /*
964 * The ring buffer page list is a circular list that does not
965 * start and end with a list head. All page list items point to
966 * other pages.
967 */
968 cpu_buffer->pages = pages.next;
969 list_del(&pages);
541 970
542 rb_check_pages(cpu_buffer); 971 rb_check_pages(cpu_buffer);
543 972
@@ -569,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
569 spin_lock_init(&cpu_buffer->reader_lock); 998 spin_lock_init(&cpu_buffer->reader_lock);
570 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 999 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
571 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1000 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
572 INIT_LIST_HEAD(&cpu_buffer->pages);
573 1001
574 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1002 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
575 GFP_KERNEL, cpu_to_node(cpu)); 1003 GFP_KERNEL, cpu_to_node(cpu));
576 if (!bpage) 1004 if (!bpage)
577 goto fail_free_buffer; 1005 goto fail_free_buffer;
578 1006
1007 rb_check_bpage(cpu_buffer, bpage);
1008
579 cpu_buffer->reader_page = bpage; 1009 cpu_buffer->reader_page = bpage;
580 addr = __get_free_page(GFP_KERNEL); 1010 addr = __get_free_page(GFP_KERNEL);
581 if (!addr) 1011 if (!addr)
@@ -590,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
590 goto fail_free_reader; 1020 goto fail_free_reader;
591 1021
592 cpu_buffer->head_page 1022 cpu_buffer->head_page
593 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1023 = list_entry(cpu_buffer->pages, struct buffer_page, list);
594 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1024 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
595 1025
1026 rb_head_page_activate(cpu_buffer);
1027
596 return cpu_buffer; 1028 return cpu_buffer;
597 1029
598 fail_free_reader: 1030 fail_free_reader:
@@ -605,24 +1037,25 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
605 1037
606static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1038static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
607{ 1039{
608 struct list_head *head = &cpu_buffer->pages; 1040 struct list_head *head = cpu_buffer->pages;
609 struct buffer_page *bpage, *tmp; 1041 struct buffer_page *bpage, *tmp;
610 1042
611 free_buffer_page(cpu_buffer->reader_page); 1043 free_buffer_page(cpu_buffer->reader_page);
612 1044
613 list_for_each_entry_safe(bpage, tmp, head, list) { 1045 rb_head_page_deactivate(cpu_buffer);
614 list_del_init(&bpage->list); 1046
1047 if (head) {
1048 list_for_each_entry_safe(bpage, tmp, head, list) {
1049 list_del_init(&bpage->list);
1050 free_buffer_page(bpage);
1051 }
1052 bpage = list_entry(head, struct buffer_page, list);
615 free_buffer_page(bpage); 1053 free_buffer_page(bpage);
616 } 1054 }
1055
617 kfree(cpu_buffer); 1056 kfree(cpu_buffer);
618} 1057}
619 1058
620/*
621 * Causes compile errors if the struct buffer_page gets bigger
622 * than the struct page.
623 */
624extern int ring_buffer_page_too_big(void);
625
626#ifdef CONFIG_HOTPLUG_CPU 1059#ifdef CONFIG_HOTPLUG_CPU
627static int rb_cpu_notify(struct notifier_block *self, 1060static int rb_cpu_notify(struct notifier_block *self,
628 unsigned long action, void *hcpu); 1061 unsigned long action, void *hcpu);
@@ -645,11 +1078,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
645 int bsize; 1078 int bsize;
646 int cpu; 1079 int cpu;
647 1080
648 /* Paranoid! Optimizes out when all is well */
649 if (sizeof(struct buffer_page) > sizeof(struct page))
650 ring_buffer_page_too_big();
651
652
653 /* keep it in its own cache line */ 1081 /* keep it in its own cache line */
654 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 1082 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
655 GFP_KERNEL); 1083 GFP_KERNEL);
@@ -665,8 +1093,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
665 buffer->reader_lock_key = key; 1093 buffer->reader_lock_key = key;
666 1094
667 /* need at least two pages */ 1095 /* need at least two pages */
668 if (buffer->pages == 1) 1096 if (buffer->pages < 2)
669 buffer->pages++; 1097 buffer->pages = 2;
670 1098
671 /* 1099 /*
672 * In case of non-hotplug cpu, if the ring-buffer is allocated 1100 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -742,6 +1170,7 @@ ring_buffer_free(struct ring_buffer *buffer)
742 1170
743 put_online_cpus(); 1171 put_online_cpus();
744 1172
1173 kfree(buffer->buffers);
745 free_cpumask_var(buffer->cpumask); 1174 free_cpumask_var(buffer->cpumask);
746 1175
747 kfree(buffer); 1176 kfree(buffer);
@@ -766,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
766 atomic_inc(&cpu_buffer->record_disabled); 1195 atomic_inc(&cpu_buffer->record_disabled);
767 synchronize_sched(); 1196 synchronize_sched();
768 1197
1198 rb_head_page_deactivate(cpu_buffer);
1199
769 for (i = 0; i < nr_pages; i++) { 1200 for (i = 0; i < nr_pages; i++) {
770 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1201 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
771 return; 1202 return;
772 p = cpu_buffer->pages.next; 1203 p = cpu_buffer->pages->next;
773 bpage = list_entry(p, struct buffer_page, list); 1204 bpage = list_entry(p, struct buffer_page, list);
774 list_del_init(&bpage->list); 1205 list_del_init(&bpage->list);
775 free_buffer_page(bpage); 1206 free_buffer_page(bpage);
776 } 1207 }
777 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1208 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
778 return; 1209 return;
779 1210
780 rb_reset_cpu(cpu_buffer); 1211 rb_reset_cpu(cpu_buffer);
@@ -796,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
796 atomic_inc(&cpu_buffer->record_disabled); 1227 atomic_inc(&cpu_buffer->record_disabled);
797 synchronize_sched(); 1228 synchronize_sched();
798 1229
1230 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer);
1232
799 for (i = 0; i < nr_pages; i++) { 1233 for (i = 0; i < nr_pages; i++) {
800 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
801 return; 1235 return;
802 p = pages->next; 1236 p = pages->next;
803 bpage = list_entry(p, struct buffer_page, list); 1237 bpage = list_entry(p, struct buffer_page, list);
804 list_del_init(&bpage->list); 1238 list_del_init(&bpage->list);
805 list_add_tail(&bpage->list, &cpu_buffer->pages); 1239 list_add_tail(&bpage->list, cpu_buffer->pages);
806 } 1240 }
807 rb_reset_cpu(cpu_buffer); 1241 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
808 1243
809 rb_check_pages(cpu_buffer); 1244 rb_check_pages(cpu_buffer);
810 1245
@@ -955,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
955} 1390}
956 1391
957static inline struct ring_buffer_event * 1392static inline struct ring_buffer_event *
958rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
959{
960 return __rb_page_index(cpu_buffer->head_page,
961 cpu_buffer->head_page->read);
962}
963
964static inline struct ring_buffer_event *
965rb_iter_head_event(struct ring_buffer_iter *iter) 1393rb_iter_head_event(struct ring_buffer_iter *iter)
966{ 1394{
967 return __rb_page_index(iter->head_page, iter->head); 1395 return __rb_page_index(iter->head_page, iter->head);
968} 1396}
969 1397
970static inline unsigned rb_page_write(struct buffer_page *bpage) 1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
971{ 1399{
972 return local_read(&bpage->write); 1400 return local_read(&bpage->write) & RB_WRITE_MASK;
973} 1401}
974 1402
975static inline unsigned rb_page_commit(struct buffer_page *bpage) 1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -977,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
977 return local_read(&bpage->page->commit); 1405 return local_read(&bpage->page->commit);
978} 1406}
979 1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410 return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
980/* Size is determined by what has been commited */ 1413/* Size is determined by what has been commited */
981static inline unsigned rb_page_size(struct buffer_page *bpage) 1414static inline unsigned rb_page_size(struct buffer_page *bpage)
982{ 1415{
@@ -989,33 +1422,17 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
989 return rb_page_commit(cpu_buffer->commit_page); 1422 return rb_page_commit(cpu_buffer->commit_page);
990} 1423}
991 1424
992static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
993{
994 return rb_page_commit(cpu_buffer->head_page);
995}
996
997static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
998 struct buffer_page **bpage)
999{
1000 struct list_head *p = (*bpage)->list.next;
1001
1002 if (p == &cpu_buffer->pages)
1003 p = p->next;
1004
1005 *bpage = list_entry(p, struct buffer_page, list);
1006}
1007
1008static inline unsigned 1425static inline unsigned
1009rb_event_index(struct ring_buffer_event *event) 1426rb_event_index(struct ring_buffer_event *event)
1010{ 1427{
1011 unsigned long addr = (unsigned long)event; 1428 unsigned long addr = (unsigned long)event;
1012 1429
1013 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1430 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
1014} 1431}
1015 1432
1016static inline int 1433static inline int
1017rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1434rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1018 struct ring_buffer_event *event) 1435 struct ring_buffer_event *event)
1019{ 1436{
1020 unsigned long addr = (unsigned long)event; 1437 unsigned long addr = (unsigned long)event;
1021 unsigned long index; 1438 unsigned long index;
@@ -1028,33 +1445,10 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1028} 1445}
1029 1446
1030static void 1447static void
1031rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
1032 struct ring_buffer_event *event)
1033{
1034 unsigned long addr = (unsigned long)event;
1035 unsigned long index;
1036
1037 index = rb_event_index(event);
1038 addr &= PAGE_MASK;
1039
1040 while (cpu_buffer->commit_page->page != (void *)addr) {
1041 if (RB_WARN_ON(cpu_buffer,
1042 cpu_buffer->commit_page == cpu_buffer->tail_page))
1043 return;
1044 cpu_buffer->commit_page->page->commit =
1045 cpu_buffer->commit_page->write;
1046 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1047 cpu_buffer->write_stamp =
1048 cpu_buffer->commit_page->page->time_stamp;
1049 }
1050
1051 /* Now set the commit to the event's index */
1052 local_set(&cpu_buffer->commit_page->page->commit, index);
1053}
1054
1055static void
1056rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1057{ 1449{
1450 unsigned long max_count;
1451
1058 /* 1452 /*
1059 * We only race with interrupts and NMIs on this CPU. 1453 * We only race with interrupts and NMIs on this CPU.
1060 * If we own the commit event, then we can commit 1454 * If we own the commit event, then we can commit
@@ -1064,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1064 * assign the commit to the tail. 1458 * assign the commit to the tail.
1065 */ 1459 */
1066 again: 1460 again:
1461 max_count = cpu_buffer->buffer->pages * 100;
1462
1067 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1463 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1068 cpu_buffer->commit_page->page->commit = 1464 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1069 cpu_buffer->commit_page->write; 1465 return;
1466 if (RB_WARN_ON(cpu_buffer,
1467 rb_is_reader_page(cpu_buffer->tail_page)))
1468 return;
1469 local_set(&cpu_buffer->commit_page->page->commit,
1470 rb_page_write(cpu_buffer->commit_page));
1070 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1471 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1071 cpu_buffer->write_stamp = 1472 cpu_buffer->write_stamp =
1072 cpu_buffer->commit_page->page->time_stamp; 1473 cpu_buffer->commit_page->page->time_stamp;
@@ -1075,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1075 } 1476 }
1076 while (rb_commit_index(cpu_buffer) != 1477 while (rb_commit_index(cpu_buffer) !=
1077 rb_page_write(cpu_buffer->commit_page)) { 1478 rb_page_write(cpu_buffer->commit_page)) {
1078 cpu_buffer->commit_page->page->commit = 1479
1079 cpu_buffer->commit_page->write; 1480 local_set(&cpu_buffer->commit_page->page->commit,
1481 rb_page_write(cpu_buffer->commit_page));
1482 RB_WARN_ON(cpu_buffer,
1483 local_read(&cpu_buffer->commit_page->page->commit) &
1484 ~RB_WRITE_MASK);
1080 barrier(); 1485 barrier();
1081 } 1486 }
1082 1487
@@ -1109,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1109 * to the head page instead of next. 1514 * to the head page instead of next.
1110 */ 1515 */
1111 if (iter->head_page == cpu_buffer->reader_page) 1516 if (iter->head_page == cpu_buffer->reader_page)
1112 iter->head_page = cpu_buffer->head_page; 1517 iter->head_page = rb_set_head_page(cpu_buffer);
1113 else 1518 else
1114 rb_inc_page(cpu_buffer, &iter->head_page); 1519 rb_inc_page(cpu_buffer, &iter->head_page);
1115 1520
@@ -1153,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
1153 } 1558 }
1154} 1559}
1155 1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 * 0 to continue
1566 * -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570 struct buffer_page *tail_page,
1571 struct buffer_page *next_page)
1572{
1573 struct buffer_page *new_head;
1574 int entries;
1575 int type;
1576 int ret;
1577
1578 entries = rb_page_entries(next_page);
1579
1580 /*
1581 * The hard part is here. We need to move the head
1582 * forward, and protect against both readers on
1583 * other CPUs and writers coming in via interrupts.
1584 */
1585 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586 RB_PAGE_HEAD);
1587
1588 /*
1589 * type can be one of four:
1590 * NORMAL - an interrupt already moved it for us
1591 * HEAD - we are the first to get here.
1592 * UPDATE - we are the interrupt interrupting
1593 * a current move.
1594 * MOVED - a reader on another CPU moved the next
1595 * pointer to its reader page. Give up
1596 * and try again.
1597 */
1598
1599 switch (type) {
1600 case RB_PAGE_HEAD:
1601 /*
1602 * We changed the head to UPDATE, thus
1603 * it is our responsibility to update
1604 * the counters.
1605 */
1606 local_add(entries, &cpu_buffer->overrun);
1607
1608 /*
1609 * The entries will be zeroed out when we move the
1610 * tail page.
1611 */
1612
1613 /* still more to do */
1614 break;
1615
1616 case RB_PAGE_UPDATE:
1617 /*
1618 * This is an interrupt that interrupt the
1619 * previous update. Still more to do.
1620 */
1621 break;
1622 case RB_PAGE_NORMAL:
1623 /*
1624 * An interrupt came in before the update
1625 * and processed this for us.
1626 * Nothing left to do.
1627 */
1628 return 1;
1629 case RB_PAGE_MOVED:
1630 /*
1631 * The reader is on another CPU and just did
1632 * a swap with our next_page.
1633 * Try again.
1634 */
1635 return 1;
1636 default:
1637 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638 return -1;
1639 }
1640
1641 /*
1642 * Now that we are here, the old head pointer is
1643 * set to UPDATE. This will keep the reader from
1644 * swapping the head page with the reader page.
1645 * The reader (on another CPU) will spin till
1646 * we are finished.
1647 *
1648 * We just need to protect against interrupts
1649 * doing the job. We will set the next pointer
1650 * to HEAD. After that, we set the old pointer
1651 * to NORMAL, but only if it was HEAD before.
1652 * otherwise we are an interrupt, and only
1653 * want the outer most commit to reset it.
1654 */
1655 new_head = next_page;
1656 rb_inc_page(cpu_buffer, &new_head);
1657
1658 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659 RB_PAGE_NORMAL);
1660
1661 /*
1662 * Valid returns are:
1663 * HEAD - an interrupt came in and already set it.
1664 * NORMAL - One of two things:
1665 * 1) We really set it.
1666 * 2) A bunch of interrupts came in and moved
1667 * the page forward again.
1668 */
1669 switch (ret) {
1670 case RB_PAGE_HEAD:
1671 case RB_PAGE_NORMAL:
1672 /* OK */
1673 break;
1674 default:
1675 RB_WARN_ON(cpu_buffer, 1);
1676 return -1;
1677 }
1678
1679 /*
1680 * It is possible that an interrupt came in,
1681 * set the head up, then more interrupts came in
1682 * and moved it again. When we get back here,
1683 * the page would have been set to NORMAL but we
1684 * just set it back to HEAD.
1685 *
1686 * How do you detect this? Well, if that happened
1687 * the tail page would have moved.
1688 */
1689 if (ret == RB_PAGE_NORMAL) {
1690 /*
1691 * If the tail had moved passed next, then we need
1692 * to reset the pointer.
1693 */
1694 if (cpu_buffer->tail_page != tail_page &&
1695 cpu_buffer->tail_page != next_page)
1696 rb_head_page_set_normal(cpu_buffer, new_head,
1697 next_page,
1698 RB_PAGE_HEAD);
1699 }
1700
1701 /*
1702 * If this was the outer most commit (the one that
1703 * changed the original pointer from HEAD to UPDATE),
1704 * then it is up to us to reset it to NORMAL.
1705 */
1706 if (type == RB_PAGE_HEAD) {
1707 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708 tail_page,
1709 RB_PAGE_UPDATE);
1710 if (RB_WARN_ON(cpu_buffer,
1711 ret != RB_PAGE_UPDATE))
1712 return -1;
1713 }
1714
1715 return 0;
1716}
1717
1156static unsigned rb_calculate_event_length(unsigned length) 1718static unsigned rb_calculate_event_length(unsigned length)
1157{ 1719{
1158 struct ring_buffer_event event; /* Used only for sizeof array */ 1720 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1170,6 +1732,57 @@ static unsigned rb_calculate_event_length(unsigned length)
1170 return length; 1732 return length;
1171} 1733}
1172 1734
1735static inline void
1736rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1737 struct buffer_page *tail_page,
1738 unsigned long tail, unsigned long length)
1739{
1740 struct ring_buffer_event *event;
1741
1742 /*
1743 * Only the event that crossed the page boundary
1744 * must fill the old tail_page with padding.
1745 */
1746 if (tail >= BUF_PAGE_SIZE) {
1747 local_sub(length, &tail_page->write);
1748 return;
1749 }
1750
1751 event = __rb_page_index(tail_page, tail);
1752 kmemcheck_annotate_bitfield(event, bitfield);
1753
1754 /*
1755 * If this event is bigger than the minimum size, then
1756 * we need to be careful that we don't subtract the
1757 * write counter enough to allow another writer to slip
1758 * in on this page.
1759 * We put in a discarded commit instead, to make sure
1760 * that this space is not used again.
1761 *
1762 * If we are less than the minimum size, we don't need to
1763 * worry about it.
1764 */
1765 if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1766 /* No room for any events */
1767
1768 /* Mark the rest of the page with padding */
1769 rb_event_set_padding(event);
1770
1771 /* Set the write back to the previous setting */
1772 local_sub(length, &tail_page->write);
1773 return;
1774 }
1775
1776 /* Put in a discarded event */
1777 event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1778 event->type_len = RINGBUF_TYPE_PADDING;
1779 /* time delta must be non zero */
1780 event->time_delta = 1;
1781
1782 /* Set write to end of buffer */
1783 length = (tail + length) - BUF_PAGE_SIZE;
1784 local_sub(length, &tail_page->write);
1785}
1173 1786
1174static struct ring_buffer_event * 1787static struct ring_buffer_event *
1175rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1788rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1177,127 +1790,101 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1177 struct buffer_page *commit_page, 1790 struct buffer_page *commit_page,
1178 struct buffer_page *tail_page, u64 *ts) 1791 struct buffer_page *tail_page, u64 *ts)
1179{ 1792{
1180 struct buffer_page *next_page, *head_page, *reader_page;
1181 struct ring_buffer *buffer = cpu_buffer->buffer; 1793 struct ring_buffer *buffer = cpu_buffer->buffer;
1182 struct ring_buffer_event *event; 1794 struct buffer_page *next_page;
1183 bool lock_taken = false; 1795 int ret;
1184 unsigned long flags;
1185 1796
1186 next_page = tail_page; 1797 next_page = tail_page;
1187 1798
1188 local_irq_save(flags);
1189 /*
1190 * Since the write to the buffer is still not
1191 * fully lockless, we must be careful with NMIs.
1192 * The locks in the writers are taken when a write
1193 * crosses to a new page. The locks protect against
1194 * races with the readers (this will soon be fixed
1195 * with a lockless solution).
1196 *
1197 * Because we can not protect against NMIs, and we
1198 * want to keep traces reentrant, we need to manage
1199 * what happens when we are in an NMI.
1200 *
1201 * NMIs can happen after we take the lock.
1202 * If we are in an NMI, only take the lock
1203 * if it is not already taken. Otherwise
1204 * simply fail.
1205 */
1206 if (unlikely(in_nmi())) {
1207 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1208 cpu_buffer->nmi_dropped++;
1209 goto out_reset;
1210 }
1211 } else
1212 __raw_spin_lock(&cpu_buffer->lock);
1213
1214 lock_taken = true;
1215
1216 rb_inc_page(cpu_buffer, &next_page); 1799 rb_inc_page(cpu_buffer, &next_page);
1217 1800
1218 head_page = cpu_buffer->head_page;
1219 reader_page = cpu_buffer->reader_page;
1220
1221 /* we grabbed the lock before incrementing */
1222 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1223 goto out_reset;
1224
1225 /* 1801 /*
1226 * If for some reason, we had an interrupt storm that made 1802 * If for some reason, we had an interrupt storm that made
1227 * it all the way around the buffer, bail, and warn 1803 * it all the way around the buffer, bail, and warn
1228 * about it. 1804 * about it.
1229 */ 1805 */
1230 if (unlikely(next_page == commit_page)) { 1806 if (unlikely(next_page == commit_page)) {
1231 cpu_buffer->commit_overrun++; 1807 local_inc(&cpu_buffer->commit_overrun);
1232 goto out_reset; 1808 goto out_reset;
1233 } 1809 }
1234 1810
1235 if (next_page == head_page) {
1236 if (!(buffer->flags & RB_FL_OVERWRITE))
1237 goto out_reset;
1238
1239 /* tail_page has not moved yet? */
1240 if (tail_page == cpu_buffer->tail_page) {
1241 /* count overflows */
1242 cpu_buffer->overrun +=
1243 local_read(&head_page->entries);
1244
1245 rb_inc_page(cpu_buffer, &head_page);
1246 cpu_buffer->head_page = head_page;
1247 cpu_buffer->head_page->read = 0;
1248 }
1249 }
1250
1251 /* 1811 /*
1252 * If the tail page is still the same as what we think 1812 * This is where the fun begins!
1253 * it is, then it is up to us to update the tail 1813 *
1254 * pointer. 1814 * We are fighting against races between a reader that
1815 * could be on another CPU trying to swap its reader
1816 * page with the buffer head.
1817 *
1818 * We are also fighting against interrupts coming in and
1819 * moving the head or tail on us as well.
1820 *
1821 * If the next page is the head page then we have filled
1822 * the buffer, unless the commit page is still on the
1823 * reader page.
1255 */ 1824 */
1256 if (tail_page == cpu_buffer->tail_page) { 1825 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1257 local_set(&next_page->write, 0);
1258 local_set(&next_page->entries, 0);
1259 local_set(&next_page->page->commit, 0);
1260 cpu_buffer->tail_page = next_page;
1261 1826
1262 /* reread the time stamp */ 1827 /*
1263 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1828 * If the commit is not on the reader page, then
1264 cpu_buffer->tail_page->page->time_stamp = *ts; 1829 * move the header page.
1830 */
1831 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1832 /*
1833 * If we are not in overwrite mode,
1834 * this is easy, just stop here.
1835 */
1836 if (!(buffer->flags & RB_FL_OVERWRITE))
1837 goto out_reset;
1838
1839 ret = rb_handle_head_page(cpu_buffer,
1840 tail_page,
1841 next_page);
1842 if (ret < 0)
1843 goto out_reset;
1844 if (ret)
1845 goto out_again;
1846 } else {
1847 /*
1848 * We need to be careful here too. The
1849 * commit page could still be on the reader
1850 * page. We could have a small buffer, and
1851 * have filled up the buffer with events
1852 * from interrupts and such, and wrapped.
1853 *
1854 * Note, if the tail page is also the on the
1855 * reader_page, we let it move out.
1856 */
1857 if (unlikely((cpu_buffer->commit_page !=
1858 cpu_buffer->tail_page) &&
1859 (cpu_buffer->commit_page ==
1860 cpu_buffer->reader_page))) {
1861 local_inc(&cpu_buffer->commit_overrun);
1862 goto out_reset;
1863 }
1864 }
1265 } 1865 }
1266 1866
1267 /* 1867 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1268 * The actual tail page has moved forward. 1868 if (ret) {
1269 */ 1869 /*
1270 if (tail < BUF_PAGE_SIZE) { 1870 * Nested commits always have zero deltas, so
1271 /* Mark the rest of the page with padding */ 1871 * just reread the time stamp
1272 event = __rb_page_index(tail_page, tail); 1872 */
1273 rb_event_set_padding(event); 1873 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1874 next_page->page->time_stamp = *ts;
1274 } 1875 }
1275 1876
1276 /* Set the write back to the previous setting */ 1877 out_again:
1277 local_sub(length, &tail_page->write);
1278 1878
1279 /* 1879 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1280 * If this was a commit entry that failed,
1281 * increment that too
1282 */
1283 if (tail_page == cpu_buffer->commit_page &&
1284 tail == rb_commit_index(cpu_buffer)) {
1285 rb_set_commit_to_write(cpu_buffer);
1286 }
1287
1288 __raw_spin_unlock(&cpu_buffer->lock);
1289 local_irq_restore(flags);
1290 1880
1291 /* fail and let the caller try again */ 1881 /* fail and let the caller try again */
1292 return ERR_PTR(-EAGAIN); 1882 return ERR_PTR(-EAGAIN);
1293 1883
1294 out_reset: 1884 out_reset:
1295 /* reset write */ 1885 /* reset write */
1296 local_sub(length, &tail_page->write); 1886 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1297 1887
1298 if (likely(lock_taken))
1299 __raw_spin_unlock(&cpu_buffer->lock);
1300 local_irq_restore(flags);
1301 return NULL; 1888 return NULL;
1302} 1889}
1303 1890
@@ -1314,6 +1901,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1314 barrier(); 1901 barrier();
1315 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1316 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904
1905 /* set write to only the index of the write */
1906 write &= RB_WRITE_MASK;
1317 tail = write - length; 1907 tail = write - length;
1318 1908
1319 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
@@ -1323,10 +1913,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1323 1913
1324 /* We reserved something on the buffer */ 1914 /* We reserved something on the buffer */
1325 1915
1326 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1327 return NULL;
1328
1329 event = __rb_page_index(tail_page, tail); 1916 event = __rb_page_index(tail_page, tail);
1917 kmemcheck_annotate_bitfield(event, bitfield);
1330 rb_update_event(event, type, length); 1918 rb_update_event(event, type, length);
1331 1919
1332 /* The passed in type is zero for DATA */ 1920 /* The passed in type is zero for DATA */
@@ -1334,11 +1922,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1334 local_inc(&tail_page->entries); 1922 local_inc(&tail_page->entries);
1335 1923
1336 /* 1924 /*
1337 * If this is a commit and the tail is zero, then update 1925 * If this is the first commit on the page, then update
1338 * this page's time stamp. 1926 * its timestamp.
1339 */ 1927 */
1340 if (!tail && rb_is_commit(cpu_buffer, event)) 1928 if (!tail)
1341 cpu_buffer->commit_page->page->time_stamp = *ts; 1929 tail_page->page->time_stamp = *ts;
1342 1930
1343 return event; 1931 return event;
1344} 1932}
@@ -1360,12 +1948,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1360 bpage = cpu_buffer->tail_page; 1948 bpage = cpu_buffer->tail_page;
1361 1949
1362 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1950 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1951 unsigned long write_mask =
1952 local_read(&bpage->write) & ~RB_WRITE_MASK;
1363 /* 1953 /*
1364 * This is on the tail page. It is possible that 1954 * This is on the tail page. It is possible that
1365 * a write could come in and move the tail page 1955 * a write could come in and move the tail page
1366 * and write to the next page. That is fine 1956 * and write to the next page. That is fine
1367 * because we just shorten what is on this page. 1957 * because we just shorten what is on this page.
1368 */ 1958 */
1959 old_index += write_mask;
1960 new_index += write_mask;
1369 index = local_cmpxchg(&bpage->write, old_index, new_index); 1961 index = local_cmpxchg(&bpage->write, old_index, new_index);
1370 if (index == old_index) 1962 if (index == old_index)
1371 return 1; 1963 return 1;
@@ -1407,16 +1999,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1407 return -EAGAIN; 1999 return -EAGAIN;
1408 2000
1409 /* Only a commited time event can update the write stamp */ 2001 /* Only a commited time event can update the write stamp */
1410 if (rb_is_commit(cpu_buffer, event)) { 2002 if (rb_event_is_commit(cpu_buffer, event)) {
1411 /* 2003 /*
1412 * If this is the first on the page, then we need to 2004 * If this is the first on the page, then it was
1413 * update the page itself, and just put in a zero. 2005 * updated with the page itself. Try to discard it
2006 * and if we can't just make it zero.
1414 */ 2007 */
1415 if (rb_event_index(event)) { 2008 if (rb_event_index(event)) {
1416 event->time_delta = *delta & TS_MASK; 2009 event->time_delta = *delta & TS_MASK;
1417 event->array[0] = *delta >> TS_SHIFT; 2010 event->array[0] = *delta >> TS_SHIFT;
1418 } else { 2011 } else {
1419 cpu_buffer->commit_page->page->time_stamp = *ts;
1420 /* try to discard, since we do not need this */ 2012 /* try to discard, since we do not need this */
1421 if (!rb_try_to_discard(cpu_buffer, event)) { 2013 if (!rb_try_to_discard(cpu_buffer, event)) {
1422 /* nope, just zero it */ 2014 /* nope, just zero it */
@@ -1442,8 +2034,47 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1442 return ret; 2034 return ret;
1443} 2035}
1444 2036
2037static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2038{
2039 local_inc(&cpu_buffer->committing);
2040 local_inc(&cpu_buffer->commits);
2041}
2042
2043static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2044{
2045 unsigned long commits;
2046
2047 if (RB_WARN_ON(cpu_buffer,
2048 !local_read(&cpu_buffer->committing)))
2049 return;
2050
2051 again:
2052 commits = local_read(&cpu_buffer->commits);
2053 /* synchronize with interrupts */
2054 barrier();
2055 if (local_read(&cpu_buffer->committing) == 1)
2056 rb_set_commit_to_write(cpu_buffer);
2057
2058 local_dec(&cpu_buffer->committing);
2059
2060 /* synchronize with interrupts */
2061 barrier();
2062
2063 /*
2064 * Need to account for interrupts coming in between the
2065 * updating of the commit page and the clearing of the
2066 * committing counter.
2067 */
2068 if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
2069 !local_read(&cpu_buffer->committing)) {
2070 local_inc(&cpu_buffer->committing);
2071 goto again;
2072 }
2073}
2074
1445static struct ring_buffer_event * 2075static struct ring_buffer_event *
1446rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2076rb_reserve_next_event(struct ring_buffer *buffer,
2077 struct ring_buffer_per_cpu *cpu_buffer,
1447 unsigned long length) 2078 unsigned long length)
1448{ 2079{
1449 struct ring_buffer_event *event; 2080 struct ring_buffer_event *event;
@@ -1451,6 +2082,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1451 int commit = 0; 2082 int commit = 0;
1452 int nr_loops = 0; 2083 int nr_loops = 0;
1453 2084
2085 rb_start_commit(cpu_buffer);
2086
2087#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2088 /*
2089 * Due to the ability to swap a cpu buffer from a buffer
2090 * it is possible it was swapped before we committed.
2091 * (committing stops a swap). We check for it here and
2092 * if it happened, we have to fail the write.
2093 */
2094 barrier();
2095 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2096 local_dec(&cpu_buffer->committing);
2097 local_dec(&cpu_buffer->commits);
2098 return NULL;
2099 }
2100#endif
2101
1454 length = rb_calculate_event_length(length); 2102 length = rb_calculate_event_length(length);
1455 again: 2103 again:
1456 /* 2104 /*
@@ -1463,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1463 * Bail! 2111 * Bail!
1464 */ 2112 */
1465 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2113 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1466 return NULL; 2114 goto out_fail;
1467 2115
1468 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2116 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1469 2117
@@ -1494,7 +2142,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1494 2142
1495 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2143 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1496 if (commit == -EBUSY) 2144 if (commit == -EBUSY)
1497 return NULL; 2145 goto out_fail;
1498 2146
1499 if (commit == -EAGAIN) 2147 if (commit == -EAGAIN)
1500 goto again; 2148 goto again;
@@ -1508,30 +2156,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1508 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2156 if (unlikely(PTR_ERR(event) == -EAGAIN))
1509 goto again; 2157 goto again;
1510 2158
1511 if (!event) { 2159 if (!event)
1512 if (unlikely(commit)) 2160 goto out_fail;
1513 /*
1514 * Ouch! We needed a timestamp and it was commited. But
1515 * we didn't get our event reserved.
1516 */
1517 rb_set_commit_to_write(cpu_buffer);
1518 return NULL;
1519 }
1520 2161
1521 /* 2162 if (!rb_event_is_commit(cpu_buffer, event))
1522 * If the timestamp was commited, make the commit our entry
1523 * now so that we will update it when needed.
1524 */
1525 if (unlikely(commit))
1526 rb_set_commit_event(cpu_buffer, event);
1527 else if (!rb_is_commit(cpu_buffer, event))
1528 delta = 0; 2163 delta = 0;
1529 2164
1530 event->time_delta = delta; 2165 event->time_delta = delta;
1531 2166
1532 return event; 2167 return event;
2168
2169 out_fail:
2170 rb_end_commit(cpu_buffer);
2171 return NULL;
1533} 2172}
1534 2173
2174#ifdef CONFIG_TRACING
2175
1535#define TRACE_RECURSIVE_DEPTH 16 2176#define TRACE_RECURSIVE_DEPTH 16
1536 2177
1537static int trace_recursive_lock(void) 2178static int trace_recursive_lock(void)
@@ -1562,6 +2203,13 @@ static void trace_recursive_unlock(void)
1562 current->trace_recursion--; 2203 current->trace_recursion--;
1563} 2204}
1564 2205
2206#else
2207
2208#define trace_recursive_lock() (0)
2209#define trace_recursive_unlock() do { } while (0)
2210
2211#endif
2212
1565static DEFINE_PER_CPU(int, rb_need_resched); 2213static DEFINE_PER_CPU(int, rb_need_resched);
1566 2214
1567/** 2215/**
@@ -1611,7 +2259,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1611 if (length > BUF_MAX_DATA_SIZE) 2259 if (length > BUF_MAX_DATA_SIZE)
1612 goto out; 2260 goto out;
1613 2261
1614 event = rb_reserve_next_event(cpu_buffer, length); 2262 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1615 if (!event) 2263 if (!event)
1616 goto out; 2264 goto out;
1617 2265
@@ -1634,18 +2282,24 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1634} 2282}
1635EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2283EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1636 2284
2285static void
2286rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2287 struct ring_buffer_event *event)
2288{
2289 /*
2290 * The event first in the commit queue updates the
2291 * time stamp.
2292 */
2293 if (rb_event_is_commit(cpu_buffer, event))
2294 cpu_buffer->write_stamp += event->time_delta;
2295}
2296
1637static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2297static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1638 struct ring_buffer_event *event) 2298 struct ring_buffer_event *event)
1639{ 2299{
1640 local_inc(&cpu_buffer->entries); 2300 local_inc(&cpu_buffer->entries);
1641 2301 rb_update_write_stamp(cpu_buffer, event);
1642 /* Only process further if we own the commit */ 2302 rb_end_commit(cpu_buffer);
1643 if (!rb_is_commit(cpu_buffer, event))
1644 return;
1645
1646 cpu_buffer->write_stamp += event->time_delta;
1647
1648 rb_set_commit_to_write(cpu_buffer);
1649} 2303}
1650 2304
1651/** 2305/**
@@ -1691,32 +2345,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1691 event->time_delta = 1; 2345 event->time_delta = 1;
1692} 2346}
1693 2347
1694/** 2348/*
1695 * ring_buffer_event_discard - discard any event in the ring buffer 2349 * Decrement the entries to the page that an event is on.
1696 * @event: the event to discard 2350 * The event does not even need to exist, only the pointer
1697 * 2351 * to the page it is on. This may only be called before the commit
1698 * Sometimes a event that is in the ring buffer needs to be ignored. 2352 * takes place.
1699 * This function lets the user discard an event in the ring buffer
1700 * and then that event will not be read later.
1701 *
1702 * Note, it is up to the user to be careful with this, and protect
1703 * against races. If the user discards an event that has been consumed
1704 * it is possible that it could corrupt the ring buffer.
1705 */ 2353 */
1706void ring_buffer_event_discard(struct ring_buffer_event *event) 2354static inline void
2355rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2356 struct ring_buffer_event *event)
1707{ 2357{
1708 rb_event_discard(event); 2358 unsigned long addr = (unsigned long)event;
2359 struct buffer_page *bpage = cpu_buffer->commit_page;
2360 struct buffer_page *start;
2361
2362 addr &= PAGE_MASK;
2363
2364 /* Do the likely case first */
2365 if (likely(bpage->page == (void *)addr)) {
2366 local_dec(&bpage->entries);
2367 return;
2368 }
2369
2370 /*
2371 * Because the commit page may be on the reader page we
2372 * start with the next page and check the end loop there.
2373 */
2374 rb_inc_page(cpu_buffer, &bpage);
2375 start = bpage;
2376 do {
2377 if (bpage->page == (void *)addr) {
2378 local_dec(&bpage->entries);
2379 return;
2380 }
2381 rb_inc_page(cpu_buffer, &bpage);
2382 } while (bpage != start);
2383
2384 /* commit not part of this buffer?? */
2385 RB_WARN_ON(cpu_buffer, 1);
1709} 2386}
1710EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1711 2387
1712/** 2388/**
1713 * ring_buffer_commit_discard - discard an event that has not been committed 2389 * ring_buffer_commit_discard - discard an event that has not been committed
1714 * @buffer: the ring buffer 2390 * @buffer: the ring buffer
1715 * @event: non committed event to discard 2391 * @event: non committed event to discard
1716 * 2392 *
1717 * This is similar to ring_buffer_event_discard but must only be 2393 * Sometimes an event that is in the ring buffer needs to be ignored.
1718 * performed on an event that has not been committed yet. The difference 2394 * This function lets the user discard an event in the ring buffer
1719 * is that this will also try to free the event from the ring buffer 2395 * and then that event will not be read later.
2396 *
2397 * This function only works if it is called before the the item has been
2398 * committed. It will try to free the event from the ring buffer
1720 * if another event has not been added behind it. 2399 * if another event has not been added behind it.
1721 * 2400 *
1722 * If another event has been added behind it, it will set the event 2401 * If another event has been added behind it, it will set the event
@@ -1734,32 +2413,27 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1734 /* The event is discarded regardless */ 2413 /* The event is discarded regardless */
1735 rb_event_discard(event); 2414 rb_event_discard(event);
1736 2415
2416 cpu = smp_processor_id();
2417 cpu_buffer = buffer->buffers[cpu];
2418
1737 /* 2419 /*
1738 * This must only be called if the event has not been 2420 * This must only be called if the event has not been
1739 * committed yet. Thus we can assume that preemption 2421 * committed yet. Thus we can assume that preemption
1740 * is still disabled. 2422 * is still disabled.
1741 */ 2423 */
1742 RB_WARN_ON(buffer, preemptible()); 2424 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1743
1744 cpu = smp_processor_id();
1745 cpu_buffer = buffer->buffers[cpu];
1746 2425
1747 if (!rb_try_to_discard(cpu_buffer, event)) 2426 rb_decrement_entry(cpu_buffer, event);
2427 if (rb_try_to_discard(cpu_buffer, event))
1748 goto out; 2428 goto out;
1749 2429
1750 /* 2430 /*
1751 * The commit is still visible by the reader, so we 2431 * The commit is still visible by the reader, so we
1752 * must increment entries. 2432 * must still update the timestamp.
1753 */ 2433 */
1754 local_inc(&cpu_buffer->entries); 2434 rb_update_write_stamp(cpu_buffer, event);
1755 out: 2435 out:
1756 /* 2436 rb_end_commit(cpu_buffer);
1757 * If a write came in and pushed the tail page
1758 * we still need to update the commit pointer
1759 * if we were the commit.
1760 */
1761 if (rb_is_commit(cpu_buffer, event))
1762 rb_set_commit_to_write(cpu_buffer);
1763 2437
1764 trace_recursive_unlock(); 2438 trace_recursive_unlock();
1765 2439
@@ -1818,7 +2492,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1818 if (length > BUF_MAX_DATA_SIZE) 2492 if (length > BUF_MAX_DATA_SIZE)
1819 goto out; 2493 goto out;
1820 2494
1821 event = rb_reserve_next_event(cpu_buffer, length); 2495 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1822 if (!event) 2496 if (!event)
1823 goto out; 2497 goto out;
1824 2498
@@ -1839,9 +2513,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1839static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2513static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1840{ 2514{
1841 struct buffer_page *reader = cpu_buffer->reader_page; 2515 struct buffer_page *reader = cpu_buffer->reader_page;
1842 struct buffer_page *head = cpu_buffer->head_page; 2516 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1843 struct buffer_page *commit = cpu_buffer->commit_page; 2517 struct buffer_page *commit = cpu_buffer->commit_page;
1844 2518
2519 /* In case of error, head will be NULL */
2520 if (unlikely(!head))
2521 return 1;
2522
1845 return reader->read == rb_page_commit(reader) && 2523 return reader->read == rb_page_commit(reader) &&
1846 (commit == reader || 2524 (commit == reader ||
1847 (commit == head && 2525 (commit == head &&
@@ -1932,7 +2610,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1932 return 0; 2610 return 0;
1933 2611
1934 cpu_buffer = buffer->buffers[cpu]; 2612 cpu_buffer = buffer->buffers[cpu];
1935 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2613 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1936 - cpu_buffer->read; 2614 - cpu_buffer->read;
1937 2615
1938 return ret; 2616 return ret;
@@ -1953,33 +2631,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1953 return 0; 2631 return 0;
1954 2632
1955 cpu_buffer = buffer->buffers[cpu]; 2633 cpu_buffer = buffer->buffers[cpu];
1956 ret = cpu_buffer->overrun; 2634 ret = local_read(&cpu_buffer->overrun);
1957 2635
1958 return ret; 2636 return ret;
1959} 2637}
1960EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1961 2639
1962/** 2640/**
1963 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
1964 * @buffer: The ring buffer
1965 * @cpu: The per CPU buffer to get the number of overruns from
1966 */
1967unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
1968{
1969 struct ring_buffer_per_cpu *cpu_buffer;
1970 unsigned long ret;
1971
1972 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1973 return 0;
1974
1975 cpu_buffer = buffer->buffers[cpu];
1976 ret = cpu_buffer->nmi_dropped;
1977
1978 return ret;
1979}
1980EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
1981
1982/**
1983 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2641 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
1984 * @buffer: The ring buffer 2642 * @buffer: The ring buffer
1985 * @cpu: The per CPU buffer to get the number of overruns from 2643 * @cpu: The per CPU buffer to get the number of overruns from
@@ -1994,7 +2652,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
1994 return 0; 2652 return 0;
1995 2653
1996 cpu_buffer = buffer->buffers[cpu]; 2654 cpu_buffer = buffer->buffers[cpu];
1997 ret = cpu_buffer->commit_overrun; 2655 ret = local_read(&cpu_buffer->commit_overrun);
1998 2656
1999 return ret; 2657 return ret;
2000} 2658}
@@ -2017,7 +2675,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2017 for_each_buffer_cpu(buffer, cpu) { 2675 for_each_buffer_cpu(buffer, cpu) {
2018 cpu_buffer = buffer->buffers[cpu]; 2676 cpu_buffer = buffer->buffers[cpu];
2019 entries += (local_read(&cpu_buffer->entries) - 2677 entries += (local_read(&cpu_buffer->entries) -
2020 cpu_buffer->overrun) - cpu_buffer->read; 2678 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2021 } 2679 }
2022 2680
2023 return entries; 2681 return entries;
@@ -2040,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2040 /* if you care about this being correct, lock the buffer */ 2698 /* if you care about this being correct, lock the buffer */
2041 for_each_buffer_cpu(buffer, cpu) { 2699 for_each_buffer_cpu(buffer, cpu) {
2042 cpu_buffer = buffer->buffers[cpu]; 2700 cpu_buffer = buffer->buffers[cpu];
2043 overruns += cpu_buffer->overrun; 2701 overruns += local_read(&cpu_buffer->overrun);
2044 } 2702 }
2045 2703
2046 return overruns; 2704 return overruns;
@@ -2053,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2053 2711
2054 /* Iterator usage is expected to have record disabled */ 2712 /* Iterator usage is expected to have record disabled */
2055 if (list_empty(&cpu_buffer->reader_page->list)) { 2713 if (list_empty(&cpu_buffer->reader_page->list)) {
2056 iter->head_page = cpu_buffer->head_page; 2714 iter->head_page = rb_set_head_page(cpu_buffer);
2057 iter->head = cpu_buffer->head_page->read; 2715 if (unlikely(!iter->head_page))
2716 return;
2717 iter->head = iter->head_page->read;
2058 } else { 2718 } else {
2059 iter->head_page = cpu_buffer->reader_page; 2719 iter->head_page = cpu_buffer->reader_page;
2060 iter->head = cpu_buffer->reader_page->read; 2720 iter->head = cpu_buffer->reader_page->read;
@@ -2171,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2171 struct buffer_page *reader = NULL; 2831 struct buffer_page *reader = NULL;
2172 unsigned long flags; 2832 unsigned long flags;
2173 int nr_loops = 0; 2833 int nr_loops = 0;
2834 int ret;
2174 2835
2175 local_irq_save(flags); 2836 local_irq_save(flags);
2176 __raw_spin_lock(&cpu_buffer->lock); 2837 __raw_spin_lock(&cpu_buffer->lock);
@@ -2204,30 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2204 goto out; 2865 goto out;
2205 2866
2206 /* 2867 /*
2207 * Splice the empty reader page into the list around the head.
2208 * Reset the reader page to size zero. 2868 * Reset the reader page to size zero.
2209 */ 2869 */
2870 local_set(&cpu_buffer->reader_page->write, 0);
2871 local_set(&cpu_buffer->reader_page->entries, 0);
2872 local_set(&cpu_buffer->reader_page->page->commit, 0);
2210 2873
2211 reader = cpu_buffer->head_page; 2874 spin:
2875 /*
2876 * Splice the empty reader page into the list around the head.
2877 */
2878 reader = rb_set_head_page(cpu_buffer);
2212 cpu_buffer->reader_page->list.next = reader->list.next; 2879 cpu_buffer->reader_page->list.next = reader->list.next;
2213 cpu_buffer->reader_page->list.prev = reader->list.prev; 2880 cpu_buffer->reader_page->list.prev = reader->list.prev;
2214 2881
2215 local_set(&cpu_buffer->reader_page->write, 0); 2882 /*
2216 local_set(&cpu_buffer->reader_page->entries, 0); 2883 * cpu_buffer->pages just needs to point to the buffer, it
2217 local_set(&cpu_buffer->reader_page->page->commit, 0); 2884 * has no specific buffer page to point to. Lets move it out
2885 * of our way so we don't accidently swap it.
2886 */
2887 cpu_buffer->pages = reader->list.prev;
2218 2888
2219 /* Make the reader page now replace the head */ 2889 /* The reader page will be pointing to the new head */
2220 reader->list.prev->next = &cpu_buffer->reader_page->list; 2890 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2221 reader->list.next->prev = &cpu_buffer->reader_page->list;
2222 2891
2223 /* 2892 /*
2224 * If the tail is on the reader, then we must set the head 2893 * Here's the tricky part.
2225 * to the inserted page, otherwise we set it one before. 2894 *
2895 * We need to move the pointer past the header page.
2896 * But we can only do that if a writer is not currently
2897 * moving it. The page before the header page has the
2898 * flag bit '1' set if it is pointing to the page we want.
2899 * but if the writer is in the process of moving it
2900 * than it will be '2' or already moved '0'.
2226 */ 2901 */
2227 cpu_buffer->head_page = cpu_buffer->reader_page;
2228 2902
2229 if (cpu_buffer->commit_page != reader) 2903 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2230 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2904
2905 /*
2906 * If we did not convert it, then we must try again.
2907 */
2908 if (!ret)
2909 goto spin;
2910
2911 /*
2912 * Yeah! We succeeded in replacing the page.
2913 *
2914 * Now make the new head point back to the reader page.
2915 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2231 2918
2232 /* Finally update the reader page to the new head */ 2919 /* Finally update the reader page to the new head */
2233 cpu_buffer->reader_page = reader; 2920 cpu_buffer->reader_page = reader;
@@ -2256,8 +2943,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2256 2943
2257 event = rb_reader_event(cpu_buffer); 2944 event = rb_reader_event(cpu_buffer);
2258 2945
2259 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2946 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2260 || rb_discarded_event(event))
2261 cpu_buffer->read++; 2947 cpu_buffer->read++;
2262 2948
2263 rb_update_read_stamp(cpu_buffer, event); 2949 rb_update_read_stamp(cpu_buffer, event);
@@ -2311,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2311} 2997}
2312 2998
2313static struct ring_buffer_event * 2999static struct ring_buffer_event *
2314rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3000rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
2315{ 3001{
2316 struct ring_buffer_per_cpu *cpu_buffer;
2317 struct ring_buffer_event *event; 3002 struct ring_buffer_event *event;
2318 struct buffer_page *reader; 3003 struct buffer_page *reader;
2319 int nr_loops = 0; 3004 int nr_loops = 0;
2320 3005
2321 cpu_buffer = buffer->buffers[cpu];
2322
2323 again: 3006 again:
2324 /* 3007 /*
2325 * We repeat when a timestamp is encountered. It is possible 3008 * We repeat when a timestamp is encountered. It is possible
@@ -2348,7 +3031,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2348 * the box. Return the padding, and we will release 3031 * the box. Return the padding, and we will release
2349 * the current locks, and try again. 3032 * the current locks, and try again.
2350 */ 3033 */
2351 rb_advance_reader(cpu_buffer);
2352 return event; 3034 return event;
2353 3035
2354 case RINGBUF_TYPE_TIME_EXTEND: 3036 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2364,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2364 case RINGBUF_TYPE_DATA: 3046 case RINGBUF_TYPE_DATA:
2365 if (ts) { 3047 if (ts) {
2366 *ts = cpu_buffer->read_stamp + event->time_delta; 3048 *ts = cpu_buffer->read_stamp + event->time_delta;
2367 ring_buffer_normalize_time_stamp(buffer, 3049 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
2368 cpu_buffer->cpu, ts); 3050 cpu_buffer->cpu, ts);
2369 } 3051 }
2370 return event; 3052 return event;
@@ -2443,6 +3125,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2443} 3125}
2444EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 3126EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
2445 3127
3128static inline int rb_ok_to_lock(void)
3129{
3130 /*
3131 * If an NMI die dumps out the content of the ring buffer
3132 * do not grab locks. We also permanently disable the ring
3133 * buffer too. A one time deal is all you get from reading
3134 * the ring buffer from an NMI.
3135 */
3136 if (likely(!in_nmi()))
3137 return 1;
3138
3139 tracing_off_permanent();
3140 return 0;
3141}
3142
2446/** 3143/**
2447 * ring_buffer_peek - peek at the next event to be read 3144 * ring_buffer_peek - peek at the next event to be read
2448 * @buffer: The ring buffer to read 3145 * @buffer: The ring buffer to read
@@ -2458,19 +3155,25 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2458 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3155 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2459 struct ring_buffer_event *event; 3156 struct ring_buffer_event *event;
2460 unsigned long flags; 3157 unsigned long flags;
3158 int dolock;
2461 3159
2462 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3160 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2463 return NULL; 3161 return NULL;
2464 3162
3163 dolock = rb_ok_to_lock();
2465 again: 3164 again:
2466 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3165 local_irq_save(flags);
2467 event = rb_buffer_peek(buffer, cpu, ts); 3166 if (dolock)
2468 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3167 spin_lock(&cpu_buffer->reader_lock);
3168 event = rb_buffer_peek(cpu_buffer, ts);
3169 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3170 rb_advance_reader(cpu_buffer);
3171 if (dolock)
3172 spin_unlock(&cpu_buffer->reader_lock);
3173 local_irq_restore(flags);
2469 3174
2470 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3175 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2471 cpu_relax();
2472 goto again; 3176 goto again;
2473 }
2474 3177
2475 return event; 3178 return event;
2476} 3179}
@@ -2495,10 +3198,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2495 event = rb_iter_peek(iter, ts); 3198 event = rb_iter_peek(iter, ts);
2496 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3199 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2497 3200
2498 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3201 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2499 cpu_relax();
2500 goto again; 3202 goto again;
2501 }
2502 3203
2503 return event; 3204 return event;
2504} 3205}
@@ -2517,6 +3218,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2517 struct ring_buffer_per_cpu *cpu_buffer; 3218 struct ring_buffer_per_cpu *cpu_buffer;
2518 struct ring_buffer_event *event = NULL; 3219 struct ring_buffer_event *event = NULL;
2519 unsigned long flags; 3220 unsigned long flags;
3221 int dolock;
3222
3223 dolock = rb_ok_to_lock();
2520 3224
2521 again: 3225 again:
2522 /* might be called in atomic */ 3226 /* might be called in atomic */
@@ -2526,24 +3230,23 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2526 goto out; 3230 goto out;
2527 3231
2528 cpu_buffer = buffer->buffers[cpu]; 3232 cpu_buffer = buffer->buffers[cpu];
2529 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3233 local_irq_save(flags);
2530 3234 if (dolock)
2531 event = rb_buffer_peek(buffer, cpu, ts); 3235 spin_lock(&cpu_buffer->reader_lock);
2532 if (!event)
2533 goto out_unlock;
2534 3236
2535 rb_advance_reader(cpu_buffer); 3237 event = rb_buffer_peek(cpu_buffer, ts);
3238 if (event)
3239 rb_advance_reader(cpu_buffer);
2536 3240
2537 out_unlock: 3241 if (dolock)
2538 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3242 spin_unlock(&cpu_buffer->reader_lock);
3243 local_irq_restore(flags);
2539 3244
2540 out: 3245 out:
2541 preempt_enable(); 3246 preempt_enable();
2542 3247
2543 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3248 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2544 cpu_relax();
2545 goto again; 3249 goto again;
2546 }
2547 3250
2548 return event; 3251 return event;
2549} 3252}
@@ -2623,21 +3326,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2623 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3326 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2624 unsigned long flags; 3327 unsigned long flags;
2625 3328
2626 again:
2627 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3329 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3330 again:
2628 event = rb_iter_peek(iter, ts); 3331 event = rb_iter_peek(iter, ts);
2629 if (!event) 3332 if (!event)
2630 goto out; 3333 goto out;
2631 3334
3335 if (event->type_len == RINGBUF_TYPE_PADDING)
3336 goto again;
3337
2632 rb_advance_iter(iter); 3338 rb_advance_iter(iter);
2633 out: 3339 out:
2634 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3340 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2635 3341
2636 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2637 cpu_relax();
2638 goto again;
2639 }
2640
2641 return event; 3342 return event;
2642} 3343}
2643EXPORT_SYMBOL_GPL(ring_buffer_read); 3344EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2655,8 +3356,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2655static void 3356static void
2656rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3357rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2657{ 3358{
3359 rb_head_page_deactivate(cpu_buffer);
3360
2658 cpu_buffer->head_page 3361 cpu_buffer->head_page
2659 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3362 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2660 local_set(&cpu_buffer->head_page->write, 0); 3363 local_set(&cpu_buffer->head_page->write, 0);
2661 local_set(&cpu_buffer->head_page->entries, 0); 3364 local_set(&cpu_buffer->head_page->entries, 0);
2662 local_set(&cpu_buffer->head_page->page->commit, 0); 3365 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2672,14 +3375,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2672 local_set(&cpu_buffer->reader_page->page->commit, 0); 3375 local_set(&cpu_buffer->reader_page->page->commit, 0);
2673 cpu_buffer->reader_page->read = 0; 3376 cpu_buffer->reader_page->read = 0;
2674 3377
2675 cpu_buffer->nmi_dropped = 0; 3378 local_set(&cpu_buffer->commit_overrun, 0);
2676 cpu_buffer->commit_overrun = 0; 3379 local_set(&cpu_buffer->overrun, 0);
2677 cpu_buffer->overrun = 0;
2678 cpu_buffer->read = 0;
2679 local_set(&cpu_buffer->entries, 0); 3380 local_set(&cpu_buffer->entries, 0);
3381 local_set(&cpu_buffer->committing, 0);
3382 local_set(&cpu_buffer->commits, 0);
3383 cpu_buffer->read = 0;
2680 3384
2681 cpu_buffer->write_stamp = 0; 3385 cpu_buffer->write_stamp = 0;
2682 cpu_buffer->read_stamp = 0; 3386 cpu_buffer->read_stamp = 0;
3387
3388 rb_head_page_activate(cpu_buffer);
2683} 3389}
2684 3390
2685/** 3391/**
@@ -2699,12 +3405,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2699 3405
2700 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3406 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2701 3407
3408 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3409 goto out;
3410
2702 __raw_spin_lock(&cpu_buffer->lock); 3411 __raw_spin_lock(&cpu_buffer->lock);
2703 3412
2704 rb_reset_cpu(cpu_buffer); 3413 rb_reset_cpu(cpu_buffer);
2705 3414
2706 __raw_spin_unlock(&cpu_buffer->lock); 3415 __raw_spin_unlock(&cpu_buffer->lock);
2707 3416
3417 out:
2708 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3418 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2709 3419
2710 atomic_dec(&cpu_buffer->record_disabled); 3420 atomic_dec(&cpu_buffer->record_disabled);
@@ -2731,12 +3441,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
2731int ring_buffer_empty(struct ring_buffer *buffer) 3441int ring_buffer_empty(struct ring_buffer *buffer)
2732{ 3442{
2733 struct ring_buffer_per_cpu *cpu_buffer; 3443 struct ring_buffer_per_cpu *cpu_buffer;
3444 unsigned long flags;
3445 int dolock;
2734 int cpu; 3446 int cpu;
3447 int ret;
3448
3449 dolock = rb_ok_to_lock();
2735 3450
2736 /* yes this is racy, but if you don't like the race, lock the buffer */ 3451 /* yes this is racy, but if you don't like the race, lock the buffer */
2737 for_each_buffer_cpu(buffer, cpu) { 3452 for_each_buffer_cpu(buffer, cpu) {
2738 cpu_buffer = buffer->buffers[cpu]; 3453 cpu_buffer = buffer->buffers[cpu];
2739 if (!rb_per_cpu_empty(cpu_buffer)) 3454 local_irq_save(flags);
3455 if (dolock)
3456 spin_lock(&cpu_buffer->reader_lock);
3457 ret = rb_per_cpu_empty(cpu_buffer);
3458 if (dolock)
3459 spin_unlock(&cpu_buffer->reader_lock);
3460 local_irq_restore(flags);
3461
3462 if (!ret)
2740 return 0; 3463 return 0;
2741 } 3464 }
2742 3465
@@ -2752,19 +3475,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2752int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 3475int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2753{ 3476{
2754 struct ring_buffer_per_cpu *cpu_buffer; 3477 struct ring_buffer_per_cpu *cpu_buffer;
3478 unsigned long flags;
3479 int dolock;
2755 int ret; 3480 int ret;
2756 3481
2757 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3482 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2758 return 1; 3483 return 1;
2759 3484
3485 dolock = rb_ok_to_lock();
3486
2760 cpu_buffer = buffer->buffers[cpu]; 3487 cpu_buffer = buffer->buffers[cpu];
3488 local_irq_save(flags);
3489 if (dolock)
3490 spin_lock(&cpu_buffer->reader_lock);
2761 ret = rb_per_cpu_empty(cpu_buffer); 3491 ret = rb_per_cpu_empty(cpu_buffer);
2762 3492 if (dolock)
3493 spin_unlock(&cpu_buffer->reader_lock);
3494 local_irq_restore(flags);
2763 3495
2764 return ret; 3496 return ret;
2765} 3497}
2766EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3498EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2767 3499
3500#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2768/** 3501/**
2769 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3502 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2770 * @buffer_a: One buffer to swap with 3503 * @buffer_a: One buffer to swap with
@@ -2819,20 +3552,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2819 atomic_inc(&cpu_buffer_a->record_disabled); 3552 atomic_inc(&cpu_buffer_a->record_disabled);
2820 atomic_inc(&cpu_buffer_b->record_disabled); 3553 atomic_inc(&cpu_buffer_b->record_disabled);
2821 3554
3555 ret = -EBUSY;
3556 if (local_read(&cpu_buffer_a->committing))
3557 goto out_dec;
3558 if (local_read(&cpu_buffer_b->committing))
3559 goto out_dec;
3560
2822 buffer_a->buffers[cpu] = cpu_buffer_b; 3561 buffer_a->buffers[cpu] = cpu_buffer_b;
2823 buffer_b->buffers[cpu] = cpu_buffer_a; 3562 buffer_b->buffers[cpu] = cpu_buffer_a;
2824 3563
2825 cpu_buffer_b->buffer = buffer_a; 3564 cpu_buffer_b->buffer = buffer_a;
2826 cpu_buffer_a->buffer = buffer_b; 3565 cpu_buffer_a->buffer = buffer_b;
2827 3566
3567 ret = 0;
3568
3569out_dec:
2828 atomic_dec(&cpu_buffer_a->record_disabled); 3570 atomic_dec(&cpu_buffer_a->record_disabled);
2829 atomic_dec(&cpu_buffer_b->record_disabled); 3571 atomic_dec(&cpu_buffer_b->record_disabled);
2830
2831 ret = 0;
2832out: 3572out:
2833 return ret; 3573 return ret;
2834} 3574}
2835EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3575EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3576#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2836 3577
2837/** 3578/**
2838 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3579 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3005,7 +3746,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3005 read = 0; 3746 read = 0;
3006 } else { 3747 } else {
3007 /* update the entry counter */ 3748 /* update the entry counter */
3008 cpu_buffer->read += local_read(&reader->entries); 3749 cpu_buffer->read += rb_page_entries(reader);
3009 3750
3010 /* swap the pages */ 3751 /* swap the pages */
3011 rb_init_page(bpage); 3752 rb_init_page(bpage);
@@ -3026,6 +3767,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3026} 3767}
3027EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3768EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3028 3769
3770#ifdef CONFIG_TRACING
3029static ssize_t 3771static ssize_t
3030rb_simple_read(struct file *filp, char __user *ubuf, 3772rb_simple_read(struct file *filp, char __user *ubuf,
3031 size_t cnt, loff_t *ppos) 3773 size_t cnt, loff_t *ppos)
@@ -3093,6 +3835,7 @@ static __init int rb_init_debugfs(void)
3093} 3835}
3094 3836
3095fs_initcall(rb_init_debugfs); 3837fs_initcall(rb_init_debugfs);
3838#endif
3096 3839
3097#ifdef CONFIG_HOTPLUG_CPU 3840#ifdef CONFIG_HOTPLUG_CPU
3098static int rb_cpu_notify(struct notifier_block *self, 3841static int rb_cpu_notify(struct notifier_block *self,
@@ -3105,7 +3848,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3105 switch (action) { 3848 switch (action) {
3106 case CPU_UP_PREPARE: 3849 case CPU_UP_PREPARE:
3107 case CPU_UP_PREPARE_FROZEN: 3850 case CPU_UP_PREPARE_FROZEN:
3108 if (cpu_isset(cpu, *buffer->cpumask)) 3851 if (cpumask_test_cpu(cpu, buffer->cpumask))
3109 return NOTIFY_OK; 3852 return NOTIFY_OK;
3110 3853
3111 buffer->buffers[cpu] = 3854 buffer->buffers[cpu] =
@@ -3116,7 +3859,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3116 return NOTIFY_OK; 3859 return NOTIFY_OK;
3117 } 3860 }
3118 smp_wmb(); 3861 smp_wmb();
3119 cpu_set(cpu, *buffer->cpumask); 3862 cpumask_set_cpu(cpu, buffer->cpumask);
3120 break; 3863 break;
3121 case CPU_DOWN_PREPARE: 3864 case CPU_DOWN_PREPARE:
3122 case CPU_DOWN_PREPARE_FROZEN: 3865 case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8d68e149a8b3..573d3cc762c3 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -102,8 +102,10 @@ static enum event_status read_page(int cpu)
102 event = (void *)&rpage->data[i]; 102 event = (void *)&rpage->data[i];
103 switch (event->type_len) { 103 switch (event->type_len) {
104 case RINGBUF_TYPE_PADDING: 104 case RINGBUF_TYPE_PADDING:
105 /* We don't expect any padding */ 105 /* failed writes may be discarded events */
106 KILL_TEST(); 106 if (!event->time_delta)
107 KILL_TEST();
108 inc = event->array[0] + 4;
107 break; 109 break;
108 case RINGBUF_TYPE_TIME_EXTEND: 110 case RINGBUF_TYPE_TIME_EXTEND:
109 inc = 8; 111 inc = 8;
@@ -119,7 +121,7 @@ static enum event_status read_page(int cpu)
119 KILL_TEST(); 121 KILL_TEST();
120 break; 122 break;
121 } 123 }
122 inc = event->array[0]; 124 inc = event->array[0] + 4;
123 break; 125 break;
124 default: 126 default:
125 entry = ring_buffer_event_data(event); 127 entry = ring_buffer_event_data(event);
@@ -201,7 +203,7 @@ static void ring_buffer_producer(void)
201 * Hammer the buffer for 10 secs (this may 203 * Hammer the buffer for 10 secs (this may
202 * make the system stall) 204 * make the system stall)
203 */ 205 */
204 pr_info("Starting ring buffer hammer\n"); 206 trace_printk("Starting ring buffer hammer\n");
205 do_gettimeofday(&start_tv); 207 do_gettimeofday(&start_tv);
206 do { 208 do {
207 struct ring_buffer_event *event; 209 struct ring_buffer_event *event;
@@ -237,7 +239,7 @@ static void ring_buffer_producer(void)
237#endif 239#endif
238 240
239 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 241 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
240 pr_info("End ring buffer hammer\n"); 242 trace_printk("End ring buffer hammer\n");
241 243
242 if (consumer) { 244 if (consumer) {
243 /* Init both completions here to avoid races */ 245 /* Init both completions here to avoid races */
@@ -260,49 +262,50 @@ static void ring_buffer_producer(void)
260 overruns = ring_buffer_overruns(buffer); 262 overruns = ring_buffer_overruns(buffer);
261 263
262 if (kill_test) 264 if (kill_test)
263 pr_info("ERROR!\n"); 265 trace_printk("ERROR!\n");
264 pr_info("Time: %lld (usecs)\n", time); 266 trace_printk("Time: %lld (usecs)\n", time);
265 pr_info("Overruns: %lld\n", overruns); 267 trace_printk("Overruns: %lld\n", overruns);
266 if (disable_reader) 268 if (disable_reader)
267 pr_info("Read: (reader disabled)\n"); 269 trace_printk("Read: (reader disabled)\n");
268 else 270 else
269 pr_info("Read: %ld (by %s)\n", read, 271 trace_printk("Read: %ld (by %s)\n", read,
270 read_events ? "events" : "pages"); 272 read_events ? "events" : "pages");
271 pr_info("Entries: %lld\n", entries); 273 trace_printk("Entries: %lld\n", entries);
272 pr_info("Total: %lld\n", entries + overruns + read); 274 trace_printk("Total: %lld\n", entries + overruns + read);
273 pr_info("Missed: %ld\n", missed); 275 trace_printk("Missed: %ld\n", missed);
274 pr_info("Hit: %ld\n", hit); 276 trace_printk("Hit: %ld\n", hit);
275 277
276 /* Convert time from usecs to millisecs */ 278 /* Convert time from usecs to millisecs */
277 do_div(time, USEC_PER_MSEC); 279 do_div(time, USEC_PER_MSEC);
278 if (time) 280 if (time)
279 hit /= (long)time; 281 hit /= (long)time;
280 else 282 else
281 pr_info("TIME IS ZERO??\n"); 283 trace_printk("TIME IS ZERO??\n");
282 284
283 pr_info("Entries per millisec: %ld\n", hit); 285 trace_printk("Entries per millisec: %ld\n", hit);
284 286
285 if (hit) { 287 if (hit) {
286 /* Calculate the average time in nanosecs */ 288 /* Calculate the average time in nanosecs */
287 avg = NSEC_PER_MSEC / hit; 289 avg = NSEC_PER_MSEC / hit;
288 pr_info("%ld ns per entry\n", avg); 290 trace_printk("%ld ns per entry\n", avg);
289 } 291 }
290 292
291 if (missed) { 293 if (missed) {
292 if (time) 294 if (time)
293 missed /= (long)time; 295 missed /= (long)time;
294 296
295 pr_info("Total iterations per millisec: %ld\n", hit + missed); 297 trace_printk("Total iterations per millisec: %ld\n",
298 hit + missed);
296 299
297 /* it is possible that hit + missed will overflow and be zero */ 300 /* it is possible that hit + missed will overflow and be zero */
298 if (!(hit + missed)) { 301 if (!(hit + missed)) {
299 pr_info("hit + missed overflowed and totalled zero!\n"); 302 trace_printk("hit + missed overflowed and totalled zero!\n");
300 hit--; /* make it non zero */ 303 hit--; /* make it non zero */
301 } 304 }
302 305
303 /* Caculate the average time in nanosecs */ 306 /* Caculate the average time in nanosecs */
304 avg = NSEC_PER_MSEC / (hit + missed); 307 avg = NSEC_PER_MSEC / (hit + missed);
305 pr_info("%ld ns per entry\n", avg); 308 trace_printk("%ld ns per entry\n", avg);
306 } 309 }
307} 310}
308 311
@@ -353,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg)
353 356
354 ring_buffer_producer(); 357 ring_buffer_producer();
355 358
356 pr_info("Sleeping for 10 secs\n"); 359 trace_printk("Sleeping for 10 secs\n");
357 set_current_state(TASK_INTERRUPTIBLE); 360 set_current_state(TASK_INTERRUPTIBLE);
358 schedule_timeout(HZ * SLEEP_TIME); 361 schedule_timeout(HZ * SLEEP_TIME);
359 __set_current_state(TASK_RUNNING); 362 __set_current_state(TASK_RUNNING);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8acd9b81a5d7..fd52a19dd172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -42,14 +43,11 @@
42 43
43#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
44 45
45unsigned long __read_mostly tracing_max_latency;
46unsigned long __read_mostly tracing_thresh;
47
48/* 46/*
49 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
50 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
51 */ 49 */
52static int ring_buffer_expanded; 50int ring_buffer_expanded;
53 51
54/* 52/*
55 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -63,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
63/* 61/*
64 * If a tracer is running, we do not want to run SELFTEST. 62 * If a tracer is running, we do not want to run SELFTEST.
65 */ 63 */
66static bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
67 65
68/* For tracers that don't implement custom flags */ 66/* For tracers that don't implement custom flags */
69static struct tracer_opt dummy_tracer_opt[] = { 67static struct tracer_opt dummy_tracer_opt[] = {
@@ -88,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
88 */ 86 */
89static int tracing_disabled = 1; 87static int tracing_disabled = 1;
90 88
91static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
92 90
93static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
94{ 92{
@@ -171,10 +169,11 @@ static struct trace_array global_trace;
171 169
172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 170static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
173 171
174int filter_current_check_discard(struct ftrace_event_call *call, void *rec, 172int filter_current_check_discard(struct ring_buffer *buffer,
173 struct ftrace_event_call *call, void *rec,
175 struct ring_buffer_event *event) 174 struct ring_buffer_event *event)
176{ 175{
177 return filter_check_discard(call, rec, global_trace.buffer, event); 176 return filter_check_discard(call, rec, buffer, event);
178} 177}
179EXPORT_SYMBOL_GPL(filter_current_check_discard); 178EXPORT_SYMBOL_GPL(filter_current_check_discard);
180 179
@@ -265,6 +264,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
265 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 264 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
266 TRACE_ITER_GRAPH_TIME; 265 TRACE_ITER_GRAPH_TIME;
267 266
267static int trace_stop_count;
268static DEFINE_SPINLOCK(tracing_start_lock);
269
268/** 270/**
269 * trace_wake_up - wake up tasks waiting for trace input 271 * trace_wake_up - wake up tasks waiting for trace input
270 * 272 *
@@ -284,13 +286,12 @@ void trace_wake_up(void)
284static int __init set_buf_size(char *str) 286static int __init set_buf_size(char *str)
285{ 287{
286 unsigned long buf_size; 288 unsigned long buf_size;
287 int ret;
288 289
289 if (!str) 290 if (!str)
290 return 0; 291 return 0;
291 ret = strict_strtoul(str, 0, &buf_size); 292 buf_size = memparse(str, &str);
292 /* nr_entries can not be zero */ 293 /* nr_entries can not be zero */
293 if (ret < 0 || buf_size == 0) 294 if (buf_size == 0)
294 return 0; 295 return 0;
295 trace_buf_size = buf_size; 296 trace_buf_size = buf_size;
296 return 1; 297 return 1;
@@ -323,49 +324,125 @@ static const char *trace_options[] = {
323 "printk-msg-only", 324 "printk-msg-only",
324 "context-info", 325 "context-info",
325 "latency-format", 326 "latency-format",
326 "global-clock",
327 "sleep-time", 327 "sleep-time",
328 "graph-time", 328 "graph-time",
329 NULL 329 NULL
330}; 330};
331 331
332static struct {
333 u64 (*func)(void);
334 const char *name;
335} trace_clocks[] = {
336 { trace_clock_local, "local" },
337 { trace_clock_global, "global" },
338};
339
340int trace_clock_id;
341
332/* 342/*
333 * ftrace_max_lock is used to protect the swapping of buffers 343 * trace_parser_get_init - gets the buffer for trace parser
334 * when taking a max snapshot. The buffers themselves are
335 * protected by per_cpu spinlocks. But the action of the swap
336 * needs its own lock.
337 *
338 * This is defined as a raw_spinlock_t in order to help
339 * with performance when lockdep debugging is enabled.
340 */ 344 */
341static raw_spinlock_t ftrace_max_lock = 345int trace_parser_get_init(struct trace_parser *parser, int size)
342 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 346{
347 memset(parser, 0, sizeof(*parser));
348
349 parser->buffer = kmalloc(size, GFP_KERNEL);
350 if (!parser->buffer)
351 return 1;
352
353 parser->size = size;
354 return 0;
355}
343 356
344/* 357/*
345 * Copy the new maximum trace into the separate maximum-trace 358 * trace_parser_put - frees the buffer for trace parser
346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /debugfs/tracing/latency_trace)
348 */ 359 */
349static void 360void trace_parser_put(struct trace_parser *parser)
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
351{ 361{
352 struct trace_array_cpu *data = tr->data[cpu]; 362 kfree(parser->buffer);
363}
353 364
354 max_tr.cpu = cpu; 365/*
355 max_tr.time_start = data->preempt_timestamp; 366 * trace_get_user - reads the user input string separated by space
367 * (matched by isspace(ch))
368 *
369 * For each string found the 'struct trace_parser' is updated,
370 * and the function returns.
371 *
372 * Returns number of bytes read.
373 *
374 * See kernel/trace/trace.h for 'struct trace_parser' details.
375 */
376int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
377 size_t cnt, loff_t *ppos)
378{
379 char ch;
380 size_t read = 0;
381 ssize_t ret;
356 382
357 data = max_tr.data[cpu]; 383 if (!*ppos)
358 data->saved_latency = tracing_max_latency; 384 trace_parser_clear(parser);
359 385
360 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 386 ret = get_user(ch, ubuf++);
361 data->pid = tsk->pid; 387 if (ret)
362 data->uid = task_uid(tsk); 388 goto out;
363 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
364 data->policy = tsk->policy;
365 data->rt_priority = tsk->rt_priority;
366 389
367 /* record this tasks comm */ 390 read++;
368 tracing_record_cmdline(tsk); 391 cnt--;
392
393 /*
394 * The parser is not finished with the last write,
395 * continue reading the user input without skipping spaces.
396 */
397 if (!parser->cont) {
398 /* skip white space */
399 while (cnt && isspace(ch)) {
400 ret = get_user(ch, ubuf++);
401 if (ret)
402 goto out;
403 read++;
404 cnt--;
405 }
406
407 /* only spaces were written */
408 if (isspace(ch)) {
409 *ppos += read;
410 ret = read;
411 goto out;
412 }
413
414 parser->idx = 0;
415 }
416
417 /* read the non-space input */
418 while (cnt && !isspace(ch)) {
419 if (parser->idx < parser->size)
420 parser->buffer[parser->idx++] = ch;
421 else {
422 ret = -EINVAL;
423 goto out;
424 }
425 ret = get_user(ch, ubuf++);
426 if (ret)
427 goto out;
428 read++;
429 cnt--;
430 }
431
432 /* We either got finished input or we have to wait for another call. */
433 if (isspace(ch)) {
434 parser->buffer[parser->idx] = 0;
435 parser->cont = false;
436 } else {
437 parser->cont = true;
438 parser->buffer[parser->idx++] = ch;
439 }
440
441 *ppos += read;
442 ret = read;
443
444out:
445 return ret;
369} 446}
370 447
371ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 448ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
@@ -411,6 +488,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
411 return cnt; 488 return cnt;
412} 489}
413 490
491/*
492 * ftrace_max_lock is used to protect the swapping of buffers
493 * when taking a max snapshot. The buffers themselves are
494 * protected by per_cpu spinlocks. But the action of the swap
495 * needs its own lock.
496 *
497 * This is defined as a raw_spinlock_t in order to help
498 * with performance when lockdep debugging is enabled.
499 *
500 * It is also used in other places outside the update_max_tr
501 * so it needs to be defined outside of the
502 * CONFIG_TRACER_MAX_TRACE.
503 */
504static raw_spinlock_t ftrace_max_lock =
505 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
506
507#ifdef CONFIG_TRACER_MAX_TRACE
508unsigned long __read_mostly tracing_max_latency;
509unsigned long __read_mostly tracing_thresh;
510
511/*
512 * Copy the new maximum trace into the separate maximum-trace
513 * structure. (this way the maximum trace is permanently saved,
514 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
515 */
516static void
517__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
518{
519 struct trace_array_cpu *data = tr->data[cpu];
520 struct trace_array_cpu *max_data = tr->data[cpu];
521
522 max_tr.cpu = cpu;
523 max_tr.time_start = data->preempt_timestamp;
524
525 max_data = max_tr.data[cpu];
526 max_data->saved_latency = tracing_max_latency;
527 max_data->critical_start = data->critical_start;
528 max_data->critical_end = data->critical_end;
529
530 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
531 max_data->pid = tsk->pid;
532 max_data->uid = task_uid(tsk);
533 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
534 max_data->policy = tsk->policy;
535 max_data->rt_priority = tsk->rt_priority;
536
537 /* record this tasks comm */
538 tracing_record_cmdline(tsk);
539}
540
414/** 541/**
415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 542 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
416 * @tr: tracer 543 * @tr: tracer
@@ -425,16 +552,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
425{ 552{
426 struct ring_buffer *buf = tr->buffer; 553 struct ring_buffer *buf = tr->buffer;
427 554
555 if (trace_stop_count)
556 return;
557
428 WARN_ON_ONCE(!irqs_disabled()); 558 WARN_ON_ONCE(!irqs_disabled());
429 __raw_spin_lock(&ftrace_max_lock); 559 __raw_spin_lock(&ftrace_max_lock);
430 560
431 tr->buffer = max_tr.buffer; 561 tr->buffer = max_tr.buffer;
432 max_tr.buffer = buf; 562 max_tr.buffer = buf;
433 563
434 ftrace_disable_cpu();
435 ring_buffer_reset(tr->buffer);
436 ftrace_enable_cpu();
437
438 __update_max_tr(tr, tsk, cpu); 564 __update_max_tr(tr, tsk, cpu);
439 __raw_spin_unlock(&ftrace_max_lock); 565 __raw_spin_unlock(&ftrace_max_lock);
440} 566}
@@ -452,21 +578,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
452{ 578{
453 int ret; 579 int ret;
454 580
581 if (trace_stop_count)
582 return;
583
455 WARN_ON_ONCE(!irqs_disabled()); 584 WARN_ON_ONCE(!irqs_disabled());
456 __raw_spin_lock(&ftrace_max_lock); 585 __raw_spin_lock(&ftrace_max_lock);
457 586
458 ftrace_disable_cpu(); 587 ftrace_disable_cpu();
459 588
460 ring_buffer_reset(max_tr.buffer);
461 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 589 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
462 590
591 if (ret == -EBUSY) {
592 /*
593 * We failed to swap the buffer due to a commit taking
594 * place on this CPU. We fail to record, but we reset
595 * the max trace buffer (no one writes directly to it)
596 * and flag that it failed.
597 */
598 trace_array_printk(&max_tr, _THIS_IP_,
599 "Failed to swap buffers due to commit in progress\n");
600 }
601
463 ftrace_enable_cpu(); 602 ftrace_enable_cpu();
464 603
465 WARN_ON_ONCE(ret && ret != -EAGAIN); 604 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
466 605
467 __update_max_tr(tr, tsk, cpu); 606 __update_max_tr(tr, tsk, cpu);
468 __raw_spin_unlock(&ftrace_max_lock); 607 __raw_spin_unlock(&ftrace_max_lock);
469} 608}
609#endif /* CONFIG_TRACER_MAX_TRACE */
470 610
471/** 611/**
472 * register_tracer - register a tracer with the ftrace system. 612 * register_tracer - register a tracer with the ftrace system.
@@ -523,7 +663,6 @@ __acquires(kernel_lock)
523 if (type->selftest && !tracing_selftest_disabled) { 663 if (type->selftest && !tracing_selftest_disabled) {
524 struct tracer *saved_tracer = current_trace; 664 struct tracer *saved_tracer = current_trace;
525 struct trace_array *tr = &global_trace; 665 struct trace_array *tr = &global_trace;
526 int i;
527 666
528 /* 667 /*
529 * Run a selftest on this tracer. 668 * Run a selftest on this tracer.
@@ -532,8 +671,7 @@ __acquires(kernel_lock)
532 * internal tracing to verify that everything is in order. 671 * internal tracing to verify that everything is in order.
533 * If we fail, we do not register this tracer. 672 * If we fail, we do not register this tracer.
534 */ 673 */
535 for_each_tracing_cpu(i) 674 tracing_reset_online_cpus(tr);
536 tracing_reset(tr, i);
537 675
538 current_trace = type; 676 current_trace = type;
539 /* the test is responsible for initializing and enabling */ 677 /* the test is responsible for initializing and enabling */
@@ -546,8 +684,7 @@ __acquires(kernel_lock)
546 goto out; 684 goto out;
547 } 685 }
548 /* Only reset on passing, to avoid touching corrupted buffers */ 686 /* Only reset on passing, to avoid touching corrupted buffers */
549 for_each_tracing_cpu(i) 687 tracing_reset_online_cpus(tr);
550 tracing_reset(tr, i);
551 688
552 printk(KERN_CONT "PASSED\n"); 689 printk(KERN_CONT "PASSED\n");
553 } 690 }
@@ -622,21 +759,42 @@ void unregister_tracer(struct tracer *type)
622 mutex_unlock(&trace_types_lock); 759 mutex_unlock(&trace_types_lock);
623} 760}
624 761
625void tracing_reset(struct trace_array *tr, int cpu) 762static void __tracing_reset(struct trace_array *tr, int cpu)
626{ 763{
627 ftrace_disable_cpu(); 764 ftrace_disable_cpu();
628 ring_buffer_reset_cpu(tr->buffer, cpu); 765 ring_buffer_reset_cpu(tr->buffer, cpu);
629 ftrace_enable_cpu(); 766 ftrace_enable_cpu();
630} 767}
631 768
769void tracing_reset(struct trace_array *tr, int cpu)
770{
771 struct ring_buffer *buffer = tr->buffer;
772
773 ring_buffer_record_disable(buffer);
774
775 /* Make sure all commits have finished */
776 synchronize_sched();
777 __tracing_reset(tr, cpu);
778
779 ring_buffer_record_enable(buffer);
780}
781
632void tracing_reset_online_cpus(struct trace_array *tr) 782void tracing_reset_online_cpus(struct trace_array *tr)
633{ 783{
784 struct ring_buffer *buffer = tr->buffer;
634 int cpu; 785 int cpu;
635 786
787 ring_buffer_record_disable(buffer);
788
789 /* Make sure all commits have finished */
790 synchronize_sched();
791
636 tr->time_start = ftrace_now(tr->cpu); 792 tr->time_start = ftrace_now(tr->cpu);
637 793
638 for_each_online_cpu(cpu) 794 for_each_online_cpu(cpu)
639 tracing_reset(tr, cpu); 795 __tracing_reset(tr, cpu);
796
797 ring_buffer_record_enable(buffer);
640} 798}
641 799
642void tracing_reset_current(int cpu) 800void tracing_reset_current(int cpu)
@@ -667,8 +825,10 @@ static void trace_init_cmdlines(void)
667 cmdline_idx = 0; 825 cmdline_idx = 0;
668} 826}
669 827
670static int trace_stop_count; 828int is_tracing_stopped(void)
671static DEFINE_SPINLOCK(tracing_start_lock); 829{
830 return trace_stop_count;
831}
672 832
673/** 833/**
674 * ftrace_off_permanent - disable all ftrace code permanently 834 * ftrace_off_permanent - disable all ftrace code permanently
@@ -837,7 +997,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
837 997
838 entry->preempt_count = pc & 0xff; 998 entry->preempt_count = pc & 0xff;
839 entry->pid = (tsk) ? tsk->pid : 0; 999 entry->pid = (tsk) ? tsk->pid : 0;
840 entry->tgid = (tsk) ? tsk->tgid : 0; 1000 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
841 entry->flags = 1001 entry->flags =
842#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1002#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
843 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1003 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -848,15 +1008,17 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 1008 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 1009 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
850} 1010}
1011EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
851 1012
852struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 1013struct ring_buffer_event *
853 int type, 1014trace_buffer_lock_reserve(struct ring_buffer *buffer,
854 unsigned long len, 1015 int type,
855 unsigned long flags, int pc) 1016 unsigned long len,
1017 unsigned long flags, int pc)
856{ 1018{
857 struct ring_buffer_event *event; 1019 struct ring_buffer_event *event;
858 1020
859 event = ring_buffer_lock_reserve(tr->buffer, len); 1021 event = ring_buffer_lock_reserve(buffer, len);
860 if (event != NULL) { 1022 if (event != NULL) {
861 struct trace_entry *ent = ring_buffer_event_data(event); 1023 struct trace_entry *ent = ring_buffer_event_data(event);
862 1024
@@ -866,58 +1028,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
866 1028
867 return event; 1029 return event;
868} 1030}
869static void ftrace_trace_stack(struct trace_array *tr,
870 unsigned long flags, int skip, int pc);
871static void ftrace_trace_userstack(struct trace_array *tr,
872 unsigned long flags, int pc);
873 1031
874static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 1032static inline void
875 struct ring_buffer_event *event, 1033__trace_buffer_unlock_commit(struct ring_buffer *buffer,
876 unsigned long flags, int pc, 1034 struct ring_buffer_event *event,
877 int wake) 1035 unsigned long flags, int pc,
1036 int wake)
878{ 1037{
879 ring_buffer_unlock_commit(tr->buffer, event); 1038 ring_buffer_unlock_commit(buffer, event);
880 1039
881 ftrace_trace_stack(tr, flags, 6, pc); 1040 ftrace_trace_stack(buffer, flags, 6, pc);
882 ftrace_trace_userstack(tr, flags, pc); 1041 ftrace_trace_userstack(buffer, flags, pc);
883 1042
884 if (wake) 1043 if (wake)
885 trace_wake_up(); 1044 trace_wake_up();
886} 1045}
887 1046
888void trace_buffer_unlock_commit(struct trace_array *tr, 1047void trace_buffer_unlock_commit(struct ring_buffer *buffer,
889 struct ring_buffer_event *event, 1048 struct ring_buffer_event *event,
890 unsigned long flags, int pc) 1049 unsigned long flags, int pc)
891{ 1050{
892 __trace_buffer_unlock_commit(tr, event, flags, pc, 1); 1051 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
893} 1052}
894 1053
895struct ring_buffer_event * 1054struct ring_buffer_event *
896trace_current_buffer_lock_reserve(int type, unsigned long len, 1055trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1056 int type, unsigned long len,
897 unsigned long flags, int pc) 1057 unsigned long flags, int pc)
898{ 1058{
899 return trace_buffer_lock_reserve(&global_trace, 1059 *current_rb = global_trace.buffer;
1060 return trace_buffer_lock_reserve(*current_rb,
900 type, len, flags, pc); 1061 type, len, flags, pc);
901} 1062}
902EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); 1063EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
903 1064
904void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 1065void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1066 struct ring_buffer_event *event,
905 unsigned long flags, int pc) 1067 unsigned long flags, int pc)
906{ 1068{
907 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 1069 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
908} 1070}
909EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1071EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
910 1072
911void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 1073void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
912 unsigned long flags, int pc) 1074 struct ring_buffer_event *event,
1075 unsigned long flags, int pc)
913{ 1076{
914 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 1077 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
915} 1078}
916EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1079EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
917 1080
918void trace_current_buffer_discard_commit(struct ring_buffer_event *event) 1081void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1082 struct ring_buffer_event *event)
919{ 1083{
920 ring_buffer_discard_commit(global_trace.buffer, event); 1084 ring_buffer_discard_commit(buffer, event);
921} 1085}
922EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); 1086EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
923 1087
@@ -927,6 +1091,7 @@ trace_function(struct trace_array *tr,
927 int pc) 1091 int pc)
928{ 1092{
929 struct ftrace_event_call *call = &event_function; 1093 struct ftrace_event_call *call = &event_function;
1094 struct ring_buffer *buffer = tr->buffer;
930 struct ring_buffer_event *event; 1095 struct ring_buffer_event *event;
931 struct ftrace_entry *entry; 1096 struct ftrace_entry *entry;
932 1097
@@ -934,7 +1099,7 @@ trace_function(struct trace_array *tr,
934 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1099 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
935 return; 1100 return;
936 1101
937 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), 1102 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
938 flags, pc); 1103 flags, pc);
939 if (!event) 1104 if (!event)
940 return; 1105 return;
@@ -942,58 +1107,10 @@ trace_function(struct trace_array *tr,
942 entry->ip = ip; 1107 entry->ip = ip;
943 entry->parent_ip = parent_ip; 1108 entry->parent_ip = parent_ip;
944 1109
945 if (!filter_check_discard(call, entry, tr->buffer, event)) 1110 if (!filter_check_discard(call, entry, buffer, event))
946 ring_buffer_unlock_commit(tr->buffer, event); 1111 ring_buffer_unlock_commit(buffer, event);
947} 1112}
948 1113
949#ifdef CONFIG_FUNCTION_GRAPH_TRACER
950static int __trace_graph_entry(struct trace_array *tr,
951 struct ftrace_graph_ent *trace,
952 unsigned long flags,
953 int pc)
954{
955 struct ftrace_event_call *call = &event_funcgraph_entry;
956 struct ring_buffer_event *event;
957 struct ftrace_graph_ent_entry *entry;
958
959 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
960 return 0;
961
962 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
963 sizeof(*entry), flags, pc);
964 if (!event)
965 return 0;
966 entry = ring_buffer_event_data(event);
967 entry->graph_ent = *trace;
968 if (!filter_current_check_discard(call, entry, event))
969 ring_buffer_unlock_commit(global_trace.buffer, event);
970
971 return 1;
972}
973
974static void __trace_graph_return(struct trace_array *tr,
975 struct ftrace_graph_ret *trace,
976 unsigned long flags,
977 int pc)
978{
979 struct ftrace_event_call *call = &event_funcgraph_exit;
980 struct ring_buffer_event *event;
981 struct ftrace_graph_ret_entry *entry;
982
983 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
984 return;
985
986 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
987 sizeof(*entry), flags, pc);
988 if (!event)
989 return;
990 entry = ring_buffer_event_data(event);
991 entry->ret = *trace;
992 if (!filter_current_check_discard(call, entry, event))
993 ring_buffer_unlock_commit(global_trace.buffer, event);
994}
995#endif
996
997void 1114void
998ftrace(struct trace_array *tr, struct trace_array_cpu *data, 1115ftrace(struct trace_array *tr, struct trace_array_cpu *data,
999 unsigned long ip, unsigned long parent_ip, unsigned long flags, 1116 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1003,17 +1120,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1003 trace_function(tr, ip, parent_ip, flags, pc); 1120 trace_function(tr, ip, parent_ip, flags, pc);
1004} 1121}
1005 1122
1006static void __ftrace_trace_stack(struct trace_array *tr, 1123#ifdef CONFIG_STACKTRACE
1124static void __ftrace_trace_stack(struct ring_buffer *buffer,
1007 unsigned long flags, 1125 unsigned long flags,
1008 int skip, int pc) 1126 int skip, int pc)
1009{ 1127{
1010#ifdef CONFIG_STACKTRACE
1011 struct ftrace_event_call *call = &event_kernel_stack; 1128 struct ftrace_event_call *call = &event_kernel_stack;
1012 struct ring_buffer_event *event; 1129 struct ring_buffer_event *event;
1013 struct stack_entry *entry; 1130 struct stack_entry *entry;
1014 struct stack_trace trace; 1131 struct stack_trace trace;
1015 1132
1016 event = trace_buffer_lock_reserve(tr, TRACE_STACK, 1133 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1017 sizeof(*entry), flags, pc); 1134 sizeof(*entry), flags, pc);
1018 if (!event) 1135 if (!event)
1019 return; 1136 return;
@@ -1026,32 +1143,28 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1026 trace.entries = entry->caller; 1143 trace.entries = entry->caller;
1027 1144
1028 save_stack_trace(&trace); 1145 save_stack_trace(&trace);
1029 if (!filter_check_discard(call, entry, tr->buffer, event)) 1146 if (!filter_check_discard(call, entry, buffer, event))
1030 ring_buffer_unlock_commit(tr->buffer, event); 1147 ring_buffer_unlock_commit(buffer, event);
1031#endif
1032} 1148}
1033 1149
1034static void ftrace_trace_stack(struct trace_array *tr, 1150void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1035 unsigned long flags, 1151 int skip, int pc)
1036 int skip, int pc)
1037{ 1152{
1038 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1153 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1039 return; 1154 return;
1040 1155
1041 __ftrace_trace_stack(tr, flags, skip, pc); 1156 __ftrace_trace_stack(buffer, flags, skip, pc);
1042} 1157}
1043 1158
1044void __trace_stack(struct trace_array *tr, 1159void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1045 unsigned long flags, 1160 int pc)
1046 int skip, int pc)
1047{ 1161{
1048 __ftrace_trace_stack(tr, flags, skip, pc); 1162 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1049} 1163}
1050 1164
1051static void ftrace_trace_userstack(struct trace_array *tr, 1165void
1052 unsigned long flags, int pc) 1166ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1053{ 1167{
1054#ifdef CONFIG_STACKTRACE
1055 struct ftrace_event_call *call = &event_user_stack; 1168 struct ftrace_event_call *call = &event_user_stack;
1056 struct ring_buffer_event *event; 1169 struct ring_buffer_event *event;
1057 struct userstack_entry *entry; 1170 struct userstack_entry *entry;
@@ -1060,12 +1173,13 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1060 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1173 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1061 return; 1174 return;
1062 1175
1063 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, 1176 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1064 sizeof(*entry), flags, pc); 1177 sizeof(*entry), flags, pc);
1065 if (!event) 1178 if (!event)
1066 return; 1179 return;
1067 entry = ring_buffer_event_data(event); 1180 entry = ring_buffer_event_data(event);
1068 1181
1182 entry->tgid = current->tgid;
1069 memset(&entry->caller, 0, sizeof(entry->caller)); 1183 memset(&entry->caller, 0, sizeof(entry->caller));
1070 1184
1071 trace.nr_entries = 0; 1185 trace.nr_entries = 0;
@@ -1074,9 +1188,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1074 trace.entries = entry->caller; 1188 trace.entries = entry->caller;
1075 1189
1076 save_stack_trace_user(&trace); 1190 save_stack_trace_user(&trace);
1077 if (!filter_check_discard(call, entry, tr->buffer, event)) 1191 if (!filter_check_discard(call, entry, buffer, event))
1078 ring_buffer_unlock_commit(tr->buffer, event); 1192 ring_buffer_unlock_commit(buffer, event);
1079#endif
1080} 1193}
1081 1194
1082#ifdef UNUSED 1195#ifdef UNUSED
@@ -1086,16 +1199,20 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1086} 1199}
1087#endif /* UNUSED */ 1200#endif /* UNUSED */
1088 1201
1202#endif /* CONFIG_STACKTRACE */
1203
1089static void 1204static void
1090ftrace_trace_special(void *__tr, 1205ftrace_trace_special(void *__tr,
1091 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1206 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1092 int pc) 1207 int pc)
1093{ 1208{
1209 struct ftrace_event_call *call = &event_special;
1094 struct ring_buffer_event *event; 1210 struct ring_buffer_event *event;
1095 struct trace_array *tr = __tr; 1211 struct trace_array *tr = __tr;
1212 struct ring_buffer *buffer = tr->buffer;
1096 struct special_entry *entry; 1213 struct special_entry *entry;
1097 1214
1098 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, 1215 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1099 sizeof(*entry), 0, pc); 1216 sizeof(*entry), 0, pc);
1100 if (!event) 1217 if (!event)
1101 return; 1218 return;
@@ -1103,7 +1220,9 @@ ftrace_trace_special(void *__tr,
1103 entry->arg1 = arg1; 1220 entry->arg1 = arg1;
1104 entry->arg2 = arg2; 1221 entry->arg2 = arg2;
1105 entry->arg3 = arg3; 1222 entry->arg3 = arg3;
1106 trace_buffer_unlock_commit(tr, event, 0, pc); 1223
1224 if (!filter_check_discard(call, entry, buffer, event))
1225 trace_buffer_unlock_commit(buffer, event, 0, pc);
1107} 1226}
1108 1227
1109void 1228void
@@ -1114,62 +1233,6 @@ __trace_special(void *__tr, void *__data,
1114} 1233}
1115 1234
1116void 1235void
1117tracing_sched_switch_trace(struct trace_array *tr,
1118 struct task_struct *prev,
1119 struct task_struct *next,
1120 unsigned long flags, int pc)
1121{
1122 struct ftrace_event_call *call = &event_context_switch;
1123 struct ring_buffer_event *event;
1124 struct ctx_switch_entry *entry;
1125
1126 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1127 sizeof(*entry), flags, pc);
1128 if (!event)
1129 return;
1130 entry = ring_buffer_event_data(event);
1131 entry->prev_pid = prev->pid;
1132 entry->prev_prio = prev->prio;
1133 entry->prev_state = prev->state;
1134 entry->next_pid = next->pid;
1135 entry->next_prio = next->prio;
1136 entry->next_state = next->state;
1137 entry->next_cpu = task_cpu(next);
1138
1139 if (!filter_check_discard(call, entry, tr->buffer, event))
1140 trace_buffer_unlock_commit(tr, event, flags, pc);
1141}
1142
1143void
1144tracing_sched_wakeup_trace(struct trace_array *tr,
1145 struct task_struct *wakee,
1146 struct task_struct *curr,
1147 unsigned long flags, int pc)
1148{
1149 struct ftrace_event_call *call = &event_wakeup;
1150 struct ring_buffer_event *event;
1151 struct ctx_switch_entry *entry;
1152
1153 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1154 sizeof(*entry), flags, pc);
1155 if (!event)
1156 return;
1157 entry = ring_buffer_event_data(event);
1158 entry->prev_pid = curr->pid;
1159 entry->prev_prio = curr->prio;
1160 entry->prev_state = curr->state;
1161 entry->next_pid = wakee->pid;
1162 entry->next_prio = wakee->prio;
1163 entry->next_state = wakee->state;
1164 entry->next_cpu = task_cpu(wakee);
1165
1166 if (!filter_check_discard(call, entry, tr->buffer, event))
1167 ring_buffer_unlock_commit(tr->buffer, event);
1168 ftrace_trace_stack(tr, flags, 6, pc);
1169 ftrace_trace_userstack(tr, flags, pc);
1170}
1171
1172void
1173ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1236ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1174{ 1237{
1175 struct trace_array *tr = &global_trace; 1238 struct trace_array *tr = &global_trace;
@@ -1193,68 +1256,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1193 local_irq_restore(flags); 1256 local_irq_restore(flags);
1194} 1257}
1195 1258
1196#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1197int trace_graph_entry(struct ftrace_graph_ent *trace)
1198{
1199 struct trace_array *tr = &global_trace;
1200 struct trace_array_cpu *data;
1201 unsigned long flags;
1202 long disabled;
1203 int ret;
1204 int cpu;
1205 int pc;
1206
1207 if (!ftrace_trace_task(current))
1208 return 0;
1209
1210 if (!ftrace_graph_addr(trace->func))
1211 return 0;
1212
1213 local_irq_save(flags);
1214 cpu = raw_smp_processor_id();
1215 data = tr->data[cpu];
1216 disabled = atomic_inc_return(&data->disabled);
1217 if (likely(disabled == 1)) {
1218 pc = preempt_count();
1219 ret = __trace_graph_entry(tr, trace, flags, pc);
1220 } else {
1221 ret = 0;
1222 }
1223 /* Only do the atomic if it is not already set */
1224 if (!test_tsk_trace_graph(current))
1225 set_tsk_trace_graph(current);
1226
1227 atomic_dec(&data->disabled);
1228 local_irq_restore(flags);
1229
1230 return ret;
1231}
1232
1233void trace_graph_return(struct ftrace_graph_ret *trace)
1234{
1235 struct trace_array *tr = &global_trace;
1236 struct trace_array_cpu *data;
1237 unsigned long flags;
1238 long disabled;
1239 int cpu;
1240 int pc;
1241
1242 local_irq_save(flags);
1243 cpu = raw_smp_processor_id();
1244 data = tr->data[cpu];
1245 disabled = atomic_inc_return(&data->disabled);
1246 if (likely(disabled == 1)) {
1247 pc = preempt_count();
1248 __trace_graph_return(tr, trace, flags, pc);
1249 }
1250 if (!trace->depth)
1251 clear_tsk_trace_graph(current);
1252 atomic_dec(&data->disabled);
1253 local_irq_restore(flags);
1254}
1255#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1256
1257
1258/** 1259/**
1259 * trace_vbprintk - write binary msg to tracing buffer 1260 * trace_vbprintk - write binary msg to tracing buffer
1260 * 1261 *
@@ -1267,6 +1268,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1267 1268
1268 struct ftrace_event_call *call = &event_bprint; 1269 struct ftrace_event_call *call = &event_bprint;
1269 struct ring_buffer_event *event; 1270 struct ring_buffer_event *event;
1271 struct ring_buffer *buffer;
1270 struct trace_array *tr = &global_trace; 1272 struct trace_array *tr = &global_trace;
1271 struct trace_array_cpu *data; 1273 struct trace_array_cpu *data;
1272 struct bprint_entry *entry; 1274 struct bprint_entry *entry;
@@ -1299,7 +1301,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1299 goto out_unlock; 1301 goto out_unlock;
1300 1302
1301 size = sizeof(*entry) + sizeof(u32) * len; 1303 size = sizeof(*entry) + sizeof(u32) * len;
1302 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); 1304 buffer = tr->buffer;
1305 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1306 flags, pc);
1303 if (!event) 1307 if (!event)
1304 goto out_unlock; 1308 goto out_unlock;
1305 entry = ring_buffer_event_data(event); 1309 entry = ring_buffer_event_data(event);
@@ -1307,8 +1311,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1307 entry->fmt = fmt; 1311 entry->fmt = fmt;
1308 1312
1309 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1313 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1310 if (!filter_check_discard(call, entry, tr->buffer, event)) 1314 if (!filter_check_discard(call, entry, buffer, event))
1311 ring_buffer_unlock_commit(tr->buffer, event); 1315 ring_buffer_unlock_commit(buffer, event);
1312 1316
1313out_unlock: 1317out_unlock:
1314 __raw_spin_unlock(&trace_buf_lock); 1318 __raw_spin_unlock(&trace_buf_lock);
@@ -1323,14 +1327,30 @@ out:
1323} 1327}
1324EXPORT_SYMBOL_GPL(trace_vbprintk); 1328EXPORT_SYMBOL_GPL(trace_vbprintk);
1325 1329
1326int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1330int trace_array_printk(struct trace_array *tr,
1331 unsigned long ip, const char *fmt, ...)
1332{
1333 int ret;
1334 va_list ap;
1335
1336 if (!(trace_flags & TRACE_ITER_PRINTK))
1337 return 0;
1338
1339 va_start(ap, fmt);
1340 ret = trace_array_vprintk(tr, ip, fmt, ap);
1341 va_end(ap);
1342 return ret;
1343}
1344
1345int trace_array_vprintk(struct trace_array *tr,
1346 unsigned long ip, const char *fmt, va_list args)
1327{ 1347{
1328 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1348 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1329 static char trace_buf[TRACE_BUF_SIZE]; 1349 static char trace_buf[TRACE_BUF_SIZE];
1330 1350
1331 struct ftrace_event_call *call = &event_print; 1351 struct ftrace_event_call *call = &event_print;
1332 struct ring_buffer_event *event; 1352 struct ring_buffer_event *event;
1333 struct trace_array *tr = &global_trace; 1353 struct ring_buffer *buffer;
1334 struct trace_array_cpu *data; 1354 struct trace_array_cpu *data;
1335 int cpu, len = 0, size, pc; 1355 int cpu, len = 0, size, pc;
1336 struct print_entry *entry; 1356 struct print_entry *entry;
@@ -1358,7 +1378,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1358 trace_buf[len] = 0; 1378 trace_buf[len] = 0;
1359 1379
1360 size = sizeof(*entry) + len + 1; 1380 size = sizeof(*entry) + len + 1;
1361 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); 1381 buffer = tr->buffer;
1382 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1383 irq_flags, pc);
1362 if (!event) 1384 if (!event)
1363 goto out_unlock; 1385 goto out_unlock;
1364 entry = ring_buffer_event_data(event); 1386 entry = ring_buffer_event_data(event);
@@ -1366,8 +1388,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1366 1388
1367 memcpy(&entry->buf, trace_buf, len); 1389 memcpy(&entry->buf, trace_buf, len);
1368 entry->buf[len] = 0; 1390 entry->buf[len] = 0;
1369 if (!filter_check_discard(call, entry, tr->buffer, event)) 1391 if (!filter_check_discard(call, entry, buffer, event))
1370 ring_buffer_unlock_commit(tr->buffer, event); 1392 ring_buffer_unlock_commit(buffer, event);
1371 1393
1372 out_unlock: 1394 out_unlock:
1373 __raw_spin_unlock(&trace_buf_lock); 1395 __raw_spin_unlock(&trace_buf_lock);
@@ -1379,6 +1401,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1379 1401
1380 return len; 1402 return len;
1381} 1403}
1404
1405int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1406{
1407 return trace_array_printk(&global_trace, ip, fmt, args);
1408}
1382EXPORT_SYMBOL_GPL(trace_vprintk); 1409EXPORT_SYMBOL_GPL(trace_vprintk);
1383 1410
1384enum trace_file_type { 1411enum trace_file_type {
@@ -1518,6 +1545,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1518 return ent; 1545 return ent;
1519} 1546}
1520 1547
1548static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1549{
1550 struct trace_array *tr = iter->tr;
1551 struct ring_buffer_event *event;
1552 struct ring_buffer_iter *buf_iter;
1553 unsigned long entries = 0;
1554 u64 ts;
1555
1556 tr->data[cpu]->skipped_entries = 0;
1557
1558 if (!iter->buffer_iter[cpu])
1559 return;
1560
1561 buf_iter = iter->buffer_iter[cpu];
1562 ring_buffer_iter_reset(buf_iter);
1563
1564 /*
1565 * We could have the case with the max latency tracers
1566 * that a reset never took place on a cpu. This is evident
1567 * by the timestamp being before the start of the buffer.
1568 */
1569 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1570 if (ts >= iter->tr->time_start)
1571 break;
1572 entries++;
1573 ring_buffer_read(buf_iter, NULL);
1574 }
1575
1576 tr->data[cpu]->skipped_entries = entries;
1577}
1578
1521/* 1579/*
1522 * No necessary locking here. The worst thing which can 1580 * No necessary locking here. The worst thing which can
1523 * happen is loosing events consumed at the same time 1581 * happen is loosing events consumed at the same time
@@ -1556,10 +1614,9 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1556 1614
1557 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1615 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1558 for_each_tracing_cpu(cpu) 1616 for_each_tracing_cpu(cpu)
1559 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1617 tracing_iter_reset(iter, cpu);
1560 } else 1618 } else
1561 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); 1619 tracing_iter_reset(iter, cpu_file);
1562
1563 1620
1564 ftrace_enable_cpu(); 1621 ftrace_enable_cpu();
1565 1622
@@ -1588,10 +1645,10 @@ static void print_lat_help_header(struct seq_file *m)
1588 seq_puts(m, "# | / _----=> need-resched \n"); 1645 seq_puts(m, "# | / _----=> need-resched \n");
1589 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1646 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1590 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1647 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1591 seq_puts(m, "# |||| / \n"); 1648 seq_puts(m, "# |||| /_--=> lock-depth \n");
1592 seq_puts(m, "# ||||| delay \n"); 1649 seq_puts(m, "# |||||/ delay \n");
1593 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1650 seq_puts(m, "# cmd pid |||||| time | caller \n");
1594 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1651 seq_puts(m, "# \\ / |||||| \\ | / \n");
1595} 1652}
1596 1653
1597static void print_func_help_header(struct seq_file *m) 1654static void print_func_help_header(struct seq_file *m)
@@ -1608,16 +1665,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1608 struct trace_array *tr = iter->tr; 1665 struct trace_array *tr = iter->tr;
1609 struct trace_array_cpu *data = tr->data[tr->cpu]; 1666 struct trace_array_cpu *data = tr->data[tr->cpu];
1610 struct tracer *type = current_trace; 1667 struct tracer *type = current_trace;
1611 unsigned long total; 1668 unsigned long entries = 0;
1612 unsigned long entries; 1669 unsigned long total = 0;
1670 unsigned long count;
1613 const char *name = "preemption"; 1671 const char *name = "preemption";
1672 int cpu;
1614 1673
1615 if (type) 1674 if (type)
1616 name = type->name; 1675 name = type->name;
1617 1676
1618 entries = ring_buffer_entries(iter->tr->buffer); 1677
1619 total = entries + 1678 for_each_tracing_cpu(cpu) {
1620 ring_buffer_overruns(iter->tr->buffer); 1679 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1680 /*
1681 * If this buffer has skipped entries, then we hold all
1682 * entries for the trace and we need to ignore the
1683 * ones before the time stamp.
1684 */
1685 if (tr->data[cpu]->skipped_entries) {
1686 count -= tr->data[cpu]->skipped_entries;
1687 /* total is the same as the entries */
1688 total += count;
1689 } else
1690 total += count +
1691 ring_buffer_overrun_cpu(tr->buffer, cpu);
1692 entries += count;
1693 }
1621 1694
1622 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1695 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1623 name, UTS_RELEASE); 1696 name, UTS_RELEASE);
@@ -1659,7 +1732,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1659 seq_puts(m, "\n# => ended at: "); 1732 seq_puts(m, "\n# => ended at: ");
1660 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1733 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1661 trace_print_seq(m, &iter->seq); 1734 trace_print_seq(m, &iter->seq);
1662 seq_puts(m, "#\n"); 1735 seq_puts(m, "\n#\n");
1663 } 1736 }
1664 1737
1665 seq_puts(m, "#\n"); 1738 seq_puts(m, "#\n");
@@ -1678,6 +1751,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1678 if (cpumask_test_cpu(iter->cpu, iter->started)) 1751 if (cpumask_test_cpu(iter->cpu, iter->started))
1679 return; 1752 return;
1680 1753
1754 if (iter->tr->data[iter->cpu]->skipped_entries)
1755 return;
1756
1681 cpumask_set_cpu(iter->cpu, iter->started); 1757 cpumask_set_cpu(iter->cpu, iter->started);
1682 1758
1683 /* Don't print started cpu buffer for the first entry of the trace */ 1759 /* Don't print started cpu buffer for the first entry of the trace */
@@ -1940,19 +2016,23 @@ __tracing_open(struct inode *inode, struct file *file)
1940 if (ring_buffer_overruns(iter->tr->buffer)) 2016 if (ring_buffer_overruns(iter->tr->buffer))
1941 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2017 iter->iter_flags |= TRACE_FILE_ANNOTATE;
1942 2018
2019 /* stop the trace while dumping */
2020 tracing_stop();
2021
1943 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2022 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1944 for_each_tracing_cpu(cpu) { 2023 for_each_tracing_cpu(cpu) {
1945 2024
1946 iter->buffer_iter[cpu] = 2025 iter->buffer_iter[cpu] =
1947 ring_buffer_read_start(iter->tr->buffer, cpu); 2026 ring_buffer_read_start(iter->tr->buffer, cpu);
2027 tracing_iter_reset(iter, cpu);
1948 } 2028 }
1949 } else { 2029 } else {
1950 cpu = iter->cpu_file; 2030 cpu = iter->cpu_file;
1951 iter->buffer_iter[cpu] = 2031 iter->buffer_iter[cpu] =
1952 ring_buffer_read_start(iter->tr->buffer, cpu); 2032 ring_buffer_read_start(iter->tr->buffer, cpu);
2033 tracing_iter_reset(iter, cpu);
1953 } 2034 }
1954 2035
1955 /* TODO stop tracer */
1956 ret = seq_open(file, &tracer_seq_ops); 2036 ret = seq_open(file, &tracer_seq_ops);
1957 if (ret < 0) { 2037 if (ret < 0) {
1958 fail_ret = ERR_PTR(ret); 2038 fail_ret = ERR_PTR(ret);
@@ -1962,9 +2042,6 @@ __tracing_open(struct inode *inode, struct file *file)
1962 m = file->private_data; 2042 m = file->private_data;
1963 m->private = iter; 2043 m->private = iter;
1964 2044
1965 /* stop the trace while dumping */
1966 tracing_stop();
1967
1968 mutex_unlock(&trace_types_lock); 2045 mutex_unlock(&trace_types_lock);
1969 2046
1970 return iter; 2047 return iter;
@@ -1975,6 +2052,7 @@ __tracing_open(struct inode *inode, struct file *file)
1975 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2052 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1976 } 2053 }
1977 free_cpumask_var(iter->started); 2054 free_cpumask_var(iter->started);
2055 tracing_start();
1978 fail: 2056 fail:
1979 mutex_unlock(&trace_types_lock); 2057 mutex_unlock(&trace_types_lock);
1980 kfree(iter->trace); 2058 kfree(iter->trace);
@@ -2031,7 +2109,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2031 2109
2032 /* If this file was open for write, then erase contents */ 2110 /* If this file was open for write, then erase contents */
2033 if ((file->f_mode & FMODE_WRITE) && 2111 if ((file->f_mode & FMODE_WRITE) &&
2034 !(file->f_flags & O_APPEND)) { 2112 (file->f_flags & O_TRUNC)) {
2035 long cpu = (long) inode->i_private; 2113 long cpu = (long) inode->i_private;
2036 2114
2037 if (cpu == TRACE_PIPE_ALL_CPU) 2115 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2053,25 +2131,23 @@ static int tracing_open(struct inode *inode, struct file *file)
2053static void * 2131static void *
2054t_next(struct seq_file *m, void *v, loff_t *pos) 2132t_next(struct seq_file *m, void *v, loff_t *pos)
2055{ 2133{
2056 struct tracer *t = m->private; 2134 struct tracer *t = v;
2057 2135
2058 (*pos)++; 2136 (*pos)++;
2059 2137
2060 if (t) 2138 if (t)
2061 t = t->next; 2139 t = t->next;
2062 2140
2063 m->private = t;
2064
2065 return t; 2141 return t;
2066} 2142}
2067 2143
2068static void *t_start(struct seq_file *m, loff_t *pos) 2144static void *t_start(struct seq_file *m, loff_t *pos)
2069{ 2145{
2070 struct tracer *t = m->private; 2146 struct tracer *t;
2071 loff_t l = 0; 2147 loff_t l = 0;
2072 2148
2073 mutex_lock(&trace_types_lock); 2149 mutex_lock(&trace_types_lock);
2074 for (; t && l < *pos; t = t_next(m, t, &l)) 2150 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
2075 ; 2151 ;
2076 2152
2077 return t; 2153 return t;
@@ -2107,18 +2183,10 @@ static struct seq_operations show_traces_seq_ops = {
2107 2183
2108static int show_traces_open(struct inode *inode, struct file *file) 2184static int show_traces_open(struct inode *inode, struct file *file)
2109{ 2185{
2110 int ret;
2111
2112 if (tracing_disabled) 2186 if (tracing_disabled)
2113 return -ENODEV; 2187 return -ENODEV;
2114 2188
2115 ret = seq_open(file, &show_traces_seq_ops); 2189 return seq_open(file, &show_traces_seq_ops);
2116 if (!ret) {
2117 struct seq_file *m = file->private_data;
2118 m->private = trace_types;
2119 }
2120
2121 return ret;
2122} 2190}
2123 2191
2124static ssize_t 2192static ssize_t
@@ -2191,11 +2259,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2191 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 2259 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2192 return -ENOMEM; 2260 return -ENOMEM;
2193 2261
2194 mutex_lock(&tracing_cpumask_update_lock);
2195 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2262 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2196 if (err) 2263 if (err)
2197 goto err_unlock; 2264 goto err_unlock;
2198 2265
2266 mutex_lock(&tracing_cpumask_update_lock);
2267
2199 local_irq_disable(); 2268 local_irq_disable();
2200 __raw_spin_lock(&ftrace_max_lock); 2269 __raw_spin_lock(&ftrace_max_lock);
2201 for_each_tracing_cpu(cpu) { 2270 for_each_tracing_cpu(cpu) {
@@ -2223,8 +2292,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2223 return count; 2292 return count;
2224 2293
2225err_unlock: 2294err_unlock:
2226 mutex_unlock(&tracing_cpumask_update_lock); 2295 free_cpumask_var(tracing_cpumask_new);
2227 free_cpumask_var(tracing_cpumask);
2228 2296
2229 return err; 2297 return err;
2230} 2298}
@@ -2266,8 +2334,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2266 len += 3; /* "no" and newline */ 2334 len += 3; /* "no" and newline */
2267 } 2335 }
2268 2336
2269 /* +2 for \n and \0 */ 2337 /* +1 for \0 */
2270 buf = kmalloc(len + 2, GFP_KERNEL); 2338 buf = kmalloc(len + 1, GFP_KERNEL);
2271 if (!buf) { 2339 if (!buf) {
2272 mutex_unlock(&trace_types_lock); 2340 mutex_unlock(&trace_types_lock);
2273 return -ENOMEM; 2341 return -ENOMEM;
@@ -2290,7 +2358,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2290 } 2358 }
2291 mutex_unlock(&trace_types_lock); 2359 mutex_unlock(&trace_types_lock);
2292 2360
2293 WARN_ON(r >= len + 2); 2361 WARN_ON(r >= len + 1);
2294 2362
2295 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2363 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2296 2364
@@ -2301,23 +2369,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2301/* Try to assign a tracer specific option */ 2369/* Try to assign a tracer specific option */
2302static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 2370static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2303{ 2371{
2304 struct tracer_flags *trace_flags = trace->flags; 2372 struct tracer_flags *tracer_flags = trace->flags;
2305 struct tracer_opt *opts = NULL; 2373 struct tracer_opt *opts = NULL;
2306 int ret = 0, i = 0; 2374 int ret = 0, i = 0;
2307 int len; 2375 int len;
2308 2376
2309 for (i = 0; trace_flags->opts[i].name; i++) { 2377 for (i = 0; tracer_flags->opts[i].name; i++) {
2310 opts = &trace_flags->opts[i]; 2378 opts = &tracer_flags->opts[i];
2311 len = strlen(opts->name); 2379 len = strlen(opts->name);
2312 2380
2313 if (strncmp(cmp, opts->name, len) == 0) { 2381 if (strncmp(cmp, opts->name, len) == 0) {
2314 ret = trace->set_flag(trace_flags->val, 2382 ret = trace->set_flag(tracer_flags->val,
2315 opts->bit, !neg); 2383 opts->bit, !neg);
2316 break; 2384 break;
2317 } 2385 }
2318 } 2386 }
2319 /* Not found */ 2387 /* Not found */
2320 if (!trace_flags->opts[i].name) 2388 if (!tracer_flags->opts[i].name)
2321 return -EINVAL; 2389 return -EINVAL;
2322 2390
2323 /* Refused to handle */ 2391 /* Refused to handle */
@@ -2325,9 +2393,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2325 return ret; 2393 return ret;
2326 2394
2327 if (neg) 2395 if (neg)
2328 trace_flags->val &= ~opts->bit; 2396 tracer_flags->val &= ~opts->bit;
2329 else 2397 else
2330 trace_flags->val |= opts->bit; 2398 tracer_flags->val |= opts->bit;
2331 2399
2332 return 0; 2400 return 0;
2333} 2401}
@@ -2342,22 +2410,6 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2342 trace_flags |= mask; 2410 trace_flags |= mask;
2343 else 2411 else
2344 trace_flags &= ~mask; 2412 trace_flags &= ~mask;
2345
2346 if (mask == TRACE_ITER_GLOBAL_CLK) {
2347 u64 (*func)(void);
2348
2349 if (enabled)
2350 func = trace_clock_global;
2351 else
2352 func = trace_clock_local;
2353
2354 mutex_lock(&trace_types_lock);
2355 ring_buffer_set_clock(global_trace.buffer, func);
2356
2357 if (max_tr.buffer)
2358 ring_buffer_set_clock(max_tr.buffer, func);
2359 mutex_unlock(&trace_types_lock);
2360 }
2361} 2413}
2362 2414
2363static ssize_t 2415static ssize_t
@@ -2414,21 +2466,20 @@ static const struct file_operations tracing_iter_fops = {
2414 2466
2415static const char readme_msg[] = 2467static const char readme_msg[] =
2416 "tracing mini-HOWTO:\n\n" 2468 "tracing mini-HOWTO:\n\n"
2417 "# mkdir /debug\n" 2469 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2418 "# mount -t debugfs nodev /debug\n\n" 2470 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2419 "# cat /debug/tracing/available_tracers\n"
2420 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2471 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2421 "# cat /debug/tracing/current_tracer\n" 2472 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2422 "nop\n" 2473 "nop\n"
2423 "# echo sched_switch > /debug/tracing/current_tracer\n" 2474 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
2424 "# cat /debug/tracing/current_tracer\n" 2475 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2425 "sched_switch\n" 2476 "sched_switch\n"
2426 "# cat /debug/tracing/trace_options\n" 2477 "# cat /sys/kernel/debug/tracing/trace_options\n"
2427 "noprint-parent nosym-offset nosym-addr noverbose\n" 2478 "noprint-parent nosym-offset nosym-addr noverbose\n"
2428 "# echo print-parent > /debug/tracing/trace_options\n" 2479 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2429 "# echo 1 > /debug/tracing/tracing_enabled\n" 2480 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
2430 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2481 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2431 "# echo 0 > /debug/tracing/tracing_enabled\n" 2482 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
2432; 2483;
2433 2484
2434static ssize_t 2485static ssize_t
@@ -3096,7 +3147,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3096 break; 3147 break;
3097 } 3148 }
3098 3149
3099 trace_consume(iter); 3150 if (ret != TRACE_TYPE_NO_CONSUME)
3151 trace_consume(iter);
3100 rem -= count; 3152 rem -= count;
3101 if (!find_next_entry_inc(iter)) { 3153 if (!find_next_entry_inc(iter)) {
3102 rem = 0; 3154 rem = 0;
@@ -3325,6 +3377,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3325 return cnt; 3377 return cnt;
3326} 3378}
3327 3379
3380static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
3381 size_t cnt, loff_t *ppos)
3382{
3383 char buf[64];
3384 int bufiter = 0;
3385 int i;
3386
3387 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3388 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
3389 "%s%s%s%s", i ? " " : "",
3390 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3391 i == trace_clock_id ? "]" : "");
3392 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
3393
3394 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
3395}
3396
3397static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3398 size_t cnt, loff_t *fpos)
3399{
3400 char buf[64];
3401 const char *clockstr;
3402 int i;
3403
3404 if (cnt >= sizeof(buf))
3405 return -EINVAL;
3406
3407 if (copy_from_user(&buf, ubuf, cnt))
3408 return -EFAULT;
3409
3410 buf[cnt] = 0;
3411
3412 clockstr = strstrip(buf);
3413
3414 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
3415 if (strcmp(trace_clocks[i].name, clockstr) == 0)
3416 break;
3417 }
3418 if (i == ARRAY_SIZE(trace_clocks))
3419 return -EINVAL;
3420
3421 trace_clock_id = i;
3422
3423 mutex_lock(&trace_types_lock);
3424
3425 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
3426 if (max_tr.buffer)
3427 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
3428
3429 mutex_unlock(&trace_types_lock);
3430
3431 *fpos += cnt;
3432
3433 return cnt;
3434}
3435
3328static const struct file_operations tracing_max_lat_fops = { 3436static const struct file_operations tracing_max_lat_fops = {
3329 .open = tracing_open_generic, 3437 .open = tracing_open_generic,
3330 .read = tracing_max_lat_read, 3438 .read = tracing_max_lat_read,
@@ -3362,6 +3470,12 @@ static const struct file_operations tracing_mark_fops = {
3362 .write = tracing_mark_write, 3470 .write = tracing_mark_write,
3363}; 3471};
3364 3472
3473static const struct file_operations trace_clock_fops = {
3474 .open = tracing_open_generic,
3475 .read = tracing_clock_read,
3476 .write = tracing_clock_write,
3477};
3478
3365struct ftrace_buffer_info { 3479struct ftrace_buffer_info {
3366 struct trace_array *tr; 3480 struct trace_array *tr;
3367 void *spare; 3481 void *spare;
@@ -3627,7 +3741,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3627 struct trace_seq *s; 3741 struct trace_seq *s;
3628 unsigned long cnt; 3742 unsigned long cnt;
3629 3743
3630 s = kmalloc(sizeof(*s), GFP_ATOMIC); 3744 s = kmalloc(sizeof(*s), GFP_KERNEL);
3631 if (!s) 3745 if (!s)
3632 return ENOMEM; 3746 return ENOMEM;
3633 3747
@@ -3642,9 +3756,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3642 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3756 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3643 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3757 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3644 3758
3645 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3646 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3647
3648 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3759 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3649 3760
3650 kfree(s); 3761 kfree(s);
@@ -3905,17 +4016,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3905 if (ret < 0) 4016 if (ret < 0)
3906 return ret; 4017 return ret;
3907 4018
3908 switch (val) { 4019 if (val != 0 && val != 1)
3909 case 0:
3910 trace_flags &= ~(1 << index);
3911 break;
3912 case 1:
3913 trace_flags |= 1 << index;
3914 break;
3915
3916 default:
3917 return -EINVAL; 4020 return -EINVAL;
3918 } 4021 set_tracer_flags(1 << index, val);
3919 4022
3920 *ppos += cnt; 4023 *ppos += cnt;
3921 4024
@@ -4083,11 +4186,13 @@ static __init int tracer_init_debugfs(void)
4083 trace_create_file("current_tracer", 0644, d_tracer, 4186 trace_create_file("current_tracer", 0644, d_tracer,
4084 &global_trace, &set_tracer_fops); 4187 &global_trace, &set_tracer_fops);
4085 4188
4189#ifdef CONFIG_TRACER_MAX_TRACE
4086 trace_create_file("tracing_max_latency", 0644, d_tracer, 4190 trace_create_file("tracing_max_latency", 0644, d_tracer,
4087 &tracing_max_latency, &tracing_max_lat_fops); 4191 &tracing_max_latency, &tracing_max_lat_fops);
4088 4192
4089 trace_create_file("tracing_thresh", 0644, d_tracer, 4193 trace_create_file("tracing_thresh", 0644, d_tracer,
4090 &tracing_thresh, &tracing_max_lat_fops); 4194 &tracing_thresh, &tracing_max_lat_fops);
4195#endif
4091 4196
4092 trace_create_file("README", 0444, d_tracer, 4197 trace_create_file("README", 0444, d_tracer,
4093 NULL, &tracing_readme_fops); 4198 NULL, &tracing_readme_fops);
@@ -4104,6 +4209,9 @@ static __init int tracer_init_debugfs(void)
4104 trace_create_file("saved_cmdlines", 0444, d_tracer, 4209 trace_create_file("saved_cmdlines", 0444, d_tracer,
4105 NULL, &tracing_saved_cmdlines_fops); 4210 NULL, &tracing_saved_cmdlines_fops);
4106 4211
4212 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4213 &trace_clock_fops);
4214
4107#ifdef CONFIG_DYNAMIC_FTRACE 4215#ifdef CONFIG_DYNAMIC_FTRACE
4108 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4216 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4109 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4217 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4244,8 +4352,11 @@ static void __ftrace_dump(bool disable_tracing)
4244 iter.pos = -1; 4352 iter.pos = -1;
4245 4353
4246 if (find_next_entry_inc(&iter) != NULL) { 4354 if (find_next_entry_inc(&iter) != NULL) {
4247 print_trace_line(&iter); 4355 int ret;
4248 trace_consume(&iter); 4356
4357 ret = print_trace_line(&iter);
4358 if (ret != TRACE_TYPE_NO_CONSUME)
4359 trace_consume(&iter);
4249 } 4360 }
4250 4361
4251 trace_printk_seq(&iter.seq); 4362 trace_printk_seq(&iter.seq);
@@ -4279,7 +4390,6 @@ void ftrace_dump(void)
4279 4390
4280__init static int tracer_alloc_buffers(void) 4391__init static int tracer_alloc_buffers(void)
4281{ 4392{
4282 struct trace_array_cpu *data;
4283 int ring_buf_size; 4393 int ring_buf_size;
4284 int i; 4394 int i;
4285 int ret = -ENOMEM; 4395 int ret = -ENOMEM;
@@ -4329,7 +4439,7 @@ __init static int tracer_alloc_buffers(void)
4329 4439
4330 /* Allocate the first page for all buffers */ 4440 /* Allocate the first page for all buffers */
4331 for_each_tracing_cpu(i) { 4441 for_each_tracing_cpu(i) {
4332 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4442 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4333 max_tr.data[i] = &per_cpu(max_data, i); 4443 max_tr.data[i] = &per_cpu(max_data, i);
4334 } 4444 }
4335 4445
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f8..86bcff94791a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,6 +7,7 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
@@ -34,8 +35,6 @@ enum trace_type {
34 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
35 TRACE_USER_STACK, 36 TRACE_USER_STACK,
36 TRACE_HW_BRANCHES, 37 TRACE_HW_BRANCHES,
37 TRACE_SYSCALL_ENTER,
38 TRACE_SYSCALL_EXIT,
39 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
40 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
41 TRACE_POWER, 40 TRACE_POWER,
@@ -44,157 +43,54 @@ enum trace_type {
44 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
45}; 44};
46 45
47/* 46enum kmemtrace_type_id {
48 * Function trace entry - function address and parent function addres: 47 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
49 */ 48 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
50struct ftrace_entry { 49 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
51 struct trace_entry ent;
52 unsigned long ip;
53 unsigned long parent_ip;
54};
55
56/* Function call entry */
57struct ftrace_graph_ent_entry {
58 struct trace_entry ent;
59 struct ftrace_graph_ent graph_ent;
60}; 50};
61 51
62/* Function return entry */
63struct ftrace_graph_ret_entry {
64 struct trace_entry ent;
65 struct ftrace_graph_ret ret;
66};
67extern struct tracer boot_tracer; 52extern struct tracer boot_tracer;
68 53
69/* 54#undef __field
70 * Context switch trace entry - which task (and prio) we switched from/to: 55#define __field(type, item) type item;
71 */
72struct ctx_switch_entry {
73 struct trace_entry ent;
74 unsigned int prev_pid;
75 unsigned char prev_prio;
76 unsigned char prev_state;
77 unsigned int next_pid;
78 unsigned char next_prio;
79 unsigned char next_state;
80 unsigned int next_cpu;
81};
82
83/*
84 * Special (free-form) trace entry:
85 */
86struct special_entry {
87 struct trace_entry ent;
88 unsigned long arg1;
89 unsigned long arg2;
90 unsigned long arg3;
91};
92
93/*
94 * Stack-trace entry:
95 */
96
97#define FTRACE_STACK_ENTRIES 8
98
99struct stack_entry {
100 struct trace_entry ent;
101 unsigned long caller[FTRACE_STACK_ENTRIES];
102};
103
104struct userstack_entry {
105 struct trace_entry ent;
106 unsigned long caller[FTRACE_STACK_ENTRIES];
107};
108 56
109/* 57#undef __field_struct
110 * trace_printk entry: 58#define __field_struct(type, item) __field(type, item)
111 */
112struct bprint_entry {
113 struct trace_entry ent;
114 unsigned long ip;
115 const char *fmt;
116 u32 buf[];
117};
118 59
119struct print_entry { 60#undef __field_desc
120 struct trace_entry ent; 61#define __field_desc(type, container, item)
121 unsigned long ip;
122 char buf[];
123};
124 62
125#define TRACE_OLD_SIZE 88 63#undef __array
64#define __array(type, item, size) type item[size];
126 65
127struct trace_field_cont { 66#undef __array_desc
128 unsigned char type; 67#define __array_desc(type, container, item, size)
129 /* Temporary till we get rid of this completely */
130 char buf[TRACE_OLD_SIZE - 1];
131};
132 68
133struct trace_mmiotrace_rw { 69#undef __dynamic_array
134 struct trace_entry ent; 70#define __dynamic_array(type, item) type item[];
135 struct mmiotrace_rw rw;
136};
137 71
138struct trace_mmiotrace_map { 72#undef F_STRUCT
139 struct trace_entry ent; 73#define F_STRUCT(args...) args
140 struct mmiotrace_map map;
141};
142 74
143struct trace_boot_call { 75#undef FTRACE_ENTRY
144 struct trace_entry ent; 76#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
145 struct boot_trace_call boot_call; 77 struct struct_name { \
146}; 78 struct trace_entry ent; \
147 79 tstruct \
148struct trace_boot_ret { 80 }
149 struct trace_entry ent;
150 struct boot_trace_ret boot_ret;
151};
152
153#define TRACE_FUNC_SIZE 30
154#define TRACE_FILE_SIZE 20
155struct trace_branch {
156 struct trace_entry ent;
157 unsigned line;
158 char func[TRACE_FUNC_SIZE+1];
159 char file[TRACE_FILE_SIZE+1];
160 char correct;
161};
162
163struct hw_branch_entry {
164 struct trace_entry ent;
165 u64 from;
166 u64 to;
167};
168
169struct trace_power {
170 struct trace_entry ent;
171 struct power_trace state_data;
172};
173 81
174enum kmemtrace_type_id { 82#undef TP_ARGS
175 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ 83#define TP_ARGS(args...) args
176 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
177 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
178};
179 84
180struct kmemtrace_alloc_entry { 85#undef FTRACE_ENTRY_DUP
181 struct trace_entry ent; 86#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
182 enum kmemtrace_type_id type_id;
183 unsigned long call_site;
184 const void *ptr;
185 size_t bytes_req;
186 size_t bytes_alloc;
187 gfp_t gfp_flags;
188 int node;
189};
190 87
191struct kmemtrace_free_entry { 88#include "trace_entries.h"
192 struct trace_entry ent;
193 enum kmemtrace_type_id type_id;
194 unsigned long call_site;
195 const void *ptr;
196};
197 89
90/*
91 * syscalls are special, and need special handling, this is why
92 * they are not included in trace_entries.h
93 */
198struct syscall_trace_enter { 94struct syscall_trace_enter {
199 struct trace_entry ent; 95 struct trace_entry ent;
200 int nr; 96 int nr;
@@ -207,13 +103,12 @@ struct syscall_trace_exit {
207 unsigned long ret; 103 unsigned long ret;
208}; 104};
209 105
210
211/* 106/*
212 * trace_flag_type is an enumeration that holds different 107 * trace_flag_type is an enumeration that holds different
213 * states when a trace occurs. These are: 108 * states when a trace occurs. These are:
214 * IRQS_OFF - interrupts were disabled 109 * IRQS_OFF - interrupts were disabled
215 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 110 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
216 * NEED_RESCED - reschedule is requested 111 * NEED_RESCHED - reschedule is requested
217 * HARDIRQ - inside an interrupt handler 112 * HARDIRQ - inside an interrupt handler
218 * SOFTIRQ - inside a softirq handler 113 * SOFTIRQ - inside a softirq handler
219 */ 114 */
@@ -236,9 +131,6 @@ struct trace_array_cpu {
236 atomic_t disabled; 131 atomic_t disabled;
237 void *buffer_page; /* ring buffer spare */ 132 void *buffer_page; /* ring buffer spare */
238 133
239 /* these fields get copied into max-trace: */
240 unsigned long trace_idx;
241 unsigned long overrun;
242 unsigned long saved_latency; 134 unsigned long saved_latency;
243 unsigned long critical_start; 135 unsigned long critical_start;
244 unsigned long critical_end; 136 unsigned long critical_end;
@@ -246,6 +138,7 @@ struct trace_array_cpu {
246 unsigned long nice; 138 unsigned long nice;
247 unsigned long policy; 139 unsigned long policy;
248 unsigned long rt_priority; 140 unsigned long rt_priority;
141 unsigned long skipped_entries;
249 cycle_t preempt_timestamp; 142 cycle_t preempt_timestamp;
250 pid_t pid; 143 pid_t pid;
251 uid_t uid; 144 uid_t uid;
@@ -319,10 +212,6 @@ extern void __ftrace_bad_type(void);
319 TRACE_KMEM_ALLOC); \ 212 TRACE_KMEM_ALLOC); \
320 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 213 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
321 TRACE_KMEM_FREE); \ 214 TRACE_KMEM_FREE); \
322 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
323 TRACE_SYSCALL_ENTER); \
324 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
325 TRACE_SYSCALL_EXIT); \
326 __ftrace_bad_type(); \ 215 __ftrace_bad_type(); \
327 } while (0) 216 } while (0)
328 217
@@ -398,7 +287,6 @@ struct tracer {
398 struct tracer *next; 287 struct tracer *next;
399 int print_max; 288 int print_max;
400 struct tracer_flags *flags; 289 struct tracer_flags *flags;
401 struct tracer_stat *stats;
402}; 290};
403 291
404 292
@@ -423,12 +311,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
423 311
424struct ring_buffer_event; 312struct ring_buffer_event;
425 313
426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 314struct ring_buffer_event *
427 int type, 315trace_buffer_lock_reserve(struct ring_buffer *buffer,
428 unsigned long len, 316 int type,
429 unsigned long flags, 317 unsigned long len,
430 int pc); 318 unsigned long flags,
431void trace_buffer_unlock_commit(struct trace_array *tr, 319 int pc);
320void trace_buffer_unlock_commit(struct ring_buffer *buffer,
432 struct ring_buffer_event *event, 321 struct ring_buffer_event *event,
433 unsigned long flags, int pc); 322 unsigned long flags, int pc);
434 323
@@ -438,10 +327,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 327struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
439 int *ent_cpu, u64 *ent_ts); 328 int *ent_cpu, u64 *ent_ts);
440 329
441void tracing_generic_entry_update(struct trace_entry *entry,
442 unsigned long flags,
443 int pc);
444
445void default_wait_pipe(struct trace_iterator *iter); 330void default_wait_pipe(struct trace_iterator *iter);
446void poll_wait_pipe(struct trace_iterator *iter); 331void poll_wait_pipe(struct trace_iterator *iter);
447 332
@@ -471,6 +356,7 @@ void trace_function(struct trace_array *tr,
471 356
472void trace_graph_return(struct ftrace_graph_ret *trace); 357void trace_graph_return(struct ftrace_graph_ret *trace);
473int trace_graph_entry(struct ftrace_graph_ent *trace); 358int trace_graph_entry(struct ftrace_graph_ent *trace);
359void set_graph_array(struct trace_array *tr);
474 360
475void tracing_start_cmdline_record(void); 361void tracing_start_cmdline_record(void);
476void tracing_stop_cmdline_record(void); 362void tracing_stop_cmdline_record(void);
@@ -479,35 +365,46 @@ void tracing_stop_sched_switch_record(void);
479void tracing_start_sched_switch_record(void); 365void tracing_start_sched_switch_record(void);
480int register_tracer(struct tracer *type); 366int register_tracer(struct tracer *type);
481void unregister_tracer(struct tracer *type); 367void unregister_tracer(struct tracer *type);
368int is_tracing_stopped(void);
482 369
483extern unsigned long nsecs_to_usecs(unsigned long nsecs); 370extern unsigned long nsecs_to_usecs(unsigned long nsecs);
484 371
372#ifdef CONFIG_TRACER_MAX_TRACE
485extern unsigned long tracing_max_latency; 373extern unsigned long tracing_max_latency;
486extern unsigned long tracing_thresh; 374extern unsigned long tracing_thresh;
487 375
488void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 376void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
489void update_max_tr_single(struct trace_array *tr, 377void update_max_tr_single(struct trace_array *tr,
490 struct task_struct *tsk, int cpu); 378 struct task_struct *tsk, int cpu);
379#endif /* CONFIG_TRACER_MAX_TRACE */
491 380
492void __trace_stack(struct trace_array *tr, 381#ifdef CONFIG_STACKTRACE
493 unsigned long flags, 382void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
494 int skip, int pc); 383 int skip, int pc);
495 384
496extern cycle_t ftrace_now(int cpu); 385void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
386 int pc);
497 387
498#ifdef CONFIG_CONTEXT_SWITCH_TRACER 388void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
499typedef void 389 int pc);
500(*tracer_switch_func_t)(void *private, 390#else
501 void *__rq, 391static inline void ftrace_trace_stack(struct trace_array *tr,
502 struct task_struct *prev, 392 unsigned long flags, int skip, int pc)
503 struct task_struct *next); 393{
504 394}
505struct tracer_switch_ops { 395
506 tracer_switch_func_t func; 396static inline void ftrace_trace_userstack(struct trace_array *tr,
507 void *private; 397 unsigned long flags, int pc)
508 struct tracer_switch_ops *next; 398{
509}; 399}
510#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 400
401static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
402 int skip, int pc)
403{
404}
405#endif /* CONFIG_STACKTRACE */
406
407extern cycle_t ftrace_now(int cpu);
511 408
512extern void trace_find_cmdline(int pid, char comm[]); 409extern void trace_find_cmdline(int pid, char comm[]);
513 410
@@ -517,6 +414,10 @@ extern unsigned long ftrace_update_tot_cnt;
517extern int DYN_FTRACE_TEST_NAME(void); 414extern int DYN_FTRACE_TEST_NAME(void);
518#endif 415#endif
519 416
417extern int ring_buffer_expanded;
418extern bool tracing_selftest_disabled;
419DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
420
520#ifdef CONFIG_FTRACE_STARTUP_TEST 421#ifdef CONFIG_FTRACE_STARTUP_TEST
521extern int trace_selftest_startup_function(struct tracer *trace, 422extern int trace_selftest_startup_function(struct tracer *trace,
522 struct trace_array *tr); 423 struct trace_array *tr);
@@ -548,9 +449,16 @@ extern int
548trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 449trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
549extern int 450extern int
550trace_vprintk(unsigned long ip, const char *fmt, va_list args); 451trace_vprintk(unsigned long ip, const char *fmt, va_list args);
452extern int
453trace_array_vprintk(struct trace_array *tr,
454 unsigned long ip, const char *fmt, va_list args);
455int trace_array_printk(struct trace_array *tr,
456 unsigned long ip, const char *fmt, ...);
551 457
552extern unsigned long trace_flags; 458extern unsigned long trace_flags;
553 459
460extern int trace_clock_id;
461
554/* Standard output formatting function used for function return traces */ 462/* Standard output formatting function used for function return traces */
555#ifdef CONFIG_FUNCTION_GRAPH_TRACER 463#ifdef CONFIG_FUNCTION_GRAPH_TRACER
556extern enum print_line_t print_graph_function(struct trace_iterator *iter); 464extern enum print_line_t print_graph_function(struct trace_iterator *iter);
@@ -597,6 +505,7 @@ print_graph_function(struct trace_iterator *iter)
597 505
598extern struct pid *ftrace_pid_trace; 506extern struct pid *ftrace_pid_trace;
599 507
508#ifdef CONFIG_FUNCTION_TRACER
600static inline int ftrace_trace_task(struct task_struct *task) 509static inline int ftrace_trace_task(struct task_struct *task)
601{ 510{
602 if (!ftrace_pid_trace) 511 if (!ftrace_pid_trace)
@@ -604,6 +513,47 @@ static inline int ftrace_trace_task(struct task_struct *task)
604 513
605 return test_tsk_trace_trace(task); 514 return test_tsk_trace_trace(task);
606} 515}
516#else
517static inline int ftrace_trace_task(struct task_struct *task)
518{
519 return 1;
520}
521#endif
522
523/*
524 * struct trace_parser - servers for reading the user input separated by spaces
525 * @cont: set if the input is not complete - no final space char was found
526 * @buffer: holds the parsed user input
527 * @idx: user input lenght
528 * @size: buffer size
529 */
530struct trace_parser {
531 bool cont;
532 char *buffer;
533 unsigned idx;
534 unsigned size;
535};
536
537static inline bool trace_parser_loaded(struct trace_parser *parser)
538{
539 return (parser->idx != 0);
540}
541
542static inline bool trace_parser_cont(struct trace_parser *parser)
543{
544 return parser->cont;
545}
546
547static inline void trace_parser_clear(struct trace_parser *parser)
548{
549 parser->cont = false;
550 parser->idx = 0;
551}
552
553extern int trace_parser_get_init(struct trace_parser *parser, int size);
554extern void trace_parser_put(struct trace_parser *parser);
555extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
556 size_t cnt, loff_t *ppos);
607 557
608/* 558/*
609 * trace_iterator_flags is an enumeration that defines bit 559 * trace_iterator_flags is an enumeration that defines bit
@@ -632,9 +582,8 @@ enum trace_iterator_flags {
632 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 582 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
633 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 583 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
634 TRACE_ITER_LATENCY_FMT = 0x40000, 584 TRACE_ITER_LATENCY_FMT = 0x40000,
635 TRACE_ITER_GLOBAL_CLK = 0x80000, 585 TRACE_ITER_SLEEP_TIME = 0x80000,
636 TRACE_ITER_SLEEP_TIME = 0x100000, 586 TRACE_ITER_GRAPH_TIME = 0x100000,
637 TRACE_ITER_GRAPH_TIME = 0x200000,
638}; 587};
639 588
640/* 589/*
@@ -731,6 +680,7 @@ struct ftrace_event_field {
731 struct list_head link; 680 struct list_head link;
732 char *name; 681 char *name;
733 char *type; 682 char *type;
683 int filter_type;
734 int offset; 684 int offset;
735 int size; 685 int size;
736 int is_signed; 686 int is_signed;
@@ -740,13 +690,15 @@ struct event_filter {
740 int n_preds; 690 int n_preds;
741 struct filter_pred **preds; 691 struct filter_pred **preds;
742 char *filter_string; 692 char *filter_string;
693 bool no_reset;
743}; 694};
744 695
745struct event_subsystem { 696struct event_subsystem {
746 struct list_head list; 697 struct list_head list;
747 const char *name; 698 const char *name;
748 struct dentry *entry; 699 struct dentry *entry;
749 void *filter; 700 struct event_filter *filter;
701 int nr_events;
750}; 702};
751 703
752struct filter_pred; 704struct filter_pred;
@@ -774,6 +726,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
774 char *filter_string); 726 char *filter_string);
775extern void print_subsystem_event_filter(struct event_subsystem *system, 727extern void print_subsystem_event_filter(struct event_subsystem *system,
776 struct trace_seq *s); 728 struct trace_seq *s);
729extern int filter_assign_type(const char *type);
777 730
778static inline int 731static inline int
779filter_check_discard(struct ftrace_event_call *call, void *rec, 732filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -788,58 +741,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
788 return 0; 741 return 0;
789} 742}
790 743
791#define DEFINE_COMPARISON_PRED(type) \
792static int filter_pred_##type(struct filter_pred *pred, void *event, \
793 int val1, int val2) \
794{ \
795 type *addr = (type *)(event + pred->offset); \
796 type val = (type)pred->val; \
797 int match = 0; \
798 \
799 switch (pred->op) { \
800 case OP_LT: \
801 match = (*addr < val); \
802 break; \
803 case OP_LE: \
804 match = (*addr <= val); \
805 break; \
806 case OP_GT: \
807 match = (*addr > val); \
808 break; \
809 case OP_GE: \
810 match = (*addr >= val); \
811 break; \
812 default: \
813 break; \
814 } \
815 \
816 return match; \
817}
818
819#define DEFINE_EQUALITY_PRED(size) \
820static int filter_pred_##size(struct filter_pred *pred, void *event, \
821 int val1, int val2) \
822{ \
823 u##size *addr = (u##size *)(event + pred->offset); \
824 u##size val = (u##size)pred->val; \
825 int match; \
826 \
827 match = (val == *addr) ^ pred->not; \
828 \
829 return match; \
830}
831
832extern struct mutex event_mutex; 744extern struct mutex event_mutex;
833extern struct list_head ftrace_events; 745extern struct list_head ftrace_events;
834 746
835extern const char *__start___trace_bprintk_fmt[]; 747extern const char *__start___trace_bprintk_fmt[];
836extern const char *__stop___trace_bprintk_fmt[]; 748extern const char *__stop___trace_bprintk_fmt[];
837 749
838#undef TRACE_EVENT_FORMAT 750#undef FTRACE_ENTRY
839#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 751#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
840 extern struct ftrace_event_call event_##call; 752 extern struct ftrace_event_call event_##call;
841#undef TRACE_EVENT_FORMAT_NOFILTER 753#undef FTRACE_ENTRY_DUP
842#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 754#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
843#include "trace_event_types.h" 755 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
756#include "trace_entries.h"
844 757
845#endif /* _LINUX_KERNEL_TRACE_H */ 758#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a29ef23ffb47..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -41,14 +41,12 @@ void disable_boot_trace(void)
41 41
42static int boot_trace_init(struct trace_array *tr) 42static int boot_trace_init(struct trace_array *tr)
43{ 43{
44 int cpu;
45 boot_trace = tr; 44 boot_trace = tr;
46 45
47 if (!tr) 46 if (!tr)
48 return 0; 47 return 0;
49 48
50 for_each_cpu(cpu, cpu_possible_mask) 49 tracing_reset_online_cpus(tr);
51 tracing_reset(tr, cpu);
52 50
53 tracing_sched_switch_assign_trace(tr); 51 tracing_sched_switch_assign_trace(tr);
54 return 0; 52 return 0;
@@ -131,7 +129,9 @@ struct tracer boot_tracer __read_mostly =
131 129
132void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
133{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
134 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace; 136 struct trace_array *tr = boot_trace;
137 137
@@ -144,20 +144,24 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
144 sprint_symbol(bt->func, (unsigned long)fn); 144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable(); 145 preempt_disable();
146 146
147 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, 147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
148 sizeof(*entry), 0, 0); 149 sizeof(*entry), 0, 0);
149 if (!event) 150 if (!event)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(tr, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
161 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
162 struct trace_array *tr = boot_trace; 166 struct trace_array *tr = boot_trace;
163 167
@@ -167,13 +171,15 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
167 sprint_symbol(bt->func, (unsigned long)fn); 171 sprint_symbol(bt->func, (unsigned long)fn);
168 preempt_disable(); 172 preempt_disable();
169 173
170 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, 174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
171 sizeof(*entry), 0, 0); 176 sizeof(*entry), 0, 0);
172 if (!event) 177 if (!event)
173 goto out; 178 goto out;
174 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
175 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
176 trace_buffer_unlock_commit(tr, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
177 out: 183 out:
178 preempt_enable(); 184 preempt_enable();
179} 185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..a431748ddd6e
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,383 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(power, trace_power,
334
335 TRACE_POWER,
336
337 F_STRUCT(
338 __field_struct( struct power_trace, state_data )
339 __field_desc( s64, state_data, stamp )
340 __field_desc( s64, state_data, end )
341 __field_desc( int, state_data, type )
342 __field_desc( int, state_data, state )
343 ),
344
345 F_printk("%llx->%llx type:%u state:%u",
346 __entry->stamp, __entry->end,
347 __entry->type, __entry->state)
348);
349
350FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
351
352 TRACE_KMEM_ALLOC,
353
354 F_STRUCT(
355 __field( enum kmemtrace_type_id, type_id )
356 __field( unsigned long, call_site )
357 __field( const void *, ptr )
358 __field( size_t, bytes_req )
359 __field( size_t, bytes_alloc )
360 __field( gfp_t, gfp_flags )
361 __field( int, node )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
365 " flags:%x node:%d",
366 __entry->type_id, __entry->call_site, __entry->ptr,
367 __entry->bytes_req, __entry->bytes_alloc,
368 __entry->gfp_flags, __entry->node)
369);
370
371FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
372
373 TRACE_KMEM_FREE,
374
375 F_STRUCT(
376 __field( enum kmemtrace_type_id, type_id )
377 __field( unsigned long, call_site )
378 __field( const void *, ptr )
379 ),
380
381 F_printk("type:%u call_site:%lx ptr:%p",
382 __entry->type_id, __entry->call_site, __entry->ptr)
383);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecfe..55a25c933d15 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,6 +5,7 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
10int ftrace_profile_enable(int event_id) 11int ftrace_profile_enable(int event_id)
@@ -14,7 +15,8 @@ int ftrace_profile_enable(int event_id)
14 15
15 mutex_lock(&event_mutex); 16 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 17 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id) { 18 if (event->id == event_id && event->profile_enable &&
19 try_module_get(event->mod)) {
18 ret = event->profile_enable(event); 20 ret = event->profile_enable(event);
19 break; 21 break;
20 } 22 }
@@ -32,6 +34,7 @@ void ftrace_profile_disable(int event_id)
32 list_for_each_entry(event, &ftrace_events, list) { 34 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 35 if (event->id == event_id) {
34 event->profile_disable(event); 36 event->profile_disable(event);
37 module_put(event->mod);
35 break; 38 break;
36 } 39 }
37 } 40 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 5e32e375134d..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,175 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(int, ret.depth, depth)
30 ),
31 TP_RAW_FMT("<-- %lx (%d)")
32);
33
34TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
35 TRACE_STRUCT(
36 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
37 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
38 TRACE_FIELD(unsigned char, prev_state, prev_state)
39 TRACE_FIELD(unsigned int, next_pid, next_pid)
40 TRACE_FIELD(unsigned char, next_prio, next_prio)
41 TRACE_FIELD(unsigned char, next_state, next_state)
42 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
43 ),
44 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
45);
46
47TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
48 TRACE_STRUCT(
49 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
50 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
51 TRACE_FIELD(unsigned char, prev_state, prev_state)
52 TRACE_FIELD(unsigned int, next_pid, next_pid)
53 TRACE_FIELD(unsigned char, next_prio, next_prio)
54 TRACE_FIELD(unsigned char, next_state, next_state)
55 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
56 ),
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58);
59
60TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2)
64 TRACE_FIELD(unsigned long, arg3, arg3)
65 ),
66 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
67);
68
69/*
70 * Stack-trace entry:
71 */
72
73/* #define FTRACE_STACK_ENTRIES 8 */
74
75TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
76 TRACE_STRUCT(
77 TRACE_FIELD(unsigned long, caller[0], stack0)
78 TRACE_FIELD(unsigned long, caller[1], stack1)
79 TRACE_FIELD(unsigned long, caller[2], stack2)
80 TRACE_FIELD(unsigned long, caller[3], stack3)
81 TRACE_FIELD(unsigned long, caller[4], stack4)
82 TRACE_FIELD(unsigned long, caller[5], stack5)
83 TRACE_FIELD(unsigned long, caller[6], stack6)
84 TRACE_FIELD(unsigned long, caller[7], stack7)
85 ),
86 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
87 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
88);
89
90TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
91 TRACE_STRUCT(
92 TRACE_FIELD(unsigned long, caller[0], stack0)
93 TRACE_FIELD(unsigned long, caller[1], stack1)
94 TRACE_FIELD(unsigned long, caller[2], stack2)
95 TRACE_FIELD(unsigned long, caller[3], stack3)
96 TRACE_FIELD(unsigned long, caller[4], stack4)
97 TRACE_FIELD(unsigned long, caller[5], stack5)
98 TRACE_FIELD(unsigned long, caller[6], stack6)
99 TRACE_FIELD(unsigned long, caller[7], stack7)
100 ),
101 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
102 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
103);
104
105TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
106 TRACE_STRUCT(
107 TRACE_FIELD(unsigned long, ip, ip)
108 TRACE_FIELD(char *, fmt, fmt)
109 TRACE_FIELD_ZERO_CHAR(buf)
110 ),
111 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
112);
113
114TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
115 TRACE_STRUCT(
116 TRACE_FIELD(unsigned long, ip, ip)
117 TRACE_FIELD_ZERO_CHAR(buf)
118 ),
119 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
120);
121
122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
123 TRACE_STRUCT(
124 TRACE_FIELD(unsigned int, line, line)
125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
126 TRACE_FUNC_SIZE+1, func)
127 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
128 TRACE_FUNC_SIZE+1, file)
129 TRACE_FIELD(char, correct, correct)
130 ),
131 TP_RAW_FMT("%u:%s:%s (%u)")
132);
133
134TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
135 TRACE_STRUCT(
136 TRACE_FIELD(u64, from, from)
137 TRACE_FIELD(u64, to, to)
138 ),
139 TP_RAW_FMT("from: %llx to: %llx")
140);
141
142TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
143 TRACE_STRUCT(
144 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
145 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
146 TRACE_FIELD(int, state_data.type, type)
147 TRACE_FIELD(int, state_data.state, state)
148 ),
149 TP_RAW_FMT("%llx->%llx type:%u state:%u")
150);
151
152TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
153 TRACE_STRUCT(
154 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
155 TRACE_FIELD(unsigned long, call_site, call_site)
156 TRACE_FIELD(const void *, ptr, ptr)
157 TRACE_FIELD(size_t, bytes_req, bytes_req)
158 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
159 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
160 TRACE_FIELD(int, node, node)
161 ),
162 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
163 " flags:%x node:%d")
164);
165
166TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
167 TRACE_STRUCT(
168 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
169 TRACE_FIELD(unsigned long, call_site, call_site)
170 TRACE_FIELD(const void *, ptr, ptr)
171 ),
172 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
173);
174
175#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b6..56c260b83a9c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,16 +17,20 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
24#undef TRACE_SYSTEM
22#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
23 26
24DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
25 28
26LIST_HEAD(ftrace_events); 29LIST_HEAD(ftrace_events);
27 30
28int trace_define_field(struct ftrace_event_call *call, char *type, 31int trace_define_field(struct ftrace_event_call *call, const char *type,
29 char *name, int offset, int size, int is_signed) 32 const char *name, int offset, int size, int is_signed,
33 int filter_type)
30{ 34{
31 struct ftrace_event_field *field; 35 struct ftrace_event_field *field;
32 36
@@ -42,9 +46,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
42 if (!field->type) 46 if (!field->type)
43 goto err; 47 goto err;
44 48
49 if (filter_type == FILTER_OTHER)
50 field->filter_type = filter_assign_type(type);
51 else
52 field->filter_type = filter_type;
53
45 field->offset = offset; 54 field->offset = offset;
46 field->size = size; 55 field->size = size;
47 field->is_signed = is_signed; 56 field->is_signed = is_signed;
57
48 list_add(&field->link, &call->fields); 58 list_add(&field->link, &call->fields);
49 59
50 return 0; 60 return 0;
@@ -60,6 +70,29 @@ err:
60} 70}
61EXPORT_SYMBOL_GPL(trace_define_field); 71EXPORT_SYMBOL_GPL(trace_define_field);
62 72
73#define __common_field(type, item) \
74 ret = trace_define_field(call, #type, "common_" #item, \
75 offsetof(typeof(ent), item), \
76 sizeof(ent.item), \
77 is_signed_type(type), FILTER_OTHER); \
78 if (ret) \
79 return ret;
80
81int trace_define_common_fields(struct ftrace_event_call *call)
82{
83 int ret;
84 struct trace_entry ent;
85
86 __common_field(unsigned short, type);
87 __common_field(unsigned char, flags);
88 __common_field(unsigned char, preempt_count);
89 __common_field(int, pid);
90 __common_field(int, lock_depth);
91
92 return ret;
93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95
63#ifdef CONFIG_MODULES 96#ifdef CONFIG_MODULES
64 97
65static void trace_destroy_fields(struct ftrace_event_call *call) 98static void trace_destroy_fields(struct ftrace_event_call *call)
@@ -84,14 +117,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
84 if (call->enabled) { 117 if (call->enabled) {
85 call->enabled = 0; 118 call->enabled = 0;
86 tracing_stop_cmdline_record(); 119 tracing_stop_cmdline_record();
87 call->unregfunc(); 120 call->unregfunc(call->data);
88 } 121 }
89 break; 122 break;
90 case 1: 123 case 1:
91 if (!call->enabled) { 124 if (!call->enabled) {
92 call->enabled = 1; 125 call->enabled = 1;
93 tracing_start_cmdline_record(); 126 tracing_start_cmdline_record();
94 call->regfunc(); 127 call->regfunc(call->data);
95 } 128 }
96 break; 129 break;
97 } 130 }
@@ -198,11 +231,9 @@ static ssize_t
198ftrace_event_write(struct file *file, const char __user *ubuf, 231ftrace_event_write(struct file *file, const char __user *ubuf,
199 size_t cnt, loff_t *ppos) 232 size_t cnt, loff_t *ppos)
200{ 233{
234 struct trace_parser parser;
201 size_t read = 0; 235 size_t read = 0;
202 int i, set = 1;
203 ssize_t ret; 236 ssize_t ret;
204 char *buf;
205 char ch;
206 237
207 if (!cnt || cnt < 0) 238 if (!cnt || cnt < 0)
208 return 0; 239 return 0;
@@ -211,60 +242,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
211 if (ret < 0) 242 if (ret < 0)
212 return ret; 243 return ret;
213 244
214 ret = get_user(ch, ubuf++); 245 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
215 if (ret)
216 return ret;
217 read++;
218 cnt--;
219
220 /* skip white space */
221 while (cnt && isspace(ch)) {
222 ret = get_user(ch, ubuf++);
223 if (ret)
224 return ret;
225 read++;
226 cnt--;
227 }
228
229 /* Only white space found? */
230 if (isspace(ch)) {
231 file->f_pos += read;
232 ret = read;
233 return ret;
234 }
235
236 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
237 if (!buf)
238 return -ENOMEM; 246 return -ENOMEM;
239 247
240 if (cnt > EVENT_BUF_SIZE) 248 read = trace_get_user(&parser, ubuf, cnt, ppos);
241 cnt = EVENT_BUF_SIZE; 249
250 if (trace_parser_loaded((&parser))) {
251 int set = 1;
242 252
243 i = 0; 253 if (*parser.buffer == '!')
244 while (cnt && !isspace(ch)) {
245 if (!i && ch == '!')
246 set = 0; 254 set = 0;
247 else
248 buf[i++] = ch;
249 255
250 ret = get_user(ch, ubuf++); 256 parser.buffer[parser.idx] = 0;
257
258 ret = ftrace_set_clr_event(parser.buffer + !set, set);
251 if (ret) 259 if (ret)
252 goto out_free; 260 goto out_put;
253 read++;
254 cnt--;
255 } 261 }
256 buf[i] = 0;
257
258 file->f_pos += read;
259
260 ret = ftrace_set_clr_event(buf, set);
261 if (ret)
262 goto out_free;
263 262
264 ret = read; 263 ret = read;
265 264
266 out_free: 265 out_put:
267 kfree(buf); 266 trace_parser_put(&parser);
268 267
269 return ret; 268 return ret;
270} 269}
@@ -300,10 +299,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
300 299
301static void *t_start(struct seq_file *m, loff_t *pos) 300static void *t_start(struct seq_file *m, loff_t *pos)
302{ 301{
302 struct ftrace_event_call *call = NULL;
303 loff_t l;
304
303 mutex_lock(&event_mutex); 305 mutex_lock(&event_mutex);
304 if (*pos == 0) 306
305 m->private = ftrace_events.next; 307 m->private = ftrace_events.next;
306 return t_next(m, NULL, pos); 308 for (l = 0; l <= *pos; ) {
309 call = t_next(m, NULL, &l);
310 if (!call)
311 break;
312 }
313 return call;
307} 314}
308 315
309static void * 316static void *
@@ -332,10 +339,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
332 339
333static void *s_start(struct seq_file *m, loff_t *pos) 340static void *s_start(struct seq_file *m, loff_t *pos)
334{ 341{
342 struct ftrace_event_call *call = NULL;
343 loff_t l;
344
335 mutex_lock(&event_mutex); 345 mutex_lock(&event_mutex);
336 if (*pos == 0) 346
337 m->private = ftrace_events.next; 347 m->private = ftrace_events.next;
338 return s_next(m, NULL, pos); 348 for (l = 0; l <= *pos; ) {
349 call = s_next(m, NULL, &l);
350 if (!call)
351 break;
352 }
353 return call;
339} 354}
340 355
341static int t_show(struct seq_file *m, void *v) 356static int t_show(struct seq_file *m, void *v)
@@ -360,7 +375,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
360 const struct seq_operations *seq_ops; 375 const struct seq_operations *seq_ops;
361 376
362 if ((file->f_mode & FMODE_WRITE) && 377 if ((file->f_mode & FMODE_WRITE) &&
363 !(file->f_flags & O_APPEND)) 378 (file->f_flags & O_TRUNC))
364 ftrace_clear_events(); 379 ftrace_clear_events();
365 380
366 seq_ops = inode->i_private; 381 seq_ops = inode->i_private;
@@ -530,7 +545,7 @@ static int trace_write_header(struct trace_seq *s)
530 FIELD(unsigned char, flags), 545 FIELD(unsigned char, flags),
531 FIELD(unsigned char, preempt_count), 546 FIELD(unsigned char, preempt_count),
532 FIELD(int, pid), 547 FIELD(int, pid),
533 FIELD(int, tgid)); 548 FIELD(int, lock_depth));
534} 549}
535 550
536static ssize_t 551static ssize_t
@@ -558,7 +573,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
558 trace_seq_printf(s, "format:\n"); 573 trace_seq_printf(s, "format:\n");
559 trace_write_header(s); 574 trace_write_header(s);
560 575
561 r = call->show_format(s); 576 r = call->show_format(call, s);
562 if (!r) { 577 if (!r) {
563 /* 578 /*
564 * ug! The format output is bigger than a PAGE!! 579 * ug! The format output is bigger than a PAGE!!
@@ -833,8 +848,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
833 848
834 /* First see if we did not already create this dir */ 849 /* First see if we did not already create this dir */
835 list_for_each_entry(system, &event_subsystems, list) { 850 list_for_each_entry(system, &event_subsystems, list) {
836 if (strcmp(system->name, name) == 0) 851 if (strcmp(system->name, name) == 0) {
852 system->nr_events++;
837 return system->entry; 853 return system->entry;
854 }
838 } 855 }
839 856
840 /* need to create new entry */ 857 /* need to create new entry */
@@ -853,6 +870,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
853 return d_events; 870 return d_events;
854 } 871 }
855 872
873 system->nr_events = 1;
856 system->name = kstrdup(name, GFP_KERNEL); 874 system->name = kstrdup(name, GFP_KERNEL);
857 if (!system->name) { 875 if (!system->name) {
858 debugfs_remove(system->entry); 876 debugfs_remove(system->entry);
@@ -904,15 +922,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
904 if (strcmp(call->system, TRACE_SYSTEM) != 0) 922 if (strcmp(call->system, TRACE_SYSTEM) != 0)
905 d_events = event_subsystem_dir(call->system, d_events); 923 d_events = event_subsystem_dir(call->system, d_events);
906 924
907 if (call->raw_init) {
908 ret = call->raw_init();
909 if (ret < 0) {
910 pr_warning("Could not initialize trace point"
911 " events/%s\n", call->name);
912 return ret;
913 }
914 }
915
916 call->dir = debugfs_create_dir(call->name, d_events); 925 call->dir = debugfs_create_dir(call->name, d_events);
917 if (!call->dir) { 926 if (!call->dir) {
918 pr_warning("Could not create debugfs " 927 pr_warning("Could not create debugfs "
@@ -924,12 +933,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
924 entry = trace_create_file("enable", 0644, call->dir, call, 933 entry = trace_create_file("enable", 0644, call->dir, call,
925 enable); 934 enable);
926 935
927 if (call->id) 936 if (call->id && call->profile_enable)
928 entry = trace_create_file("id", 0444, call->dir, call, 937 entry = trace_create_file("id", 0444, call->dir, call,
929 id); 938 id);
930 939
931 if (call->define_fields) { 940 if (call->define_fields) {
932 ret = call->define_fields(); 941 ret = call->define_fields(call);
933 if (ret < 0) { 942 if (ret < 0) {
934 pr_warning("Could not initialize trace point" 943 pr_warning("Could not initialize trace point"
935 " events/%s\n", call->name); 944 " events/%s\n", call->name);
@@ -971,6 +980,32 @@ struct ftrace_module_file_ops {
971 struct file_operations filter; 980 struct file_operations filter;
972}; 981};
973 982
983static void remove_subsystem_dir(const char *name)
984{
985 struct event_subsystem *system;
986
987 if (strcmp(name, TRACE_SYSTEM) == 0)
988 return;
989
990 list_for_each_entry(system, &event_subsystems, list) {
991 if (strcmp(system->name, name) == 0) {
992 if (!--system->nr_events) {
993 struct event_filter *filter = system->filter;
994
995 debugfs_remove_recursive(system->entry);
996 list_del(&system->list);
997 if (filter) {
998 kfree(filter->filter_string);
999 kfree(filter);
1000 }
1001 kfree(system->name);
1002 kfree(system);
1003 }
1004 break;
1005 }
1006 }
1007}
1008
974static struct ftrace_module_file_ops * 1009static struct ftrace_module_file_ops *
975trace_create_file_ops(struct module *mod) 1010trace_create_file_ops(struct module *mod)
976{ 1011{
@@ -1011,6 +1046,7 @@ static void trace_module_add_events(struct module *mod)
1011 struct ftrace_module_file_ops *file_ops = NULL; 1046 struct ftrace_module_file_ops *file_ops = NULL;
1012 struct ftrace_event_call *call, *start, *end; 1047 struct ftrace_event_call *call, *start, *end;
1013 struct dentry *d_events; 1048 struct dentry *d_events;
1049 int ret;
1014 1050
1015 start = mod->trace_events; 1051 start = mod->trace_events;
1016 end = mod->trace_events + mod->num_trace_events; 1052 end = mod->trace_events + mod->num_trace_events;
@@ -1026,7 +1062,15 @@ static void trace_module_add_events(struct module *mod)
1026 /* The linker may leave blanks */ 1062 /* The linker may leave blanks */
1027 if (!call->name) 1063 if (!call->name)
1028 continue; 1064 continue;
1029 1065 if (call->raw_init) {
1066 ret = call->raw_init();
1067 if (ret < 0) {
1068 if (ret != -ENOSYS)
1069 pr_warning("Could not initialize trace "
1070 "point events/%s\n", call->name);
1071 continue;
1072 }
1073 }
1030 /* 1074 /*
1031 * This module has events, create file ops for this module 1075 * This module has events, create file ops for this module
1032 * if not already done. 1076 * if not already done.
@@ -1061,6 +1105,7 @@ static void trace_module_remove_events(struct module *mod)
1061 list_del(&call->list); 1105 list_del(&call->list);
1062 trace_destroy_fields(call); 1106 trace_destroy_fields(call);
1063 destroy_preds(call); 1107 destroy_preds(call);
1108 remove_subsystem_dir(call->system);
1064 } 1109 }
1065 } 1110 }
1066 1111
@@ -1109,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self,
1109} 1154}
1110#endif /* CONFIG_MODULES */ 1155#endif /* CONFIG_MODULES */
1111 1156
1112struct notifier_block trace_module_nb = { 1157static struct notifier_block trace_module_nb = {
1113 .notifier_call = trace_module_notify, 1158 .notifier_call = trace_module_notify,
1114 .priority = 0, 1159 .priority = 0,
1115}; 1160};
@@ -1117,6 +1162,18 @@ struct notifier_block trace_module_nb = {
1117extern struct ftrace_event_call __start_ftrace_events[]; 1162extern struct ftrace_event_call __start_ftrace_events[];
1118extern struct ftrace_event_call __stop_ftrace_events[]; 1163extern struct ftrace_event_call __stop_ftrace_events[];
1119 1164
1165static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1166
1167static __init int setup_trace_event(char *str)
1168{
1169 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1170 ring_buffer_expanded = 1;
1171 tracing_selftest_disabled = 1;
1172
1173 return 1;
1174}
1175__setup("trace_event=", setup_trace_event);
1176
1120static __init int event_trace_init(void) 1177static __init int event_trace_init(void)
1121{ 1178{
1122 struct ftrace_event_call *call; 1179 struct ftrace_event_call *call;
@@ -1124,6 +1181,8 @@ static __init int event_trace_init(void)
1124 struct dentry *entry; 1181 struct dentry *entry;
1125 struct dentry *d_events; 1182 struct dentry *d_events;
1126 int ret; 1183 int ret;
1184 char *buf = bootup_event_buf;
1185 char *token;
1127 1186
1128 d_tracer = tracing_init_dentry(); 1187 d_tracer = tracing_init_dentry();
1129 if (!d_tracer) 1188 if (!d_tracer)
@@ -1163,12 +1222,34 @@ static __init int event_trace_init(void)
1163 /* The linker may leave blanks */ 1222 /* The linker may leave blanks */
1164 if (!call->name) 1223 if (!call->name)
1165 continue; 1224 continue;
1225 if (call->raw_init) {
1226 ret = call->raw_init();
1227 if (ret < 0) {
1228 if (ret != -ENOSYS)
1229 pr_warning("Could not initialize trace "
1230 "point events/%s\n", call->name);
1231 continue;
1232 }
1233 }
1166 list_add(&call->list, &ftrace_events); 1234 list_add(&call->list, &ftrace_events);
1167 event_create_dir(call, d_events, &ftrace_event_id_fops, 1235 event_create_dir(call, d_events, &ftrace_event_id_fops,
1168 &ftrace_enable_fops, &ftrace_event_filter_fops, 1236 &ftrace_enable_fops, &ftrace_event_filter_fops,
1169 &ftrace_event_format_fops); 1237 &ftrace_event_format_fops);
1170 } 1238 }
1171 1239
1240 while (true) {
1241 token = strsep(&buf, ",");
1242
1243 if (!token)
1244 break;
1245 if (!*token)
1246 continue;
1247
1248 ret = ftrace_set_clr_event(token, 1);
1249 if (ret)
1250 pr_warning("Failed to enable trace event: %s\n", token);
1251 }
1252
1172 ret = register_module_notifier(&trace_module_nb); 1253 ret = register_module_notifier(&trace_module_nb);
1173 if (ret) 1254 if (ret)
1174 pr_warning("Failed to register trace events module notifier\n"); 1255 pr_warning("Failed to register trace events module notifier\n");
@@ -1245,6 +1326,18 @@ static __init void event_trace_self_tests(void)
1245 if (!call->regfunc) 1326 if (!call->regfunc)
1246 continue; 1327 continue;
1247 1328
1329/*
1330 * Testing syscall events here is pretty useless, but
1331 * we still do it if configured. But this is time consuming.
1332 * What we really need is a user thread to perform the
1333 * syscalls as we test.
1334 */
1335#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1336 if (call->system &&
1337 strcmp(call->system, "syscalls") == 0)
1338 continue;
1339#endif
1340
1248 pr_info("Testing event %s: ", call->name); 1341 pr_info("Testing event %s: ", call->name);
1249 1342
1250 /* 1343 /*
@@ -1318,12 +1411,13 @@ static __init void event_trace_self_tests(void)
1318 1411
1319#ifdef CONFIG_FUNCTION_TRACER 1412#ifdef CONFIG_FUNCTION_TRACER
1320 1413
1321static DEFINE_PER_CPU(atomic_t, test_event_disable); 1414static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1322 1415
1323static void 1416static void
1324function_test_events_call(unsigned long ip, unsigned long parent_ip) 1417function_test_events_call(unsigned long ip, unsigned long parent_ip)
1325{ 1418{
1326 struct ring_buffer_event *event; 1419 struct ring_buffer_event *event;
1420 struct ring_buffer *buffer;
1327 struct ftrace_entry *entry; 1421 struct ftrace_entry *entry;
1328 unsigned long flags; 1422 unsigned long flags;
1329 long disabled; 1423 long disabled;
@@ -1334,14 +1428,15 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1334 pc = preempt_count(); 1428 pc = preempt_count();
1335 resched = ftrace_preempt_disable(); 1429 resched = ftrace_preempt_disable();
1336 cpu = raw_smp_processor_id(); 1430 cpu = raw_smp_processor_id();
1337 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1431 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1338 1432
1339 if (disabled != 1) 1433 if (disabled != 1)
1340 goto out; 1434 goto out;
1341 1435
1342 local_save_flags(flags); 1436 local_save_flags(flags);
1343 1437
1344 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), 1438 event = trace_current_buffer_lock_reserve(&buffer,
1439 TRACE_FN, sizeof(*entry),
1345 flags, pc); 1440 flags, pc);
1346 if (!event) 1441 if (!event)
1347 goto out; 1442 goto out;
@@ -1349,10 +1444,10 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1349 entry->ip = ip; 1444 entry->ip = ip;
1350 entry->parent_ip = parent_ip; 1445 entry->parent_ip = parent_ip;
1351 1446
1352 trace_nowake_buffer_unlock_commit(event, flags, pc); 1447 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1353 1448
1354 out: 1449 out:
1355 atomic_dec(&per_cpu(test_event_disable, cpu)); 1450 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1356 ftrace_preempt_enable(resched); 1451 ftrace_preempt_enable(resched);
1357} 1452}
1358 1453
@@ -1376,10 +1471,10 @@ static __init void event_trace_self_test_with_function(void)
1376 1471
1377static __init int event_trace_self_tests_init(void) 1472static __init int event_trace_self_tests_init(void)
1378{ 1473{
1379 1474 if (!tracing_selftest_disabled) {
1380 event_trace_self_tests(); 1475 event_trace_self_tests();
1381 1476 event_trace_self_test_with_function();
1382 event_trace_self_test_with_function(); 1477 }
1383 1478
1384 return 0; 1479 return 0;
1385} 1480}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index db6e54bdb596..23245785927f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,8 +27,6 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30static DEFINE_MUTEX(filter_mutex);
31
32enum filter_op_ids 30enum filter_op_ids
33{ 31{
34 OP_OR, 32 OP_OR,
@@ -123,6 +121,47 @@ struct filter_parse_state {
123 } operand; 121 } operand;
124}; 122};
125 123
124#define DEFINE_COMPARISON_PRED(type) \
125static int filter_pred_##type(struct filter_pred *pred, void *event, \
126 int val1, int val2) \
127{ \
128 type *addr = (type *)(event + pred->offset); \
129 type val = (type)pred->val; \
130 int match = 0; \
131 \
132 switch (pred->op) { \
133 case OP_LT: \
134 match = (*addr < val); \
135 break; \
136 case OP_LE: \
137 match = (*addr <= val); \
138 break; \
139 case OP_GT: \
140 match = (*addr > val); \
141 break; \
142 case OP_GE: \
143 match = (*addr >= val); \
144 break; \
145 default: \
146 break; \
147 } \
148 \
149 return match; \
150}
151
152#define DEFINE_EQUALITY_PRED(size) \
153static int filter_pred_##size(struct filter_pred *pred, void *event, \
154 int val1, int val2) \
155{ \
156 u##size *addr = (u##size *)(event + pred->offset); \
157 u##size val = (u##size)pred->val; \
158 int match; \
159 \
160 match = (val == *addr) ^ pred->not; \
161 \
162 return match; \
163}
164
126DEFINE_COMPARISON_PRED(s64); 165DEFINE_COMPARISON_PRED(s64);
127DEFINE_COMPARISON_PRED(u64); 166DEFINE_COMPARISON_PRED(u64);
128DEFINE_COMPARISON_PRED(s32); 167DEFINE_COMPARISON_PRED(s32);
@@ -165,6 +204,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
165 return match; 204 return match;
166} 205}
167 206
207/* Filter predicate for char * pointers */
208static int filter_pred_pchar(struct filter_pred *pred, void *event,
209 int val1, int val2)
210{
211 char **addr = (char **)(event + pred->offset);
212 int cmp, match;
213
214 cmp = strncmp(*addr, pred->str_val, pred->str_len);
215
216 match = (!cmp) ^ pred->not;
217
218 return match;
219}
220
168/* 221/*
169 * Filter predicate for dynamic sized arrays of characters. 222 * Filter predicate for dynamic sized arrays of characters.
170 * These are implemented through a list of strings at the end 223 * These are implemented through a list of strings at the end
@@ -178,11 +231,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
178static int filter_pred_strloc(struct filter_pred *pred, void *event, 231static int filter_pred_strloc(struct filter_pred *pred, void *event,
179 int val1, int val2) 232 int val1, int val2)
180{ 233{
181 int str_loc = *(int *)(event + pred->offset); 234 u32 str_item = *(u32 *)(event + pred->offset);
235 int str_loc = str_item & 0xffff;
236 int str_len = str_item >> 16;
182 char *addr = (char *)(event + str_loc); 237 char *addr = (char *)(event + str_loc);
183 int cmp, match; 238 int cmp, match;
184 239
185 cmp = strncmp(addr, pred->str_val, pred->str_len); 240 cmp = strncmp(addr, pred->str_val, str_len);
186 241
187 match = (!cmp) ^ pred->not; 242 match = (!cmp) ^ pred->not;
188 243
@@ -294,12 +349,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
294{ 349{
295 struct event_filter *filter = call->filter; 350 struct event_filter *filter = call->filter;
296 351
297 mutex_lock(&filter_mutex); 352 mutex_lock(&event_mutex);
298 if (filter->filter_string) 353 if (filter && filter->filter_string)
299 trace_seq_printf(s, "%s\n", filter->filter_string); 354 trace_seq_printf(s, "%s\n", filter->filter_string);
300 else 355 else
301 trace_seq_printf(s, "none\n"); 356 trace_seq_printf(s, "none\n");
302 mutex_unlock(&filter_mutex); 357 mutex_unlock(&event_mutex);
303} 358}
304 359
305void print_subsystem_event_filter(struct event_subsystem *system, 360void print_subsystem_event_filter(struct event_subsystem *system,
@@ -307,12 +362,12 @@ void print_subsystem_event_filter(struct event_subsystem *system,
307{ 362{
308 struct event_filter *filter = system->filter; 363 struct event_filter *filter = system->filter;
309 364
310 mutex_lock(&filter_mutex); 365 mutex_lock(&event_mutex);
311 if (filter->filter_string) 366 if (filter && filter->filter_string)
312 trace_seq_printf(s, "%s\n", filter->filter_string); 367 trace_seq_printf(s, "%s\n", filter->filter_string);
313 else 368 else
314 trace_seq_printf(s, "none\n"); 369 trace_seq_printf(s, "none\n");
315 mutex_unlock(&filter_mutex); 370 mutex_unlock(&event_mutex);
316} 371}
317 372
318static struct ftrace_event_field * 373static struct ftrace_event_field *
@@ -376,26 +431,32 @@ void destroy_preds(struct ftrace_event_call *call)
376 struct event_filter *filter = call->filter; 431 struct event_filter *filter = call->filter;
377 int i; 432 int i;
378 433
434 if (!filter)
435 return;
436
379 for (i = 0; i < MAX_FILTER_PRED; i++) { 437 for (i = 0; i < MAX_FILTER_PRED; i++) {
380 if (filter->preds[i]) 438 if (filter->preds[i])
381 filter_free_pred(filter->preds[i]); 439 filter_free_pred(filter->preds[i]);
382 } 440 }
383 kfree(filter->preds); 441 kfree(filter->preds);
442 kfree(filter->filter_string);
384 kfree(filter); 443 kfree(filter);
385 call->filter = NULL; 444 call->filter = NULL;
386} 445}
387 446
388int init_preds(struct ftrace_event_call *call) 447static int init_preds(struct ftrace_event_call *call)
389{ 448{
390 struct event_filter *filter; 449 struct event_filter *filter;
391 struct filter_pred *pred; 450 struct filter_pred *pred;
392 int i; 451 int i;
393 452
453 if (call->filter)
454 return 0;
455
394 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); 456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
395 if (!call->filter) 457 if (!call->filter)
396 return -ENOMEM; 458 return -ENOMEM;
397 459
398 call->filter_active = 0;
399 filter->n_preds = 0; 460 filter->n_preds = 0;
400 461
401 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); 462 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -417,33 +478,56 @@ oom:
417 478
418 return -ENOMEM; 479 return -ENOMEM;
419} 480}
420EXPORT_SYMBOL_GPL(init_preds);
421 481
422static void filter_free_subsystem_preds(struct event_subsystem *system) 482static int init_subsystem_preds(struct event_subsystem *system)
423{ 483{
424 struct event_filter *filter = system->filter;
425 struct ftrace_event_call *call; 484 struct ftrace_event_call *call;
426 int i; 485 int err;
427 486
428 if (filter->n_preds) { 487 list_for_each_entry(call, &ftrace_events, list) {
429 for (i = 0; i < filter->n_preds; i++) 488 if (!call->define_fields)
430 filter_free_pred(filter->preds[i]); 489 continue;
431 kfree(filter->preds); 490
432 filter->preds = NULL; 491 if (strcmp(call->system, system->name) != 0)
433 filter->n_preds = 0; 492 continue;
493
494 err = init_preds(call);
495 if (err)
496 return err;
434 } 497 }
435 498
436 mutex_lock(&event_mutex); 499 return 0;
500}
501
502enum {
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{
511 struct ftrace_event_call *call;
512
437 list_for_each_entry(call, &ftrace_events, list) { 513 list_for_each_entry(call, &ftrace_events, list) {
438 if (!call->define_fields) 514 if (!call->define_fields)
439 continue; 515 continue;
440 516
441 if (!strcmp(call->system, system->name)) { 517 if (strcmp(call->system, system->name) != 0)
442 filter_disable_preds(call); 518 continue;
443 remove_filter_string(call->filter); 519
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
444 } 523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call);
529 remove_filter_string(call->filter);
445 } 530 }
446 mutex_unlock(&event_mutex);
447} 531}
448 532
449static int filter_add_pred_fn(struct filter_parse_state *ps, 533static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -471,12 +555,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
471 return 0; 555 return 0;
472} 556}
473 557
474enum { 558int filter_assign_type(const char *type)
475 FILTER_STATIC_STRING = 1,
476 FILTER_DYN_STRING
477};
478
479static int is_string_field(const char *type)
480{ 559{
481 if (strstr(type, "__data_loc") && strstr(type, "char")) 560 if (strstr(type, "__data_loc") && strstr(type, "char"))
482 return FILTER_DYN_STRING; 561 return FILTER_DYN_STRING;
@@ -484,12 +563,19 @@ static int is_string_field(const char *type)
484 if (strchr(type, '[') && strstr(type, "char")) 563 if (strchr(type, '[') && strstr(type, "char"))
485 return FILTER_STATIC_STRING; 564 return FILTER_STATIC_STRING;
486 565
487 return 0; 566 return FILTER_OTHER;
567}
568
569static bool is_string_field(struct ftrace_event_field *field)
570{
571 return field->filter_type == FILTER_DYN_STRING ||
572 field->filter_type == FILTER_STATIC_STRING ||
573 field->filter_type == FILTER_PTR_STRING;
488} 574}
489 575
490static int is_legal_op(struct ftrace_event_field *field, int op) 576static int is_legal_op(struct ftrace_event_field *field, int op)
491{ 577{
492 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) 578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
493 return 0; 579 return 0;
494 580
495 return 1; 581 return 1;
@@ -540,21 +626,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
540 626
541static int filter_add_pred(struct filter_parse_state *ps, 627static int filter_add_pred(struct filter_parse_state *ps,
542 struct ftrace_event_call *call, 628 struct ftrace_event_call *call,
543 struct filter_pred *pred) 629 struct filter_pred *pred,
630 bool dry_run)
544{ 631{
545 struct ftrace_event_field *field; 632 struct ftrace_event_field *field;
546 filter_pred_fn_t fn; 633 filter_pred_fn_t fn;
547 unsigned long long val; 634 unsigned long long val;
548 int string_type; 635 int ret;
549 636
550 pred->fn = filter_pred_none; 637 pred->fn = filter_pred_none;
551 638
552 if (pred->op == OP_AND) { 639 if (pred->op == OP_AND) {
553 pred->pop_n = 2; 640 pred->pop_n = 2;
554 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 641 fn = filter_pred_and;
642 goto add_pred_fn;
555 } else if (pred->op == OP_OR) { 643 } else if (pred->op == OP_OR) {
556 pred->pop_n = 2; 644 pred->pop_n = 2;
557 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 645 fn = filter_pred_or;
646 goto add_pred_fn;
558 } 647 }
559 648
560 field = find_event_field(call, pred->field_name); 649 field = find_event_field(call, pred->field_name);
@@ -570,62 +659,55 @@ static int filter_add_pred(struct filter_parse_state *ps,
570 return -EINVAL; 659 return -EINVAL;
571 } 660 }
572 661
573 string_type = is_string_field(field->type); 662 if (is_string_field(field)) {
574 if (string_type) { 663 pred->str_len = field->size;
575 if (string_type == FILTER_STATIC_STRING) 664
665 if (field->filter_type == FILTER_STATIC_STRING)
576 fn = filter_pred_string; 666 fn = filter_pred_string;
577 else 667 else if (field->filter_type == FILTER_DYN_STRING)
578 fn = filter_pred_strloc; 668 fn = filter_pred_strloc;
579 pred->str_len = field->size; 669 else {
580 if (pred->op == OP_NE) 670 fn = filter_pred_pchar;
581 pred->not = 1; 671 pred->str_len = strlen(pred->str_val);
582 return filter_add_pred_fn(ps, call, pred, fn); 672 }
583 } else { 673 } else {
584 if (strict_strtoull(pred->str_val, 0, &val)) { 674 if (field->is_signed)
675 ret = strict_strtoll(pred->str_val, 0, &val);
676 else
677 ret = strict_strtoull(pred->str_val, 0, &val);
678 if (ret) {
585 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 679 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
586 return -EINVAL; 680 return -EINVAL;
587 } 681 }
588 pred->val = val; 682 pred->val = val;
589 }
590 683
591 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 684 fn = select_comparison_fn(pred->op, field->size,
592 if (!fn) { 685 field->is_signed);
593 parse_error(ps, FILT_ERR_INVALID_OP, 0); 686 if (!fn) {
594 return -EINVAL; 687 parse_error(ps, FILT_ERR_INVALID_OP, 0);
688 return -EINVAL;
689 }
595 } 690 }
596 691
597 if (pred->op == OP_NE) 692 if (pred->op == OP_NE)
598 pred->not = 1; 693 pred->not = 1;
599 694
600 return filter_add_pred_fn(ps, call, pred, fn); 695add_pred_fn:
696 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn);
698 return 0;
601} 699}
602 700
603static int filter_add_subsystem_pred(struct filter_parse_state *ps, 701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
604 struct event_subsystem *system, 702 struct event_subsystem *system,
605 struct filter_pred *pred, 703 struct filter_pred *pred,
606 char *filter_string) 704 char *filter_string,
705 bool dry_run)
607{ 706{
608 struct event_filter *filter = system->filter;
609 struct ftrace_event_call *call; 707 struct ftrace_event_call *call;
610 int err = 0; 708 int err = 0;
709 bool fail = true;
611 710
612 if (!filter->preds) {
613 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
614 GFP_KERNEL);
615
616 if (!filter->preds)
617 return -ENOMEM;
618 }
619
620 if (filter->n_preds == MAX_FILTER_PRED) {
621 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
622 return -ENOSPC;
623 }
624
625 filter->preds[filter->n_preds] = pred;
626 filter->n_preds++;
627
628 mutex_lock(&event_mutex);
629 list_for_each_entry(call, &ftrace_events, list) { 711 list_for_each_entry(call, &ftrace_events, list) {
630 712
631 if (!call->define_fields) 713 if (!call->define_fields)
@@ -634,18 +716,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
634 if (strcmp(call->system, system->name)) 716 if (strcmp(call->system, system->name))
635 continue; 717 continue;
636 718
637 err = filter_add_pred(ps, call, pred); 719 if (call->filter->no_reset)
638 if (err) { 720 continue;
639 mutex_unlock(&event_mutex); 721
640 filter_free_subsystem_preds(system); 722 err = filter_add_pred(ps, call, pred, dry_run);
641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 723 if (err)
642 goto out; 724 call->filter->no_reset = true;
643 } 725 else
644 replace_filter_string(call->filter, filter_string); 726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
645 } 730 }
646 mutex_unlock(&event_mutex); 731
647out: 732 if (fail) {
648 return err; 733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
734 return err;
735 }
736 return 0;
649} 737}
650 738
651static void parse_init(struct filter_parse_state *ps, 739static void parse_init(struct filter_parse_state *ps,
@@ -1004,12 +1092,14 @@ static int check_preds(struct filter_parse_state *ps)
1004static int replace_preds(struct event_subsystem *system, 1092static int replace_preds(struct event_subsystem *system,
1005 struct ftrace_event_call *call, 1093 struct ftrace_event_call *call,
1006 struct filter_parse_state *ps, 1094 struct filter_parse_state *ps,
1007 char *filter_string) 1095 char *filter_string,
1096 bool dry_run)
1008{ 1097{
1009 char *operand1 = NULL, *operand2 = NULL; 1098 char *operand1 = NULL, *operand2 = NULL;
1010 struct filter_pred *pred; 1099 struct filter_pred *pred;
1011 struct postfix_elt *elt; 1100 struct postfix_elt *elt;
1012 int err; 1101 int err;
1102 int n_preds = 0;
1013 1103
1014 err = check_preds(ps); 1104 err = check_preds(ps);
1015 if (err) 1105 if (err)
@@ -1028,19 +1118,14 @@ static int replace_preds(struct event_subsystem *system,
1028 continue; 1118 continue;
1029 } 1119 }
1030 1120
1121 if (n_preds++ == MAX_FILTER_PRED) {
1122 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1123 return -ENOSPC;
1124 }
1125
1031 if (elt->op == OP_AND || elt->op == OP_OR) { 1126 if (elt->op == OP_AND || elt->op == OP_OR) {
1032 pred = create_logical_pred(elt->op); 1127 pred = create_logical_pred(elt->op);
1033 if (call) { 1128 goto add_pred;
1034 err = filter_add_pred(ps, call, pred);
1035 filter_free_pred(pred);
1036 } else
1037 err = filter_add_subsystem_pred(ps, system,
1038 pred, filter_string);
1039 if (err)
1040 return err;
1041
1042 operand1 = operand2 = NULL;
1043 continue;
1044 } 1129 }
1045 1130
1046 if (!operand1 || !operand2) { 1131 if (!operand1 || !operand2) {
@@ -1049,12 +1134,15 @@ static int replace_preds(struct event_subsystem *system,
1049 } 1134 }
1050 1135
1051 pred = create_pred(elt->op, operand1, operand2); 1136 pred = create_pred(elt->op, operand1, operand2);
1052 if (call) { 1137add_pred:
1053 err = filter_add_pred(ps, call, pred); 1138 if (!pred)
1054 filter_free_pred(pred); 1139 return -ENOMEM;
1055 } else 1140 if (call)
1141 err = filter_add_pred(ps, call, pred, false);
1142 else
1056 err = filter_add_subsystem_pred(ps, system, pred, 1143 err = filter_add_subsystem_pred(ps, system, pred,
1057 filter_string); 1144 filter_string, dry_run);
1145 filter_free_pred(pred);
1058 if (err) 1146 if (err)
1059 return err; 1147 return err;
1060 1148
@@ -1070,12 +1158,16 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1070 1158
1071 struct filter_parse_state *ps; 1159 struct filter_parse_state *ps;
1072 1160
1073 mutex_lock(&filter_mutex); 1161 mutex_lock(&event_mutex);
1162
1163 err = init_preds(call);
1164 if (err)
1165 goto out_unlock;
1074 1166
1075 if (!strcmp(strstrip(filter_string), "0")) { 1167 if (!strcmp(strstrip(filter_string), "0")) {
1076 filter_disable_preds(call); 1168 filter_disable_preds(call);
1077 remove_filter_string(call->filter); 1169 remove_filter_string(call->filter);
1078 mutex_unlock(&filter_mutex); 1170 mutex_unlock(&event_mutex);
1079 return 0; 1171 return 0;
1080 } 1172 }
1081 1173
@@ -1094,7 +1186,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1094 goto out; 1186 goto out;
1095 } 1187 }
1096 1188
1097 err = replace_preds(NULL, call, ps, filter_string); 1189 err = replace_preds(NULL, call, ps, filter_string, false);
1098 if (err) 1190 if (err)
1099 append_filter_err(ps, call->filter); 1191 append_filter_err(ps, call->filter);
1100 1192
@@ -1103,7 +1195,7 @@ out:
1103 postfix_clear(ps); 1195 postfix_clear(ps);
1104 kfree(ps); 1196 kfree(ps);
1105out_unlock: 1197out_unlock:
1106 mutex_unlock(&filter_mutex); 1198 mutex_unlock(&event_mutex);
1107 1199
1108 return err; 1200 return err;
1109} 1201}
@@ -1115,12 +1207,16 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1115 1207
1116 struct filter_parse_state *ps; 1208 struct filter_parse_state *ps;
1117 1209
1118 mutex_lock(&filter_mutex); 1210 mutex_lock(&event_mutex);
1211
1212 err = init_subsystem_preds(system);
1213 if (err)
1214 goto out_unlock;
1119 1215
1120 if (!strcmp(strstrip(filter_string), "0")) { 1216 if (!strcmp(strstrip(filter_string), "0")) {
1121 filter_free_subsystem_preds(system); 1217 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1122 remove_filter_string(system->filter); 1218 remove_filter_string(system->filter);
1123 mutex_unlock(&filter_mutex); 1219 mutex_unlock(&event_mutex);
1124 return 0; 1220 return 0;
1125 } 1221 }
1126 1222
@@ -1129,7 +1225,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1129 if (!ps) 1225 if (!ps)
1130 goto out_unlock; 1226 goto out_unlock;
1131 1227
1132 filter_free_subsystem_preds(system);
1133 replace_filter_string(system->filter, filter_string); 1228 replace_filter_string(system->filter, filter_string);
1134 1229
1135 parse_init(ps, filter_ops, filter_string); 1230 parse_init(ps, filter_ops, filter_string);
@@ -1139,16 +1234,30 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1139 goto out; 1234 goto out;
1140 } 1235 }
1141 1236
1142 err = replace_preds(system, NULL, ps, filter_string); 1237 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1143 if (err) 1238
1239 /* try to see the filter can be applied to which events */
1240 err = replace_preds(system, NULL, ps, filter_string, true);
1241 if (err) {
1144 append_filter_err(ps, system->filter); 1242 append_filter_err(ps, system->filter);
1243 goto out;
1244 }
1245
1246 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1247
1248 /* really apply the filter to the events */
1249 err = replace_preds(system, NULL, ps, filter_string, false);
1250 if (err) {
1251 append_filter_err(ps, system->filter);
1252 filter_free_subsystem_preds(system, 2);
1253 }
1145 1254
1146out: 1255out:
1147 filter_opstack_clear(ps); 1256 filter_opstack_clear(ps);
1148 postfix_clear(ps); 1257 postfix_clear(ps);
1149 kfree(ps); 1258 kfree(ps);
1150out_unlock: 1259out_unlock:
1151 mutex_unlock(&filter_mutex); 1260 mutex_unlock(&event_mutex);
1152 1261
1153 return err; 1262 return err;
1154} 1263}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..9753fcc61bc5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,116 +15,209 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
83
84#undef __array
85#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \
88 offsetof(typeof(field), item), \
89 sizeof(field.item)); \
90 if (!ret) \
91 return 0;
35 92
36#undef TRACE_FIELD_SPECIAL 93#undef __array_desc
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 94#define __array_desc(type, container, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 96 "offset:%zu;\tsize:%zu;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 97 offsetof(typeof(field), container.item), \
41 (unsigned int)sizeof(field.item)); \ 98 sizeof(field.container.item)); \
42 if (!ret) \ 99 if (!ret) \
43 return 0; 100 return 0;
44 101
45#undef TRACE_FIELD_ZERO_CHAR 102#undef __dynamic_array
46#define TRACE_FIELD_ZERO_CHAR(item) \ 103#define __dynamic_array(type, item) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
48 "offset:%u;\tsize:0;\n", \ 105 "offset:%zu;\tsize:0;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 106 offsetof(typeof(field), item)); \
50 if (!ret) \ 107 if (!ret) \
51 return 0; 108 return 0;
52 109
53#undef TRACE_FIELD_SIGN 110#undef F_printk
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
55 TRACE_FIELD(type, item, assign)
56 112
57#undef TP_RAW_FMT 113#undef __entry
58#define TP_RAW_FMT(args...) args 114#define __entry REC
59 115
60#undef TRACE_EVENT_FORMAT 116#undef FTRACE_ENTRY
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
62static int \ 118static int \
63ftrace_format_##call(struct trace_seq *s) \ 119ftrace_format_##name(struct ftrace_event_call *unused, \
120 struct trace_seq *s) \
64{ \ 121{ \
65 struct args field; \ 122 struct struct_name field __attribute__((unused)); \
66 int ret; \ 123 int ret = 0; \
67 \ 124 \
68 tstruct; \ 125 tstruct; \
69 \ 126 \
70 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 127 trace_seq_printf(s, "\nprint fmt: " print); \
71 \ 128 \
72 return ret; \ 129 return ret; \
73} 130}
74 131
75#undef TRACE_EVENT_FORMAT_NOFILTER 132#include "trace_entries.h"
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 133
77 tpfmt) \ 134
78static int \ 135#undef __field
79ftrace_format_##call(struct trace_seq *s) \ 136#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \
138 offsetof(typeof(field), item), \
139 sizeof(field.item), \
140 is_signed_type(type), FILTER_OTHER); \
141 if (ret) \
142 return ret;
143
144#undef __field_desc
145#define __field_desc(type, container, item) \
146 ret = trace_define_field(event_call, #type, #item, \
147 offsetof(typeof(field), \
148 container.item), \
149 sizeof(field.container.item), \
150 is_signed_type(type), FILTER_OTHER); \
151 if (ret) \
152 return ret;
153
154#undef __array
155#define __array(type, item, len) \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), 0, FILTER_OTHER); \
160 if (ret) \
161 return ret;
162
163#undef __array_desc
164#define __array_desc(type, container, item, len) \
165 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
167 offsetof(typeof(field), \
168 container.item), \
169 sizeof(field.container.item), 0, \
170 FILTER_OTHER); \
171 if (ret) \
172 return ret;
173
174#undef __dynamic_array
175#define __dynamic_array(type, item)
176
177#undef FTRACE_ENTRY
178#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
179int \
180ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
80{ \ 181{ \
81 struct args field; \ 182 struct struct_name field; \
82 int ret; \ 183 int ret; \
83 \ 184 \
84 tstruct; \ 185 ret = trace_define_common_fields(event_call); \
186 if (ret) \
187 return ret; \
85 \ 188 \
86 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 189 tstruct; \
87 \ 190 \
88 return ret; \ 191 return ret; \
89} 192}
90 193
91#include "trace_event_types.h" 194#include "trace_entries.h"
92
93#undef TRACE_ZERO_CHAR
94#define TRACE_ZERO_CHAR(arg)
95 195
96#undef TRACE_FIELD
97#define TRACE_FIELD(type, item, assign)\
98 entry->item = assign;
99 196
100#undef TRACE_FIELD 197#undef __field
101#define TRACE_FIELD(type, item, assign)\ 198#define __field(type, item)
102 entry->item = assign;
103 199
104#undef TRACE_FIELD_SIGN 200#undef __field_desc
105#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 201#define __field_desc(type, container, item)
106 TRACE_FIELD(type, item, assign)
107 202
108#undef TP_CMD 203#undef __array
109#define TP_CMD(cmd...) cmd 204#define __array(type, item, len)
110 205
111#undef TRACE_ENTRY 206#undef __array_desc
112#define TRACE_ENTRY entry 207#define __array_desc(type, container, item, len)
113 208
114#undef TRACE_FIELD_SPECIAL 209#undef __dynamic_array
115#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 210#define __dynamic_array(type, item)
116 cmd;
117 211
118#undef TRACE_EVENT_FORMAT 212#undef FTRACE_ENTRY
119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
120int ftrace_define_fields_##call(void); \
121static int ftrace_raw_init_event_##call(void); \ 214static int ftrace_raw_init_event_##call(void); \
122 \ 215 \
123struct ftrace_event_call __used \ 216struct ftrace_event_call __used \
124__attribute__((__aligned__(4))) \ 217__attribute__((__aligned__(4))) \
125__attribute__((section("_ftrace_events"))) event_##call = { \ 218__attribute__((section("_ftrace_events"))) event_##call = { \
126 .name = #call, \ 219 .name = #call, \
127 .id = proto, \ 220 .id = type, \
128 .system = __stringify(TRACE_SYSTEM), \ 221 .system = __stringify(TRACE_SYSTEM), \
129 .raw_init = ftrace_raw_init_event_##call, \ 222 .raw_init = ftrace_raw_init_event_##call, \
130 .show_format = ftrace_format_##call, \ 223 .show_format = ftrace_format_##call, \
@@ -133,74 +226,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
133static int ftrace_raw_init_event_##call(void) \ 226static int ftrace_raw_init_event_##call(void) \
134{ \ 227{ \
135 INIT_LIST_HEAD(&event_##call.fields); \ 228 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \ 229 return 0; \
138} \ 230} \
139 231
140#undef TRACE_EVENT_FORMAT_NOFILTER 232#include "trace_entries.h"
141#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
142 tpfmt) \
143 \
144struct ftrace_event_call __used \
145__attribute__((__aligned__(4))) \
146__attribute__((section("_ftrace_events"))) event_##call = { \
147 .name = #call, \
148 .id = proto, \
149 .system = __stringify(TRACE_SYSTEM), \
150 .show_format = ftrace_format_##call, \
151};
152
153#include "trace_event_types.h"
154
155#undef TRACE_FIELD
156#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \
160 if (ret) \
161 return ret;
162
163#undef TRACE_FIELD_SPECIAL
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \
168 if (ret) \
169 return ret;
170
171#undef TRACE_FIELD_SIGN
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \
176 if (ret) \
177 return ret;
178
179#undef TRACE_FIELD_ZERO_CHAR
180#define TRACE_FIELD_ZERO_CHAR(item)
181
182#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \
185ftrace_define_fields_##call(void) \
186{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \
189 int ret; \
190 \
191 __common_field(unsigned char, type, 0); \
192 __common_field(unsigned char, flags, 0); \
193 __common_field(unsigned char, preempt_count, 0); \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \
197 tstruct; \
198 \
199 return ret; \
200}
201
202#undef TRACE_EVENT_FORMAT_NOFILTER
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
204 tpfmt)
205
206#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
193static void tracing_stop_function_trace(void) 193static void tracing_stop_function_trace(void)
194{ 194{
195 ftrace_function_enabled = 0; 195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */ 196
197 unregister_ftrace_function(&trace_stack_ops); 197 if (func_flags.val & TRACE_FUNC_OPT_STACK)
198 unregister_ftrace_function(&trace_ops); 198 unregister_ftrace_function(&trace_stack_ops);
199 else
200 unregister_ftrace_function(&trace_ops);
199} 201}
200 202
201static int func_set_flag(u32 old_flags, u32 bit, int set) 203static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -286,11 +288,9 @@ static int
286ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
287 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
288{ 290{
289 char str[KSYM_SYMBOL_LEN];
290 long count = (long)data; 291 long count = (long)data;
291 292
292 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%ps:", (void *)ip);
293 seq_printf(m, "%s:", str);
294 294
295 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
@@ -300,8 +300,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
300 if (count == -1) 300 if (count == -1)
301 seq_printf(m, ":unlimited\n"); 301 seq_printf(m, ":unlimited\n");
302 else 302 else
303 seq_printf(m, ":count=%ld", count); 303 seq_printf(m, ":count=%ld\n", count);
304 seq_putc(m, '\n');
305 304
306 return 0; 305 return 0;
307} 306}
@@ -362,7 +361,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
362 out_reg: 361 out_reg:
363 ret = register_ftrace_function_probe(glob, ops, count); 362 ret = register_ftrace_function_probe(glob, ops, count);
364 363
365 return ret; 364 return ret < 0 ? ret : 0;
366} 365}
367 366
368static struct ftrace_func_command ftrace_traceon_cmd = { 367static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8b592418d8b2..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,12 +52,13 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
59int 59int
60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) 60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
61 unsigned long frame_pointer)
61{ 62{
62 unsigned long long calltime; 63 unsigned long long calltime;
63 int index; 64 int index;
@@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
85 current->ret_stack[index].func = func; 86 current->ret_stack[index].func = func;
86 current->ret_stack[index].calltime = calltime; 87 current->ret_stack[index].calltime = calltime;
87 current->ret_stack[index].subtime = 0; 88 current->ret_stack[index].subtime = 0;
89 current->ret_stack[index].fp = frame_pointer;
88 *depth = index; 90 *depth = index;
89 91
90 return 0; 92 return 0;
@@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
92 94
93/* Retrieve a function return address to the trace stack on thread info.*/ 95/* Retrieve a function return address to the trace stack on thread info.*/
94static void 96static void
95ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) 97ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
98 unsigned long frame_pointer)
96{ 99{
97 int index; 100 int index;
98 101
@@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
106 return; 109 return;
107 } 110 }
108 111
112#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
113 /*
114 * The arch may choose to record the frame pointer used
115 * and check it here to make sure that it is what we expect it
116 * to be. If gcc does not set the place holder of the return
117 * address in the frame pointer, and does a copy instead, then
118 * the function graph trace will fail. This test detects this
119 * case.
120 *
121 * Currently, x86_32 with optimize for size (-Os) makes the latest
122 * gcc do the above.
123 */
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp,
129 frame_pointer,
130 (void *)current->ret_stack[index].func,
131 current->ret_stack[index].ret);
132 *ret = (unsigned long)panic;
133 return;
134 }
135#endif
136
109 *ret = current->ret_stack[index].ret; 137 *ret = current->ret_stack[index].ret;
110 trace->func = current->ret_stack[index].func; 138 trace->func = current->ret_stack[index].func;
111 trace->calltime = current->ret_stack[index].calltime; 139 trace->calltime = current->ret_stack[index].calltime;
@@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
117 * Send the trace to the ring-buffer. 145 * Send the trace to the ring-buffer.
118 * @return the original return address. 146 * @return the original return address.
119 */ 147 */
120unsigned long ftrace_return_to_handler(void) 148unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
121{ 149{
122 struct ftrace_graph_ret trace; 150 struct ftrace_graph_ret trace;
123 unsigned long ret; 151 unsigned long ret;
124 152
125 ftrace_pop_return_trace(&trace, &ret); 153 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
126 trace.rettime = trace_clock_local(); 154 trace.rettime = trace_clock_local();
127 ftrace_graph_return(&trace); 155 ftrace_graph_return(&trace);
128 barrier(); 156 barrier();
@@ -138,10 +166,123 @@ unsigned long ftrace_return_to_handler(void)
138 return ret; 166 return ret;
139} 167}
140 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry;
178
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
180 return 0;
181
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
183 sizeof(*entry), flags, pc);
184 if (!event)
185 return 0;
186 entry = ring_buffer_event_data(event);
187 entry->graph_ent = *trace;
188 if (!filter_current_check_discard(buffer, call, entry, event))
189 ring_buffer_unlock_commit(buffer, event);
190
191 return 1;
192}
193
194int trace_graph_entry(struct ftrace_graph_ent *trace)
195{
196 struct trace_array *tr = graph_array;
197 struct trace_array_cpu *data;
198 unsigned long flags;
199 long disabled;
200 int ret;
201 int cpu;
202 int pc;
203
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current))
208 return 0;
209
210 if (!ftrace_graph_addr(trace->func))
211 return 0;
212
213 local_irq_save(flags);
214 cpu = raw_smp_processor_id();
215 data = tr->data[cpu];
216 disabled = atomic_inc_return(&data->disabled);
217 if (likely(disabled == 1)) {
218 pc = preempt_count();
219 ret = __trace_graph_entry(tr, trace, flags, pc);
220 } else {
221 ret = 0;
222 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226
227 atomic_dec(&data->disabled);
228 local_irq_restore(flags);
229
230 return ret;
231}
232
233static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace,
235 unsigned long flags,
236 int pc)
237{
238 struct ftrace_event_call *call = &event_funcgraph_exit;
239 struct ring_buffer_event *event;
240 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry;
242
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
244 return;
245
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
247 sizeof(*entry), flags, pc);
248 if (!event)
249 return;
250 entry = ring_buffer_event_data(event);
251 entry->ret = *trace;
252 if (!filter_current_check_discard(buffer, call, entry, event))
253 ring_buffer_unlock_commit(buffer, event);
254}
255
256void trace_graph_return(struct ftrace_graph_ret *trace)
257{
258 struct trace_array *tr = graph_array;
259 struct trace_array_cpu *data;
260 unsigned long flags;
261 long disabled;
262 int cpu;
263 int pc;
264
265 local_irq_save(flags);
266 cpu = raw_smp_processor_id();
267 data = tr->data[cpu];
268 disabled = atomic_inc_return(&data->disabled);
269 if (likely(disabled == 1)) {
270 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc);
272 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled);
276 local_irq_restore(flags);
277}
278
141static int graph_trace_init(struct trace_array *tr) 279static int graph_trace_init(struct trace_array *tr)
142{ 280{
143 int ret = register_ftrace_graph(&trace_graph_return, 281 int ret;
144 &trace_graph_entry); 282
283 graph_array = tr;
284 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry);
145 if (ret) 286 if (ret)
146 return ret; 287 return ret;
147 tracing_start_cmdline_record(); 288 tracing_start_cmdline_record();
@@ -149,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
149 return 0; 290 return 0;
150} 291}
151 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
152static void graph_trace_reset(struct trace_array *tr) 298static void graph_trace_reset(struct trace_array *tr)
153{ 299{
154 tracing_stop_cmdline_record(); 300 tracing_stop_cmdline_record();
155 unregister_ftrace_graph(); 301 unregister_ftrace_graph();
156} 302}
157 303
158static inline int log10_cpu(int nb) 304static int max_bytes_for_cpu;
159{
160 if (nb / 100)
161 return 3;
162 if (nb / 10)
163 return 2;
164 return 1;
165}
166 305
167static enum print_line_t 306static enum print_line_t
168print_graph_cpu(struct trace_seq *s, int cpu) 307print_graph_cpu(struct trace_seq *s, int cpu)
169{ 308{
170 int i;
171 int ret; 309 int ret;
172 int log10_this = log10_cpu(cpu);
173 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
174
175 310
176 /* 311 /*
177 * Start with a space character - to make it stand out 312 * Start with a space character - to make it stand out
178 * to the right a bit when trace output is pasted into 313 * to the right a bit when trace output is pasted into
179 * email: 314 * email:
180 */ 315 */
181 ret = trace_seq_printf(s, " "); 316 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
182
183 /*
184 * Tricky - we space the CPU field according to the max
185 * number of online CPUs. On a 2-cpu system it would take
186 * a maximum of 1 digit - on a 128 cpu system it would
187 * take up to 3 digits:
188 */
189 for (i = 0; i < log10_all - log10_this; i++) {
190 ret = trace_seq_printf(s, " ");
191 if (!ret)
192 return TRACE_TYPE_PARTIAL_LINE;
193 }
194 ret = trace_seq_printf(s, "%d) ", cpu);
195 if (!ret) 317 if (!ret)
196 return TRACE_TYPE_PARTIAL_LINE; 318 return TRACE_TYPE_PARTIAL_LINE;
197 319
@@ -242,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
242} 364}
243 365
244 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
245/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
246static enum print_line_t 377static enum print_line_t
247verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -399,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
399 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
400 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
401 } 532 }
533
402 /* Proc */ 534 /* Proc */
403 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
404 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -537,11 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
537 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
538 } 670 }
539 671
540 ret = seq_print_ip_sym(s, call->func, 0); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
541 if (!ret)
542 return TRACE_TYPE_PARTIAL_LINE;
543
544 ret = trace_seq_printf(s, "();\n");
545 if (!ret) 673 if (!ret)
546 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
547 675
@@ -584,11 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
584 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
585 } 713 }
586 714
587 ret = seq_print_ip_sym(s, call->func, 0); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
588 if (!ret)
589 return TRACE_TYPE_PARTIAL_LINE;
590
591 ret = trace_seq_printf(s, "() {\n");
592 if (!ret) 716 if (!ret)
593 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
594 718
@@ -644,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
644 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
645 } 769 }
646 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
647 return 0; 778 return 0;
648} 779}
649 780
@@ -815,9 +946,16 @@ print_graph_function(struct trace_iterator *iter)
815 946
816 switch (entry->type) { 947 switch (entry->type) {
817 case TRACE_GRAPH_ENT: { 948 case TRACE_GRAPH_ENT: {
818 struct ftrace_graph_ent_entry *field; 949 /*
950 * print_graph_entry() may consume the current event,
951 * thus @field may become invalid, so we need to save it.
952 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack.
954 */
955 struct ftrace_graph_ent_entry *field, saved;
819 trace_assign_type(field, entry); 956 trace_assign_type(field, entry);
820 return print_graph_entry(field, s, iter); 957 saved = *field;
958 return print_graph_entry(&saved, s, iter);
821 } 959 }
822 case TRACE_GRAPH_RET: { 960 case TRACE_GRAPH_RET: {
823 struct ftrace_graph_ret_entry *field; 961 struct ftrace_graph_ret_entry *field;
@@ -831,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
831 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
832} 970}
833 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
834static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
835{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
836 /* 1st line */ 1001 /* 1st line */
837 seq_printf(s, "# "); 1002 seq_printf(s, "#");
838 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
839 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
840 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
841 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
842 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
843 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
844 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
845 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
846 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
847 1014
848 /* 2nd line */ 1015 /* 2nd line */
849 seq_printf(s, "# "); 1016 seq_printf(s, "#");
850 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
851 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
852 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
853 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
854 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
855 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
856 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
857 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
858 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
@@ -899,6 +1068,8 @@ static struct tracer graph_trace __read_mostly = {
899 1068
900static __init int init_graph_trace(void) 1069static __init int init_graph_trace(void)
901{ 1070{
1071 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1072
902 return register_tracer(&graph_trace); 1073 return register_tracer(&graph_trace);
903} 1074}
904 1075
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fad..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
@@ -178,7 +170,6 @@ out_unlock:
178out: 170out:
179 data->critical_sequence = max_sequence; 171 data->critical_sequence = max_sequence;
180 data->preempt_timestamp = ftrace_now(cpu); 172 data->preempt_timestamp = ftrace_now(cpu);
181 tracing_reset(tr, cpu);
182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 173 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
183} 174}
184 175
@@ -208,7 +199,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
208 data->critical_sequence = max_sequence; 199 data->critical_sequence = max_sequence;
209 data->preempt_timestamp = ftrace_now(cpu); 200 data->preempt_timestamp = ftrace_now(cpu);
210 data->critical_start = parent_ip ? : ip; 201 data->critical_start = parent_ip ? : ip;
211 tracing_reset(tr, cpu);
212 202
213 local_save_flags(flags); 203 local_save_flags(flags);
214 204
@@ -379,6 +369,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
379 irqsoff_trace = tr; 369 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */ 370 /* make sure that the tracer is visible */
381 smp_wmb(); 371 smp_wmb();
372 tracing_reset_online_cpus(tr);
382 start_irqsoff_tracer(tr); 373 start_irqsoff_tracer(tr);
383} 374}
384 375
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed0806..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,11 +307,13 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
311 struct ring_buffer *buffer = tr->buffer;
310 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
311 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
312 int pc = preempt_count(); 314 int pc = preempt_count();
313 315
314 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, 316 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
315 sizeof(*entry), 0, pc); 317 sizeof(*entry), 0, pc);
316 if (!event) { 318 if (!event) {
317 atomic_inc(&dropped_count); 319 atomic_inc(&dropped_count);
@@ -319,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
319 } 321 }
320 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
321 entry->rw = *rw; 323 entry->rw = *rw;
322 trace_buffer_unlock_commit(tr, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
323} 327}
324 328
325void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +337,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
333 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
334 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
335{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
341 struct ring_buffer *buffer = tr->buffer;
336 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
337 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
338 int pc = preempt_count(); 344 int pc = preempt_count();
339 345
340 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, 346 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
341 sizeof(*entry), 0, pc); 347 sizeof(*entry), 0, pc);
342 if (!event) { 348 if (!event) {
343 atomic_inc(&dropped_count); 349 atomic_inc(&dropped_count);
@@ -345,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
345 } 351 }
346 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
347 entry->map = *map; 353 entry->map = *map;
348 trace_buffer_unlock_commit(tr, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
349} 357}
350 358
351void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..f572f44c6e1e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
@@ -408,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
408 * since individual threads might have already quit! 407 * since individual threads might have already quit!
409 */ 408 */
410 rcu_read_lock(); 409 rcu_read_lock();
411 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
412 if (task) 411 if (task)
413 mm = get_task_mm(task); 412 mm = get_task_mm(task);
414 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -461,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
461 return ret; 460 return ret;
462} 461}
463 462
464static int 463/**
465lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
466{ 472{
467 int hardirq, softirq; 473 int hardirq, softirq;
468 char comm[TASK_COMM_LEN]; 474 int ret;
469 475
470 trace_find_cmdline(entry->pid, comm);
471 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
472 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
473 478
474 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
475 comm, entry->pid, cpu,
476 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
477 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
478 'X' : '.', 482 'X' : '.',
@@ -482,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
482 hardirq ? 'h' : softirq ? 's' : '.')) 486 hardirq ? 'h' : softirq ? 's' : '.'))
483 return 0; 487 return 0;
484 488
489 if (entry->lock_depth < 0)
490 ret = trace_seq_putc(s, '.');
491 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth);
493 if (!ret)
494 return 0;
495
485 if (entry->preempt_count) 496 if (entry->preempt_count)
486 return trace_seq_printf(s, "%x", entry->preempt_count); 497 return trace_seq_printf(s, "%x", entry->preempt_count);
487 return trace_seq_puts(s, "."); 498 return trace_seq_putc(s, '.');
499}
500
501static int
502lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
503{
504 char comm[TASK_COMM_LEN];
505
506 trace_find_cmdline(entry->pid, comm);
507
508 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
509 comm, entry->pid, cpu))
510 return 0;
511
512 return trace_print_lat_fmt(s, entry);
488} 513}
489 514
490static unsigned long preempt_mark_thresh = 100; 515static unsigned long preempt_mark_thresh = 100;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8a30d9874cd4..fe1a00f1445a 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -38,6 +38,7 @@ static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power; 39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
41 struct trace_power *entry; 42 struct trace_power *entry;
42 struct trace_array_cpu *data; 43 struct trace_array_cpu *data;
43 struct trace_array *tr = power_trace; 44 struct trace_array *tr = power_trace;
@@ -45,18 +46,20 @@ static void probe_power_end(struct power_trace *it)
45 if (!trace_power_enabled) 46 if (!trace_power_enabled)
46 return; 47 return;
47 48
49 buffer = tr->buffer;
50
48 preempt_disable(); 51 preempt_disable();
49 it->end = ktime_get(); 52 it->end = ktime_get();
50 data = tr->data[smp_processor_id()]; 53 data = tr->data[smp_processor_id()];
51 54
52 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
53 sizeof(*entry), 0, 0); 56 sizeof(*entry), 0, 0);
54 if (!event) 57 if (!event)
55 goto out; 58 goto out;
56 entry = ring_buffer_event_data(event); 59 entry = ring_buffer_event_data(event);
57 entry->state_data = *it; 60 entry->state_data = *it;
58 if (!filter_check_discard(call, entry, tr->buffer, event)) 61 if (!filter_check_discard(call, entry, buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0); 62 trace_buffer_unlock_commit(buffer, event, 0, 0);
60 out: 63 out:
61 preempt_enable(); 64 preempt_enable();
62} 65}
@@ -66,6 +69,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
66{ 69{
67 struct ftrace_event_call *call = &event_power; 70 struct ftrace_event_call *call = &event_power;
68 struct ring_buffer_event *event; 71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
69 struct trace_power *entry; 73 struct trace_power *entry;
70 struct trace_array_cpu *data; 74 struct trace_array_cpu *data;
71 struct trace_array *tr = power_trace; 75 struct trace_array *tr = power_trace;
@@ -73,6 +77,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
73 if (!trace_power_enabled) 77 if (!trace_power_enabled)
74 return; 78 return;
75 79
80 buffer = tr->buffer;
81
76 memset(it, 0, sizeof(struct power_trace)); 82 memset(it, 0, sizeof(struct power_trace));
77 it->state = level; 83 it->state = level;
78 it->type = type; 84 it->type = type;
@@ -81,14 +87,14 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
81 it->end = it->stamp; 87 it->end = it->stamp;
82 data = tr->data[smp_processor_id()]; 88 data = tr->data[smp_processor_id()];
83 89
84 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
85 sizeof(*entry), 0, 0); 91 sizeof(*entry), 0, 0);
86 if (!event) 92 if (!event)
87 goto out; 93 goto out;
88 entry = ring_buffer_event_data(event); 94 entry = ring_buffer_event_data(event);
89 entry->state_data = *it; 95 entry->state_data = *it;
90 if (!filter_check_discard(call, entry, tr->buffer, event)) 96 if (!filter_check_discard(call, entry, buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0); 97 trace_buffer_unlock_commit(buffer, event, 0, 0);
92 out: 98 out:
93 preempt_enable(); 99 preempt_enable();
94} 100}
@@ -144,14 +150,12 @@ static void power_trace_reset(struct trace_array *tr)
144 150
145static int power_trace_init(struct trace_array *tr) 151static int power_trace_init(struct trace_array *tr)
146{ 152{
147 int cpu;
148 power_trace = tr; 153 power_trace = tr;
149 154
150 trace_power_enabled = 1; 155 trace_power_enabled = 1;
151 tracing_power_register(); 156 tracing_power_register();
152 157
153 for_each_cpu(cpu, cpu_possible_mask) 158 tracing_reset_online_cpus(tr);
154 tracing_reset(tr, cpu);
155 return 0; 159 return 0;
156} 160}
157 161
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b62..687699d365ae 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
155EXPORT_SYMBOL_GPL(__ftrace_vprintk); 155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156 156
157static void * 157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos) 158t_start(struct seq_file *m, loff_t *pos)
159{ 159{
160 const char **fmt = m->private; 160 const char **fmt = __start___trace_bprintk_fmt + *pos;
161 const char **next = fmt;
162
163 (*pos)++;
164 161
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) 162 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL; 163 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt; 164 return fmt;
172} 165}
173 166
174static void *t_start(struct seq_file *m, loff_t *pos) 167static void *t_next(struct seq_file *m, void * v, loff_t *pos)
175{ 168{
176 return t_next(m, NULL, pos); 169 (*pos)++;
170 return t_start(m, pos);
177} 171}
178 172
179static int t_show(struct seq_file *m, void *v) 173static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
182 const char *str = *fmt; 176 const char *str = *fmt;
183 int i; 177 int i;
184 178
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 179 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
186 180
187 /* 181 /*
188 * Tabs and new lines need to be converted. 182 * Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
224static int 218static int
225ftrace_formats_open(struct inode *inode, struct file *file) 219ftrace_formats_open(struct inode *inode, struct file *file)
226{ 220{
227 int ret; 221 return seq_open(file, &show_format_seq_ops);
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236} 222}
237 223
238static const struct file_operations ftrace_formats_fops = { 224static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..5fca0f51fde4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,35 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51
23static void 52static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 54 struct task_struct *next)
@@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 78 local_irq_restore(flags);
50} 79}
51 80
81void
82tracing_sched_wakeup_trace(struct trace_array *tr,
83 struct task_struct *wakee,
84 struct task_struct *curr,
85 unsigned long flags, int pc)
86{
87 struct ftrace_event_call *call = &event_wakeup;
88 struct ring_buffer_event *event;
89 struct ctx_switch_entry *entry;
90 struct ring_buffer *buffer = tr->buffer;
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
93 sizeof(*entry), flags, pc);
94 if (!event)
95 return;
96 entry = ring_buffer_event_data(event);
97 entry->prev_pid = curr->pid;
98 entry->prev_prio = curr->prio;
99 entry->prev_state = curr->state;
100 entry->next_pid = wakee->pid;
101 entry->next_prio = wakee->prio;
102 entry->next_state = wakee->state;
103 entry->next_cpu = task_cpu(wakee);
104
105 if (!filter_check_discard(call, entry, buffer, event))
106 ring_buffer_unlock_commit(buffer, event);
107 ftrace_trace_stack(tr->buffer, flags, 6, pc);
108 ftrace_trace_userstack(tr->buffer, flags, pc);
109}
110
52static void 111static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 113{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb27225173..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -186,11 +177,6 @@ out:
186 177
187static void __wakeup_reset(struct trace_array *tr) 178static void __wakeup_reset(struct trace_array *tr)
188{ 179{
189 int cpu;
190
191 for_each_possible_cpu(cpu)
192 tracing_reset(tr, cpu);
193
194 wakeup_cpu = -1; 180 wakeup_cpu = -1;
195 wakeup_prio = -1; 181 wakeup_prio = -1;
196 182
@@ -204,6 +190,8 @@ static void wakeup_reset(struct trace_array *tr)
204{ 190{
205 unsigned long flags; 191 unsigned long flags;
206 192
193 tracing_reset_online_cpus(tr);
194
207 local_irq_save(flags); 195 local_irq_save(flags);
208 __raw_spin_lock(&wakeup_lock); 196 __raw_spin_lock(&wakeup_lock);
209 __wakeup_reset(tr); 197 __wakeup_reset(tr);
@@ -247,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
247 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
248 236
249 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
250 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
251 240
252 wakeup_task = p; 241 wakeup_task = p;
@@ -296,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
296 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
297 } 286 }
298 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
299 wakeup_reset(tr); 295 wakeup_reset(tr);
300 296
301 /* 297 /*
@@ -328,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
328 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
329 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
330 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
331} 328}
332 329
333static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
288 * to detect and recover from possible hangs 288 * to detect and recover from possible hangs
289 */ 289 */
290 tracing_reset_online_cpus(tr); 290 tracing_reset_online_cpus(tr);
291 set_graph_array(tr);
291 ret = register_ftrace_graph(&trace_graph_return, 292 ret = register_ftrace_graph(&trace_graph_return,
292 &trace_graph_entry_watchdog); 293 &trace_graph_entry_watchdog);
293 if (ret) { 294 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..0f6facb050a1 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
186}; 186};
187 187
188static void * 188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos) 189__next(struct seq_file *m, loff_t *pos)
190{ 190{
191 long i; 191 long n = *pos - 1;
192
193 (*pos)++;
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201 192
202 if (i >= max_stack_trace.nr_entries || 193 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL; 194 return NULL;
205 195
206 m->private = (void *)i; 196 m->private = (void *)n;
207
208 return &m->private; 197 return &m->private;
209} 198}
210 199
211static void *t_start(struct seq_file *m, loff_t *pos) 200static void *
201t_next(struct seq_file *m, void *v, loff_t *pos)
212{ 202{
213 void *t = SEQ_START_TOKEN; 203 (*pos)++;
214 loff_t l = 0; 204 return __next(m, pos);
205}
215 206
207static void *t_start(struct seq_file *m, loff_t *pos)
208{
216 local_irq_disable(); 209 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock); 210 __raw_spin_lock(&max_stack_lock);
218 211
219 if (*pos == 0) 212 if (*pos == 0)
220 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
221 214
222 for (; t && l < *pos; t = t_next(m, t, &l)) 215 return __next(m, pos);
223 ;
224
225 return t;
226} 216}
227 217
228static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 224static int trace_lookup_stack(struct seq_file *m, long i)
235{ 225{
236 unsigned long addr = stack_dump_trace[i]; 226 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239 227
240 sprint_symbol(str, addr); 228 return seq_printf(m, "%pF\n", (void *)addr);
241
242 return seq_printf(m, "%s\n", str);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 229}
247 230
248static void print_disabled(struct seq_file *m) 231static void print_disabled(struct seq_file *m)
@@ -301,17 +284,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 284
302static int stack_trace_open(struct inode *inode, struct file *file) 285static int stack_trace_open(struct inode *inode, struct file *file)
303{ 286{
304 int ret; 287 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 288}
310 289
311static const struct file_operations stack_trace_fops = { 290static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 291 .open = stack_trace_open,
313 .read = seq_read, 292 .read = seq_read,
314 .llseek = seq_lseek, 293 .llseek = seq_lseek,
294 .release = seq_release,
315}; 295};
316 296
317int 297int
@@ -326,10 +306,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 307
328 if (ret || !write || 308 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 310 goto out;
331 311
332 last_stack_tracer_enabled = stack_tracer_enabled; 312 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 313
334 if (stack_tracer_enabled) 314 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 315 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4c..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,26 +68,35 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
73 } 76 }
74} 77}
75 78
76static void reset_stat_session(struct stat_session *session) 79static void __reset_stat_session(struct stat_session *session)
77{ 80{
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
85 88
89static void reset_stat_session(struct stat_session *session)
90{
91 mutex_lock(&session->stat_mutex);
92 __reset_stat_session(session);
93 mutex_unlock(&session->stat_mutex);
94}
95
86static void destroy_session(struct stat_session *session) 96static void destroy_session(struct stat_session *session)
87{ 97{
88 debugfs_remove(session->file); 98 debugfs_remove(session->file);
89 reset_stat_session(session); 99 __reset_stat_session(session);
90 mutex_destroy(&session->stat_mutex); 100 mutex_destroy(&session->stat_mutex);
91 kfree(session); 101 kfree(session);
92} 102}
@@ -150,7 +160,7 @@ static int stat_seq_init(struct stat_session *session)
150 int i; 160 int i;
151 161
152 mutex_lock(&session->stat_mutex); 162 mutex_lock(&session->stat_mutex);
153 reset_stat_session(session); 163 __reset_stat_session(session);
154 164
155 if (!ts->stat_cmp) 165 if (!ts->stat_cmp)
156 ts->stat_cmp = dummy_cmp; 166 ts->stat_cmp = dummy_cmp;
@@ -183,7 +193,7 @@ exit:
183 return ret; 193 return ret;
184 194
185exit_free_rbtree: 195exit_free_rbtree:
186 reset_stat_session(session); 196 __reset_stat_session(session);
187 mutex_unlock(&session->stat_mutex); 197 mutex_unlock(&session->stat_mutex);
188 return ret; 198 return ret;
189} 199}
@@ -193,23 +203,23 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
193{ 203{
194 struct stat_session *session = s->private; 204 struct stat_session *session = s->private;
195 struct rb_node *node; 205 struct rb_node *node;
206 int n = *pos;
196 int i; 207 int i;
197 208
198 /* Prevent from tracer switch or rbtree modification */ 209 /* Prevent from tracer switch or rbtree modification */
199 mutex_lock(&session->stat_mutex); 210 mutex_lock(&session->stat_mutex);
200 211
201 /* If we are in the beginning of the file, print the headers */ 212 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) { 213 if (session->ts->stat_headers) {
203 (*pos)++; 214 if (n == 0)
204 return SEQ_START_TOKEN; 215 return SEQ_START_TOKEN;
216 n--;
205 } 217 }
206 218
207 node = rb_first(&session->stat_root); 219 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++) 220 for (i = 0; node && i < n; i++)
209 node = rb_next(node); 221 node = rb_next(node);
210 222
211 (*pos)++;
212
213 return node; 223 return node;
214} 224}
215 225
@@ -254,16 +264,21 @@ static const struct seq_operations trace_stat_seq_ops = {
254static int tracing_stat_open(struct inode *inode, struct file *file) 264static int tracing_stat_open(struct inode *inode, struct file *file)
255{ 265{
256 int ret; 266 int ret;
257 267 struct seq_file *m;
258 struct stat_session *session = inode->i_private; 268 struct stat_session *session = inode->i_private;
259 269
270 ret = stat_seq_init(session);
271 if (ret)
272 return ret;
273
260 ret = seq_open(file, &trace_stat_seq_ops); 274 ret = seq_open(file, &trace_stat_seq_ops);
261 if (!ret) { 275 if (ret) {
262 struct seq_file *m = file->private_data; 276 reset_stat_session(session);
263 m->private = session; 277 return ret;
264 ret = stat_seq_init(session);
265 } 278 }
266 279
280 m = file->private_data;
281 m->private = session;
267 return ret; 282 return ret;
268} 283}
269 284
@@ -274,11 +289,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
274{ 289{
275 struct stat_session *session = i->i_private; 290 struct stat_session *session = i->i_private;
276 291
277 mutex_lock(&session->stat_mutex);
278 reset_stat_session(session); 292 reset_stat_session(session);
279 mutex_unlock(&session->stat_mutex);
280 293
281 return 0; 294 return seq_release(i, f);
282} 295}
283 296
284static const struct file_operations tracing_stat_fops = { 297static const struct file_operations tracing_stat_fops = {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..8712ce3c6a0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,18 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h>
5#include <linux/perf_counter.h>
3#include <asm/syscall.h> 6#include <asm/syscall.h>
4 7
5#include "trace_output.h" 8#include "trace_output.h"
6#include "trace.h" 9#include "trace.h"
7 10
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock); 11static DEFINE_MUTEX(syscall_trace_lock);
13 12static int sys_refcount_enter;
14/* Option to display the parameters types */ 13static int sys_refcount_exit;
15enum { 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 TRACE_SYSCALLS_OPT_TYPES = 0x1, 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28 16
29enum print_line_t 17enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags) 18print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
35 struct syscall_metadata *entry; 23 struct syscall_metadata *entry;
36 int i, ret, syscall; 24 int i, ret, syscall;
37 25
38 trace_assign_type(trace, ent); 26 trace = (typeof(trace))ent;
39
40 syscall = trace->nr; 27 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall); 28 entry = syscall_nr_to_meta(syscall);
29
43 if (!entry) 30 if (!entry)
44 goto end; 31 goto end;
45 32
33 if (entry->enter_id != ent->type) {
34 WARN_ON_ONCE(1);
35 goto end;
36 }
37
46 ret = trace_seq_printf(s, "%s(", entry->name); 38 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret) 39 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE; 40 return TRACE_TYPE_PARTIAL_LINE;
49 41
50 for (i = 0; i < entry->nb_args; i++) { 42 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */ 43 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { 44 if (trace_flags & TRACE_ITER_VERBOSE) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]); 45 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret) 46 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE; 47 return TRACE_TYPE_PARTIAL_LINE;
56 } 48 }
57 /* parameter values */ 49 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], 50 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
59 trace->args[i], 51 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ","); 52 i == entry->nb_args - 1 ? "" : ", ");
61 if (!ret) 53 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE; 54 return TRACE_TYPE_PARTIAL_LINE;
63 } 55 }
64 56
57 ret = trace_seq_putc(s, ')');
58 if (!ret)
59 return TRACE_TYPE_PARTIAL_LINE;
60
65end: 61end:
66 trace_seq_printf(s, "\n"); 62 ret = trace_seq_putc(s, '\n');
63 if (!ret)
64 return TRACE_TYPE_PARTIAL_LINE;
65
67 return TRACE_TYPE_HANDLED; 66 return TRACE_TYPE_HANDLED;
68} 67}
69 68
@@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
77 struct syscall_metadata *entry; 76 struct syscall_metadata *entry;
78 int ret; 77 int ret;
79 78
80 trace_assign_type(trace, ent); 79 trace = (typeof(trace))ent;
81
82 syscall = trace->nr; 80 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall); 81 entry = syscall_nr_to_meta(syscall);
82
85 if (!entry) { 83 if (!entry) {
86 trace_seq_printf(s, "\n"); 84 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED; 85 return TRACE_TYPE_HANDLED;
88 } 86 }
89 87
88 if (entry->exit_id != ent->type) {
89 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED;
91 }
92
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 93 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret); 94 trace->ret);
92 if (!ret) 95 if (!ret)
@@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
95 return TRACE_TYPE_HANDLED; 98 return TRACE_TYPE_HANDLED;
96} 99}
97 100
98void start_ftrace_syscalls(void) 101extern char *__bad_type_size(void);
102
103#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
107
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
99{ 109{
100 unsigned long flags; 110 int i;
101 struct task_struct *g, *t; 111 int nr;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
102 116
103 mutex_lock(&syscall_trace_lock); 117 nr = syscall_name_to_nr(call->data);
118 entry = syscall_nr_to_meta(nr);
104 119
105 /* Don't enable the flag on the tasks twice */ 120 if (!entry)
106 if (++refcount != 1) 121 return 0;
107 goto unlock;
108 122
109 arch_init_ftrace_syscalls(); 123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
110 read_lock_irqsave(&tasklist_lock, flags); 124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
111 127
112 do_each_thread(g, t) { 128 for (i = 0; i < entry->nb_args; i++) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
114 } while_each_thread(g, t); 130 entry->args[i]);
131 if (!ret)
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 }
115 139
116 read_unlock_irqrestore(&tasklist_lock, flags); 140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
143 sizeof(unsigned long),
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 }
148 trace_seq_putc(s, '"');
117 149
118unlock: 150 for (i = 0; i < entry->nb_args; i++) {
119 mutex_unlock(&syscall_trace_lock); 151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156
157 return trace_seq_putc(s, '\n');
120} 158}
121 159
122void stop_ftrace_syscalls(void) 160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
123{ 161{
124 unsigned long flags; 162 int ret;
125 struct task_struct *g, *t; 163 struct syscall_trace_exit trace;
126 164
127 mutex_lock(&syscall_trace_lock); 165 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret));
170 if (!ret)
171 return 0;
128 172
129 /* There are perhaps still some users */ 173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
130 if (--refcount) 174}
131 goto unlock;
132 175
133 read_lock_irqsave(&tasklist_lock, flags); 176int syscall_enter_define_fields(struct ftrace_event_call *call)
177{
178 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta;
180 int ret;
181 int nr;
182 int i;
183 int offset = offsetof(typeof(trace), args);
184
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret)
193 return ret;
194
195 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset,
198 sizeof(unsigned long), 0,
199 FILTER_OTHER);
200 offset += sizeof(unsigned long);
201 }
134 202
135 do_each_thread(g, t) { 203 return ret;
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 204}
137 } while_each_thread(g, t);
138 205
139 read_unlock_irqrestore(&tasklist_lock, flags); 206int syscall_exit_define_fields(struct ftrace_event_call *call)
207{
208 struct syscall_trace_exit trace;
209 int ret;
140 210
141unlock: 211 ret = trace_define_common_fields(call);
142 mutex_unlock(&syscall_trace_lock); 212 if (ret)
213 return ret;
214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
216 FILTER_OTHER);
217
218 return ret;
143} 219}
144 220
145void ftrace_syscall_enter(struct pt_regs *regs) 221void ftrace_syscall_enter(struct pt_regs *regs, long id)
146{ 222{
147 struct syscall_trace_enter *entry; 223 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data; 224 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event; 225 struct ring_buffer_event *event;
226 struct ring_buffer *buffer;
150 int size; 227 int size;
151 int syscall_nr; 228 int syscall_nr;
152 229
153 syscall_nr = syscall_get_nr(current, regs); 230 syscall_nr = syscall_get_nr(current, regs);
231 if (syscall_nr < 0)
232 return;
233 if (!test_bit(syscall_nr, enabled_enter_syscalls))
234 return;
154 235
155 sys_data = syscall_nr_to_meta(syscall_nr); 236 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data) 237 if (!sys_data)
@@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
158 239
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160 241
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, 242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
162 0, 0); 243 size, 0, 0);
163 if (!event) 244 if (!event)
164 return; 245 return;
165 246
@@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs)
167 entry->nr = syscall_nr; 248 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 249 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169 250
170 trace_current_buffer_unlock_commit(event, 0, 0); 251 if (!filter_current_check_discard(buffer, sys_data->enter_event,
171 trace_wake_up(); 252 entry, event))
253 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
172} 254}
173 255
174void ftrace_syscall_exit(struct pt_regs *regs) 256void ftrace_syscall_exit(struct pt_regs *regs, long ret)
175{ 257{
176 struct syscall_trace_exit *entry; 258 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data; 259 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event; 260 struct ring_buffer_event *event;
261 struct ring_buffer *buffer;
179 int syscall_nr; 262 int syscall_nr;
180 263
181 syscall_nr = syscall_get_nr(current, regs); 264 syscall_nr = syscall_get_nr(current, regs);
265 if (syscall_nr < 0)
266 return;
267 if (!test_bit(syscall_nr, enabled_exit_syscalls))
268 return;
182 269
183 sys_data = syscall_nr_to_meta(syscall_nr); 270 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data) 271 if (!sys_data)
185 return; 272 return;
186 273
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, 274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
188 sizeof(*entry), 0, 0); 275 sizeof(*entry), 0, 0);
189 if (!event) 276 if (!event)
190 return; 277 return;
@@ -193,58 +280,244 @@ void ftrace_syscall_exit(struct pt_regs *regs)
193 entry->nr = syscall_nr; 280 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs); 281 entry->ret = syscall_get_return_value(current, regs);
195 282
196 trace_current_buffer_unlock_commit(event, 0, 0); 283 if (!filter_current_check_discard(buffer, sys_data->exit_event,
197 trace_wake_up(); 284 entry, event))
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
198} 286}
199 287
200static int init_syscall_tracer(struct trace_array *tr) 288int reg_event_syscall_enter(void *ptr)
201{ 289{
202 start_ftrace_syscalls(); 290 int ret = 0;
291 int num;
292 char *name;
293
294 name = (char *)ptr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++;
307 }
308 mutex_unlock(&syscall_trace_lock);
309 return ret;
310}
311
312void unreg_event_syscall_enter(void *ptr)
313{
314 int num;
315 char *name;
203 316
204 return 0; 317 name = (char *)ptr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls)
320 return;
321 mutex_lock(&syscall_trace_lock);
322 sys_refcount_enter--;
323 clear_bit(num, enabled_enter_syscalls);
324 if (!sys_refcount_enter)
325 unregister_trace_sys_enter(ftrace_syscall_enter);
326 mutex_unlock(&syscall_trace_lock);
205} 327}
206 328
207static void reset_syscall_tracer(struct trace_array *tr) 329int reg_event_syscall_exit(void *ptr)
208{ 330{
209 stop_ftrace_syscalls(); 331 int ret = 0;
210 tracing_reset_online_cpus(tr); 332 int num;
333 char *name;
334
335 name = (char *)ptr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++;
348 }
349 mutex_unlock(&syscall_trace_lock);
350 return ret;
211} 351}
212 352
213static struct trace_event syscall_enter_event = { 353void unreg_event_syscall_exit(void *ptr)
214 .type = TRACE_SYSCALL_ENTER, 354{
215 .trace = print_syscall_enter, 355 int num;
216}; 356 char *name;
357
358 name = (char *)ptr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls)
361 return;
362 mutex_lock(&syscall_trace_lock);
363 sys_refcount_exit--;
364 clear_bit(num, enabled_exit_syscalls);
365 if (!sys_refcount_exit)
366 unregister_trace_sys_exit(ftrace_syscall_exit);
367 mutex_unlock(&syscall_trace_lock);
368}
217 369
218static struct trace_event syscall_exit_event = { 370struct trace_event event_syscall_enter = {
219 .type = TRACE_SYSCALL_EXIT, 371 .trace = print_syscall_enter,
220 .trace = print_syscall_exit,
221}; 372};
222 373
223static struct tracer syscall_tracer __read_mostly = { 374struct trace_event event_syscall_exit = {
224 .name = "syscall", 375 .trace = print_syscall_exit,
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228}; 376};
229 377
230__init int register_ftrace_syscalls(void) 378#ifdef CONFIG_EVENT_PROFILE
379
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
382static int sys_prof_refcount_enter;
383static int sys_prof_refcount_exit;
384
385static void prof_syscall_enter(struct pt_regs *regs, long id)
231{ 386{
232 int ret; 387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data;
389 int syscall_nr;
390 int size;
233 391
234 ret = register_ftrace_event(&syscall_enter_event); 392 syscall_nr = syscall_get_nr(current, regs);
235 if (!ret) { 393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
236 printk(KERN_WARNING "event %d failed to register\n", 394 return;
237 syscall_enter_event.type); 395
238 WARN_ON_ONCE(1); 396 sys_data = syscall_nr_to_meta(syscall_nr);
397 if (!sys_data)
398 return;
399
400 /* get the size after alignment with the u32 buffer size field */
401 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
402 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32);
404
405 do {
406 char raw_data[size];
407
408 /* zero the dead bytes from align to not leak stack to user */
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
410
411 rec = (struct syscall_trace_enter *) raw_data;
412 tracing_generic_entry_update(&rec->ent, 0, 0);
413 rec->ent.type = sys_data->enter_id;
414 rec->nr = syscall_nr;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
416 (unsigned long *)&rec->args);
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
418 } while(0);
419}
420
421int reg_prof_syscall_enter(char *name)
422{
423 int ret = 0;
424 int num;
425
426 num = syscall_name_to_nr(name);
427 if (num < 0 || num >= NR_syscalls)
428 return -ENOSYS;
429
430 mutex_lock(&syscall_trace_lock);
431 if (!sys_prof_refcount_enter)
432 ret = register_trace_sys_enter(prof_syscall_enter);
433 if (ret) {
434 pr_info("event trace: Could not activate"
435 "syscall entry trace point");
436 } else {
437 set_bit(num, enabled_prof_enter_syscalls);
438 sys_prof_refcount_enter++;
239 } 439 }
440 mutex_unlock(&syscall_trace_lock);
441 return ret;
442}
240 443
241 ret = register_ftrace_event(&syscall_exit_event); 444void unreg_prof_syscall_enter(char *name)
242 if (!ret) { 445{
243 printk(KERN_WARNING "event %d failed to register\n", 446 int num;
244 syscall_exit_event.type); 447
245 WARN_ON_ONCE(1); 448 num = syscall_name_to_nr(name);
449 if (num < 0 || num >= NR_syscalls)
450 return;
451
452 mutex_lock(&syscall_trace_lock);
453 sys_prof_refcount_enter--;
454 clear_bit(num, enabled_prof_enter_syscalls);
455 if (!sys_prof_refcount_enter)
456 unregister_trace_sys_enter(prof_syscall_enter);
457 mutex_unlock(&syscall_trace_lock);
458}
459
460static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{
462 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec;
464 int syscall_nr;
465
466 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
468 return;
469
470 sys_data = syscall_nr_to_meta(syscall_nr);
471 if (!sys_data)
472 return;
473
474 tracing_generic_entry_update(&rec.ent, 0, 0);
475 rec.ent.type = sys_data->exit_id;
476 rec.nr = syscall_nr;
477 rec.ret = syscall_get_return_value(current, regs);
478
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
480}
481
482int reg_prof_syscall_exit(char *name)
483{
484 int ret = 0;
485 int num;
486
487 num = syscall_name_to_nr(name);
488 if (num < 0 || num >= NR_syscalls)
489 return -ENOSYS;
490
491 mutex_lock(&syscall_trace_lock);
492 if (!sys_prof_refcount_exit)
493 ret = register_trace_sys_exit(prof_syscall_exit);
494 if (ret) {
495 pr_info("event trace: Could not activate"
496 "syscall entry trace point");
497 } else {
498 set_bit(num, enabled_prof_exit_syscalls);
499 sys_prof_refcount_exit++;
246 } 500 }
501 mutex_unlock(&syscall_trace_lock);
502 return ret;
503}
247 504
248 return register_tracer(&syscall_tracer); 505void unreg_prof_syscall_exit(char *name)
506{
507 int num;
508
509 num = syscall_name_to_nr(name);
510 if (num < 0 || num >= NR_syscalls)
511 return;
512
513 mutex_lock(&syscall_trace_lock);
514 sys_prof_refcount_exit--;
515 clear_bit(num, enabled_prof_exit_syscalls);
516 if (!sys_prof_refcount_exit)
517 unregister_trace_sys_exit(prof_syscall_exit);
518 mutex_unlock(&syscall_trace_lock);
249} 519}
250device_initcall(register_ftrace_syscalls); 520
521#endif
522
523
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1ef5d3a601c7..9489a0a9b1be 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
24#include <linux/tracepoint.h> 24#include <linux/tracepoint.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h>
27 28
28extern struct tracepoint __start___tracepoints[]; 29extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[]; 30extern struct tracepoint __stop___tracepoints[];
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
242{ 243{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 244 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244 245
246 if (elem->regfunc && !elem->state && active)
247 elem->regfunc();
248 else if (elem->unregfunc && elem->state && !active)
249 elem->unregfunc();
250
245 /* 251 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 252 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it. 253 * probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
261 */ 267 */
262static void disable_tracepoint(struct tracepoint *elem) 268static void disable_tracepoint(struct tracepoint *elem)
263{ 269{
270 if (elem->unregfunc && elem->state)
271 elem->unregfunc();
272
264 elem->state = 0; 273 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL); 274 rcu_assign_pointer(elem->funcs, NULL);
266} 275}
@@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
554 563
555 switch (val) { 564 switch (val) {
556 case MODULE_STATE_COMING: 565 case MODULE_STATE_COMING:
557 tracepoint_update_probe_range(mod->tracepoints,
558 mod->tracepoints + mod->num_tracepoints);
559 break;
560 case MODULE_STATE_GOING: 566 case MODULE_STATE_GOING:
561 tracepoint_update_probe_range(mod->tracepoints, 567 tracepoint_update_probe_range(mod->tracepoints,
562 mod->tracepoints + mod->num_tracepoints); 568 mod->tracepoints + mod->num_tracepoints);
@@ -577,3 +583,41 @@ static int init_tracepoints(void)
577__initcall(init_tracepoints); 583__initcall(init_tracepoints);
578 584
579#endif /* CONFIG_MODULES */ 585#endif /* CONFIG_MODULES */
586
587#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
588
589/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
590static int sys_tracepoint_refcount;
591
592void syscall_regfunc(void)
593{
594 unsigned long flags;
595 struct task_struct *g, *t;
596
597 if (!sys_tracepoint_refcount) {
598 read_lock_irqsave(&tasklist_lock, flags);
599 do_each_thread(g, t) {
600 /* Skip kernel threads. */
601 if (t->mm)
602 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
603 } while_each_thread(g, t);
604 read_unlock_irqrestore(&tasklist_lock, flags);
605 }
606 sys_tracepoint_refcount++;
607}
608
609void syscall_unregfunc(void)
610{
611 unsigned long flags;
612 struct task_struct *g, *t;
613
614 sys_tracepoint_refcount--;
615 if (!sys_tracepoint_refcount) {
616 read_lock_irqsave(&tasklist_lock, flags);
617 do_each_thread(g, t) {
618 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
619 } while_each_thread(g, t);
620 read_unlock_irqrestore(&tasklist_lock, flags);
621 }
622}
623#endif
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 75 put_user_ns(up->user_ns);
76} 76}
77 77
78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
79{
80 struct user_struct *user;
81 struct hlist_node *h;
82
83 hlist_for_each_entry(user, h, hashent, uidhash_node) {
84 if (user->uid == uid) {
85 atomic_inc(&user->__count);
86 return user;
87 }
88 }
89
90 return NULL;
91}
92
93#ifdef CONFIG_USER_SCHED 78#ifdef CONFIG_USER_SCHED
94 79
95static void sched_destroy_user(struct user_struct *up) 80static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
119 104
120#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) 105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
121 106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
122static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ 124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
123static DEFINE_MUTEX(uids_mutex); 125static DEFINE_MUTEX(uids_mutex);
124 126
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
283 return uids_user_create(&root_user); 285 return uids_user_create(&root_user);
284} 286}
285 287
286/* work function to remove sysfs directory for a user and free up 288/* delayed work function to remove sysfs directory for a user and free up
287 * corresponding structures. 289 * corresponding structures.
288 */ 290 */
289static void cleanup_user_struct(struct work_struct *w) 291static void cleanup_user_struct(struct work_struct *w)
290{ 292{
291 struct user_struct *up = container_of(w, struct user_struct, work); 293 struct user_struct *up = container_of(w, struct user_struct, work.work);
292 unsigned long flags; 294 unsigned long flags;
293 int remove_user = 0; 295 int remove_user = 0;
294 296
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
297 */ 299 */
298 uids_mutex_lock(); 300 uids_mutex_lock();
299 301
300 local_irq_save(flags); 302 spin_lock_irqsave(&uidhash_lock, flags);
301 303 if (atomic_read(&up->__count) == 0) {
302 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
303 uid_hash_remove(up); 304 uid_hash_remove(up);
304 remove_user = 1; 305 remove_user = 1;
305 spin_unlock_irqrestore(&uidhash_lock, flags);
306 } else {
307 local_irq_restore(flags);
308 } 306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
309 308
310 if (!remove_user) 309 if (!remove_user)
311 goto done; 310 goto done;
@@ -331,16 +330,28 @@ done:
331 */ 330 */
332static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
333{ 332{
334 /* restore back the count */
335 atomic_inc(&up->__count);
336 spin_unlock_irqrestore(&uidhash_lock, flags); 333 spin_unlock_irqrestore(&uidhash_lock, flags);
337 334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
338 INIT_WORK(&up->work, cleanup_user_struct); 335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
339 schedule_work(&up->work);
340} 336}
341 337
342#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
343 339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{
342 struct user_struct *user;
343 struct hlist_node *h;
344
345 hlist_for_each_entry(user, h, hashent, uidhash_node) {
346 if (user->uid == uid) {
347 atomic_inc(&user->__count);
348 return user;
349 }
350 }
351
352 return NULL;
353}
354
344int uids_sysfs_init(void) { return 0; } 355int uids_sysfs_init(void) { return 0; }
345static inline int uids_user_create(struct user_struct *up) { return 0; } 356static inline int uids_user_create(struct user_struct *up) { return 0; }
346static inline void uids_mutex_lock(void) { } 357static inline void uids_mutex_lock(void) { }
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af8..8a82b4b8ea52 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18static struct uts_namespace *create_uts_ns(void)
19{
20 struct uts_namespace *uts_ns;
21
22 uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
23 if (uts_ns)
24 kref_init(&uts_ns->kref);
25 return uts_ns;
26}
27
18/* 28/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 29 * Clone a new ns copying an original utsname, setting refcount to 1
20 * @old_ns: namespace to clone 30 * @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24{ 34{
25 struct uts_namespace *ns; 35 struct uts_namespace *ns;
26 36
27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 37 ns = create_uts_ns();
28 if (!ns) 38 if (!ns)
29 return ERR_PTR(-ENOMEM); 39 return ERR_PTR(-ENOMEM);
30 40
31 down_read(&uts_sem); 41 down_read(&uts_sem);
32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem); 43 up_read(&uts_sem);
34 kref_init(&ns->kref);
35 return ns; 44 return ns;
36} 45}
37 46
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4275cf..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void init_waitqueue_head(wait_queue_head_t *q) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key);
16 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
17} 18}
18 19
19EXPORT_SYMBOL(init_waitqueue_head); 20EXPORT_SYMBOL(__init_waitqueue_head);
20 21
21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 22void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 23{
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&
@@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
600 * schedule_work - put work task in global workqueue 598 * schedule_work - put work task in global workqueue
601 * @work: job to be done 599 * @work: job to be done
602 * 600 *
603 * This puts a job in the kernel-global workqueue. 601 * Returns zero if @work was already on the kernel-global workqueue and
602 * non-zero otherwise.
603 *
604 * This puts a job in the kernel-global workqueue if it was not already
605 * queued and leaves it in the same position on the kernel-global
606 * workqueue otherwise.
604 */ 607 */
605int schedule_work(struct work_struct *work) 608int schedule_work(struct work_struct *work)
606{ 609{