aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/audit.c32
-rw-r--r--kernel/auditsc.c279
-rw-r--r--kernel/capability.c288
-rw-r--r--kernel/cgroup.c27
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c588
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/exit.c32
-rw-r--r--kernel/extable.c21
-rw-r--r--kernel/fork.c96
-rw-r--r--kernel/futex.c371
-rw-r--r--kernel/futex_compat.c7
-rw-r--r--kernel/hrtimer.c331
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/handle.c189
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c27
-rw-r--r--kernel/irq/numa_migrate.c122
-rw-r--r--kernel/irq/proc.c6
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kmod.c30
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c61
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/marker.c192
-rw-r--r--kernel/module.c13
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c32
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c46
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/rcuclassic.c4
-rw-r--r--kernel/rcupreempt.c10
-rw-r--r--kernel/rcupreempt_trace.c10
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c1535
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/relay.c7
-rw-r--r--kernel/resource.c9
-rw-r--r--kernel/sched.c414
-rw-r--r--kernel/sched_clock.c6
-rw-r--r--kernel/sched_debug.c57
-rw-r--r--kernel/sched_fair.c9
-rw-r--r--kernel/sched_rt.c9
-rw-r--r--kernel/sched_stats.h5
-rw-r--r--kernel/signal.c62
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/stacktrace.c11
-rw-r--r--kernel/sys.c588
-rw-r--r--kernel/sysctl.c36
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/tick-sched.c44
-rw-r--r--kernel/time/timekeeping.c22
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/Kconfig115
-rw-r--r--kernel/trace/Makefile9
-rw-r--r--kernel/trace/ftrace.c929
-rw-r--r--kernel/trace/ring_buffer.c709
-rw-r--r--kernel/trace/trace.c978
-rw-r--r--kernel/trace/trace.h265
-rw-r--r--kernel/trace/trace_boot.c158
-rw-r--r--kernel/trace/trace_branch.c342
-rw-r--r--kernel/trace/trace_functions.c30
-rw-r--r--kernel/trace/trace_functions_graph.c669
-rw-r--r--kernel/trace/trace_hw_branches.c195
-rw-r--r--kernel/trace/trace_irqsoff.c61
-rw-r--r--kernel/trace/trace_mmiotrace.c33
-rw-r--r--kernel/trace/trace_nop.c65
-rw-r--r--kernel/trace/trace_power.c179
-rw-r--r--kernel/trace/trace_sched_switch.c121
-rw-r--r--kernel/trace/trace_sched_wakeup.c72
-rw-r--r--kernel/trace/trace_selftest.c173
-rw-r--r--kernel/trace/trace_stack.c70
-rw-r--r--kernel/trace/trace_sysprof.c32
-rw-r--r--kernel/tracepoint.c295
-rw-r--r--kernel/tsacct.c6
-rw-r--r--kernel/uid16.c31
-rw-r--r--kernel/user.c98
-rw-r--r--kernel/user_namespace.c65
-rw-r--r--kernel/workqueue.c8
95 files changed, 9268 insertions, 2707 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,28 +52,3 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
69config RCU_TRACE
70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
72 select DEBUG_FS
73 default y
74 help
75 This option provides tracing in RCU which presents stats
76 in debugfs for debugging RCU implementation.
77
78 Say Y here if you want to enable RCU tracing
79 Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d..e1c5bf3365c0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
13 13
14ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
@@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
19CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
20CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
21CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -pg
23endif 22endif
24 23
25obj-$(CONFIG_FREEZER) += freezer.o 24obj-$(CONFIG_FREEZER) += freezer.o
@@ -74,10 +73,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
74obj-$(CONFIG_SECCOMP) += seccomp.o 73obj-$(CONFIG_SECCOMP) += seccomp.o
75obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
76obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o 75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
76obj-$(CONFIG_TREE_RCU) += rcutree.o
77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
78ifeq ($(CONFIG_PREEMPT_RCU),y) 78obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
79obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o 79obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
80endif
81obj-$(CONFIG_RELAY) += relay.o 80obj-$(CONFIG_RELAY) += relay.o
82obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
83obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 89obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 90obj-$(CONFIG_SMP) += sched_cpupri.o
92 91
93ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 92ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
95# needed for x86 only. Why this used to be enabled for all architectures is beyond 94# needed for x86 only. Why this used to be enabled for all architectures is beyond
96# me. I suspect most platforms don't need this, but until we know that for sure 95# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/acct.c b/kernel/acct.c
index f6006a60df5d..d57b7cbb98b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct,
530 do_div(elapsed, AHZ); 530 do_div(elapsed, AHZ);
531 ac.ac_btime = get_seconds() - elapsed; 531 ac.ac_btime = get_seconds() - elapsed;
532 /* we really need to bite the bullet and change layout */ 532 /* we really need to bite the bullet and change layout */
533 ac.ac_uid = current->uid; 533 current_uid_gid(&ac.ac_uid, &ac.ac_gid);
534 ac.ac_gid = current->gid;
535#if ACCT_VERSION==2 534#if ACCT_VERSION==2
536 ac.ac_ahz = AHZ; 535 ac.ac_ahz = AHZ;
537#endif 536#endif
538#if ACCT_VERSION==1 || ACCT_VERSION==2 537#if ACCT_VERSION==1 || ACCT_VERSION==2
539 /* backward-compatible 16 bit fields */ 538 /* backward-compatible 16 bit fields */
540 ac.ac_uid16 = current->uid; 539 ac.ac_uid16 = ac.ac_uid;
541 ac.ac_gid16 = current->gid; 540 ac.ac_gid16 = ac.ac_gid;
542#endif 541#endif
543#if ACCT_VERSION==3 542#if ACCT_VERSION==3
544 ac.ac_pid = task_tgid_nr_ns(current, ns); 543 ac.ac_pid = task_tgid_nr_ns(current, ns);
diff --git a/kernel/audit.c b/kernel/audit.c
index 4414e93d8750..ce6d8ea3131e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,8 +61,11 @@
61 61
62#include "audit.h" 62#include "audit.h"
63 63
64/* No auditing will take place until audit_initialized != 0. 64/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
65 * (Initialization happens after skb_init is called.) */ 65 * (Initialization happens after skb_init is called.) */
66#define AUDIT_DISABLED -1
67#define AUDIT_UNINITIALIZED 0
68#define AUDIT_INITIALIZED 1
66static int audit_initialized; 69static int audit_initialized;
67 70
68#define AUDIT_OFF 0 71#define AUDIT_OFF 0
@@ -965,6 +968,9 @@ static int __init audit_init(void)
965{ 968{
966 int i; 969 int i;
967 970
971 if (audit_initialized == AUDIT_DISABLED)
972 return 0;
973
968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 974 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
969 audit_default ? "enabled" : "disabled"); 975 audit_default ? "enabled" : "disabled");
970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, 976 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
@@ -976,7 +982,7 @@ static int __init audit_init(void)
976 982
977 skb_queue_head_init(&audit_skb_queue); 983 skb_queue_head_init(&audit_skb_queue);
978 skb_queue_head_init(&audit_skb_hold_queue); 984 skb_queue_head_init(&audit_skb_hold_queue);
979 audit_initialized = 1; 985 audit_initialized = AUDIT_INITIALIZED;
980 audit_enabled = audit_default; 986 audit_enabled = audit_default;
981 audit_ever_enabled |= !!audit_default; 987 audit_ever_enabled |= !!audit_default;
982 988
@@ -999,13 +1005,21 @@ __initcall(audit_init);
999static int __init audit_enable(char *str) 1005static int __init audit_enable(char *str)
1000{ 1006{
1001 audit_default = !!simple_strtol(str, NULL, 0); 1007 audit_default = !!simple_strtol(str, NULL, 0);
1002 printk(KERN_INFO "audit: %s%s\n", 1008 if (!audit_default)
1003 audit_default ? "enabled" : "disabled", 1009 audit_initialized = AUDIT_DISABLED;
1004 audit_initialized ? "" : " (after initialization)"); 1010
1005 if (audit_initialized) { 1011 printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
1012
1013 if (audit_initialized == AUDIT_INITIALIZED) {
1006 audit_enabled = audit_default; 1014 audit_enabled = audit_default;
1007 audit_ever_enabled |= !!audit_default; 1015 audit_ever_enabled |= !!audit_default;
1016 } else if (audit_initialized == AUDIT_UNINITIALIZED) {
1017 printk(" (after initialization)");
1018 } else {
1019 printk(" (until reboot)");
1008 } 1020 }
1021 printk("\n");
1022
1009 return 1; 1023 return 1;
1010} 1024}
1011 1025
@@ -1107,9 +1121,7 @@ unsigned int audit_serial(void)
1107static inline void audit_get_stamp(struct audit_context *ctx, 1121static inline void audit_get_stamp(struct audit_context *ctx,
1108 struct timespec *t, unsigned int *serial) 1122 struct timespec *t, unsigned int *serial)
1109{ 1123{
1110 if (ctx) 1124 if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
1111 auditsc_get_stamp(ctx, t, serial);
1112 else {
1113 *t = CURRENT_TIME; 1125 *t = CURRENT_TIME;
1114 *serial = audit_serial(); 1126 *serial = audit_serial();
1115 } 1127 }
@@ -1146,7 +1158,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1146 int reserve; 1158 int reserve;
1147 unsigned long timeout_start = jiffies; 1159 unsigned long timeout_start = jiffies;
1148 1160
1149 if (!audit_initialized) 1161 if (audit_initialized != AUDIT_INITIALIZED)
1150 return NULL; 1162 return NULL;
1151 1163
1152 if (unlikely(audit_filter_type(type))) 1164 if (unlikely(audit_filter_type(type)))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c3..4819f3711973 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,6 +65,7 @@
65#include <linux/highmem.h> 65#include <linux/highmem.h>
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h>
68 69
69#include "audit.h" 70#include "audit.h"
70 71
@@ -84,6 +85,15 @@ int audit_n_rules;
84/* determines whether we collect data for signals sent */ 85/* determines whether we collect data for signals sent */
85int audit_signals; 86int audit_signals;
86 87
88struct audit_cap_data {
89 kernel_cap_t permitted;
90 kernel_cap_t inheritable;
91 union {
92 unsigned int fE; /* effective bit of a file capability */
93 kernel_cap_t effective; /* effective set of a process */
94 };
95};
96
87/* When fs/namei.c:getname() is called, we store the pointer in name and 97/* When fs/namei.c:getname() is called, we store the pointer in name and
88 * we don't let putname() free it (instead we free all of the saved 98 * we don't let putname() free it (instead we free all of the saved
89 * pointers at syscall exit time). 99 * pointers at syscall exit time).
@@ -100,6 +110,8 @@ struct audit_names {
100 gid_t gid; 110 gid_t gid;
101 dev_t rdev; 111 dev_t rdev;
102 u32 osid; 112 u32 osid;
113 struct audit_cap_data fcap;
114 unsigned int fcap_ver;
103}; 115};
104 116
105struct audit_aux_data { 117struct audit_aux_data {
@@ -184,6 +196,20 @@ struct audit_aux_data_pids {
184 int pid_count; 196 int pid_count;
185}; 197};
186 198
199struct audit_aux_data_bprm_fcaps {
200 struct audit_aux_data d;
201 struct audit_cap_data fcap;
202 unsigned int fcap_ver;
203 struct audit_cap_data old_pcap;
204 struct audit_cap_data new_pcap;
205};
206
207struct audit_aux_data_capset {
208 struct audit_aux_data d;
209 pid_t pid;
210 struct audit_cap_data cap;
211};
212
187struct audit_tree_refs { 213struct audit_tree_refs {
188 struct audit_tree_refs *next; 214 struct audit_tree_refs *next;
189 struct audit_chunk *c[31]; 215 struct audit_chunk *c[31];
@@ -421,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
421 struct audit_names *name, 447 struct audit_names *name,
422 enum audit_state *state) 448 enum audit_state *state)
423{ 449{
450 const struct cred *cred = get_task_cred(tsk);
424 int i, j, need_sid = 1; 451 int i, j, need_sid = 1;
425 u32 sid; 452 u32 sid;
426 453
@@ -440,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk,
440 } 467 }
441 break; 468 break;
442 case AUDIT_UID: 469 case AUDIT_UID:
443 result = audit_comparator(tsk->uid, f->op, f->val); 470 result = audit_comparator(cred->uid, f->op, f->val);
444 break; 471 break;
445 case AUDIT_EUID: 472 case AUDIT_EUID:
446 result = audit_comparator(tsk->euid, f->op, f->val); 473 result = audit_comparator(cred->euid, f->op, f->val);
447 break; 474 break;
448 case AUDIT_SUID: 475 case AUDIT_SUID:
449 result = audit_comparator(tsk->suid, f->op, f->val); 476 result = audit_comparator(cred->suid, f->op, f->val);
450 break; 477 break;
451 case AUDIT_FSUID: 478 case AUDIT_FSUID:
452 result = audit_comparator(tsk->fsuid, f->op, f->val); 479 result = audit_comparator(cred->fsuid, f->op, f->val);
453 break; 480 break;
454 case AUDIT_GID: 481 case AUDIT_GID:
455 result = audit_comparator(tsk->gid, f->op, f->val); 482 result = audit_comparator(cred->gid, f->op, f->val);
456 break; 483 break;
457 case AUDIT_EGID: 484 case AUDIT_EGID:
458 result = audit_comparator(tsk->egid, f->op, f->val); 485 result = audit_comparator(cred->egid, f->op, f->val);
459 break; 486 break;
460 case AUDIT_SGID: 487 case AUDIT_SGID:
461 result = audit_comparator(tsk->sgid, f->op, f->val); 488 result = audit_comparator(cred->sgid, f->op, f->val);
462 break; 489 break;
463 case AUDIT_FSGID: 490 case AUDIT_FSGID:
464 result = audit_comparator(tsk->fsgid, f->op, f->val); 491 result = audit_comparator(cred->fsgid, f->op, f->val);
465 break; 492 break;
466 case AUDIT_PERS: 493 case AUDIT_PERS:
467 result = audit_comparator(tsk->personality, f->op, f->val); 494 result = audit_comparator(tsk->personality, f->op, f->val);
@@ -615,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk,
615 break; 642 break;
616 } 643 }
617 644
618 if (!result) 645 if (!result) {
646 put_cred(cred);
619 return 0; 647 return 0;
648 }
620 } 649 }
621 if (rule->filterkey && ctx) 650 if (rule->filterkey && ctx)
622 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 651 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -624,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk,
624 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 653 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
625 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 654 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
626 } 655 }
656 put_cred(cred);
627 return 1; 657 return 1;
628} 658}
629 659
@@ -1171,8 +1201,38 @@ static void audit_log_execve_info(struct audit_context *context,
1171 kfree(buf); 1201 kfree(buf);
1172} 1202}
1173 1203
1204static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1205{
1206 int i;
1207
1208 audit_log_format(ab, " %s=", prefix);
1209 CAP_FOR_EACH_U32(i) {
1210 audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
1211 }
1212}
1213
1214static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1215{
1216 kernel_cap_t *perm = &name->fcap.permitted;
1217 kernel_cap_t *inh = &name->fcap.inheritable;
1218 int log = 0;
1219
1220 if (!cap_isclear(*perm)) {
1221 audit_log_cap(ab, "cap_fp", perm);
1222 log = 1;
1223 }
1224 if (!cap_isclear(*inh)) {
1225 audit_log_cap(ab, "cap_fi", inh);
1226 log = 1;
1227 }
1228
1229 if (log)
1230 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1231}
1232
1174static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1233static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1175{ 1234{
1235 const struct cred *cred;
1176 int i, call_panic = 0; 1236 int i, call_panic = 0;
1177 struct audit_buffer *ab; 1237 struct audit_buffer *ab;
1178 struct audit_aux_data *aux; 1238 struct audit_aux_data *aux;
@@ -1182,14 +1242,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1182 context->pid = tsk->pid; 1242 context->pid = tsk->pid;
1183 if (!context->ppid) 1243 if (!context->ppid)
1184 context->ppid = sys_getppid(); 1244 context->ppid = sys_getppid();
1185 context->uid = tsk->uid; 1245 cred = current_cred();
1186 context->gid = tsk->gid; 1246 context->uid = cred->uid;
1187 context->euid = tsk->euid; 1247 context->gid = cred->gid;
1188 context->suid = tsk->suid; 1248 context->euid = cred->euid;
1189 context->fsuid = tsk->fsuid; 1249 context->suid = cred->suid;
1190 context->egid = tsk->egid; 1250 context->fsuid = cred->fsuid;
1191 context->sgid = tsk->sgid; 1251 context->egid = cred->egid;
1192 context->fsgid = tsk->fsgid; 1252 context->sgid = cred->sgid;
1253 context->fsgid = cred->fsgid;
1193 context->personality = tsk->personality; 1254 context->personality = tsk->personality;
1194 1255
1195 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 1256 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1334,6 +1395,28 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1334 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); 1395 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
1335 break; } 1396 break; }
1336 1397
1398 case AUDIT_BPRM_FCAPS: {
1399 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1400 audit_log_format(ab, "fver=%x", axs->fcap_ver);
1401 audit_log_cap(ab, "fp", &axs->fcap.permitted);
1402 audit_log_cap(ab, "fi", &axs->fcap.inheritable);
1403 audit_log_format(ab, " fe=%d", axs->fcap.fE);
1404 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
1405 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
1406 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
1407 audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
1408 audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
1409 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
1410 break; }
1411
1412 case AUDIT_CAPSET: {
1413 struct audit_aux_data_capset *axs = (void *)aux;
1414 audit_log_format(ab, "pid=%d", axs->pid);
1415 audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
1416 audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
1417 audit_log_cap(ab, "cap_pe", &axs->cap.effective);
1418 break; }
1419
1337 } 1420 }
1338 audit_log_end(ab); 1421 audit_log_end(ab);
1339 } 1422 }
@@ -1421,6 +1504,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1421 } 1504 }
1422 } 1505 }
1423 1506
1507 audit_log_fcaps(ab, n);
1508
1424 audit_log_end(ab); 1509 audit_log_end(ab);
1425 } 1510 }
1426 1511
@@ -1459,7 +1544,6 @@ void audit_free(struct task_struct *tsk)
1459 1544
1460/** 1545/**
1461 * audit_syscall_entry - fill in an audit record at syscall entry 1546 * audit_syscall_entry - fill in an audit record at syscall entry
1462 * @tsk: task being audited
1463 * @arch: architecture type 1547 * @arch: architecture type
1464 * @major: major syscall type (function) 1548 * @major: major syscall type (function)
1465 * @a1: additional syscall register 1 1549 * @a1: additional syscall register 1
@@ -1548,9 +1632,25 @@ void audit_syscall_entry(int arch, int major,
1548 context->ppid = 0; 1632 context->ppid = 0;
1549} 1633}
1550 1634
1635void audit_finish_fork(struct task_struct *child)
1636{
1637 struct audit_context *ctx = current->audit_context;
1638 struct audit_context *p = child->audit_context;
1639 if (!p || !ctx || !ctx->auditable)
1640 return;
1641 p->arch = ctx->arch;
1642 p->major = ctx->major;
1643 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1644 p->ctime = ctx->ctime;
1645 p->dummy = ctx->dummy;
1646 p->auditable = ctx->auditable;
1647 p->in_syscall = ctx->in_syscall;
1648 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1649 p->ppid = current->pid;
1650}
1651
1551/** 1652/**
1552 * audit_syscall_exit - deallocate audit context after a system call 1653 * audit_syscall_exit - deallocate audit context after a system call
1553 * @tsk: task being audited
1554 * @valid: success/failure flag 1654 * @valid: success/failure flag
1555 * @return_code: syscall return value 1655 * @return_code: syscall return value
1556 * 1656 *
@@ -1787,8 +1887,36 @@ static int audit_inc_name_count(struct audit_context *context,
1787 return 0; 1887 return 0;
1788} 1888}
1789 1889
1890
1891static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
1892{
1893 struct cpu_vfs_cap_data caps;
1894 int rc;
1895
1896 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1897 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1898 name->fcap.fE = 0;
1899 name->fcap_ver = 0;
1900
1901 if (!dentry)
1902 return 0;
1903
1904 rc = get_vfs_caps_from_disk(dentry, &caps);
1905 if (rc)
1906 return rc;
1907
1908 name->fcap.permitted = caps.permitted;
1909 name->fcap.inheritable = caps.inheritable;
1910 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
1911 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
1912
1913 return 0;
1914}
1915
1916
1790/* Copy inode data into an audit_names. */ 1917/* Copy inode data into an audit_names. */
1791static void audit_copy_inode(struct audit_names *name, const struct inode *inode) 1918static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
1919 const struct inode *inode)
1792{ 1920{
1793 name->ino = inode->i_ino; 1921 name->ino = inode->i_ino;
1794 name->dev = inode->i_sb->s_dev; 1922 name->dev = inode->i_sb->s_dev;
@@ -1797,6 +1925,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
1797 name->gid = inode->i_gid; 1925 name->gid = inode->i_gid;
1798 name->rdev = inode->i_rdev; 1926 name->rdev = inode->i_rdev;
1799 security_inode_getsecid(inode, &name->osid); 1927 security_inode_getsecid(inode, &name->osid);
1928 audit_copy_fcaps(name, dentry);
1800} 1929}
1801 1930
1802/** 1931/**
@@ -1831,7 +1960,7 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1831 context->names[idx].name = NULL; 1960 context->names[idx].name = NULL;
1832 } 1961 }
1833 handle_path(dentry); 1962 handle_path(dentry);
1834 audit_copy_inode(&context->names[idx], inode); 1963 audit_copy_inode(&context->names[idx], dentry, inode);
1835} 1964}
1836 1965
1837/** 1966/**
@@ -1892,7 +2021,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
1892 if (!strcmp(dname, n->name) || 2021 if (!strcmp(dname, n->name) ||
1893 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2022 !audit_compare_dname_path(dname, n->name, &dirlen)) {
1894 if (inode) 2023 if (inode)
1895 audit_copy_inode(n, inode); 2024 audit_copy_inode(n, NULL, inode);
1896 else 2025 else
1897 n->ino = (unsigned long)-1; 2026 n->ino = (unsigned long)-1;
1898 found_child = n->name; 2027 found_child = n->name;
@@ -1906,7 +2035,7 @@ add_names:
1906 return; 2035 return;
1907 idx = context->name_count - 1; 2036 idx = context->name_count - 1;
1908 context->names[idx].name = NULL; 2037 context->names[idx].name = NULL;
1909 audit_copy_inode(&context->names[idx], parent); 2038 audit_copy_inode(&context->names[idx], NULL, parent);
1910 } 2039 }
1911 2040
1912 if (!found_child) { 2041 if (!found_child) {
@@ -1927,7 +2056,7 @@ add_names:
1927 } 2056 }
1928 2057
1929 if (inode) 2058 if (inode)
1930 audit_copy_inode(&context->names[idx], inode); 2059 audit_copy_inode(&context->names[idx], NULL, inode);
1931 else 2060 else
1932 context->names[idx].ino = (unsigned long)-1; 2061 context->names[idx].ino = (unsigned long)-1;
1933 } 2062 }
@@ -1942,15 +2071,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
1942 * 2071 *
1943 * Also sets the context as auditable. 2072 * Also sets the context as auditable.
1944 */ 2073 */
1945void auditsc_get_stamp(struct audit_context *ctx, 2074int auditsc_get_stamp(struct audit_context *ctx,
1946 struct timespec *t, unsigned int *serial) 2075 struct timespec *t, unsigned int *serial)
1947{ 2076{
2077 if (!ctx->in_syscall)
2078 return 0;
1948 if (!ctx->serial) 2079 if (!ctx->serial)
1949 ctx->serial = audit_serial(); 2080 ctx->serial = audit_serial();
1950 t->tv_sec = ctx->ctime.tv_sec; 2081 t->tv_sec = ctx->ctime.tv_sec;
1951 t->tv_nsec = ctx->ctime.tv_nsec; 2082 t->tv_nsec = ctx->ctime.tv_nsec;
1952 *serial = ctx->serial; 2083 *serial = ctx->serial;
1953 ctx->auditable = 1; 2084 ctx->auditable = 1;
2085 return 1;
1954} 2086}
1955 2087
1956/* global counter which is incremented every time something logs in */ 2088/* global counter which is incremented every time something logs in */
@@ -1978,7 +2110,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1978 audit_log_format(ab, "login pid=%d uid=%u " 2110 audit_log_format(ab, "login pid=%d uid=%u "
1979 "old auid=%u new auid=%u" 2111 "old auid=%u new auid=%u"
1980 " old ses=%u new ses=%u", 2112 " old ses=%u new ses=%u",
1981 task->pid, task->uid, 2113 task->pid, task_uid(task),
1982 task->loginuid, loginuid, 2114 task->loginuid, loginuid,
1983 task->sessionid, sessionid); 2115 task->sessionid, sessionid);
1984 audit_log_end(ab); 2116 audit_log_end(ab);
@@ -2361,7 +2493,7 @@ void __audit_ptrace(struct task_struct *t)
2361 2493
2362 context->target_pid = t->pid; 2494 context->target_pid = t->pid;
2363 context->target_auid = audit_get_loginuid(t); 2495 context->target_auid = audit_get_loginuid(t);
2364 context->target_uid = t->uid; 2496 context->target_uid = task_uid(t);
2365 context->target_sessionid = audit_get_sessionid(t); 2497 context->target_sessionid = audit_get_sessionid(t);
2366 security_task_getsecid(t, &context->target_sid); 2498 security_task_getsecid(t, &context->target_sid);
2367 memcpy(context->target_comm, t->comm, TASK_COMM_LEN); 2499 memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2380,6 +2512,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2380 struct audit_aux_data_pids *axp; 2512 struct audit_aux_data_pids *axp;
2381 struct task_struct *tsk = current; 2513 struct task_struct *tsk = current;
2382 struct audit_context *ctx = tsk->audit_context; 2514 struct audit_context *ctx = tsk->audit_context;
2515 uid_t uid = current_uid(), t_uid = task_uid(t);
2383 2516
2384 if (audit_pid && t->tgid == audit_pid) { 2517 if (audit_pid && t->tgid == audit_pid) {
2385 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2518 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
@@ -2387,7 +2520,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2387 if (tsk->loginuid != -1) 2520 if (tsk->loginuid != -1)
2388 audit_sig_uid = tsk->loginuid; 2521 audit_sig_uid = tsk->loginuid;
2389 else 2522 else
2390 audit_sig_uid = tsk->uid; 2523 audit_sig_uid = uid;
2391 security_task_getsecid(tsk, &audit_sig_sid); 2524 security_task_getsecid(tsk, &audit_sig_sid);
2392 } 2525 }
2393 if (!audit_signals || audit_dummy_context()) 2526 if (!audit_signals || audit_dummy_context())
@@ -2399,7 +2532,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2399 if (!ctx->target_pid) { 2532 if (!ctx->target_pid) {
2400 ctx->target_pid = t->tgid; 2533 ctx->target_pid = t->tgid;
2401 ctx->target_auid = audit_get_loginuid(t); 2534 ctx->target_auid = audit_get_loginuid(t);
2402 ctx->target_uid = t->uid; 2535 ctx->target_uid = t_uid;
2403 ctx->target_sessionid = audit_get_sessionid(t); 2536 ctx->target_sessionid = audit_get_sessionid(t);
2404 security_task_getsecid(t, &ctx->target_sid); 2537 security_task_getsecid(t, &ctx->target_sid);
2405 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); 2538 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2420,7 +2553,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2420 2553
2421 axp->target_pid[axp->pid_count] = t->tgid; 2554 axp->target_pid[axp->pid_count] = t->tgid;
2422 axp->target_auid[axp->pid_count] = audit_get_loginuid(t); 2555 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2423 axp->target_uid[axp->pid_count] = t->uid; 2556 axp->target_uid[axp->pid_count] = t_uid;
2424 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); 2557 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
2425 security_task_getsecid(t, &axp->target_sid[axp->pid_count]); 2558 security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
2426 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); 2559 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
@@ -2430,6 +2563,86 @@ int __audit_signal_info(int sig, struct task_struct *t)
2430} 2563}
2431 2564
2432/** 2565/**
2566 * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
2567 * @bprm: pointer to the bprm being processed
2568 * @new: the proposed new credentials
2569 * @old: the old credentials
2570 *
2571 * Simply check if the proc already has the caps given by the file and if not
2572 * store the priv escalation info for later auditing at the end of the syscall
2573 *
2574 * -Eric
2575 */
2576int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2577 const struct cred *new, const struct cred *old)
2578{
2579 struct audit_aux_data_bprm_fcaps *ax;
2580 struct audit_context *context = current->audit_context;
2581 struct cpu_vfs_cap_data vcaps;
2582 struct dentry *dentry;
2583
2584 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2585 if (!ax)
2586 return -ENOMEM;
2587
2588 ax->d.type = AUDIT_BPRM_FCAPS;
2589 ax->d.next = context->aux;
2590 context->aux = (void *)ax;
2591
2592 dentry = dget(bprm->file->f_dentry);
2593 get_vfs_caps_from_disk(dentry, &vcaps);
2594 dput(dentry);
2595
2596 ax->fcap.permitted = vcaps.permitted;
2597 ax->fcap.inheritable = vcaps.inheritable;
2598 ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
2599 ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
2600
2601 ax->old_pcap.permitted = old->cap_permitted;
2602 ax->old_pcap.inheritable = old->cap_inheritable;
2603 ax->old_pcap.effective = old->cap_effective;
2604
2605 ax->new_pcap.permitted = new->cap_permitted;
2606 ax->new_pcap.inheritable = new->cap_inheritable;
2607 ax->new_pcap.effective = new->cap_effective;
2608 return 0;
2609}
2610
2611/**
2612 * __audit_log_capset - store information about the arguments to the capset syscall
2613 * @pid: target pid of the capset call
2614 * @new: the new credentials
2615 * @old: the old (current) credentials
2616 *
2617 * Record the aguments userspace sent to sys_capset for later printing by the
2618 * audit system if applicable
2619 */
2620int __audit_log_capset(pid_t pid,
2621 const struct cred *new, const struct cred *old)
2622{
2623 struct audit_aux_data_capset *ax;
2624 struct audit_context *context = current->audit_context;
2625
2626 if (likely(!audit_enabled || !context || context->dummy))
2627 return 0;
2628
2629 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2630 if (!ax)
2631 return -ENOMEM;
2632
2633 ax->d.type = AUDIT_CAPSET;
2634 ax->d.next = context->aux;
2635 context->aux = (void *)ax;
2636
2637 ax->pid = pid;
2638 ax->cap.effective = new->cap_effective;
2639 ax->cap.inheritable = new->cap_effective;
2640 ax->cap.permitted = new->cap_permitted;
2641
2642 return 0;
2643}
2644
2645/**
2433 * audit_core_dumps - record information about processes that end abnormally 2646 * audit_core_dumps - record information about processes that end abnormally
2434 * @signr: signal value 2647 * @signr: signal value
2435 * 2648 *
@@ -2440,7 +2653,8 @@ void audit_core_dumps(long signr)
2440{ 2653{
2441 struct audit_buffer *ab; 2654 struct audit_buffer *ab;
2442 u32 sid; 2655 u32 sid;
2443 uid_t auid = audit_get_loginuid(current); 2656 uid_t auid = audit_get_loginuid(current), uid;
2657 gid_t gid;
2444 unsigned int sessionid = audit_get_sessionid(current); 2658 unsigned int sessionid = audit_get_sessionid(current);
2445 2659
2446 if (!audit_enabled) 2660 if (!audit_enabled)
@@ -2450,8 +2664,9 @@ void audit_core_dumps(long signr)
2450 return; 2664 return;
2451 2665
2452 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2666 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2667 current_uid_gid(&uid, &gid);
2453 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2668 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2454 auid, current->uid, current->gid, sessionid); 2669 auid, uid, gid, sessionid);
2455 security_task_getsecid(current, &sid); 2670 security_task_getsecid(current, &sid);
2456 if (sid) { 2671 if (sid) {
2457 char *ctx = NULL; 2672 char *ctx = NULL;
diff --git a/kernel/capability.c b/kernel/capability.c
index 33e51e78c2d8..36b4b4daebec 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/audit.h>
10#include <linux/capability.h> 11#include <linux/capability.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
@@ -14,12 +15,7 @@
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
15#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18#include "cred-internals.h"
18/*
19 * This lock protects task->cap_* for all tasks including current.
20 * Locking rule: acquire this prior to tasklist_lock.
21 */
22static DEFINE_SPINLOCK(task_capability_lock);
23 19
24/* 20/*
25 * Leveraged for setting/resetting capabilities 21 * Leveraged for setting/resetting capabilities
@@ -33,6 +29,17 @@ EXPORT_SYMBOL(__cap_empty_set);
33EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
34EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
35 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1;
34
35static int __init file_caps_disable(char *str)
36{
37 file_caps_enabled = 0;
38 return 1;
39}
40__setup("no_file_caps", file_caps_disable);
41#endif
42
36/* 43/*
37 * More recent versions of libcap are available from: 44 * More recent versions of libcap are available from:
38 * 45 *
@@ -115,167 +122,12 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
115 return 0; 122 return 0;
116} 123}
117 124
118#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
119
120/*
121 * Without filesystem capability support, we nominally support one process
122 * setting the capabilities of another
123 */
124static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
125 kernel_cap_t *pIp, kernel_cap_t *pPp)
126{
127 struct task_struct *target;
128 int ret;
129
130 spin_lock(&task_capability_lock);
131 read_lock(&tasklist_lock);
132
133 if (pid && pid != task_pid_vnr(current)) {
134 target = find_task_by_vpid(pid);
135 if (!target) {
136 ret = -ESRCH;
137 goto out;
138 }
139 } else
140 target = current;
141
142 ret = security_capget(target, pEp, pIp, pPp);
143
144out:
145 read_unlock(&tasklist_lock);
146 spin_unlock(&task_capability_lock);
147
148 return ret;
149}
150
151/*
152 * cap_set_pg - set capabilities for all processes in a given process
153 * group. We call this holding task_capability_lock and tasklist_lock.
154 */
155static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
156 kernel_cap_t *inheritable,
157 kernel_cap_t *permitted)
158{
159 struct task_struct *g, *target;
160 int ret = -EPERM;
161 int found = 0;
162 struct pid *pgrp;
163
164 spin_lock(&task_capability_lock);
165 read_lock(&tasklist_lock);
166
167 pgrp = find_vpid(pgrp_nr);
168 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
169 target = g;
170 while_each_thread(g, target) {
171 if (!security_capset_check(target, effective,
172 inheritable, permitted)) {
173 security_capset_set(target, effective,
174 inheritable, permitted);
175 ret = 0;
176 }
177 found = 1;
178 }
179 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
180
181 read_unlock(&tasklist_lock);
182 spin_unlock(&task_capability_lock);
183
184 if (!found)
185 ret = 0;
186 return ret;
187}
188
189/*
190 * cap_set_all - set capabilities for all processes other than init
191 * and self. We call this holding task_capability_lock and tasklist_lock.
192 */
193static inline int cap_set_all(kernel_cap_t *effective,
194 kernel_cap_t *inheritable,
195 kernel_cap_t *permitted)
196{
197 struct task_struct *g, *target;
198 int ret = -EPERM;
199 int found = 0;
200
201 spin_lock(&task_capability_lock);
202 read_lock(&tasklist_lock);
203
204 do_each_thread(g, target) {
205 if (target == current
206 || is_container_init(target->group_leader))
207 continue;
208 found = 1;
209 if (security_capset_check(target, effective, inheritable,
210 permitted))
211 continue;
212 ret = 0;
213 security_capset_set(target, effective, inheritable, permitted);
214 } while_each_thread(g, target);
215
216 read_unlock(&tasklist_lock);
217 spin_unlock(&task_capability_lock);
218
219 if (!found)
220 ret = 0;
221
222 return ret;
223}
224
225/*
226 * Given the target pid does not refer to the current process we
227 * need more elaborate support... (This support is not present when
228 * filesystem capabilities are configured.)
229 */
230static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
231 kernel_cap_t *inheritable,
232 kernel_cap_t *permitted)
233{
234 struct task_struct *target;
235 int ret;
236
237 if (!capable(CAP_SETPCAP))
238 return -EPERM;
239
240 if (pid == -1) /* all procs other than current and init */
241 return cap_set_all(effective, inheritable, permitted);
242
243 else if (pid < 0) /* all procs in process group */
244 return cap_set_pg(-pid, effective, inheritable, permitted);
245
246 /* target != current */
247 spin_lock(&task_capability_lock);
248 read_lock(&tasklist_lock);
249
250 target = find_task_by_vpid(pid);
251 if (!target)
252 ret = -ESRCH;
253 else {
254 ret = security_capset_check(target, effective, inheritable,
255 permitted);
256
257 /* having verified that the proposed changes are legal,
258 we now put them into effect. */
259 if (!ret)
260 security_capset_set(target, effective, inheritable,
261 permitted);
262 }
263
264 read_unlock(&tasklist_lock);
265 spin_unlock(&task_capability_lock);
266
267 return ret;
268}
269
270#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
271
272/* 125/*
273 * If we have configured with filesystem capability support, then the 126 * The only thing that can change the capabilities of the current
274 * only thing that can change the capabilities of the current process 127 * process is the current process. As such, we can't be in this code
275 * is the current process. As such, we can't be in this code at the 128 * at the same time as we are in the process of setting capabilities
276 * same time as we are in the process of setting capabilities in this 129 * in this process. The net result is that we can limit our use of
277 * process. The net result is that we can limit our use of locks to 130 * locks to when we are reading the caps of another process.
278 * when we are reading the caps of another process.
279 */ 131 */
280static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, 132static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
281 kernel_cap_t *pIp, kernel_cap_t *pPp) 133 kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -285,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
285 if (pid && (pid != task_pid_vnr(current))) { 137 if (pid && (pid != task_pid_vnr(current))) {
286 struct task_struct *target; 138 struct task_struct *target;
287 139
288 spin_lock(&task_capability_lock);
289 read_lock(&tasklist_lock); 140 read_lock(&tasklist_lock);
290 141
291 target = find_task_by_vpid(pid); 142 target = find_task_by_vpid(pid);
@@ -295,50 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
295 ret = security_capget(target, pEp, pIp, pPp); 146 ret = security_capget(target, pEp, pIp, pPp);
296 147
297 read_unlock(&tasklist_lock); 148 read_unlock(&tasklist_lock);
298 spin_unlock(&task_capability_lock);
299 } else 149 } else
300 ret = security_capget(current, pEp, pIp, pPp); 150 ret = security_capget(current, pEp, pIp, pPp);
301 151
302 return ret; 152 return ret;
303} 153}
304 154
305/*
306 * With filesystem capability support configured, the kernel does not
307 * permit the changing of capabilities in one process by another
308 * process. (CAP_SETPCAP has much less broad semantics when configured
309 * this way.)
310 */
311static inline int do_sys_capset_other_tasks(pid_t pid,
312 kernel_cap_t *effective,
313 kernel_cap_t *inheritable,
314 kernel_cap_t *permitted)
315{
316 return -EPERM;
317}
318
319#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
320
321/*
322 * Atomically modify the effective capabilities returning the original
323 * value. No permission check is performed here - it is assumed that the
324 * caller is permitted to set the desired effective capabilities.
325 */
326kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
327{
328 kernel_cap_t pE_old;
329
330 spin_lock(&task_capability_lock);
331
332 pE_old = current->cap_effective;
333 current->cap_effective = pE_new;
334
335 spin_unlock(&task_capability_lock);
336
337 return pE_old;
338}
339
340EXPORT_SYMBOL(cap_set_effective);
341
342/** 155/**
343 * sys_capget - get the capabilities of a given process. 156 * sys_capget - get the capabilities of a given process.
344 * @header: pointer to struct that contains capability version and 157 * @header: pointer to struct that contains capability version and
@@ -366,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
366 return -EINVAL; 179 return -EINVAL;
367 180
368 ret = cap_get_target_pid(pid, &pE, &pI, &pP); 181 ret = cap_get_target_pid(pid, &pE, &pI, &pP);
369
370 if (!ret) { 182 if (!ret) {
371 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 183 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
372 unsigned i; 184 unsigned i;
@@ -412,16 +224,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
412 * @data: pointer to struct that contains the effective, permitted, 224 * @data: pointer to struct that contains the effective, permitted,
413 * and inheritable capabilities 225 * and inheritable capabilities
414 * 226 *
415 * Set capabilities for a given process, all processes, or all 227 * Set capabilities for the current process only. The ability to any other
416 * processes in a given process group. 228 * process(es) has been deprecated and removed.
417 * 229 *
418 * The restrictions on setting capabilities are specified as: 230 * The restrictions on setting capabilities are specified as:
419 * 231 *
420 * [pid is for the 'target' task. 'current' is the calling task.] 232 * I: any raised capabilities must be a subset of the old permitted
421 * 233 * P: any raised capabilities must be a subset of the old permitted
422 * I: any raised capabilities must be a subset of the (old current) permitted 234 * E: must be set to a subset of new permitted
423 * P: any raised capabilities must be a subset of the (old current) permitted
424 * E: must be set to a subset of (new target) permitted
425 * 235 *
426 * Returns 0 on success and < 0 on error. 236 * Returns 0 on success and < 0 on error.
427 */ 237 */
@@ -430,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
430 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
431 unsigned i, tocopy; 241 unsigned i, tocopy;
432 kernel_cap_t inheritable, permitted, effective; 242 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new;
433 int ret; 244 int ret;
434 pid_t pid; 245 pid_t pid;
435 246
@@ -440,10 +251,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
440 if (get_user(pid, &header->pid)) 251 if (get_user(pid, &header->pid))
441 return -EFAULT; 252 return -EFAULT;
442 253
443 if (copy_from_user(&kdata, data, tocopy 254 /* may only affect current now */
444 * sizeof(struct __user_cap_data_struct))) { 255 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM;
257
258 if (copy_from_user(&kdata, data,
259 tocopy * sizeof(struct __user_cap_data_struct)))
445 return -EFAULT; 260 return -EFAULT;
446 }
447 261
448 for (i = 0; i < tocopy; i++) { 262 for (i = 0; i < tocopy; i++) {
449 effective.cap[i] = kdata[i].effective; 263 effective.cap[i] = kdata[i].effective;
@@ -457,32 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
457 i++; 271 i++;
458 } 272 }
459 273
460 if (pid && (pid != task_pid_vnr(current))) 274 new = prepare_creds();
461 ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, 275 if (!new)
462 &permitted); 276 return -ENOMEM;
463 else {
464 /*
465 * This lock is required even when filesystem
466 * capability support is configured - it protects the
467 * sys_capget() call from returning incorrect data in
468 * the case that the targeted process is not the
469 * current one.
470 */
471 spin_lock(&task_capability_lock);
472 277
473 ret = security_capset_check(current, &effective, &inheritable, 278 ret = security_capset(new, current_cred(),
474 &permitted); 279 &effective, &inheritable, &permitted);
475 /* 280 if (ret < 0)
476 * Having verified that the proposed changes are 281 goto error;
477 * legal, we now put them into effect. 282
478 */ 283 ret = audit_log_capset(pid, new, current_cred());
479 if (!ret) 284 if (ret < 0)
480 security_capset_set(current, &effective, &inheritable, 285 return ret;
481 &permitted);
482 spin_unlock(&task_capability_lock);
483 }
484 286
287 return commit_creds(new);
485 288
289error:
290 abort_creds(new);
486 return ret; 291 return ret;
487} 292}
488 293
@@ -498,6 +303,11 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
498 */ 303 */
499int capable(int cap) 304int capable(int cap)
500{ 305{
306 if (unlikely(!cap_valid(cap))) {
307 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
308 BUG();
309 }
310
501 if (has_capability(current, cap)) { 311 if (has_capability(current, cap)) {
502 current->flags |= PF_SUPERPRIV; 312 current->flags |= PF_SUPERPRIV;
503 return 1; 313 return 1;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fe00b3b983a8..48348dde6d81 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -571,8 +571,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571 571
572 if (inode) { 572 if (inode) {
573 inode->i_mode = mode; 573 inode->i_mode = mode;
574 inode->i_uid = current->fsuid; 574 inode->i_uid = current_fsuid();
575 inode->i_gid = current->fsgid; 575 inode->i_gid = current_fsgid();
576 inode->i_blocks = 0; 576 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
@@ -702,7 +702,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
702 * any child cgroups exist. This is theoretically supportable 702 * any child cgroups exist. This is theoretically supportable
703 * but involves complex error handling, so it's being left until 703 * but involves complex error handling, so it's being left until
704 * later */ 704 * later */
705 if (!list_empty(&cgrp->children)) 705 if (root->number_of_cgroups > 1)
706 return -EBUSY; 706 return -EBUSY;
707 707
708 /* Process each subsystem */ 708 /* Process each subsystem */
@@ -1024,7 +1024,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1024 if (ret == -EBUSY) { 1024 if (ret == -EBUSY) {
1025 mutex_unlock(&cgroup_mutex); 1025 mutex_unlock(&cgroup_mutex);
1026 mutex_unlock(&inode->i_mutex); 1026 mutex_unlock(&inode->i_mutex);
1027 goto drop_new_super; 1027 goto free_cg_links;
1028 } 1028 }
1029 1029
1030 /* EBUSY should be the only error here */ 1030 /* EBUSY should be the only error here */
@@ -1073,10 +1073,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1073 1073
1074 return simple_set_mnt(mnt, sb); 1074 return simple_set_mnt(mnt, sb);
1075 1075
1076 free_cg_links:
1077 free_cg_links(&tmp_cg_links);
1076 drop_new_super: 1078 drop_new_super:
1077 up_write(&sb->s_umount); 1079 up_write(&sb->s_umount);
1078 deactivate_super(sb); 1080 deactivate_super(sb);
1079 free_cg_links(&tmp_cg_links);
1080 return ret; 1081 return ret;
1081} 1082}
1082 1083
@@ -1279,6 +1280,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1279static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 1280static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1280{ 1281{
1281 struct task_struct *tsk; 1282 struct task_struct *tsk;
1283 const struct cred *cred = current_cred(), *tcred;
1282 int ret; 1284 int ret;
1283 1285
1284 if (pid) { 1286 if (pid) {
@@ -1288,14 +1290,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1288 rcu_read_unlock(); 1290 rcu_read_unlock();
1289 return -ESRCH; 1291 return -ESRCH;
1290 } 1292 }
1291 get_task_struct(tsk);
1292 rcu_read_unlock();
1293 1293
1294 if ((current->euid) && (current->euid != tsk->uid) 1294 tcred = __task_cred(tsk);
1295 && (current->euid != tsk->suid)) { 1295 if (cred->euid &&
1296 put_task_struct(tsk); 1296 cred->euid != tcred->uid &&
1297 cred->euid != tcred->suid) {
1298 rcu_read_unlock();
1297 return -EACCES; 1299 return -EACCES;
1298 } 1300 }
1301 get_task_struct(tsk);
1302 rcu_read_unlock();
1299 } else { 1303 } else {
1300 tsk = current; 1304 tsk = current;
1301 get_task_struct(tsk); 1305 get_task_struct(tsk);
@@ -2934,9 +2938,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2934 again: 2938 again:
2935 root = subsys->root; 2939 root = subsys->root;
2936 if (root == &rootnode) { 2940 if (root == &rootnode) {
2937 printk(KERN_INFO
2938 "Not cloning cgroup for unused subsystem %s\n",
2939 subsys->name);
2940 mutex_unlock(&cgroup_mutex); 2941 mutex_unlock(&cgroup_mutex);
2941 return 0; 2942 return 0;
2942 } 2943 }
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 000000000000..2dc4fc2d0bf1
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 000000000000..ff7bc071991c
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,588 @@
1/* Task credentials management - see Documentation/credentials.txt
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11#include <linux/module.h>
12#include <linux/cred.h>
13#include <linux/sched.h>
14#include <linux/key.h>
15#include <linux/keyctl.h>
16#include <linux/init_task.h>
17#include <linux/security.h>
18#include <linux/cn_proc.h>
19#include "cred-internals.h"
20
21static struct kmem_cache *cred_jar;
22
23/*
24 * The common credentials for the initial task's thread group
25 */
26#ifdef CONFIG_KEYS
27static struct thread_group_cred init_tgcred = {
28 .usage = ATOMIC_INIT(2),
29 .tgid = 0,
30 .lock = SPIN_LOCK_UNLOCKED,
31};
32#endif
33
34/*
35 * The initial credentials for the initial task
36 */
37struct cred init_cred = {
38 .usage = ATOMIC_INIT(4),
39 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET,
42 .cap_effective = CAP_INIT_EFF_SET,
43 .cap_bset = CAP_INIT_BSET,
44 .user = INIT_USER,
45 .group_info = &init_groups,
46#ifdef CONFIG_KEYS
47 .tgcred = &init_tgcred,
48#endif
49};
50
51/*
52 * Dispose of the shared task group credentials
53 */
54#ifdef CONFIG_KEYS
55static void release_tgcred_rcu(struct rcu_head *rcu)
56{
57 struct thread_group_cred *tgcred =
58 container_of(rcu, struct thread_group_cred, rcu);
59
60 BUG_ON(atomic_read(&tgcred->usage) != 0);
61
62 key_put(tgcred->session_keyring);
63 key_put(tgcred->process_keyring);
64 kfree(tgcred);
65}
66#endif
67
68/*
69 * Release a set of thread group credentials.
70 */
71static void release_tgcred(struct cred *cred)
72{
73#ifdef CONFIG_KEYS
74 struct thread_group_cred *tgcred = cred->tgcred;
75
76 if (atomic_dec_and_test(&tgcred->usage))
77 call_rcu(&tgcred->rcu, release_tgcred_rcu);
78#endif
79}
80
81/*
82 * The RCU callback to actually dispose of a set of credentials
83 */
84static void put_cred_rcu(struct rcu_head *rcu)
85{
86 struct cred *cred = container_of(rcu, struct cred, rcu);
87
88 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage));
91
92 security_cred_free(cred);
93 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth);
95 release_tgcred(cred);
96 put_group_info(cred->group_info);
97 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred);
99}
100
101/**
102 * __put_cred - Destroy a set of credentials
103 * @cred: The record to release
104 *
105 * Destroy a set of credentials on which no references remain.
106 */
107void __put_cred(struct cred *cred)
108{
109 BUG_ON(atomic_read(&cred->usage) != 0);
110
111 call_rcu(&cred->rcu, put_cred_rcu);
112}
113EXPORT_SYMBOL(__put_cred);
114
115/**
116 * prepare_creds - Prepare a new set of credentials for modification
117 *
118 * Prepare a new set of task credentials for modification. A task's creds
119 * shouldn't generally be modified directly, therefore this function is used to
120 * prepare a new copy, which the caller then modifies and then commits by
121 * calling commit_creds().
122 *
123 * Preparation involves making a copy of the objective creds for modification.
124 *
125 * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
126 *
127 * Call commit_creds() or abort_creds() to clean up.
128 */
129struct cred *prepare_creds(void)
130{
131 struct task_struct *task = current;
132 const struct cred *old;
133 struct cred *new;
134
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1);
136
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new)
139 return NULL;
140
141 old = task->cred;
142 memcpy(new, old, sizeof(struct cred));
143
144 atomic_set(&new->usage, 1);
145 get_group_info(new->group_info);
146 get_uid(new->user);
147
148#ifdef CONFIG_KEYS
149 key_get(new->thread_keyring);
150 key_get(new->request_key_auth);
151 atomic_inc(&new->tgcred->usage);
152#endif
153
154#ifdef CONFIG_SECURITY
155 new->security = NULL;
156#endif
157
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error;
160 return new;
161
162error:
163 abort_creds(new);
164 return NULL;
165}
166EXPORT_SYMBOL(prepare_creds);
167
168/*
169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex
171 */
172struct cred *prepare_exec_creds(void)
173{
174 struct thread_group_cred *tgcred = NULL;
175 struct cred *new;
176
177#ifdef CONFIG_KEYS
178 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
179 if (!tgcred)
180 return NULL;
181#endif
182
183 new = prepare_creds();
184 if (!new) {
185 kfree(tgcred);
186 return new;
187 }
188
189#ifdef CONFIG_KEYS
190 /* newly exec'd tasks don't get a thread keyring */
191 key_put(new->thread_keyring);
192 new->thread_keyring = NULL;
193
194 /* create a new per-thread-group creds for all this set of threads to
195 * share */
196 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
197
198 atomic_set(&tgcred->usage, 1);
199 spin_lock_init(&tgcred->lock);
200
201 /* inherit the session keyring; new process keyring */
202 key_get(tgcred->session_keyring);
203 tgcred->process_keyring = NULL;
204
205 release_tgcred(new);
206 new->tgcred = tgcred;
207#endif
208
209 return new;
210}
211
212/*
213 * prepare new credentials for the usermode helper dispatcher
214 */
215struct cred *prepare_usermodehelper_creds(void)
216{
217#ifdef CONFIG_KEYS
218 struct thread_group_cred *tgcred = NULL;
219#endif
220 struct cred *new;
221
222#ifdef CONFIG_KEYS
223 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
224 if (!tgcred)
225 return NULL;
226#endif
227
228 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
229 if (!new)
230 return NULL;
231
232 memcpy(new, &init_cred, sizeof(struct cred));
233
234 atomic_set(&new->usage, 1);
235 get_group_info(new->group_info);
236 get_uid(new->user);
237
238#ifdef CONFIG_KEYS
239 new->thread_keyring = NULL;
240 new->request_key_auth = NULL;
241 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
242
243 atomic_set(&tgcred->usage, 1);
244 spin_lock_init(&tgcred->lock);
245 new->tgcred = tgcred;
246#endif
247
248#ifdef CONFIG_SECURITY
249 new->security = NULL;
250#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error;
253
254 BUG_ON(atomic_read(&new->usage) != 1);
255 return new;
256
257error:
258 put_cred(new);
259 return NULL;
260}
261
262/*
263 * Copy credentials for the new process created by fork()
264 *
265 * We share if we can, but under some circumstances we have to generate a new
266 * set.
267 *
268 * The new process gets the current process's subjective credentials as its
269 * objective and subjective credentials
270 */
271int copy_creds(struct task_struct *p, unsigned long clone_flags)
272{
273#ifdef CONFIG_KEYS
274 struct thread_group_cred *tgcred;
275#endif
276 struct cred *new;
277 int ret;
278
279 mutex_init(&p->cred_exec_mutex);
280
281 if (
282#ifdef CONFIG_KEYS
283 !p->cred->thread_keyring &&
284#endif
285 clone_flags & CLONE_THREAD
286 ) {
287 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred);
289 atomic_inc(&p->cred->user->processes);
290 return 0;
291 }
292
293 new = prepare_creds();
294 if (!new)
295 return -ENOMEM;
296
297 if (clone_flags & CLONE_NEWUSER) {
298 ret = create_user_ns(new);
299 if (ret < 0)
300 goto error_put;
301 }
302
303#ifdef CONFIG_KEYS
304 /* new threads get their own thread keyrings if their parent already
305 * had one */
306 if (new->thread_keyring) {
307 key_put(new->thread_keyring);
308 new->thread_keyring = NULL;
309 if (clone_flags & CLONE_THREAD)
310 install_thread_keyring_to_cred(new);
311 }
312
313 /* we share the process and session keyrings between all the threads in
314 * a process - this is slightly icky as we violate COW credentials a
315 * bit */
316 if (!(clone_flags & CLONE_THREAD)) {
317 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
318 if (!tgcred) {
319 ret = -ENOMEM;
320 goto error_put;
321 }
322 atomic_set(&tgcred->usage, 1);
323 spin_lock_init(&tgcred->lock);
324 tgcred->process_keyring = NULL;
325 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
326
327 release_tgcred(new);
328 new->tgcred = tgcred;
329 }
330#endif
331
332 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new);
334 return 0;
335
336error_put:
337 put_cred(new);
338 return ret;
339}
340
341/**
342 * commit_creds - Install new credentials upon the current task
343 * @new: The credentials to be assigned
344 *
345 * Install a new set of credentials to the current task, using RCU to replace
346 * the old set. Both the objective and the subjective credentials pointers are
347 * updated. This function may not be called if the subjective credentials are
348 * in an overridden state.
349 *
350 * This function eats the caller's reference to the new credentials.
351 *
352 * Always returns 0 thus allowing this function to be tail-called at the end
353 * of, say, sys_setgid().
354 */
355int commit_creds(struct cred *new)
356{
357 struct task_struct *task = current;
358 const struct cred *old;
359
360 BUG_ON(task->cred != task->real_cred);
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2);
362 BUG_ON(atomic_read(&new->usage) < 1);
363
364 old = task->real_cred;
365 security_commit_creds(new, old);
366
367 get_cred(new); /* we will require a ref for the subj creds too */
368
369 /* dumpability changes */
370 if (old->euid != new->euid ||
371 old->egid != new->egid ||
372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0;
377 smp_wmb();
378 }
379
380 /* alter the thread keyring */
381 if (new->fsuid != old->fsuid)
382 key_fsuid_changed(task);
383 if (new->fsgid != old->fsgid)
384 key_fsgid_changed(task);
385
386 /* do it
387 * - What if a process setreuid()'s and this brings the
388 * new uid over his NPROC rlimit? We can check this now
389 * cheaply with the new uid cache, so if it matters
390 * we should be checking for it. -DaveM
391 */
392 if (new->user != old->user)
393 atomic_inc(&new->user->processes);
394 rcu_assign_pointer(task->real_cred, new);
395 rcu_assign_pointer(task->cred, new);
396 if (new->user != old->user)
397 atomic_dec(&old->user->processes);
398
399 sched_switch_user(task);
400
401 /* send notifications */
402 if (new->uid != old->uid ||
403 new->euid != old->euid ||
404 new->suid != old->suid ||
405 new->fsuid != old->fsuid)
406 proc_id_connector(task, PROC_EVENT_UID);
407
408 if (new->gid != old->gid ||
409 new->egid != old->egid ||
410 new->sgid != old->sgid ||
411 new->fsgid != old->fsgid)
412 proc_id_connector(task, PROC_EVENT_GID);
413
414 /* release the old obj and subj refs both */
415 put_cred(old);
416 put_cred(old);
417 return 0;
418}
419EXPORT_SYMBOL(commit_creds);
420
421/**
422 * abort_creds - Discard a set of credentials and unlock the current task
423 * @new: The credentials that were going to be applied
424 *
425 * Discard a set of credentials that were under construction and unlock the
426 * current task.
427 */
428void abort_creds(struct cred *new)
429{
430 BUG_ON(atomic_read(&new->usage) < 1);
431 put_cred(new);
432}
433EXPORT_SYMBOL(abort_creds);
434
435/**
436 * override_creds - Override the current process's subjective credentials
437 * @new: The credentials to be assigned
438 *
439 * Install a set of temporary override subjective credentials on the current
440 * process, returning the old set for later reversion.
441 */
442const struct cred *override_creds(const struct cred *new)
443{
444 const struct cred *old = current->cred;
445
446 rcu_assign_pointer(current->cred, get_cred(new));
447 return old;
448}
449EXPORT_SYMBOL(override_creds);
450
451/**
452 * revert_creds - Revert a temporary subjective credentials override
453 * @old: The credentials to be restored
454 *
455 * Revert a temporary set of override subjective credentials to an old set,
456 * discarding the override set.
457 */
458void revert_creds(const struct cred *old)
459{
460 const struct cred *override = current->cred;
461
462 rcu_assign_pointer(current->cred, old);
463 put_cred(override);
464}
465EXPORT_SYMBOL(revert_creds);
466
467/*
468 * initialise the credentials stuff
469 */
470void __init cred_init(void)
471{
472 /* allocate a slab in which we can store credentials */
473 cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
474 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
475}
476
477/**
478 * prepare_kernel_cred - Prepare a set of credentials for a kernel service
479 * @daemon: A userspace daemon to be used as a reference
480 *
481 * Prepare a set of credentials for a kernel service. This can then be used to
482 * override a task's own credentials so that work can be done on behalf of that
483 * task that requires a different subjective context.
484 *
485 * @daemon is used to provide a base for the security record, but can be NULL.
486 * If @daemon is supplied, then the security data will be derived from that;
487 * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
488 *
489 * The caller may change these controls afterwards if desired.
490 *
491 * Returns the new credentials or NULL if out of memory.
492 *
493 * Does not take, and does not return holding current->cred_replace_mutex.
494 */
495struct cred *prepare_kernel_cred(struct task_struct *daemon)
496{
497 const struct cred *old;
498 struct cred *new;
499
500 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
501 if (!new)
502 return NULL;
503
504 if (daemon)
505 old = get_task_cred(daemon);
506 else
507 old = get_cred(&init_cred);
508
509 get_uid(new->user);
510 get_group_info(new->group_info);
511
512#ifdef CONFIG_KEYS
513 atomic_inc(&init_tgcred.usage);
514 new->tgcred = &init_tgcred;
515 new->request_key_auth = NULL;
516 new->thread_keyring = NULL;
517 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
518#endif
519
520#ifdef CONFIG_SECURITY
521 new->security = NULL;
522#endif
523 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
524 goto error;
525
526 atomic_set(&new->usage, 1);
527 put_cred(old);
528 return new;
529
530error:
531 put_cred(new);
532 return NULL;
533}
534EXPORT_SYMBOL(prepare_kernel_cred);
535
536/**
537 * set_security_override - Set the security ID in a set of credentials
538 * @new: The credentials to alter
539 * @secid: The LSM security ID to set
540 *
541 * Set the LSM security ID in a set of credentials so that the subjective
542 * security is overridden when an alternative set of credentials is used.
543 */
544int set_security_override(struct cred *new, u32 secid)
545{
546 return security_kernel_act_as(new, secid);
547}
548EXPORT_SYMBOL(set_security_override);
549
550/**
551 * set_security_override_from_ctx - Set the security ID in a set of credentials
552 * @new: The credentials to alter
553 * @secctx: The LSM security context to generate the security ID from.
554 *
555 * Set the LSM security ID in a set of credentials so that the subjective
556 * security is overridden when an alternative set of credentials is used. The
557 * security ID is specified in string form as a security context to be
558 * interpreted by the LSM.
559 */
560int set_security_override_from_ctx(struct cred *new, const char *secctx)
561{
562 u32 secid;
563 int ret;
564
565 ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
566 if (ret < 0)
567 return ret;
568
569 return set_security_override(new, secid);
570}
571EXPORT_SYMBOL(set_security_override_from_ctx);
572
573/**
574 * set_create_files_as - Set the LSM file create context in a set of credentials
575 * @new: The credentials to alter
576 * @inode: The inode to take the context from
577 *
578 * Change the LSM file creation context in a set of credentials to be the same
579 * as the object context of the specified inode, so that the new inodes have
580 * the same MAC context as that inode.
581 */
582int set_create_files_as(struct cred *new, struct inode *inode)
583{
584 new->fsuid = inode->i_uid;
585 new->fsgid = inode->i_gid;
586 return security_kernel_create_files_as(new, inode);
587}
588EXPORT_SYMBOL(set_create_files_as);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be..abb6e17505e2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
127 */ 127 */
128 t1 = tsk->sched_info.pcount; 128 t1 = tsk->sched_info.pcount;
129 t2 = tsk->sched_info.run_delay; 129 t2 = tsk->sched_info.run_delay;
130 t3 = tsk->sched_info.cpu_time; 130 t3 = tsk->se.sum_exec_runtime;
131 131
132 d->cpu_count += t1; 132 d->cpu_count += t1;
133 133
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..c9e5a1c14e08 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,12 +46,18 @@
46#include <linux/blkdev.h> 46#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/init_task.h>
49#include <trace/sched.h> 50#include <trace/sched.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52#include <asm/unistd.h> 53#include <asm/unistd.h>
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
56#include "cred-internals.h"
57
58DEFINE_TRACE(sched_process_free);
59DEFINE_TRACE(sched_process_exit);
60DEFINE_TRACE(sched_process_wait);
55 61
56static void exit_mm(struct task_struct * tsk); 62static void exit_mm(struct task_struct * tsk);
57 63
@@ -164,7 +170,10 @@ void release_task(struct task_struct * p)
164 int zap_leader; 170 int zap_leader;
165repeat: 171repeat:
166 tracehook_prepare_release_task(p); 172 tracehook_prepare_release_task(p);
167 atomic_dec(&p->user->processes); 173 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials */
175 atomic_dec(&__task_cred(p)->user->processes);
176
168 proc_flush_task(p); 177 proc_flush_task(p);
169 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
170 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
@@ -339,12 +348,12 @@ static void reparent_to_kthreadd(void)
339 /* cpus_allowed? */ 348 /* cpus_allowed? */
340 /* rt_priority? */ 349 /* rt_priority? */
341 /* signals? */ 350 /* signals? */
342 security_task_reparent_to_init(current);
343 memcpy(current->signal->rlim, init_task.signal->rlim, 351 memcpy(current->signal->rlim, init_task.signal->rlim,
344 sizeof(current->signal->rlim)); 352 sizeof(current->signal->rlim));
345 atomic_inc(&(INIT_USER->__count)); 353
354 atomic_inc(&init_cred.usage);
355 commit_creds(&init_cred);
346 write_unlock_irq(&tasklist_lock); 356 write_unlock_irq(&tasklist_lock);
347 switch_uid(INIT_USER);
348} 357}
349 358
350void __set_special_pids(struct pid *pid) 359void __set_special_pids(struct pid *pid)
@@ -1028,8 +1037,6 @@ NORET_TYPE void do_exit(long code)
1028 * task into the wait for ever nirwana as well. 1037 * task into the wait for ever nirwana as well.
1029 */ 1038 */
1030 tsk->flags |= PF_EXITPIDONE; 1039 tsk->flags |= PF_EXITPIDONE;
1031 if (tsk->io_context)
1032 exit_io_context();
1033 set_current_state(TASK_UNINTERRUPTIBLE); 1040 set_current_state(TASK_UNINTERRUPTIBLE);
1034 schedule(); 1041 schedule();
1035 } 1042 }
@@ -1078,7 +1085,6 @@ NORET_TYPE void do_exit(long code)
1078 check_stack_usage(); 1085 check_stack_usage();
1079 exit_thread(); 1086 exit_thread();
1080 cgroup_exit(tsk, 1); 1087 cgroup_exit(tsk, 1);
1081 exit_keys(tsk);
1082 1088
1083 if (group_dead && tsk->signal->leader) 1089 if (group_dead && tsk->signal->leader)
1084 disassociate_ctty(1); 1090 disassociate_ctty(1);
@@ -1123,7 +1129,6 @@ NORET_TYPE void do_exit(long code)
1123 preempt_disable(); 1129 preempt_disable();
1124 /* causes final put_task_struct in finish_task_switch(). */ 1130 /* causes final put_task_struct in finish_task_switch(). */
1125 tsk->state = TASK_DEAD; 1131 tsk->state = TASK_DEAD;
1126
1127 schedule(); 1132 schedule();
1128 BUG(); 1133 BUG();
1129 /* Avoid "noreturn function does return". */ 1134 /* Avoid "noreturn function does return". */
@@ -1263,12 +1268,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1263 unsigned long state; 1268 unsigned long state;
1264 int retval, status, traced; 1269 int retval, status, traced;
1265 pid_t pid = task_pid_vnr(p); 1270 pid_t pid = task_pid_vnr(p);
1271 uid_t uid = __task_cred(p)->uid;
1266 1272
1267 if (!likely(options & WEXITED)) 1273 if (!likely(options & WEXITED))
1268 return 0; 1274 return 0;
1269 1275
1270 if (unlikely(options & WNOWAIT)) { 1276 if (unlikely(options & WNOWAIT)) {
1271 uid_t uid = p->uid;
1272 int exit_code = p->exit_code; 1277 int exit_code = p->exit_code;
1273 int why, status; 1278 int why, status;
1274 1279
@@ -1321,10 +1326,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1321 * group, which consolidates times for all threads in the 1326 * group, which consolidates times for all threads in the
1322 * group including the group leader. 1327 * group including the group leader.
1323 */ 1328 */
1329 thread_group_cputime(p, &cputime);
1324 spin_lock_irq(&p->parent->sighand->siglock); 1330 spin_lock_irq(&p->parent->sighand->siglock);
1325 psig = p->parent->signal; 1331 psig = p->parent->signal;
1326 sig = p->signal; 1332 sig = p->signal;
1327 thread_group_cputime(p, &cputime);
1328 psig->cutime = 1333 psig->cutime =
1329 cputime_add(psig->cutime, 1334 cputime_add(psig->cutime,
1330 cputime_add(cputime.utime, 1335 cputime_add(cputime.utime,
@@ -1389,7 +1394,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1389 if (!retval && infop) 1394 if (!retval && infop)
1390 retval = put_user(pid, &infop->si_pid); 1395 retval = put_user(pid, &infop->si_pid);
1391 if (!retval && infop) 1396 if (!retval && infop)
1392 retval = put_user(p->uid, &infop->si_uid); 1397 retval = put_user(uid, &infop->si_uid);
1393 if (!retval) 1398 if (!retval)
1394 retval = pid; 1399 retval = pid;
1395 1400
@@ -1454,7 +1459,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1454 if (!unlikely(options & WNOWAIT)) 1459 if (!unlikely(options & WNOWAIT))
1455 p->exit_code = 0; 1460 p->exit_code = 0;
1456 1461
1457 uid = p->uid; 1462 /* don't need the RCU readlock here as we're holding a spinlock */
1463 uid = __task_cred(p)->uid;
1458unlock_sig: 1464unlock_sig:
1459 spin_unlock_irq(&p->sighand->siglock); 1465 spin_unlock_irq(&p->sighand->siglock);
1460 if (!exit_code) 1466 if (!exit_code)
@@ -1528,10 +1534,10 @@ static int wait_task_continued(struct task_struct *p, int options,
1528 } 1534 }
1529 if (!unlikely(options & WNOWAIT)) 1535 if (!unlikely(options & WNOWAIT))
1530 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1536 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1537 uid = __task_cred(p)->uid;
1531 spin_unlock_irq(&p->sighand->siglock); 1538 spin_unlock_irq(&p->sighand->siglock);
1532 1539
1533 pid = task_pid_vnr(p); 1540 pid = task_pid_vnr(p);
1534 uid = p->uid;
1535 get_task_struct(p); 1541 get_task_struct(p);
1536 read_unlock(&tasklist_lock); 1542 read_unlock(&tasklist_lock);
1537 1543
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
17*/ 17*/
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/ftrace.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22 23
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40 return e; 41 return e;
41} 42}
42 43
43int core_kernel_text(unsigned long addr) 44__notrace_funcgraph int core_kernel_text(unsigned long addr)
44{ 45{
45 if (addr >= (unsigned long)_stext && 46 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext) 47 addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
53 return 0; 54 return 0;
54} 55}
55 56
56int __kernel_text_address(unsigned long addr) 57__notrace_funcgraph int __kernel_text_address(unsigned long addr)
57{ 58{
58 if (core_kernel_text(addr)) 59 if (core_kernel_text(addr))
59 return 1; 60 return 1;
@@ -66,3 +67,19 @@ int kernel_text_address(unsigned long addr)
66 return 1; 67 return 1;
67 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
68} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a372a0e206f..43cbf30669e6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -47,6 +47,7 @@
47#include <linux/mount.h> 47#include <linux/mount.h>
48#include <linux/audit.h> 48#include <linux/audit.h>
49#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
50#include <linux/ftrace.h>
50#include <linux/profile.h> 51#include <linux/profile.h>
51#include <linux/rmap.h> 52#include <linux/rmap.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
@@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
80 81
81__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 82__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
82 83
84DEFINE_TRACE(sched_process_fork);
85
83int nr_processes(void) 86int nr_processes(void)
84{ 87{
85 int cpu; 88 int cpu;
@@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk)
137 prop_local_destroy_single(&tsk->dirties); 140 prop_local_destroy_single(&tsk->dirties);
138 free_thread_info(tsk->stack); 141 free_thread_info(tsk->stack);
139 rt_mutex_debug_task_free(tsk); 142 rt_mutex_debug_task_free(tsk);
143 ftrace_graph_exit_task(tsk);
140 free_task_struct(tsk); 144 free_task_struct(tsk);
141} 145}
142EXPORT_SYMBOL(free_task); 146EXPORT_SYMBOL(free_task);
@@ -147,9 +151,8 @@ void __put_task_struct(struct task_struct *tsk)
147 WARN_ON(atomic_read(&tsk->usage)); 151 WARN_ON(atomic_read(&tsk->usage));
148 WARN_ON(tsk == current); 152 WARN_ON(tsk == current);
149 153
150 security_task_free(tsk); 154 put_cred(tsk->real_cred);
151 free_uid(tsk->user); 155 put_cred(tsk->cred);
152 put_group_info(tsk->group_info);
153 delayacct_tsk_free(tsk); 156 delayacct_tsk_free(tsk);
154 157
155 if (!profile_handoff_task(tsk)) 158 if (!profile_handoff_task(tsk))
@@ -315,17 +318,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
315 file = tmp->vm_file; 318 file = tmp->vm_file;
316 if (file) { 319 if (file) {
317 struct inode *inode = file->f_path.dentry->d_inode; 320 struct inode *inode = file->f_path.dentry->d_inode;
321 struct address_space *mapping = file->f_mapping;
322
318 get_file(file); 323 get_file(file);
319 if (tmp->vm_flags & VM_DENYWRITE) 324 if (tmp->vm_flags & VM_DENYWRITE)
320 atomic_dec(&inode->i_writecount); 325 atomic_dec(&inode->i_writecount);
321 326 spin_lock(&mapping->i_mmap_lock);
322 /* insert tmp into the share list, just after mpnt */ 327 if (tmp->vm_flags & VM_SHARED)
323 spin_lock(&file->f_mapping->i_mmap_lock); 328 mapping->i_mmap_writable++;
324 tmp->vm_truncate_count = mpnt->vm_truncate_count; 329 tmp->vm_truncate_count = mpnt->vm_truncate_count;
325 flush_dcache_mmap_lock(file->f_mapping); 330 flush_dcache_mmap_lock(mapping);
331 /* insert tmp into the share list, just after mpnt */
326 vma_prio_tree_add(tmp, mpnt); 332 vma_prio_tree_add(tmp, mpnt);
327 flush_dcache_mmap_unlock(file->f_mapping); 333 flush_dcache_mmap_unlock(mapping);
328 spin_unlock(&file->f_mapping->i_mmap_lock); 334 spin_unlock(&mapping->i_mmap_lock);
329 } 335 }
330 336
331 /* 337 /*
@@ -409,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
409 set_mm_counter(mm, file_rss, 0); 415 set_mm_counter(mm, file_rss, 0);
410 set_mm_counter(mm, anon_rss, 0); 416 set_mm_counter(mm, anon_rss, 0);
411 spin_lock_init(&mm->page_table_lock); 417 spin_lock_init(&mm->page_table_lock);
412 rwlock_init(&mm->ioctx_list_lock); 418 spin_lock_init(&mm->ioctx_lock);
413 mm->ioctx_list = NULL; 419 INIT_HLIST_HEAD(&mm->ioctx_list);
414 mm->free_area_cache = TASK_UNMAPPED_BASE; 420 mm->free_area_cache = TASK_UNMAPPED_BASE;
415 mm->cached_hole_size = ~0UL; 421 mm->cached_hole_size = ~0UL;
416 mm_init_owner(mm, p); 422 mm_init_owner(mm, p);
@@ -815,12 +821,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
815 if (!sig) 821 if (!sig)
816 return -ENOMEM; 822 return -ENOMEM;
817 823
818 ret = copy_thread_group_keys(tsk);
819 if (ret < 0) {
820 kmem_cache_free(signal_cachep, sig);
821 return ret;
822 }
823
824 atomic_set(&sig->count, 1); 824 atomic_set(&sig->count, 1);
825 atomic_set(&sig->live, 1); 825 atomic_set(&sig->live, 1);
826 init_waitqueue_head(&sig->wait_chldexit); 826 init_waitqueue_head(&sig->wait_chldexit);
@@ -865,7 +865,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
865void __cleanup_signal(struct signal_struct *sig) 865void __cleanup_signal(struct signal_struct *sig)
866{ 866{
867 thread_group_cputime_free(sig); 867 thread_group_cputime_free(sig);
868 exit_thread_group_keys(sig);
869 tty_kref_put(sig->tty); 868 tty_kref_put(sig->tty);
870 kmem_cache_free(signal_cachep, sig); 869 kmem_cache_free(signal_cachep, sig);
871} 870}
@@ -981,16 +980,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
981 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 980 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
982#endif 981#endif
983 retval = -EAGAIN; 982 retval = -EAGAIN;
984 if (atomic_read(&p->user->processes) >= 983 if (atomic_read(&p->real_cred->user->processes) >=
985 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 984 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
986 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 985 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
987 p->user != current->nsproxy->user_ns->root_user) 986 p->real_cred->user != INIT_USER)
988 goto bad_fork_free; 987 goto bad_fork_free;
989 } 988 }
990 989
991 atomic_inc(&p->user->__count); 990 retval = copy_creds(p, clone_flags);
992 atomic_inc(&p->user->processes); 991 if (retval < 0)
993 get_group_info(p->group_info); 992 goto bad_fork_free;
994 993
995 /* 994 /*
996 * If multiple threads are within copy_process(), then this check 995 * If multiple threads are within copy_process(), then this check
@@ -1045,10 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 do_posix_clock_monotonic_gettime(&p->start_time); 1044 do_posix_clock_monotonic_gettime(&p->start_time);
1046 p->real_start_time = p->start_time; 1045 p->real_start_time = p->start_time;
1047 monotonic_to_bootbased(&p->real_start_time); 1046 monotonic_to_bootbased(&p->real_start_time);
1048#ifdef CONFIG_SECURITY
1049 p->security = NULL;
1050#endif
1051 p->cap_bset = current->cap_bset;
1052 p->io_context = NULL; 1047 p->io_context = NULL;
1053 p->audit_context = NULL; 1048 p->audit_context = NULL;
1054 cgroup_fork(p); 1049 cgroup_fork(p);
@@ -1089,14 +1084,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1089#ifdef CONFIG_DEBUG_MUTEXES 1084#ifdef CONFIG_DEBUG_MUTEXES
1090 p->blocked_on = NULL; /* not blocked yet */ 1085 p->blocked_on = NULL; /* not blocked yet */
1091#endif 1086#endif
1087 if (unlikely(ptrace_reparented(current)))
1088 ptrace_fork(p, clone_flags);
1092 1089
1093 /* Perform scheduler related setup. Assign this task to a CPU. */ 1090 /* Perform scheduler related setup. Assign this task to a CPU. */
1094 sched_fork(p, clone_flags); 1091 sched_fork(p, clone_flags);
1095 1092
1096 if ((retval = security_task_alloc(p)))
1097 goto bad_fork_cleanup_policy;
1098 if ((retval = audit_alloc(p))) 1093 if ((retval = audit_alloc(p)))
1099 goto bad_fork_cleanup_security; 1094 goto bad_fork_cleanup_policy;
1100 /* copy all the process information */ 1095 /* copy all the process information */
1101 if ((retval = copy_semundo(clone_flags, p))) 1096 if ((retval = copy_semundo(clone_flags, p)))
1102 goto bad_fork_cleanup_audit; 1097 goto bad_fork_cleanup_audit;
@@ -1110,10 +1105,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1110 goto bad_fork_cleanup_sighand; 1105 goto bad_fork_cleanup_sighand;
1111 if ((retval = copy_mm(clone_flags, p))) 1106 if ((retval = copy_mm(clone_flags, p)))
1112 goto bad_fork_cleanup_signal; 1107 goto bad_fork_cleanup_signal;
1113 if ((retval = copy_keys(clone_flags, p)))
1114 goto bad_fork_cleanup_mm;
1115 if ((retval = copy_namespaces(clone_flags, p))) 1108 if ((retval = copy_namespaces(clone_flags, p)))
1116 goto bad_fork_cleanup_keys; 1109 goto bad_fork_cleanup_mm;
1117 if ((retval = copy_io(clone_flags, p))) 1110 if ((retval = copy_io(clone_flags, p)))
1118 goto bad_fork_cleanup_namespaces; 1111 goto bad_fork_cleanup_namespaces;
1119 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1112 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1133,6 +1126,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1133 } 1126 }
1134 } 1127 }
1135 1128
1129 ftrace_graph_init_task(p);
1130
1136 p->pid = pid_nr(pid); 1131 p->pid = pid_nr(pid);
1137 p->tgid = p->pid; 1132 p->tgid = p->pid;
1138 if (clone_flags & CLONE_THREAD) 1133 if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1136,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1141 if (current->nsproxy != p->nsproxy) { 1136 if (current->nsproxy != p->nsproxy) {
1142 retval = ns_cgroup_clone(p, pid); 1137 retval = ns_cgroup_clone(p, pid);
1143 if (retval) 1138 if (retval)
1144 goto bad_fork_free_pid; 1139 goto bad_fork_free_graph;
1145 } 1140 }
1146 1141
1147 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1142 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1234,7 +1229,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1234 spin_unlock(&current->sighand->siglock); 1229 spin_unlock(&current->sighand->siglock);
1235 write_unlock_irq(&tasklist_lock); 1230 write_unlock_irq(&tasklist_lock);
1236 retval = -ERESTARTNOINTR; 1231 retval = -ERESTARTNOINTR;
1237 goto bad_fork_free_pid; 1232 goto bad_fork_free_graph;
1238 } 1233 }
1239 1234
1240 if (clone_flags & CLONE_THREAD) { 1235 if (clone_flags & CLONE_THREAD) {
@@ -1271,6 +1266,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1271 cgroup_post_fork(p); 1266 cgroup_post_fork(p);
1272 return p; 1267 return p;
1273 1268
1269bad_fork_free_graph:
1270 ftrace_graph_exit_task(p);
1274bad_fork_free_pid: 1271bad_fork_free_pid:
1275 if (pid != &init_struct_pid) 1272 if (pid != &init_struct_pid)
1276 free_pid(pid); 1273 free_pid(pid);
@@ -1278,8 +1275,6 @@ bad_fork_cleanup_io:
1278 put_io_context(p->io_context); 1275 put_io_context(p->io_context);
1279bad_fork_cleanup_namespaces: 1276bad_fork_cleanup_namespaces:
1280 exit_task_namespaces(p); 1277 exit_task_namespaces(p);
1281bad_fork_cleanup_keys:
1282 exit_keys(p);
1283bad_fork_cleanup_mm: 1278bad_fork_cleanup_mm:
1284 if (p->mm) 1279 if (p->mm)
1285 mmput(p->mm); 1280 mmput(p->mm);
@@ -1295,8 +1290,6 @@ bad_fork_cleanup_semundo:
1295 exit_sem(p); 1290 exit_sem(p);
1296bad_fork_cleanup_audit: 1291bad_fork_cleanup_audit:
1297 audit_free(p); 1292 audit_free(p);
1298bad_fork_cleanup_security:
1299 security_task_free(p);
1300bad_fork_cleanup_policy: 1293bad_fork_cleanup_policy:
1301#ifdef CONFIG_NUMA 1294#ifdef CONFIG_NUMA
1302 mpol_put(p->mempolicy); 1295 mpol_put(p->mempolicy);
@@ -1309,9 +1302,9 @@ bad_fork_cleanup_cgroup:
1309bad_fork_cleanup_put_domain: 1302bad_fork_cleanup_put_domain:
1310 module_put(task_thread_info(p)->exec_domain->module); 1303 module_put(task_thread_info(p)->exec_domain->module);
1311bad_fork_cleanup_count: 1304bad_fork_cleanup_count:
1312 put_group_info(p->group_info); 1305 atomic_dec(&p->cred->user->processes);
1313 atomic_dec(&p->user->processes); 1306 put_cred(p->real_cred);
1314 free_uid(p->user); 1307 put_cred(p->cred);
1315bad_fork_free: 1308bad_fork_free:
1316 free_task(p); 1309 free_task(p);
1317fork_out: 1310fork_out:
@@ -1355,6 +1348,21 @@ long do_fork(unsigned long clone_flags,
1355 long nr; 1348 long nr;
1356 1349
1357 /* 1350 /*
1351 * Do some preliminary argument and permissions checking before we
1352 * actually start allocating stuff
1353 */
1354 if (clone_flags & CLONE_NEWUSER) {
1355 if (clone_flags & CLONE_THREAD)
1356 return -EINVAL;
1357 /* hopefully this check will go away when userns support is
1358 * complete
1359 */
1360 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1361 !capable(CAP_SETGID))
1362 return -EPERM;
1363 }
1364
1365 /*
1358 * We hope to recycle these flags after 2.6.26 1366 * We hope to recycle these flags after 2.6.26
1359 */ 1367 */
1360 if (unlikely(clone_flags & CLONE_STOPPED)) { 1368 if (unlikely(clone_flags & CLONE_STOPPED)) {
@@ -1398,6 +1406,7 @@ long do_fork(unsigned long clone_flags,
1398 init_completion(&vfork); 1406 init_completion(&vfork);
1399 } 1407 }
1400 1408
1409 audit_finish_fork(p);
1401 tracehook_report_clone(trace, regs, clone_flags, nr, p); 1410 tracehook_report_clone(trace, regs, clone_flags, nr, p);
1402 1411
1403 /* 1412 /*
@@ -1601,8 +1610,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1601 err = -EINVAL; 1610 err = -EINVAL;
1602 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1611 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1603 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1612 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1604 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| 1613 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1605 CLONE_NEWNET))
1606 goto bad_unshare_out; 1614 goto bad_unshare_out;
1607 1615
1608 /* 1616 /*
diff --git a/kernel/futex.c b/kernel/futex.c
index 8af10027514b..7c6cbabe52b3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
94 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
95 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiter, then make the second condition true.
96 */ 96 */
97struct futex_q { 97struct futex_q {
98 struct plist_node list; 98 struct plist_node list;
99 wait_queue_head_t waiters; 99 /* There can only be a single waiter */
100 wait_queue_head_t waiter;
100 101
101 /* Which hash list lock to use: */ 102 /* Which hash list lock to use: */
102 spinlock_t *lock_ptr; 103 spinlock_t *lock_ptr;
@@ -123,24 +124,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 125
125/* 126/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 127 * We hash on the keys returned from get_futex_key (see below).
145 */ 128 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 129static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +144,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 144 && key1->both.offset == key2->both.offset);
162} 145}
163 146
147/*
148 * Take a reference to the resource addressed by a key.
149 * Can be called while holding spinlocks.
150 *
151 */
152static void get_futex_key_refs(union futex_key *key)
153{
154 if (!key->both.ptr)
155 return;
156
157 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
158 case FUT_OFF_INODE:
159 atomic_inc(&key->shared.inode->i_count);
160 break;
161 case FUT_OFF_MMSHARED:
162 atomic_inc(&key->private.mm->mm_count);
163 break;
164 }
165}
166
167/*
168 * Drop a reference to the resource addressed by a key.
169 * The hash bucket spinlock must not be held.
170 */
171static void drop_futex_key_refs(union futex_key *key)
172{
173 if (!key->both.ptr)
174 return;
175
176 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
177 case FUT_OFF_INODE:
178 iput(key->shared.inode);
179 break;
180 case FUT_OFF_MMSHARED:
181 mmdrop(key->private.mm);
182 break;
183 }
184}
185
164/** 186/**
165 * get_futex_key - Get parameters which are the keys for a futex. 187 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 188 * @uaddr: virtual address of the futex
@@ -179,12 +201,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 201 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 202 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 203 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 204static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 205{
185 unsigned long address = (unsigned long)uaddr; 206 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 207 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 208 struct page *page;
189 int err; 209 int err;
190 210
@@ -208,100 +228,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 228 return -EFAULT;
209 key->private.mm = mm; 229 key->private.mm = mm;
210 key->private.address = address; 230 key->private.address = address;
231 get_futex_key_refs(key);
211 return 0; 232 return 0;
212 } 233 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 234
221 /* 235again:
222 * Permissions. 236 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 237 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 238 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 239
240 lock_page(page);
241 if (!page->mapping) {
242 unlock_page(page);
243 put_page(page);
244 goto again;
245 }
226 246
227 /* 247 /*
228 * Private mappings are handled in a simple way. 248 * Private mappings are handled in a simple way.
229 * 249 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 250 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 251 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 252 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 253 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 254 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 255 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 256 key->private.mm = mm;
239 key->private.address = address; 257 key->private.address = address;
240 return 0; 258 } else {
259 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
260 key->shared.inode = page->mapping->host;
261 key->shared.pgoff = page->index;
241 } 262 }
242 263
243 /* 264 get_futex_key_refs(key);
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 }
253 265
254 /* 266 unlock_page(page);
255 * We could walk the page table to read the non-linear 267 put_page(page);
256 * pte, and get the page index without fetching the page 268 return 0;
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269
270/*
271 * Take a reference to the resource addressed by a key.
272 * Can be called while holding spinlocks.
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 269}
288 270
289/* 271static inline
290 * Drop a reference to the resource addressed by a key. 272void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 273{
295 if (!key->both.ptr) 274 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 275}
306 276
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 277static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +298,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 298
329/* 299/*
330 * Fault handling. 300 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 301 */
333static int futex_handle_fault(unsigned long address, 302static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 303{
336 struct vm_area_struct * vma; 304 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 305 struct mm_struct *mm = current->mm;
@@ -340,8 +308,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 308 if (attempt > 2)
341 return ret; 309 return ret;
342 310
343 if (!fshared) 311 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 312 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 313 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 314 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +328,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 328 current->min_flt++;
362 } 329 }
363 } 330 }
364 if (!fshared) 331 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 332 return ret;
367} 333}
368 334
@@ -385,6 +351,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 351 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 352 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 353 atomic_set(&pi_state->refcount, 1);
354 pi_state->key = FUTEX_KEY_INIT;
388 355
389 current->pi_state_cache = pi_state; 356 current->pi_state_cache = pi_state;
390 357
@@ -439,13 +406,20 @@ static void free_pi_state(struct futex_pi_state *pi_state)
439static struct task_struct * futex_find_get_task(pid_t pid) 406static struct task_struct * futex_find_get_task(pid_t pid)
440{ 407{
441 struct task_struct *p; 408 struct task_struct *p;
409 const struct cred *cred = current_cred(), *pcred;
442 410
443 rcu_read_lock(); 411 rcu_read_lock();
444 p = find_task_by_vpid(pid); 412 p = find_task_by_vpid(pid);
445 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 413 if (!p) {
446 p = ERR_PTR(-ESRCH); 414 p = ERR_PTR(-ESRCH);
447 else 415 } else {
448 get_task_struct(p); 416 pcred = __task_cred(p);
417 if (cred->euid != pcred->euid &&
418 cred->euid != pcred->uid)
419 p = ERR_PTR(-ESRCH);
420 else
421 get_task_struct(p);
422 }
449 423
450 rcu_read_unlock(); 424 rcu_read_unlock();
451 425
@@ -462,7 +436,7 @@ void exit_pi_state_list(struct task_struct *curr)
462 struct list_head *next, *head = &curr->pi_state_list; 436 struct list_head *next, *head = &curr->pi_state_list;
463 struct futex_pi_state *pi_state; 437 struct futex_pi_state *pi_state;
464 struct futex_hash_bucket *hb; 438 struct futex_hash_bucket *hb;
465 union futex_key key; 439 union futex_key key = FUTEX_KEY_INIT;
466 440
467 if (!futex_cmpxchg_enabled) 441 if (!futex_cmpxchg_enabled)
468 return; 442 return;
@@ -607,7 +581,7 @@ static void wake_futex(struct futex_q *q)
607 * The lock in wake_up_all() is a crucial memory barrier after the 581 * The lock in wake_up_all() is a crucial memory barrier after the
608 * plist_del() and also before assigning to q->lock_ptr. 582 * plist_del() and also before assigning to q->lock_ptr.
609 */ 583 */
610 wake_up_all(&q->waiters); 584 wake_up(&q->waiter);
611 /* 585 /*
612 * The waiting task can free the futex_q as soon as this is written, 586 * The waiting task can free the futex_q as soon as this is written,
613 * without taking any locks. This must come last. 587 * without taking any locks. This must come last.
@@ -719,20 +693,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
719 * Wake up all waiters hashed on the physical page that is mapped 693 * Wake up all waiters hashed on the physical page that is mapped
720 * to this virtual address: 694 * to this virtual address:
721 */ 695 */
722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 696static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
723 int nr_wake, u32 bitset)
724{ 697{
725 struct futex_hash_bucket *hb; 698 struct futex_hash_bucket *hb;
726 struct futex_q *this, *next; 699 struct futex_q *this, *next;
727 struct plist_head *head; 700 struct plist_head *head;
728 union futex_key key; 701 union futex_key key = FUTEX_KEY_INIT;
729 int ret; 702 int ret;
730 703
731 if (!bitset) 704 if (!bitset)
732 return -EINVAL; 705 return -EINVAL;
733 706
734 futex_lock_mm(fshared);
735
736 ret = get_futex_key(uaddr, fshared, &key); 707 ret = get_futex_key(uaddr, fshared, &key);
737 if (unlikely(ret != 0)) 708 if (unlikely(ret != 0))
738 goto out; 709 goto out;
@@ -760,7 +731,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
760 731
761 spin_unlock(&hb->lock); 732 spin_unlock(&hb->lock);
762out: 733out:
763 futex_unlock_mm(fshared); 734 put_futex_key(fshared, &key);
764 return ret; 735 return ret;
765} 736}
766 737
@@ -769,19 +740,16 @@ out:
769 * to this virtual address: 740 * to this virtual address:
770 */ 741 */
771static int 742static int
772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 743futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
773 u32 __user *uaddr2,
774 int nr_wake, int nr_wake2, int op) 744 int nr_wake, int nr_wake2, int op)
775{ 745{
776 union futex_key key1, key2; 746 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
777 struct futex_hash_bucket *hb1, *hb2; 747 struct futex_hash_bucket *hb1, *hb2;
778 struct plist_head *head; 748 struct plist_head *head;
779 struct futex_q *this, *next; 749 struct futex_q *this, *next;
780 int ret, op_ret, attempt = 0; 750 int ret, op_ret, attempt = 0;
781 751
782retryfull: 752retryfull:
783 futex_lock_mm(fshared);
784
785 ret = get_futex_key(uaddr1, fshared, &key1); 753 ret = get_futex_key(uaddr1, fshared, &key1);
786 if (unlikely(ret != 0)) 754 if (unlikely(ret != 0))
787 goto out; 755 goto out;
@@ -826,18 +794,12 @@ retry:
826 */ 794 */
827 if (attempt++) { 795 if (attempt++) {
828 ret = futex_handle_fault((unsigned long)uaddr2, 796 ret = futex_handle_fault((unsigned long)uaddr2,
829 fshared, attempt); 797 attempt);
830 if (ret) 798 if (ret)
831 goto out; 799 goto out;
832 goto retry; 800 goto retry;
833 } 801 }
834 802
835 /*
836 * If we would have faulted, release mmap_sem,
837 * fault it in and start all over again.
838 */
839 futex_unlock_mm(fshared);
840
841 ret = get_user(dummy, uaddr2); 803 ret = get_user(dummy, uaddr2);
842 if (ret) 804 if (ret)
843 return ret; 805 return ret;
@@ -873,7 +835,8 @@ retry:
873 if (hb1 != hb2) 835 if (hb1 != hb2)
874 spin_unlock(&hb2->lock); 836 spin_unlock(&hb2->lock);
875out: 837out:
876 futex_unlock_mm(fshared); 838 put_futex_key(fshared, &key2);
839 put_futex_key(fshared, &key1);
877 840
878 return ret; 841 return ret;
879} 842}
@@ -882,19 +845,16 @@ out:
882 * Requeue all waiters hashed on one physical page to another 845 * Requeue all waiters hashed on one physical page to another
883 * physical page. 846 * physical page.
884 */ 847 */
885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 848static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
886 u32 __user *uaddr2,
887 int nr_wake, int nr_requeue, u32 *cmpval) 849 int nr_wake, int nr_requeue, u32 *cmpval)
888{ 850{
889 union futex_key key1, key2; 851 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
890 struct futex_hash_bucket *hb1, *hb2; 852 struct futex_hash_bucket *hb1, *hb2;
891 struct plist_head *head1; 853 struct plist_head *head1;
892 struct futex_q *this, *next; 854 struct futex_q *this, *next;
893 int ret, drop_count = 0; 855 int ret, drop_count = 0;
894 856
895 retry: 857 retry:
896 futex_lock_mm(fshared);
897
898 ret = get_futex_key(uaddr1, fshared, &key1); 858 ret = get_futex_key(uaddr1, fshared, &key1);
899 if (unlikely(ret != 0)) 859 if (unlikely(ret != 0))
900 goto out; 860 goto out;
@@ -917,12 +877,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
917 if (hb1 != hb2) 877 if (hb1 != hb2)
918 spin_unlock(&hb2->lock); 878 spin_unlock(&hb2->lock);
919 879
920 /*
921 * If we would have faulted, release mmap_sem, fault
922 * it in and start all over again.
923 */
924 futex_unlock_mm(fshared);
925
926 ret = get_user(curval, uaddr1); 880 ret = get_user(curval, uaddr1);
927 881
928 if (!ret) 882 if (!ret)
@@ -974,7 +928,8 @@ out_unlock:
974 drop_futex_key_refs(&key1); 928 drop_futex_key_refs(&key1);
975 929
976out: 930out:
977 futex_unlock_mm(fshared); 931 put_futex_key(fshared, &key2);
932 put_futex_key(fshared, &key1);
978 return ret; 933 return ret;
979} 934}
980 935
@@ -983,7 +938,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
983{ 938{
984 struct futex_hash_bucket *hb; 939 struct futex_hash_bucket *hb;
985 940
986 init_waitqueue_head(&q->waiters); 941 init_waitqueue_head(&q->waiter);
987 942
988 get_futex_key_refs(&q->key); 943 get_futex_key_refs(&q->key);
989 hb = hash_futex(&q->key); 944 hb = hash_futex(&q->key);
@@ -1096,8 +1051,7 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1051 * private futexes.
1097 */ 1052 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1053static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner, 1054 struct task_struct *newowner, int fshared)
1100 struct rw_semaphore *fshared)
1101{ 1055{
1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1056 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103 struct futex_pi_state *pi_state = q->pi_state; 1057 struct futex_pi_state *pi_state = q->pi_state;
@@ -1176,7 +1130,7 @@ retry:
1176handle_fault: 1130handle_fault:
1177 spin_unlock(q->lock_ptr); 1131 spin_unlock(q->lock_ptr);
1178 1132
1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1133 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1180 1134
1181 spin_lock(q->lock_ptr); 1135 spin_lock(q->lock_ptr);
1182 1136
@@ -1196,12 +1150,13 @@ handle_fault:
1196 * In case we must use restart_block to restart a futex_wait, 1150 * In case we must use restart_block to restart a futex_wait,
1197 * we encode in the 'flags' shared capability 1151 * we encode in the 'flags' shared capability
1198 */ 1152 */
1199#define FLAGS_SHARED 1 1153#define FLAGS_SHARED 0x01
1154#define FLAGS_CLOCKRT 0x02
1200 1155
1201static long futex_wait_restart(struct restart_block *restart); 1156static long futex_wait_restart(struct restart_block *restart);
1202 1157
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1158static int futex_wait(u32 __user *uaddr, int fshared,
1204 u32 val, ktime_t *abs_time, u32 bitset) 1159 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1205{ 1160{
1206 struct task_struct *curr = current; 1161 struct task_struct *curr = current;
1207 DECLARE_WAITQUEUE(wait, curr); 1162 DECLARE_WAITQUEUE(wait, curr);
@@ -1218,8 +1173,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1218 q.pi_state = NULL; 1173 q.pi_state = NULL;
1219 q.bitset = bitset; 1174 q.bitset = bitset;
1220 retry: 1175 retry:
1221 futex_lock_mm(fshared); 1176 q.key = FUTEX_KEY_INIT;
1222
1223 ret = get_futex_key(uaddr, fshared, &q.key); 1177 ret = get_futex_key(uaddr, fshared, &q.key);
1224 if (unlikely(ret != 0)) 1178 if (unlikely(ret != 0))
1225 goto out_release_sem; 1179 goto out_release_sem;
@@ -1251,12 +1205,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1251 if (unlikely(ret)) { 1205 if (unlikely(ret)) {
1252 queue_unlock(&q, hb); 1206 queue_unlock(&q, hb);
1253 1207
1254 /*
1255 * If we would have faulted, release mmap_sem, fault it in and
1256 * start all over again.
1257 */
1258 futex_unlock_mm(fshared);
1259
1260 ret = get_user(uval, uaddr); 1208 ret = get_user(uval, uaddr);
1261 1209
1262 if (!ret) 1210 if (!ret)
@@ -1271,12 +1219,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1271 queue_me(&q, hb); 1219 queue_me(&q, hb);
1272 1220
1273 /* 1221 /*
1274 * Now the futex is queued and we have checked the data, we
1275 * don't want to hold mmap_sem while we sleep.
1276 */
1277 futex_unlock_mm(fshared);
1278
1279 /*
1280 * There might have been scheduling since the queue_me(), as we 1222 * There might have been scheduling since the queue_me(), as we
1281 * cannot hold a spinlock across the get_user() in case it 1223 * cannot hold a spinlock across the get_user() in case it
1282 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1224 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1287,7 +1229,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1287 1229
1288 /* add_wait_queue is the barrier after __set_current_state. */ 1230 /* add_wait_queue is the barrier after __set_current_state. */
1289 __set_current_state(TASK_INTERRUPTIBLE); 1231 __set_current_state(TASK_INTERRUPTIBLE);
1290 add_wait_queue(&q.waiters, &wait); 1232 add_wait_queue(&q.waiter, &wait);
1291 /* 1233 /*
1292 * !plist_node_empty() is safe here without any lock. 1234 * !plist_node_empty() is safe here without any lock.
1293 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1235 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
@@ -1300,8 +1242,10 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1300 slack = current->timer_slack_ns; 1242 slack = current->timer_slack_ns;
1301 if (rt_task(current)) 1243 if (rt_task(current))
1302 slack = 0; 1244 slack = 0;
1303 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1245 hrtimer_init_on_stack(&t.timer,
1304 HRTIMER_MODE_ABS); 1246 clockrt ? CLOCK_REALTIME :
1247 CLOCK_MONOTONIC,
1248 HRTIMER_MODE_ABS);
1305 hrtimer_init_sleeper(&t, current); 1249 hrtimer_init_sleeper(&t, current);
1306 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1250 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1307 1251
@@ -1356,6 +1300,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1356 1300
1357 if (fshared) 1301 if (fshared)
1358 restart->futex.flags |= FLAGS_SHARED; 1302 restart->futex.flags |= FLAGS_SHARED;
1303 if (clockrt)
1304 restart->futex.flags |= FLAGS_CLOCKRT;
1359 return -ERESTART_RESTARTBLOCK; 1305 return -ERESTART_RESTARTBLOCK;
1360 } 1306 }
1361 1307
@@ -1363,7 +1309,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1363 queue_unlock(&q, hb); 1309 queue_unlock(&q, hb);
1364 1310
1365 out_release_sem: 1311 out_release_sem:
1366 futex_unlock_mm(fshared); 1312 put_futex_key(fshared, &q.key);
1367 return ret; 1313 return ret;
1368} 1314}
1369 1315
@@ -1371,15 +1317,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1371static long futex_wait_restart(struct restart_block *restart) 1317static long futex_wait_restart(struct restart_block *restart)
1372{ 1318{
1373 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1319 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1374 struct rw_semaphore *fshared = NULL; 1320 int fshared = 0;
1375 ktime_t t; 1321 ktime_t t;
1376 1322
1377 t.tv64 = restart->futex.time; 1323 t.tv64 = restart->futex.time;
1378 restart->fn = do_no_restart_syscall; 1324 restart->fn = do_no_restart_syscall;
1379 if (restart->futex.flags & FLAGS_SHARED) 1325 if (restart->futex.flags & FLAGS_SHARED)
1380 fshared = &current->mm->mmap_sem; 1326 fshared = 1;
1381 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1327 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1382 restart->futex.bitset); 1328 restart->futex.bitset,
1329 restart->futex.flags & FLAGS_CLOCKRT);
1383} 1330}
1384 1331
1385 1332
@@ -1389,7 +1336,7 @@ static long futex_wait_restart(struct restart_block *restart)
1389 * if there are waiters then it will block, it does PI, etc. (Due to 1336 * if there are waiters then it will block, it does PI, etc. (Due to
1390 * races the kernel might see a 0 value of the futex too.) 1337 * races the kernel might see a 0 value of the futex too.)
1391 */ 1338 */
1392static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1339static int futex_lock_pi(u32 __user *uaddr, int fshared,
1393 int detect, ktime_t *time, int trylock) 1340 int detect, ktime_t *time, int trylock)
1394{ 1341{
1395 struct hrtimer_sleeper timeout, *to = NULL; 1342 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1412,8 +1359,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1412 1359
1413 q.pi_state = NULL; 1360 q.pi_state = NULL;
1414 retry: 1361 retry:
1415 futex_lock_mm(fshared); 1362 q.key = FUTEX_KEY_INIT;
1416
1417 ret = get_futex_key(uaddr, fshared, &q.key); 1363 ret = get_futex_key(uaddr, fshared, &q.key);
1418 if (unlikely(ret != 0)) 1364 if (unlikely(ret != 0))
1419 goto out_release_sem; 1365 goto out_release_sem;
@@ -1502,7 +1448,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1502 * exit to complete. 1448 * exit to complete.
1503 */ 1449 */
1504 queue_unlock(&q, hb); 1450 queue_unlock(&q, hb);
1505 futex_unlock_mm(fshared);
1506 cond_resched(); 1451 cond_resched();
1507 goto retry; 1452 goto retry;
1508 1453
@@ -1534,12 +1479,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1534 */ 1479 */
1535 queue_me(&q, hb); 1480 queue_me(&q, hb);
1536 1481
1537 /*
1538 * Now the futex is queued and we have checked the data, we
1539 * don't want to hold mmap_sem while we sleep.
1540 */
1541 futex_unlock_mm(fshared);
1542
1543 WARN_ON(!q.pi_state); 1482 WARN_ON(!q.pi_state);
1544 /* 1483 /*
1545 * Block on the PI mutex: 1484 * Block on the PI mutex:
@@ -1552,7 +1491,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1552 ret = ret ? 0 : -EWOULDBLOCK; 1491 ret = ret ? 0 : -EWOULDBLOCK;
1553 } 1492 }
1554 1493
1555 futex_lock_mm(fshared);
1556 spin_lock(q.lock_ptr); 1494 spin_lock(q.lock_ptr);
1557 1495
1558 if (!ret) { 1496 if (!ret) {
@@ -1618,7 +1556,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1618 1556
1619 /* Unqueue and drop the lock */ 1557 /* Unqueue and drop the lock */
1620 unqueue_me_pi(&q); 1558 unqueue_me_pi(&q);
1621 futex_unlock_mm(fshared);
1622 1559
1623 if (to) 1560 if (to)
1624 destroy_hrtimer_on_stack(&to->timer); 1561 destroy_hrtimer_on_stack(&to->timer);
@@ -1628,34 +1565,30 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1628 queue_unlock(&q, hb); 1565 queue_unlock(&q, hb);
1629 1566
1630 out_release_sem: 1567 out_release_sem:
1631 futex_unlock_mm(fshared); 1568 put_futex_key(fshared, &q.key);
1632 if (to) 1569 if (to)
1633 destroy_hrtimer_on_stack(&to->timer); 1570 destroy_hrtimer_on_stack(&to->timer);
1634 return ret; 1571 return ret;
1635 1572
1636 uaddr_faulted: 1573 uaddr_faulted:
1637 /* 1574 /*
1638 * We have to r/w *(int __user *)uaddr, but we can't modify it 1575 * We have to r/w *(int __user *)uaddr, and we have to modify it
1639 * non-atomically. Therefore, if get_user below is not 1576 * atomically. Therefore, if we continue to fault after get_user()
1640 * enough, we need to handle the fault ourselves, while 1577 * below, we need to handle the fault ourselves, while still holding
1641 * still holding the mmap_sem. 1578 * the mmap_sem. This can occur if the uaddr is under contention as
1642 * 1579 * we have to drop the mmap_sem in order to call get_user().
1643 * ... and hb->lock. :-) --ANK
1644 */ 1580 */
1645 queue_unlock(&q, hb); 1581 queue_unlock(&q, hb);
1646 1582
1647 if (attempt++) { 1583 if (attempt++) {
1648 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1584 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1649 attempt);
1650 if (ret) 1585 if (ret)
1651 goto out_release_sem; 1586 goto out_release_sem;
1652 goto retry_unlocked; 1587 goto retry_unlocked;
1653 } 1588 }
1654 1589
1655 futex_unlock_mm(fshared);
1656
1657 ret = get_user(uval, uaddr); 1590 ret = get_user(uval, uaddr);
1658 if (!ret && (uval != -EFAULT)) 1591 if (!ret)
1659 goto retry; 1592 goto retry;
1660 1593
1661 if (to) 1594 if (to)
@@ -1668,13 +1601,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1668 * This is the in-kernel slowpath: we look up the PI state (if any), 1601 * This is the in-kernel slowpath: we look up the PI state (if any),
1669 * and do the rt-mutex unlock. 1602 * and do the rt-mutex unlock.
1670 */ 1603 */
1671static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1604static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1672{ 1605{
1673 struct futex_hash_bucket *hb; 1606 struct futex_hash_bucket *hb;
1674 struct futex_q *this, *next; 1607 struct futex_q *this, *next;
1675 u32 uval; 1608 u32 uval;
1676 struct plist_head *head; 1609 struct plist_head *head;
1677 union futex_key key; 1610 union futex_key key = FUTEX_KEY_INIT;
1678 int ret, attempt = 0; 1611 int ret, attempt = 0;
1679 1612
1680retry: 1613retry:
@@ -1685,10 +1618,6 @@ retry:
1685 */ 1618 */
1686 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1619 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1687 return -EPERM; 1620 return -EPERM;
1688 /*
1689 * First take all the futex related locks:
1690 */
1691 futex_lock_mm(fshared);
1692 1621
1693 ret = get_futex_key(uaddr, fshared, &key); 1622 ret = get_futex_key(uaddr, fshared, &key);
1694 if (unlikely(ret != 0)) 1623 if (unlikely(ret != 0))
@@ -1747,34 +1676,30 @@ retry_unlocked:
1747out_unlock: 1676out_unlock:
1748 spin_unlock(&hb->lock); 1677 spin_unlock(&hb->lock);
1749out: 1678out:
1750 futex_unlock_mm(fshared); 1679 put_futex_key(fshared, &key);
1751 1680
1752 return ret; 1681 return ret;
1753 1682
1754pi_faulted: 1683pi_faulted:
1755 /* 1684 /*
1756 * We have to r/w *(int __user *)uaddr, but we can't modify it 1685 * We have to r/w *(int __user *)uaddr, and we have to modify it
1757 * non-atomically. Therefore, if get_user below is not 1686 * atomically. Therefore, if we continue to fault after get_user()
1758 * enough, we need to handle the fault ourselves, while 1687 * below, we need to handle the fault ourselves, while still holding
1759 * still holding the mmap_sem. 1688 * the mmap_sem. This can occur if the uaddr is under contention as
1760 * 1689 * we have to drop the mmap_sem in order to call get_user().
1761 * ... and hb->lock. --ANK
1762 */ 1690 */
1763 spin_unlock(&hb->lock); 1691 spin_unlock(&hb->lock);
1764 1692
1765 if (attempt++) { 1693 if (attempt++) {
1766 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1694 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1767 attempt);
1768 if (ret) 1695 if (ret)
1769 goto out; 1696 goto out;
1770 uval = 0; 1697 uval = 0;
1771 goto retry_unlocked; 1698 goto retry_unlocked;
1772 } 1699 }
1773 1700
1774 futex_unlock_mm(fshared);
1775
1776 ret = get_user(uval, uaddr); 1701 ret = get_user(uval, uaddr);
1777 if (!ret && (uval != -EFAULT)) 1702 if (!ret)
1778 goto retry; 1703 goto retry;
1779 1704
1780 return ret; 1705 return ret;
@@ -1829,6 +1754,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1829{ 1754{
1830 struct robust_list_head __user *head; 1755 struct robust_list_head __user *head;
1831 unsigned long ret; 1756 unsigned long ret;
1757 const struct cred *cred = current_cred(), *pcred;
1832 1758
1833 if (!futex_cmpxchg_enabled) 1759 if (!futex_cmpxchg_enabled)
1834 return -ENOSYS; 1760 return -ENOSYS;
@@ -1844,8 +1770,10 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1844 if (!p) 1770 if (!p)
1845 goto err_unlock; 1771 goto err_unlock;
1846 ret = -EPERM; 1772 ret = -EPERM;
1847 if ((current->euid != p->euid) && (current->euid != p->uid) && 1773 pcred = __task_cred(p);
1848 !capable(CAP_SYS_PTRACE)) 1774 if (cred->euid != pcred->euid &&
1775 cred->euid != pcred->uid &&
1776 !capable(CAP_SYS_PTRACE))
1849 goto err_unlock; 1777 goto err_unlock;
1850 head = p->robust_list; 1778 head = p->robust_list;
1851 rcu_read_unlock(); 1779 rcu_read_unlock();
@@ -1898,8 +1826,7 @@ retry:
1898 * PI futexes happens in exit_pi_state(): 1826 * PI futexes happens in exit_pi_state():
1899 */ 1827 */
1900 if (!pi && (uval & FUTEX_WAITERS)) 1828 if (!pi && (uval & FUTEX_WAITERS))
1901 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1829 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1902 FUTEX_BITSET_MATCH_ANY);
1903 } 1830 }
1904 return 0; 1831 return 0;
1905} 1832}
@@ -1993,18 +1920,22 @@ void exit_robust_list(struct task_struct *curr)
1993long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1920long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1994 u32 __user *uaddr2, u32 val2, u32 val3) 1921 u32 __user *uaddr2, u32 val2, u32 val3)
1995{ 1922{
1996 int ret = -ENOSYS; 1923 int clockrt, ret = -ENOSYS;
1997 int cmd = op & FUTEX_CMD_MASK; 1924 int cmd = op & FUTEX_CMD_MASK;
1998 struct rw_semaphore *fshared = NULL; 1925 int fshared = 0;
1999 1926
2000 if (!(op & FUTEX_PRIVATE_FLAG)) 1927 if (!(op & FUTEX_PRIVATE_FLAG))
2001 fshared = &current->mm->mmap_sem; 1928 fshared = 1;
1929
1930 clockrt = op & FUTEX_CLOCK_REALTIME;
1931 if (clockrt && cmd != FUTEX_WAIT_BITSET)
1932 return -ENOSYS;
2002 1933
2003 switch (cmd) { 1934 switch (cmd) {
2004 case FUTEX_WAIT: 1935 case FUTEX_WAIT:
2005 val3 = FUTEX_BITSET_MATCH_ANY; 1936 val3 = FUTEX_BITSET_MATCH_ANY;
2006 case FUTEX_WAIT_BITSET: 1937 case FUTEX_WAIT_BITSET:
2007 ret = futex_wait(uaddr, fshared, val, timeout, val3); 1938 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
2008 break; 1939 break;
2009 case FUTEX_WAKE: 1940 case FUTEX_WAKE:
2010 val3 = FUTEX_BITSET_MATCH_ANY; 1941 val3 = FUTEX_BITSET_MATCH_ANY;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 04ac3a9e42cf..d607a5b9ee29 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
135{ 135{
136 struct compat_robust_list_head __user *head; 136 struct compat_robust_list_head __user *head;
137 unsigned long ret; 137 unsigned long ret;
138 const struct cred *cred = current_cred(), *pcred;
138 139
139 if (!futex_cmpxchg_enabled) 140 if (!futex_cmpxchg_enabled)
140 return -ENOSYS; 141 return -ENOSYS;
@@ -150,8 +151,10 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
150 if (!p) 151 if (!p)
151 goto err_unlock; 152 goto err_unlock;
152 ret = -EPERM; 153 ret = -EPERM;
153 if ((current->euid != p->euid) && (current->euid != p->uid) && 154 pcred = __task_cred(p);
154 !capable(CAP_SYS_PTRACE)) 155 if (cred->euid != pcred->euid &&
156 cred->euid != pcred->uid &&
157 !capable(CAP_SYS_PTRACE))
155 goto err_unlock; 158 goto err_unlock;
156 head = p->compat_robust_list; 159 head = p->compat_robust_list;
157 read_unlock(&tasklist_lock); 160 read_unlock(&tasklist_lock);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 47e63349d1b2..bda9cb924276 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
443#endif 443#endif
444 444
445/*
446 * Check, whether the timer is on the callback pending list
447 */
448static inline int hrtimer_cb_pending(const struct hrtimer *timer)
449{
450 return timer->state & HRTIMER_STATE_PENDING;
451}
452
453/*
454 * Remove a timer from the callback pending list
455 */
456static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
457{
458 list_del_init(&timer->cb_entry);
459}
460
461/* High resolution timer related functions */ 445/* High resolution timer related functions */
462#ifdef CONFIG_HIGH_RES_TIMERS 446#ifdef CONFIG_HIGH_RES_TIMERS
463 447
@@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651{ 635{
652} 636}
653 637
638static void __run_hrtimer(struct hrtimer *timer);
639
654/* 640/*
655 * When High resolution timers are active, try to reprogram. Note, that in case 641 * When High resolution timers are active, try to reprogram. Note, that in case
656 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 642 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,31 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
661 struct hrtimer_clock_base *base) 647 struct hrtimer_clock_base *base)
662{ 648{
663 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 649 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
664 650 /*
665 /* Timer is expired, act upon the callback mode */ 651 * XXX: recursion check?
666 switch(timer->cb_mode) { 652 * hrtimer_forward() should round up with timer granularity
667 case HRTIMER_CB_IRQSAFE_PERCPU: 653 * so that we never get into inf recursion here,
668 case HRTIMER_CB_IRQSAFE_UNLOCKED: 654 * it doesn't do that though
669 /* 655 */
670 * This is solely for the sched tick emulation with 656 __run_hrtimer(timer);
671 * dynamic tick support to ensure that we do not 657 return 1;
672 * restart the tick right on the edge and end up with
673 * the tick timer in the softirq ! The calling site
674 * takes care of this. Also used for hrtimer sleeper !
675 */
676 debug_hrtimer_deactivate(timer);
677 return 1;
678 case HRTIMER_CB_SOFTIRQ:
679 /*
680 * Move everything else into the softirq pending list !
681 */
682 list_add_tail(&timer->cb_entry,
683 &base->cpu_base->cb_pending);
684 timer->state = HRTIMER_STATE_PENDING;
685 return 1;
686 default:
687 BUG();
688 }
689 } 658 }
690 return 0; 659 return 0;
691} 660}
@@ -724,11 +693,6 @@ static int hrtimer_switch_to_hres(void)
724 return 1; 693 return 1;
725} 694}
726 695
727static inline void hrtimer_raise_softirq(void)
728{
729 raise_softirq(HRTIMER_SOFTIRQ);
730}
731
732#else 696#else
733 697
734static inline int hrtimer_hres_active(void) { return 0; } 698static inline int hrtimer_hres_active(void) { return 0; }
@@ -747,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
747{ 711{
748 return 0; 712 return 0;
749} 713}
750static inline void hrtimer_raise_softirq(void) { }
751 714
752#endif /* CONFIG_HIGH_RES_TIMERS */ 715#endif /* CONFIG_HIGH_RES_TIMERS */
753 716
@@ -890,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
890 struct hrtimer_clock_base *base, 853 struct hrtimer_clock_base *base,
891 unsigned long newstate, int reprogram) 854 unsigned long newstate, int reprogram)
892{ 855{
893 /* High res. callback list. NOP for !HIGHRES */ 856 if (timer->state & HRTIMER_STATE_ENQUEUED) {
894 if (hrtimer_cb_pending(timer))
895 hrtimer_remove_cb_pending(timer);
896 else {
897 /* 857 /*
898 * Remove the timer from the rbtree and replace the 858 * Remove the timer from the rbtree and replace the
899 * first entry pointer if necessary. 859 * first entry pointer if necessary.
@@ -953,7 +913,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
953{ 913{
954 struct hrtimer_clock_base *base, *new_base; 914 struct hrtimer_clock_base *base, *new_base;
955 unsigned long flags; 915 unsigned long flags;
956 int ret, raise; 916 int ret;
957 917
958 base = lock_hrtimer_base(timer, &flags); 918 base = lock_hrtimer_base(timer, &flags);
959 919
@@ -988,26 +948,8 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
988 enqueue_hrtimer(timer, new_base, 948 enqueue_hrtimer(timer, new_base,
989 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 949 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
990 950
991 /*
992 * The timer may be expired and moved to the cb_pending
993 * list. We can not raise the softirq with base lock held due
994 * to a possible deadlock with runqueue lock.
995 */
996 raise = timer->state == HRTIMER_STATE_PENDING;
997
998 /*
999 * We use preempt_disable to prevent this task from migrating after
1000 * setting up the softirq and raising it. Otherwise, if me migrate
1001 * we will raise the softirq on the wrong CPU.
1002 */
1003 preempt_disable();
1004
1005 unlock_hrtimer_base(timer, &flags); 951 unlock_hrtimer_base(timer, &flags);
1006 952
1007 if (raise)
1008 hrtimer_raise_softirq();
1009 preempt_enable();
1010
1011 return ret; 953 return ret;
1012} 954}
1013EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 955EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1192,75 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1192} 1134}
1193EXPORT_SYMBOL_GPL(hrtimer_get_res); 1135EXPORT_SYMBOL_GPL(hrtimer_get_res);
1194 1136
1195static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1196{
1197 spin_lock_irq(&cpu_base->lock);
1198
1199 while (!list_empty(&cpu_base->cb_pending)) {
1200 enum hrtimer_restart (*fn)(struct hrtimer *);
1201 struct hrtimer *timer;
1202 int restart;
1203 int emulate_hardirq_ctx = 0;
1204
1205 timer = list_entry(cpu_base->cb_pending.next,
1206 struct hrtimer, cb_entry);
1207
1208 debug_hrtimer_deactivate(timer);
1209 timer_stats_account_hrtimer(timer);
1210
1211 fn = timer->function;
1212 /*
1213 * A timer might have been added to the cb_pending list
1214 * when it was migrated during a cpu-offline operation.
1215 * Emulate hardirq context for such timers.
1216 */
1217 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1218 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
1219 emulate_hardirq_ctx = 1;
1220
1221 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1222 spin_unlock_irq(&cpu_base->lock);
1223
1224 if (unlikely(emulate_hardirq_ctx)) {
1225 local_irq_disable();
1226 restart = fn(timer);
1227 local_irq_enable();
1228 } else
1229 restart = fn(timer);
1230
1231 spin_lock_irq(&cpu_base->lock);
1232
1233 timer->state &= ~HRTIMER_STATE_CALLBACK;
1234 if (restart == HRTIMER_RESTART) {
1235 BUG_ON(hrtimer_active(timer));
1236 /*
1237 * Enqueue the timer, allow reprogramming of the event
1238 * device
1239 */
1240 enqueue_hrtimer(timer, timer->base, 1);
1241 } else if (hrtimer_active(timer)) {
1242 /*
1243 * If the timer was rearmed on another CPU, reprogram
1244 * the event device.
1245 */
1246 struct hrtimer_clock_base *base = timer->base;
1247
1248 if (base->first == &timer->node &&
1249 hrtimer_reprogram(timer, base)) {
1250 /*
1251 * Timer is expired. Thus move it from tree to
1252 * pending list again.
1253 */
1254 __remove_hrtimer(timer, base,
1255 HRTIMER_STATE_PENDING, 0);
1256 list_add_tail(&timer->cb_entry,
1257 &base->cpu_base->cb_pending);
1258 }
1259 }
1260 }
1261 spin_unlock_irq(&cpu_base->lock);
1262}
1263
1264static void __run_hrtimer(struct hrtimer *timer) 1137static void __run_hrtimer(struct hrtimer *timer)
1265{ 1138{
1266 struct hrtimer_clock_base *base = timer->base; 1139 struct hrtimer_clock_base *base = timer->base;
@@ -1268,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer)
1268 enum hrtimer_restart (*fn)(struct hrtimer *); 1141 enum hrtimer_restart (*fn)(struct hrtimer *);
1269 int restart; 1142 int restart;
1270 1143
1144 WARN_ON(!irqs_disabled());
1145
1271 debug_hrtimer_deactivate(timer); 1146 debug_hrtimer_deactivate(timer);
1272 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1147 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1273 timer_stats_account_hrtimer(timer); 1148 timer_stats_account_hrtimer(timer);
1274
1275 fn = timer->function; 1149 fn = timer->function;
1276 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || 1150
1277 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { 1151 /*
1278 /* 1152 * Because we run timers from hardirq context, there is no chance
1279 * Used for scheduler timers, avoid lock inversion with 1153 * they get migrated to another cpu, therefore its safe to unlock
1280 * rq->lock and tasklist_lock. 1154 * the timer base.
1281 * 1155 */
1282 * These timers are required to deal with enqueue expiry 1156 spin_unlock(&cpu_base->lock);
1283 * themselves and are not allowed to migrate. 1157 restart = fn(timer);
1284 */ 1158 spin_lock(&cpu_base->lock);
1285 spin_unlock(&cpu_base->lock);
1286 restart = fn(timer);
1287 spin_lock(&cpu_base->lock);
1288 } else
1289 restart = fn(timer);
1290 1159
1291 /* 1160 /*
1292 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1161 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
@@ -1311,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1311 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1180 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1312 struct hrtimer_clock_base *base; 1181 struct hrtimer_clock_base *base;
1313 ktime_t expires_next, now; 1182 ktime_t expires_next, now;
1314 int i, raise = 0; 1183 int i;
1315 1184
1316 BUG_ON(!cpu_base->hres_active); 1185 BUG_ON(!cpu_base->hres_active);
1317 cpu_base->nr_events++; 1186 cpu_base->nr_events++;
@@ -1360,16 +1229,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1360 break; 1229 break;
1361 } 1230 }
1362 1231
1363 /* Move softirq callbacks to the pending list */
1364 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1365 __remove_hrtimer(timer, base,
1366 HRTIMER_STATE_PENDING, 0);
1367 list_add_tail(&timer->cb_entry,
1368 &base->cpu_base->cb_pending);
1369 raise = 1;
1370 continue;
1371 }
1372
1373 __run_hrtimer(timer); 1232 __run_hrtimer(timer);
1374 } 1233 }
1375 spin_unlock(&cpu_base->lock); 1234 spin_unlock(&cpu_base->lock);
@@ -1383,10 +1242,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1383 if (tick_program_event(expires_next, 0)) 1242 if (tick_program_event(expires_next, 0))
1384 goto retry; 1243 goto retry;
1385 } 1244 }
1386
1387 /* Raise softirq ? */
1388 if (raise)
1389 raise_softirq(HRTIMER_SOFTIRQ);
1390} 1245}
1391 1246
1392/** 1247/**
@@ -1413,11 +1268,6 @@ void hrtimer_peek_ahead_timers(void)
1413 local_irq_restore(flags); 1268 local_irq_restore(flags);
1414} 1269}
1415 1270
1416static void run_hrtimer_softirq(struct softirq_action *h)
1417{
1418 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
1419}
1420
1421#endif /* CONFIG_HIGH_RES_TIMERS */ 1271#endif /* CONFIG_HIGH_RES_TIMERS */
1422 1272
1423/* 1273/*
@@ -1429,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
1429 */ 1279 */
1430void hrtimer_run_pending(void) 1280void hrtimer_run_pending(void)
1431{ 1281{
1432 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1433
1434 if (hrtimer_hres_active()) 1282 if (hrtimer_hres_active())
1435 return; 1283 return;
1436 1284
@@ -1444,8 +1292,6 @@ void hrtimer_run_pending(void)
1444 */ 1292 */
1445 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) 1293 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1446 hrtimer_switch_to_hres(); 1294 hrtimer_switch_to_hres();
1447
1448 run_hrtimer_pending(cpu_base);
1449} 1295}
1450 1296
1451/* 1297/*
@@ -1482,14 +1328,6 @@ void hrtimer_run_queues(void)
1482 hrtimer_get_expires_tv64(timer)) 1328 hrtimer_get_expires_tv64(timer))
1483 break; 1329 break;
1484 1330
1485 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1486 __remove_hrtimer(timer, base,
1487 HRTIMER_STATE_PENDING, 0);
1488 list_add_tail(&timer->cb_entry,
1489 &base->cpu_base->cb_pending);
1490 continue;
1491 }
1492
1493 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1494 } 1332 }
1495 spin_unlock(&cpu_base->lock); 1333 spin_unlock(&cpu_base->lock);
@@ -1516,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1516{ 1354{
1517 sl->timer.function = hrtimer_wakeup; 1355 sl->timer.function = hrtimer_wakeup;
1518 sl->task = task; 1356 sl->task = task;
1519#ifdef CONFIG_HIGH_RES_TIMERS
1520 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1521#endif
1522} 1357}
1523 1358
1524static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1359static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1655,18 +1490,16 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1655 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1490 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1656 cpu_base->clock_base[i].cpu_base = cpu_base; 1491 cpu_base->clock_base[i].cpu_base = cpu_base;
1657 1492
1658 INIT_LIST_HEAD(&cpu_base->cb_pending);
1659 hrtimer_init_hres(cpu_base); 1493 hrtimer_init_hres(cpu_base);
1660} 1494}
1661 1495
1662#ifdef CONFIG_HOTPLUG_CPU 1496#ifdef CONFIG_HOTPLUG_CPU
1663 1497
1664static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1498static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1665 struct hrtimer_clock_base *new_base, int dcpu) 1499 struct hrtimer_clock_base *new_base)
1666{ 1500{
1667 struct hrtimer *timer; 1501 struct hrtimer *timer;
1668 struct rb_node *node; 1502 struct rb_node *node;
1669 int raise = 0;
1670 1503
1671 while ((node = rb_first(&old_base->active))) { 1504 while ((node = rb_first(&old_base->active))) {
1672 timer = rb_entry(node, struct hrtimer, node); 1505 timer = rb_entry(node, struct hrtimer, node);
@@ -1674,18 +1507,6 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1674 debug_hrtimer_deactivate(timer); 1507 debug_hrtimer_deactivate(timer);
1675 1508
1676 /* 1509 /*
1677 * Should not happen. Per CPU timers should be
1678 * canceled _before_ the migration code is called
1679 */
1680 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1681 __remove_hrtimer(timer, old_base,
1682 HRTIMER_STATE_INACTIVE, 0);
1683 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1684 timer, timer->function, dcpu);
1685 continue;
1686 }
1687
1688 /*
1689 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1510 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1690 * timer could be seen as !active and just vanish away 1511 * timer could be seen as !active and just vanish away
1691 * under us on another CPU 1512 * under us on another CPU
@@ -1693,69 +1514,34 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1693 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1514 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1694 timer->base = new_base; 1515 timer->base = new_base;
1695 /* 1516 /*
1696 * Enqueue the timer. Allow reprogramming of the event device 1517 * Enqueue the timers on the new cpu, but do not reprogram
1518 * the timer as that would enable a deadlock between
1519 * hrtimer_enqueue_reprogramm() running the timer and us still
1520 * holding a nested base lock.
1521 *
1522 * Instead we tickle the hrtimer interrupt after the migration
1523 * is done, which will run all expired timers and re-programm
1524 * the timer device.
1697 */ 1525 */
1698 enqueue_hrtimer(timer, new_base, 1); 1526 enqueue_hrtimer(timer, new_base, 0);
1699 1527
1700#ifdef CONFIG_HIGH_RES_TIMERS
1701 /*
1702 * Happens with high res enabled when the timer was
1703 * already expired and the callback mode is
1704 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1705 * enqueue code does not move them to the soft irq
1706 * pending list for performance/latency reasons, but
1707 * in the migration state, we need to do that
1708 * otherwise we end up with a stale timer.
1709 */
1710 if (timer->state == HRTIMER_STATE_MIGRATE) {
1711 timer->state = HRTIMER_STATE_PENDING;
1712 list_add_tail(&timer->cb_entry,
1713 &new_base->cpu_base->cb_pending);
1714 raise = 1;
1715 }
1716#endif
1717 /* Clear the migration state bit */ 1528 /* Clear the migration state bit */
1718 timer->state &= ~HRTIMER_STATE_MIGRATE; 1529 timer->state &= ~HRTIMER_STATE_MIGRATE;
1719 } 1530 }
1720 return raise;
1721}
1722
1723#ifdef CONFIG_HIGH_RES_TIMERS
1724static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1725 struct hrtimer_cpu_base *new_base)
1726{
1727 struct hrtimer *timer;
1728 int raise = 0;
1729
1730 while (!list_empty(&old_base->cb_pending)) {
1731 timer = list_entry(old_base->cb_pending.next,
1732 struct hrtimer, cb_entry);
1733
1734 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1735 timer->base = &new_base->clock_base[timer->base->index];
1736 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1737 raise = 1;
1738 }
1739 return raise;
1740}
1741#else
1742static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1743 struct hrtimer_cpu_base *new_base)
1744{
1745 return 0;
1746} 1531}
1747#endif
1748 1532
1749static void migrate_hrtimers(int cpu) 1533static int migrate_hrtimers(int scpu)
1750{ 1534{
1751 struct hrtimer_cpu_base *old_base, *new_base; 1535 struct hrtimer_cpu_base *old_base, *new_base;
1752 int i, raise = 0; 1536 int dcpu, i;
1753 1537
1754 BUG_ON(cpu_online(cpu)); 1538 BUG_ON(cpu_online(scpu));
1755 old_base = &per_cpu(hrtimer_bases, cpu); 1539 old_base = &per_cpu(hrtimer_bases, scpu);
1756 new_base = &get_cpu_var(hrtimer_bases); 1540 new_base = &get_cpu_var(hrtimer_bases);
1757 1541
1758 tick_cancel_sched_timer(cpu); 1542 dcpu = smp_processor_id();
1543
1544 tick_cancel_sched_timer(scpu);
1759 /* 1545 /*
1760 * The caller is globally serialized and nobody else 1546 * The caller is globally serialized and nobody else
1761 * takes two locks at once, deadlock is not possible. 1547 * takes two locks at once, deadlock is not possible.
@@ -1764,41 +1550,47 @@ static void migrate_hrtimers(int cpu)
1764 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1550 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1765 1551
1766 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1552 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1767 if (migrate_hrtimer_list(&old_base->clock_base[i], 1553 migrate_hrtimer_list(&old_base->clock_base[i],
1768 &new_base->clock_base[i], cpu)) 1554 &new_base->clock_base[i]);
1769 raise = 1;
1770 } 1555 }
1771 1556
1772 if (migrate_hrtimer_pending(old_base, new_base))
1773 raise = 1;
1774
1775 spin_unlock(&old_base->lock); 1557 spin_unlock(&old_base->lock);
1776 spin_unlock_irq(&new_base->lock); 1558 spin_unlock_irq(&new_base->lock);
1777 put_cpu_var(hrtimer_bases); 1559 put_cpu_var(hrtimer_bases);
1778 1560
1779 if (raise) 1561 return dcpu;
1780 hrtimer_raise_softirq(); 1562}
1563
1564static void tickle_timers(void *arg)
1565{
1566 hrtimer_peek_ahead_timers();
1781} 1567}
1568
1782#endif /* CONFIG_HOTPLUG_CPU */ 1569#endif /* CONFIG_HOTPLUG_CPU */
1783 1570
1784static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1571static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1785 unsigned long action, void *hcpu) 1572 unsigned long action, void *hcpu)
1786{ 1573{
1787 unsigned int cpu = (long)hcpu; 1574 int scpu = (long)hcpu;
1788 1575
1789 switch (action) { 1576 switch (action) {
1790 1577
1791 case CPU_UP_PREPARE: 1578 case CPU_UP_PREPARE:
1792 case CPU_UP_PREPARE_FROZEN: 1579 case CPU_UP_PREPARE_FROZEN:
1793 init_hrtimers_cpu(cpu); 1580 init_hrtimers_cpu(scpu);
1794 break; 1581 break;
1795 1582
1796#ifdef CONFIG_HOTPLUG_CPU 1583#ifdef CONFIG_HOTPLUG_CPU
1797 case CPU_DEAD: 1584 case CPU_DEAD:
1798 case CPU_DEAD_FROZEN: 1585 case CPU_DEAD_FROZEN:
1799 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1586 {
1800 migrate_hrtimers(cpu); 1587 int dcpu;
1588
1589 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1590 dcpu = migrate_hrtimers(scpu);
1591 smp_call_function_single(dcpu, tickle_timers, NULL, 0);
1801 break; 1592 break;
1593 }
1802#endif 1594#endif
1803 1595
1804 default: 1596 default:
@@ -1817,9 +1609,6 @@ void __init hrtimers_init(void)
1817 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1609 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1818 (void *)(long)smp_processor_id()); 1610 (void *)(long)smp_processor_id());
1819 register_cpu_notifier(&hrtimers_nb); 1611 register_cpu_notifier(&hrtimers_nb);
1820#ifdef CONFIG_HIGH_RES_TIMERS
1821 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1822#endif
1823} 1612}
1824 1613
1825/** 1614/**
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 681c52dbfe22..4dd5b1edac98 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..650ce4102a63 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,6 +40,9 @@ unsigned long probe_irq_on(void)
40 * flush such a longstanding irq before considering it as spurious. 40 * flush such a longstanding irq before considering it as spurious.
41 */ 41 */
42 for_each_irq_desc_reverse(i, desc) { 42 for_each_irq_desc_reverse(i, desc) {
43 if (!desc)
44 continue;
45
43 spin_lock_irq(&desc->lock); 46 spin_lock_irq(&desc->lock);
44 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 47 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
45 /* 48 /*
@@ -68,6 +71,9 @@ unsigned long probe_irq_on(void)
68 * happened in the previous stage, it may have masked itself) 71 * happened in the previous stage, it may have masked itself)
69 */ 72 */
70 for_each_irq_desc_reverse(i, desc) { 73 for_each_irq_desc_reverse(i, desc) {
74 if (!desc)
75 continue;
76
71 spin_lock_irq(&desc->lock); 77 spin_lock_irq(&desc->lock);
72 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
73 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -86,6 +92,9 @@ unsigned long probe_irq_on(void)
86 * Now filter out any obviously spurious interrupts 92 * Now filter out any obviously spurious interrupts
87 */ 93 */
88 for_each_irq_desc(i, desc) { 94 for_each_irq_desc(i, desc) {
95 if (!desc)
96 continue;
97
89 spin_lock_irq(&desc->lock); 98 spin_lock_irq(&desc->lock);
90 status = desc->status; 99 status = desc->status;
91 100
@@ -124,6 +133,9 @@ unsigned int probe_irq_mask(unsigned long val)
124 int i; 133 int i;
125 134
126 for_each_irq_desc(i, desc) { 135 for_each_irq_desc(i, desc) {
136 if (!desc)
137 continue;
138
127 spin_lock_irq(&desc->lock); 139 spin_lock_irq(&desc->lock);
128 status = desc->status; 140 status = desc->status;
129 141
@@ -166,6 +178,9 @@ int probe_irq_off(unsigned long val)
166 unsigned int status; 178 unsigned int status;
167 179
168 for_each_irq_desc(i, desc) { 180 for_each_irq_desc(i, desc) {
181 if (!desc)
182 continue;
183
169 spin_lock_irq(&desc->lock); 184 spin_lock_irq(&desc->lock);
170 status = desc->status; 185 status = desc->status;
171 186
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 10b5092e9bfe..6eb3c7952b64 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,9 +24,10 @@
24 */ 24 */
25void dynamic_irq_init(unsigned int irq) 25void dynamic_irq_init(unsigned int irq)
26{ 26{
27 struct irq_desc *desc = irq_to_desc(irq); 27 struct irq_desc *desc;
28 unsigned long flags; 28 unsigned long flags;
29 29
30 desc = irq_to_desc(irq);
30 if (!desc) { 31 if (!desc) {
31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 32 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 return; 33 return;
@@ -124,6 +125,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
124 return -ENODEV; 125 return -ENODEV;
125 } 126 }
126 127
128 type &= IRQ_TYPE_SENSE_MASK;
127 if (type == IRQ_TYPE_NONE) 129 if (type == IRQ_TYPE_NONE)
128 return 0; 130 return 0;
129 131
@@ -352,6 +354,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
352 354
353 spin_lock(&desc->lock); 355 spin_lock(&desc->lock);
354 mask_ack_irq(desc, irq); 356 mask_ack_irq(desc, irq);
357 desc = irq_remap_to_desc(irq, desc);
355 358
356 if (unlikely(desc->status & IRQ_INPROGRESS)) 359 if (unlikely(desc->status & IRQ_INPROGRESS))
357 goto out_unlock; 360 goto out_unlock;
@@ -429,6 +432,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
429 desc->status &= ~IRQ_INPROGRESS; 432 desc->status &= ~IRQ_INPROGRESS;
430out: 433out:
431 desc->chip->eoi(irq); 434 desc->chip->eoi(irq);
435 desc = irq_remap_to_desc(irq, desc);
432 436
433 spin_unlock(&desc->lock); 437 spin_unlock(&desc->lock);
434} 438}
@@ -465,12 +469,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
465 !desc->action)) { 469 !desc->action)) {
466 desc->status |= (IRQ_PENDING | IRQ_MASKED); 470 desc->status |= (IRQ_PENDING | IRQ_MASKED);
467 mask_ack_irq(desc, irq); 471 mask_ack_irq(desc, irq);
472 desc = irq_remap_to_desc(irq, desc);
468 goto out_unlock; 473 goto out_unlock;
469 } 474 }
470 kstat_incr_irqs_this_cpu(irq, desc); 475 kstat_incr_irqs_this_cpu(irq, desc);
471 476
472 /* Start handling the irq */ 477 /* Start handling the irq */
473 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 desc = irq_remap_to_desc(irq, desc);
474 480
475 /* Mark the IRQ currently in progress.*/ 481 /* Mark the IRQ currently in progress.*/
476 desc->status |= IRQ_INPROGRESS; 482 desc->status |= IRQ_INPROGRESS;
@@ -531,8 +537,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
531 if (!noirqdebug) 537 if (!noirqdebug)
532 note_interrupt(irq, desc, action_ret); 538 note_interrupt(irq, desc, action_ret);
533 539
534 if (desc->chip->eoi) 540 if (desc->chip->eoi) {
535 desc->chip->eoi(irq); 541 desc->chip->eoi(irq);
542 desc = irq_remap_to_desc(irq, desc);
543 }
536} 544}
537 545
538void 546void
@@ -567,8 +575,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
567 575
568 /* Uninstall? */ 576 /* Uninstall? */
569 if (handle == handle_bad_irq) { 577 if (handle == handle_bad_irq) {
570 if (desc->chip != &no_irq_chip) 578 if (desc->chip != &no_irq_chip) {
571 mask_ack_irq(desc, irq); 579 mask_ack_irq(desc, irq);
580 desc = irq_remap_to_desc(irq, desc);
581 }
572 desc->status |= IRQ_DISABLED; 582 desc->status |= IRQ_DISABLED;
573 desc->depth = 1; 583 desc->depth = 1;
574 } 584 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c815b42d0f5b..6492400cb50d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -15,9 +15,16 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h>
19#include <linux/hash.h>
18 20
19#include "internals.h" 21#include "internals.h"
20 22
23/*
24 * lockdep: we want to handle all irq_desc locks as a single lock-class:
25 */
26struct lock_class_key irq_desc_lock_class;
27
21/** 28/**
22 * handle_bad_irq - handle spurious and unhandled irqs 29 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 30 * @irq: the interrupt number
@@ -49,6 +56,155 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
49int nr_irqs = NR_IRQS; 56int nr_irqs = NR_IRQS;
50EXPORT_SYMBOL_GPL(nr_irqs); 57EXPORT_SYMBOL_GPL(nr_irqs);
51 58
59void __init __attribute__((weak)) arch_early_irq_init(void)
60{
61}
62
63#ifdef CONFIG_SPARSE_IRQ
64static struct irq_desc irq_desc_init = {
65 .irq = -1,
66 .status = IRQ_DISABLED,
67 .chip = &no_irq_chip,
68 .handle_irq = handle_bad_irq,
69 .depth = 1,
70 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
71#ifdef CONFIG_SMP
72 .affinity = CPU_MASK_ALL
73#endif
74};
75
76void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
77{
78 unsigned long bytes;
79 char *ptr;
80 int node;
81
82 /* Compute how many bytes we need per irq and allocate them */
83 bytes = nr * sizeof(unsigned int);
84
85 node = cpu_to_node(cpu);
86 ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
87 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
88
89 if (ptr)
90 desc->kstat_irqs = (unsigned int *)ptr;
91}
92
93void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
94{
95}
96
97static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
98{
99 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
100 desc->irq = irq;
101#ifdef CONFIG_SMP
102 desc->cpu = cpu;
103#endif
104 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
105 init_kstat_irqs(desc, cpu, nr_cpu_ids);
106 if (!desc->kstat_irqs) {
107 printk(KERN_ERR "can not alloc kstat_irqs\n");
108 BUG_ON(1);
109 }
110 arch_init_chip_data(desc, cpu);
111}
112
113/*
114 * Protect the sparse_irqs:
115 */
116DEFINE_SPINLOCK(sparse_irq_lock);
117
118struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
119
120static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
121 [0 ... NR_IRQS_LEGACY-1] = {
122 .irq = -1,
123 .status = IRQ_DISABLED,
124 .chip = &no_irq_chip,
125 .handle_irq = handle_bad_irq,
126 .depth = 1,
127 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
128#ifdef CONFIG_SMP
129 .affinity = CPU_MASK_ALL
130#endif
131 }
132};
133
134/* FIXME: use bootmem alloc ...*/
135static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
136
137void __init early_irq_init(void)
138{
139 struct irq_desc *desc;
140 int legacy_count;
141 int i;
142
143 desc = irq_desc_legacy;
144 legacy_count = ARRAY_SIZE(irq_desc_legacy);
145
146 for (i = 0; i < legacy_count; i++) {
147 desc[i].irq = i;
148 desc[i].kstat_irqs = kstat_irqs_legacy[i];
149
150 irq_desc_ptrs[i] = desc + i;
151 }
152
153 for (i = legacy_count; i < NR_IRQS; i++)
154 irq_desc_ptrs[i] = NULL;
155
156 arch_early_irq_init();
157}
158
159struct irq_desc *irq_to_desc(unsigned int irq)
160{
161 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
162}
163
164struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
165{
166 struct irq_desc *desc;
167 unsigned long flags;
168 int node;
169
170 if (irq >= NR_IRQS) {
171 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
172 irq, NR_IRQS);
173 WARN_ON(1);
174 return NULL;
175 }
176
177 desc = irq_desc_ptrs[irq];
178 if (desc)
179 return desc;
180
181 spin_lock_irqsave(&sparse_irq_lock, flags);
182
183 /* We have to check it to avoid races with another CPU */
184 desc = irq_desc_ptrs[irq];
185 if (desc)
186 goto out_unlock;
187
188 node = cpu_to_node(cpu);
189 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
190 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
191 irq, cpu, node);
192 if (!desc) {
193 printk(KERN_ERR "can not alloc irq_desc\n");
194 BUG_ON(1);
195 }
196 init_one_irq_desc(irq, desc, cpu);
197
198 irq_desc_ptrs[irq] = desc;
199
200out_unlock:
201 spin_unlock_irqrestore(&sparse_irq_lock, flags);
202
203 return desc;
204}
205
206#else
207
52struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 208struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
53 [0 ... NR_IRQS-1] = { 209 [0 ... NR_IRQS-1] = {
54 .status = IRQ_DISABLED, 210 .status = IRQ_DISABLED,
@@ -62,6 +218,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
62 } 218 }
63}; 219};
64 220
221#endif
222
65/* 223/*
66 * What should we do if we get a hw irq event on an illegal vector? 224 * What should we do if we get a hw irq event on an illegal vector?
67 * Each architecture has to answer this themself. 225 * Each architecture has to answer this themself.
@@ -179,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq)
179 /* 337 /*
180 * No locking required for CPU-local interrupts: 338 * No locking required for CPU-local interrupts:
181 */ 339 */
182 if (desc->chip->ack) 340 if (desc->chip->ack) {
183 desc->chip->ack(irq); 341 desc->chip->ack(irq);
342 /* get new one */
343 desc = irq_remap_to_desc(irq, desc);
344 }
184 if (likely(!(desc->status & IRQ_DISABLED))) { 345 if (likely(!(desc->status & IRQ_DISABLED))) {
185 action_ret = handle_IRQ_event(irq, desc->action); 346 action_ret = handle_IRQ_event(irq, desc->action);
186 if (!noirqdebug) 347 if (!noirqdebug)
@@ -191,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq)
191 } 352 }
192 353
193 spin_lock(&desc->lock); 354 spin_lock(&desc->lock);
194 if (desc->chip->ack) 355 if (desc->chip->ack) {
195 desc->chip->ack(irq); 356 desc->chip->ack(irq);
357 desc = irq_remap_to_desc(irq, desc);
358 }
196 /* 359 /*
197 * REPLAY is when Linux resends an IRQ that was dropped earlier 360 * REPLAY is when Linux resends an IRQ that was dropped earlier
198 * WAITING is used by probe to mark irqs that are being tested 361 * WAITING is used by probe to mark irqs that are being tested
@@ -259,19 +422,25 @@ out:
259} 422}
260#endif 423#endif
261 424
262
263#ifdef CONFIG_TRACE_IRQFLAGS
264/*
265 * lockdep: we want to handle all irq_desc locks as a single lock-class:
266 */
267static struct lock_class_key irq_desc_lock_class;
268
269void early_init_irq_lock_class(void) 425void early_init_irq_lock_class(void)
270{ 426{
271 struct irq_desc *desc; 427 struct irq_desc *desc;
272 int i; 428 int i;
273 429
274 for_each_irq_desc(i, desc) 430 for_each_irq_desc(i, desc) {
431 if (!desc)
432 continue;
433
275 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 434 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
435 }
436}
437
438#ifdef CONFIG_SPARSE_IRQ
439unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
440{
441 struct irq_desc *desc = irq_to_desc(irq);
442 return desc->kstat_irqs[cpu];
276} 443}
277#endif 444#endif
445EXPORT_SYMBOL(kstat_irqs_cpu);
446
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 64c1c7253dae..e6d0a43cc125 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 14 unsigned long flags);
15 15
16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock;
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
20
16#ifdef CONFIG_PROC_FS 21#ifdef CONFIG_PROC_FS
17extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
18extern void register_handler_proc(unsigned int irq, struct irqaction *action); 23extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 801addda3c43..540f6c49f3fa 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -370,16 +370,18 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
370 return 0; 370 return 0;
371 } 371 }
372 372
373 ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); 373 /* caller masked out all except trigger mode flags */
374 ret = chip->set_type(irq, flags);
374 375
375 if (ret) 376 if (ret)
376 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 377 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
377 (int)(flags & IRQF_TRIGGER_MASK), 378 (int)flags, irq, chip->set_type);
378 irq, chip->set_type);
379 else { 379 else {
380 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
381 flags |= IRQ_LEVEL;
380 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 382 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
381 desc->status &= ~IRQ_TYPE_SENSE_MASK; 383 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
382 desc->status |= flags & IRQ_TYPE_SENSE_MASK; 384 desc->status |= flags;
383 } 385 }
384 386
385 return ret; 387 return ret;
@@ -459,7 +461,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
459 461
460 /* Setup the type (level, edge polarity) if configured: */ 462 /* Setup the type (level, edge polarity) if configured: */
461 if (new->flags & IRQF_TRIGGER_MASK) { 463 if (new->flags & IRQF_TRIGGER_MASK) {
462 ret = __irq_set_trigger(desc, irq, new->flags); 464 ret = __irq_set_trigger(desc, irq,
465 new->flags & IRQF_TRIGGER_MASK);
463 466
464 if (ret) { 467 if (ret) {
465 spin_unlock_irqrestore(&desc->lock, flags); 468 spin_unlock_irqrestore(&desc->lock, flags);
@@ -673,6 +676,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
673 struct irq_desc *desc; 676 struct irq_desc *desc;
674 int retval; 677 int retval;
675 678
679 /*
680 * handle_IRQ_event() always ignores IRQF_DISABLED except for
681 * the _first_ irqaction (sigh). That can cause oopsing, but
682 * the behavior is classified as "will not fix" so we need to
683 * start nudging drivers away from using that idiom.
684 */
685 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
686 == (IRQF_SHARED|IRQF_DISABLED))
687 pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
688 "guaranteed on shared IRQs\n",
689 irq, devname);
690
676#ifdef CONFIG_LOCKDEP 691#ifdef CONFIG_LOCKDEP
677 /* 692 /*
678 * Lockdep wants atomic interrupt handlers: 693 * Lockdep wants atomic interrupt handlers:
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644
index 000000000000..089c3746358a
--- /dev/null
+++ b/kernel/irq/numa_migrate.c
@@ -0,0 +1,122 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/module.h>
10#include <linux/random.h>
11#include <linux/interrupt.h>
12#include <linux/kernel_stat.h>
13
14#include "internals.h"
15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc,
18 int cpu, int nr)
19{
20 unsigned long bytes;
21
22 init_kstat_irqs(desc, cpu, nr);
23
24 if (desc->kstat_irqs != old_desc->kstat_irqs) {
25 /* Compute how many bytes we need per irq and allocate them */
26 bytes = nr * sizeof(unsigned int);
27
28 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
29 }
30}
31
32static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
33{
34 if (old_desc->kstat_irqs == desc->kstat_irqs)
35 return;
36
37 kfree(old_desc->kstat_irqs);
38 old_desc->kstat_irqs = NULL;
39}
40
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu)
43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 desc->cpu = cpu;
46 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
47 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
48 arch_init_copy_chip_data(old_desc, desc, cpu);
49}
50
51static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
52{
53 free_kstat_irqs(old_desc, desc);
54 arch_free_chip_data(old_desc, desc);
55}
56
57static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
58 int cpu)
59{
60 struct irq_desc *desc;
61 unsigned int irq;
62 unsigned long flags;
63 int node;
64
65 irq = old_desc->irq;
66
67 spin_lock_irqsave(&sparse_irq_lock, flags);
68
69 /* We have to check it to avoid races with another CPU */
70 desc = irq_desc_ptrs[irq];
71
72 if (desc && old_desc != desc)
73 goto out_unlock;
74
75 node = cpu_to_node(cpu);
76 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
77 printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n",
78 irq, cpu, node);
79 if (!desc) {
80 printk(KERN_ERR "can not get new irq_desc for moving\n");
81 /* still use old one */
82 desc = old_desc;
83 goto out_unlock;
84 }
85 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
86
87 irq_desc_ptrs[irq] = desc;
88
89 /* free the old one */
90 free_one_irq_desc(old_desc, desc);
91 kfree(old_desc);
92
93out_unlock:
94 spin_unlock_irqrestore(&sparse_irq_lock, flags);
95
96 return desc;
97}
98
99struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
100{
101 int old_cpu;
102 int node, old_node;
103
104 /* those all static, do move them */
105 if (desc->irq < NR_IRQS_LEGACY)
106 return desc;
107
108 old_cpu = desc->cpu;
109 printk(KERN_DEBUG
110 "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
111 if (old_cpu != cpu) {
112 node = cpu_to_node(cpu);
113 old_node = cpu_to_node(old_cpu);
114 if (old_node != node)
115 desc = __real_move_irq_desc(desc, cpu);
116 else
117 desc->cpu = cpu;
118 }
119
120 return desc;
121}
122
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d257e7d6a8a4..f6b3440f05bc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -243,7 +243,11 @@ void init_irq_proc(void)
243 /* 243 /*
244 * Create entries for all existing IRQs. 244 * Create entries for all existing IRQs.
245 */ 245 */
246 for_each_irq_desc(irq, desc) 246 for_each_irq_desc(irq, desc) {
247 if (!desc)
248 continue;
249
247 register_irq_proc(irq, desc); 250 register_irq_proc(irq, desc);
251 }
248} 252}
249 253
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..3738107531fd 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,6 +91,9 @@ static int misrouted_irq(int irq)
91 int i, ok = 0; 91 int i, ok = 0;
92 92
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 if (!desc)
95 continue;
96
94 if (!i) 97 if (!i)
95 continue; 98 continue;
96 99
@@ -112,6 +115,8 @@ static void poll_spurious_irqs(unsigned long dummy)
112 for_each_irq_desc(i, desc) { 115 for_each_irq_desc(i, desc) {
113 unsigned int status; 116 unsigned int status;
114 117
118 if (!desc)
119 continue;
115 if (!i) 120 if (!i)
116 continue; 121 continue;
117 122
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..e694afa0eb8c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,20 +30,19 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33extern const unsigned long kallsyms_addresses[];
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const u8 kallsyms_names[];
35extern const u8 kallsyms_names[] __attribute__((weak));
36 35
37/* tell the compiler that the count isn't in the small data section if the arch 36/* tell the compiler that the count isn't in the small data section if the arch
38 * has one (eg: FRV) 37 * has one (eg: FRV)
39 */ 38 */
40extern const unsigned long kallsyms_num_syms 39extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 40 __attribute__((__section__(".rodata")));
42 41
43extern const u8 kallsyms_token_table[] __attribute__((weak)); 42extern const u8 kallsyms_token_table[];
44extern const u16 kallsyms_token_index[] __attribute__((weak)); 43extern const u16 kallsyms_token_index[];
45 44
46extern const unsigned long kallsyms_markers[] __attribute__((weak)); 45extern const unsigned long kallsyms_markers[];
47 46
48static inline int is_kernel_inittext(unsigned long addr) 47static inline int is_kernel_inittext(unsigned long addr)
49{ 48{
@@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr,
168 unsigned long symbol_start = 0, symbol_end = 0; 167 unsigned long symbol_start = 0, symbol_end = 0;
169 unsigned long i, low, high, mid; 168 unsigned long i, low, high, mid;
170 169
171 /* This kernel should never had been booted. */
172 BUG_ON(!kallsyms_addresses);
173
174 /* do a binary search on the sorted kallsyms_addresses array */ 170 /* do a binary search on the sorted kallsyms_addresses array */
175 low = 0; 171 low = 0;
176 high = kallsyms_num_syms; 172 high = kallsyms_num_syms;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3d3c3ea3a023..b46dbb908669 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module);
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
120 struct completion *complete; 120 struct completion *complete;
121 struct cred *cred;
121 char *path; 122 char *path;
122 char **argv; 123 char **argv;
123 char **envp; 124 char **envp;
124 struct key *ring;
125 enum umh_wait wait; 125 enum umh_wait wait;
126 int retval; 126 int retval;
127 struct file *stdin; 127 struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
134static int ____call_usermodehelper(void *data) 134static int ____call_usermodehelper(void *data)
135{ 135{
136 struct subprocess_info *sub_info = data; 136 struct subprocess_info *sub_info = data;
137 struct key *new_session, *old_session;
138 int retval; 137 int retval;
139 138
140 /* Unblock all signals and set the session keyring. */ 139 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
141 new_session = key_get(sub_info->ring); 140
141 /* Unblock all signals */
142 spin_lock_irq(&current->sighand->siglock); 142 spin_lock_irq(&current->sighand->siglock);
143 old_session = __install_session_keyring(current, new_session);
144 flush_signal_handlers(current, 1); 143 flush_signal_handlers(current, 1);
145 sigemptyset(&current->blocked); 144 sigemptyset(&current->blocked);
146 recalc_sigpending(); 145 recalc_sigpending();
147 spin_unlock_irq(&current->sighand->siglock); 146 spin_unlock_irq(&current->sighand->siglock);
148 147
149 key_put(old_session); 148 /* Install the credentials */
149 commit_creds(sub_info->cred);
150 sub_info->cred = NULL;
150 151
151 /* Install input pipe when needed */ 152 /* Install input pipe when needed */
152 if (sub_info->stdin) { 153 if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
185{ 186{
186 if (info->cleanup) 187 if (info->cleanup)
187 (*info->cleanup)(info->argv, info->envp); 188 (*info->cleanup)(info->argv, info->envp);
189 if (info->cred)
190 put_cred(info->cred);
188 kfree(info); 191 kfree(info);
189} 192}
190EXPORT_SYMBOL(call_usermodehelper_freeinfo); 193EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
240 pid_t pid; 243 pid_t pid;
241 enum umh_wait wait = sub_info->wait; 244 enum umh_wait wait = sub_info->wait;
242 245
246 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
247
243 /* CLONE_VFORK: wait until the usermode helper has execve'd 248 /* CLONE_VFORK: wait until the usermode helper has execve'd
244 * successfully We need the data structures to stay around 249 * successfully We need the data structures to stay around
245 * until that is done. */ 250 * until that is done. */
@@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
362 sub_info->path = path; 367 sub_info->path = path;
363 sub_info->argv = argv; 368 sub_info->argv = argv;
364 sub_info->envp = envp; 369 sub_info->envp = envp;
370 sub_info->cred = prepare_usermodehelper_creds();
371 if (!sub_info->cred)
372 return NULL;
365 373
366 out: 374 out:
367 return sub_info; 375 return sub_info;
@@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
376void call_usermodehelper_setkeys(struct subprocess_info *info, 384void call_usermodehelper_setkeys(struct subprocess_info *info,
377 struct key *session_keyring) 385 struct key *session_keyring)
378{ 386{
379 info->ring = session_keyring; 387#ifdef CONFIG_KEYS
388 struct thread_group_cred *tgcred = info->cred->tgcred;
389 key_put(tgcred->session_keyring);
390 tgcred->session_keyring = key_get(session_keyring);
391#else
392 BUG();
393#endif
380} 394}
381EXPORT_SYMBOL(call_usermodehelper_setkeys); 395EXPORT_SYMBOL(call_usermodehelper_setkeys);
382 396
@@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
444 DECLARE_COMPLETION_ONSTACK(done); 458 DECLARE_COMPLETION_ONSTACK(done);
445 int retval = 0; 459 int retval = 0;
446 460
461 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
462
447 helper_lock(); 463 helper_lock();
448 if (sub_info->path[0] == '\0') 464 if (sub_info->path[0] == '\0')
449 goto out; 465 goto out;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0a..4fbc456f393d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
23 23
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
24struct kthread_create_info 27struct kthread_create_info
25{ 28{
26 /* Information passed to kthread() from kthreadd. */ 29 /* Information passed to kthread() from kthreadd. */
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c56923..449db466bdbc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
191 latency_record[i].time, 191 latency_record[i].time,
192 latency_record[i].max); 192 latency_record[i].max);
193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
194 char sym[KSYM_NAME_LEN]; 194 char sym[KSYM_SYMBOL_LEN];
195 char *c; 195 char *c;
196 if (!latency_record[i].backtrace[q]) 196 if (!latency_record[i].backtrace[q])
197 break; 197 break;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 46a404173db2..06b0c3568f0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -25,6 +25,7 @@
25 * Thanks to Arjan van de Ven for coming up with the initial idea of 25 * Thanks to Arjan van de Ven for coming up with the initial idea of
26 * mapping lock dependencies runtime. 26 * mapping lock dependencies runtime.
27 */ 27 */
28#define DISABLE_BRANCH_PROFILING
28#include <linux/mutex.h> 29#include <linux/mutex.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/delay.h> 31#include <linux/delay.h>
@@ -136,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
136#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
138 139
139static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
140{ 141{
141 int i; 142 int i;
142 143
143 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
144 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
145 class->contention_point[i] = ip; 146 points[i] = ip;
146 break; 147 break;
147 } 148 }
148 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
149 break; 150 break;
150 } 151 }
151 152
@@ -185,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
185 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
186 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
187 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
188 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
189 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
190 194
@@ -209,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
209 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
210 } 214 }
211 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
212} 217}
213 218
214static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -287,14 +292,12 @@ void lockdep_off(void)
287{ 292{
288 current->lockdep_recursion++; 293 current->lockdep_recursion++;
289} 294}
290
291EXPORT_SYMBOL(lockdep_off); 295EXPORT_SYMBOL(lockdep_off);
292 296
293void lockdep_on(void) 297void lockdep_on(void)
294{ 298{
295 current->lockdep_recursion--; 299 current->lockdep_recursion--;
296} 300}
297
298EXPORT_SYMBOL(lockdep_on); 301EXPORT_SYMBOL(lockdep_on);
299 302
300/* 303/*
@@ -576,7 +579,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
576/* 579/*
577 * printk all lock dependencies starting at <entry>: 580 * printk all lock dependencies starting at <entry>:
578 */ 581 */
579static void print_lock_dependencies(struct lock_class *class, int depth) 582static void __used
583print_lock_dependencies(struct lock_class *class, int depth)
580{ 584{
581 struct lock_list *entry; 585 struct lock_list *entry;
582 586
@@ -2508,7 +2512,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2508 if (subclass) 2512 if (subclass)
2509 register_lock_class(lock, subclass, 1); 2513 register_lock_class(lock, subclass, 1);
2510} 2514}
2511
2512EXPORT_SYMBOL_GPL(lockdep_init_map); 2515EXPORT_SYMBOL_GPL(lockdep_init_map);
2513 2516
2514/* 2517/*
@@ -2689,8 +2692,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2689} 2692}
2690 2693
2691static int 2694static int
2692__lock_set_subclass(struct lockdep_map *lock, 2695__lock_set_class(struct lockdep_map *lock, const char *name,
2693 unsigned int subclass, unsigned long ip) 2696 struct lock_class_key *key, unsigned int subclass,
2697 unsigned long ip)
2694{ 2698{
2695 struct task_struct *curr = current; 2699 struct task_struct *curr = current;
2696 struct held_lock *hlock, *prev_hlock; 2700 struct held_lock *hlock, *prev_hlock;
@@ -2717,6 +2721,7 @@ __lock_set_subclass(struct lockdep_map *lock,
2717 return print_unlock_inbalance_bug(curr, lock, ip); 2721 return print_unlock_inbalance_bug(curr, lock, ip);
2718 2722
2719found_it: 2723found_it:
2724 lockdep_init_map(lock, name, key, 0);
2720 class = register_lock_class(lock, subclass, 0); 2725 class = register_lock_class(lock, subclass, 0);
2721 hlock->class_idx = class - lock_classes + 1; 2726 hlock->class_idx = class - lock_classes + 1;
2722 2727
@@ -2901,9 +2906,9 @@ static void check_flags(unsigned long flags)
2901#endif 2906#endif
2902} 2907}
2903 2908
2904void 2909void lock_set_class(struct lockdep_map *lock, const char *name,
2905lock_set_subclass(struct lockdep_map *lock, 2910 struct lock_class_key *key, unsigned int subclass,
2906 unsigned int subclass, unsigned long ip) 2911 unsigned long ip)
2907{ 2912{
2908 unsigned long flags; 2913 unsigned long flags;
2909 2914
@@ -2913,13 +2918,12 @@ lock_set_subclass(struct lockdep_map *lock,
2913 raw_local_irq_save(flags); 2918 raw_local_irq_save(flags);
2914 current->lockdep_recursion = 1; 2919 current->lockdep_recursion = 1;
2915 check_flags(flags); 2920 check_flags(flags);
2916 if (__lock_set_subclass(lock, subclass, ip)) 2921 if (__lock_set_class(lock, name, key, subclass, ip))
2917 check_chain_key(current); 2922 check_chain_key(current);
2918 current->lockdep_recursion = 0; 2923 current->lockdep_recursion = 0;
2919 raw_local_irq_restore(flags); 2924 raw_local_irq_restore(flags);
2920} 2925}
2921 2926EXPORT_SYMBOL_GPL(lock_set_class);
2922EXPORT_SYMBOL_GPL(lock_set_subclass);
2923 2927
2924/* 2928/*
2925 * We are not always called with irqs disabled - do that here, 2929 * We are not always called with irqs disabled - do that here,
@@ -2943,7 +2947,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2943 current->lockdep_recursion = 0; 2947 current->lockdep_recursion = 0;
2944 raw_local_irq_restore(flags); 2948 raw_local_irq_restore(flags);
2945} 2949}
2946
2947EXPORT_SYMBOL_GPL(lock_acquire); 2950EXPORT_SYMBOL_GPL(lock_acquire);
2948 2951
2949void lock_release(struct lockdep_map *lock, int nested, 2952void lock_release(struct lockdep_map *lock, int nested,
@@ -2961,7 +2964,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2961 current->lockdep_recursion = 0; 2964 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 2965 raw_local_irq_restore(flags);
2963} 2966}
2964
2965EXPORT_SYMBOL_GPL(lock_release); 2967EXPORT_SYMBOL_GPL(lock_release);
2966 2968
2967#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
@@ -2999,7 +3001,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
2999 struct held_lock *hlock, *prev_hlock; 3001 struct held_lock *hlock, *prev_hlock;
3000 struct lock_class_stats *stats; 3002 struct lock_class_stats *stats;
3001 unsigned int depth; 3003 unsigned int depth;
3002 int i, point; 3004 int i, contention_point, contending_point;
3003 3005
3004 depth = curr->lockdep_depth; 3006 depth = curr->lockdep_depth;
3005 if (DEBUG_LOCKS_WARN_ON(!depth)) 3007 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3023,18 +3025,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3023found_it: 3025found_it:
3024 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
3025 3027
3026 point = lock_contention_point(hlock_class(hlock), ip); 3028 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3029 contending_point = lock_point(hlock_class(hlock)->contending_point,
3030 lock->ip);
3027 3031
3028 stats = get_lock_stats(hlock_class(hlock)); 3032 stats = get_lock_stats(hlock_class(hlock));
3029 if (point < ARRAY_SIZE(stats->contention_point)) 3033 if (contention_point < LOCKSTAT_POINTS)
3030 stats->contention_point[point]++; 3034 stats->contention_point[contention_point]++;
3035 if (contending_point < LOCKSTAT_POINTS)
3036 stats->contending_point[contending_point]++;
3031 if (lock->cpu != smp_processor_id()) 3037 if (lock->cpu != smp_processor_id())
3032 stats->bounces[bounce_contended + !!hlock->read]++; 3038 stats->bounces[bounce_contended + !!hlock->read]++;
3033 put_lock_stats(stats); 3039 put_lock_stats(stats);
3034} 3040}
3035 3041
3036static void 3042static void
3037__lock_acquired(struct lockdep_map *lock) 3043__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3038{ 3044{
3039 struct task_struct *curr = current; 3045 struct task_struct *curr = current;
3040 struct held_lock *hlock, *prev_hlock; 3046 struct held_lock *hlock, *prev_hlock;
@@ -3083,6 +3089,7 @@ found_it:
3083 put_lock_stats(stats); 3089 put_lock_stats(stats);
3084 3090
3085 lock->cpu = cpu; 3091 lock->cpu = cpu;
3092 lock->ip = ip;
3086} 3093}
3087 3094
3088void lock_contended(struct lockdep_map *lock, unsigned long ip) 3095void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3104,7 +3111,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3104} 3111}
3105EXPORT_SYMBOL_GPL(lock_contended); 3112EXPORT_SYMBOL_GPL(lock_contended);
3106 3113
3107void lock_acquired(struct lockdep_map *lock) 3114void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3108{ 3115{
3109 unsigned long flags; 3116 unsigned long flags;
3110 3117
@@ -3117,7 +3124,7 @@ void lock_acquired(struct lockdep_map *lock)
3117 raw_local_irq_save(flags); 3124 raw_local_irq_save(flags);
3118 check_flags(flags); 3125 check_flags(flags);
3119 current->lockdep_recursion = 1; 3126 current->lockdep_recursion = 1;
3120 __lock_acquired(lock); 3127 __lock_acquired(lock, ip);
3121 current->lockdep_recursion = 0; 3128 current->lockdep_recursion = 0;
3122 raw_local_irq_restore(flags); 3129 raw_local_irq_restore(flags);
3123} 3130}
@@ -3441,7 +3448,6 @@ retry:
3441 if (unlock) 3448 if (unlock)
3442 read_unlock(&tasklist_lock); 3449 read_unlock(&tasklist_lock);
3443} 3450}
3444
3445EXPORT_SYMBOL_GPL(debug_show_all_locks); 3451EXPORT_SYMBOL_GPL(debug_show_all_locks);
3446 3452
3447/* 3453/*
@@ -3462,7 +3468,6 @@ void debug_show_held_locks(struct task_struct *task)
3462{ 3468{
3463 __debug_show_held_locks(task); 3469 __debug_show_held_locks(task);
3464} 3470}
3465
3466EXPORT_SYMBOL_GPL(debug_show_held_locks); 3471EXPORT_SYMBOL_GPL(debug_show_held_locks);
3467 3472
3468void lockdep_sys_exit(void) 3473void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc9400..ea54f2647868 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
43 */ 43 */
44#define MARKER_HASH_BITS 6 44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
64 void *oldptr; 65 void *oldptr;
65 int rcu_pending; 66 int rcu_pending;
66 unsigned char ptype:1; 67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 69 char name[0]; /* Contains name'\0'format'\0' */
68}; 70};
69 71
70static struct hlist_head marker_table[MARKER_TABLE_SIZE];
71
72/** 72/**
73 * __mark_empty_function - Empty probe callback 73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data 74 * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
81 * though the function pointer change and the marker enabling are two distinct 81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code. 82 * operations that modifies the execution flow of preemptible code.
83 */ 83 */
84void __mark_empty_function(void *probe_private, void *call_private, 84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args) 85 const char *fmt, va_list *args)
86{ 86{
87} 87}
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
97 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
99 */ 99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private, ...) 100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
101{ 102{
102 va_list args; 103 va_list args;
103 char ptype; 104 char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
107 * sure the teardown of the callbacks can be done correctly when they 108 * sure the teardown of the callbacks can be done correctly when they
108 * are in modules and they insure RCU read coherency. 109 * are in modules and they insure RCU read coherency.
109 */ 110 */
110 rcu_read_lock_sched(); 111 rcu_read_lock_sched_notrace();
111 ptype = mdata->ptype; 112 ptype = mdata->ptype;
112 if (likely(!ptype)) { 113 if (likely(!ptype)) {
113 marker_probe_func *func; 114 marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
145 va_end(args); 146 va_end(args);
146 } 147 }
147 } 148 }
148 rcu_read_unlock_sched(); 149 rcu_read_unlock_sched_notrace();
149} 150}
150EXPORT_SYMBOL_GPL(marker_probe_cb); 151EXPORT_SYMBOL_GPL(marker_probe_cb);
151 152
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
157 * 158 *
158 * Should be connected to markers "MARK_NOARGS". 159 * Should be connected to markers "MARK_NOARGS".
159 */ 160 */
160void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) 161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
161{ 163{
162 va_list args; /* not initialized */ 164 va_list args; /* not initialized */
163 char ptype; 165 char ptype;
164 166
165 rcu_read_lock_sched(); 167 rcu_read_lock_sched_notrace();
166 ptype = mdata->ptype; 168 ptype = mdata->ptype;
167 if (likely(!ptype)) { 169 if (likely(!ptype)) {
168 marker_probe_func *func; 170 marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
195 multi[i].func(multi[i].probe_private, call_private, 197 multi[i].func(multi[i].probe_private, call_private,
196 mdata->format, &args); 198 mdata->format, &args);
197 } 199 }
198 rcu_read_unlock_sched(); 200 rcu_read_unlock_sched_notrace();
199} 201}
200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
201 202
202static void free_old_closure(struct rcu_head *head) 203static void free_old_closure(struct rcu_head *head)
203{ 204{
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
416 e->single.probe_private = NULL; 417 e->single.probe_private = NULL;
417 e->multi = NULL; 418 e->multi = NULL;
418 e->ptype = 0; 419 e->ptype = 0;
420 e->format_allocated = 0;
419 e->refcount = 0; 421 e->refcount = 0;
420 e->rcu_pending = 0; 422 e->rcu_pending = 0;
421 hlist_add_head(&e->hlist, head); 423 hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
447 if (e->single.func != __mark_empty_function) 449 if (e->single.func != __mark_empty_function)
448 return -EBUSY; 450 return -EBUSY;
449 hlist_del(&e->hlist); 451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
450 /* Make sure the call_rcu has been executed */ 454 /* Make sure the call_rcu has been executed */
451 if (e->rcu_pending) 455 if (e->rcu_pending)
452 rcu_barrier_sched(); 456 rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
457/* 461/*
458 * Set the mark_entry format to the format found in the element. 462 * Set the mark_entry format to the format found in the element.
459 */ 463 */
460static int marker_set_format(struct marker_entry **entry, const char *format) 464static int marker_set_format(struct marker_entry *entry, const char *format)
461{ 465{
462 struct marker_entry *e; 466 entry->format = kstrdup(format, GFP_KERNEL);
463 size_t name_len = strlen((*entry)->name) + 1; 467 if (!entry->format)
464 size_t format_len = strlen(format) + 1;
465
466
467 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
468 GFP_KERNEL);
469 if (!e)
470 return -ENOMEM; 468 return -ENOMEM;
471 memcpy(&e->name[0], (*entry)->name, name_len); 469 entry->format_allocated = 1;
472 e->format = &e->name[name_len]; 470
473 memcpy(e->format, format, format_len);
474 if (strcmp(e->format, MARK_NOARGS) == 0)
475 e->call = marker_probe_cb_noarg;
476 else
477 e->call = marker_probe_cb;
478 e->single = (*entry)->single;
479 e->multi = (*entry)->multi;
480 e->ptype = (*entry)->ptype;
481 e->refcount = (*entry)->refcount;
482 e->rcu_pending = 0;
483 hlist_add_before(&e->hlist, &(*entry)->hlist);
484 hlist_del(&(*entry)->hlist);
485 /* Make sure the call_rcu has been executed */
486 if ((*entry)->rcu_pending)
487 rcu_barrier_sched();
488 kfree(*entry);
489 *entry = e;
490 trace_mark(core_marker_format, "name %s format %s", 471 trace_mark(core_marker_format, "name %s format %s",
491 e->name, e->format); 472 entry->name, entry->format);
492 return 0; 473 return 0;
493} 474}
494 475
495/* 476/*
496 * Sets the probe callback corresponding to one marker. 477 * Sets the probe callback corresponding to one marker.
497 */ 478 */
498static int set_marker(struct marker_entry **entry, struct marker *elem, 479static int set_marker(struct marker_entry *entry, struct marker *elem,
499 int active) 480 int active)
500{ 481{
501 int ret; 482 int ret = 0;
502 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 483 WARN_ON(strcmp(entry->name, elem->name) != 0);
503 484
504 if ((*entry)->format) { 485 if (entry->format) {
505 if (strcmp((*entry)->format, elem->format) != 0) { 486 if (strcmp(entry->format, elem->format) != 0) {
506 printk(KERN_NOTICE 487 printk(KERN_NOTICE
507 "Format mismatch for probe %s " 488 "Format mismatch for probe %s "
508 "(%s), marker (%s)\n", 489 "(%s), marker (%s)\n",
509 (*entry)->name, 490 entry->name,
510 (*entry)->format, 491 entry->format,
511 elem->format); 492 elem->format);
512 return -EPERM; 493 return -EPERM;
513 } 494 }
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
523 * pass from a "safe" callback (with argument) to an "unsafe" 504 * pass from a "safe" callback (with argument) to an "unsafe"
524 * callback (does not set arguments). 505 * callback (does not set arguments).
525 */ 506 */
526 elem->call = (*entry)->call; 507 elem->call = entry->call;
527 /* 508 /*
528 * Sanity check : 509 * Sanity check :
529 * We only update the single probe private data when the ptr is 510 * We only update the single probe private data when the ptr is
530 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) 511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
531 */ 512 */
532 WARN_ON(elem->single.func != __mark_empty_function 513 WARN_ON(elem->single.func != __mark_empty_function
533 && elem->single.probe_private 514 && elem->single.probe_private != entry->single.probe_private
534 != (*entry)->single.probe_private && 515 && !elem->ptype);
535 !elem->ptype); 516 elem->single.probe_private = entry->single.probe_private;
536 elem->single.probe_private = (*entry)->single.probe_private;
537 /* 517 /*
538 * Make sure the private data is valid when we update the 518 * Make sure the private data is valid when we update the
539 * single probe ptr. 519 * single probe ptr.
540 */ 520 */
541 smp_wmb(); 521 smp_wmb();
542 elem->single.func = (*entry)->single.func; 522 elem->single.func = entry->single.func;
543 /* 523 /*
544 * We also make sure that the new probe callbacks array is consistent 524 * We also make sure that the new probe callbacks array is consistent
545 * before setting a pointer to it. 525 * before setting a pointer to it.
546 */ 526 */
547 rcu_assign_pointer(elem->multi, (*entry)->multi); 527 rcu_assign_pointer(elem->multi, entry->multi);
548 /* 528 /*
549 * Update the function or multi probe array pointer before setting the 529 * Update the function or multi probe array pointer before setting the
550 * ptype. 530 * ptype.
551 */ 531 */
552 smp_wmb(); 532 smp_wmb();
553 elem->ptype = (*entry)->ptype; 533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
554 elem->state = active; 565 elem->state = active;
555 566
556 return 0; 567 return ret;
557} 568}
558 569
559/* 570/*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
564 */ 575 */
565static void disable_marker(struct marker *elem) 576static void disable_marker(struct marker *elem)
566{ 577{
578 int ret;
579
567 /* leave "call" as is. It is known statically. */ 580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
568 elem->state = 0; 596 elem->state = 0;
569 elem->single.func = __mark_empty_function; 597 elem->single.func = __mark_empty_function;
570 /* Update the function before setting the ptype */ 598 /* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
594 for (iter = begin; iter < end; iter++) { 622 for (iter = begin; iter < end; iter++) {
595 mark_entry = get_marker(iter->name); 623 mark_entry = get_marker(iter->name);
596 if (mark_entry) { 624 if (mark_entry) {
597 set_marker(&mark_entry, iter, 625 set_marker(mark_entry, iter, !!mark_entry->refcount);
598 !!mark_entry->refcount);
599 /* 626 /*
600 * ignore error, continue 627 * ignore error, continue
601 */ 628 */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
629 marker_update_probe_range(__start___markers, __stop___markers); 656 marker_update_probe_range(__start___markers, __stop___markers);
630 /* Markers in modules. */ 657 /* Markers in modules. */
631 module_update_markers(); 658 module_update_markers();
659 tracepoint_probe_update_all();
632} 660}
633 661
634/** 662/**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
657 ret = PTR_ERR(entry); 685 ret = PTR_ERR(entry);
658 } else if (format) { 686 } else if (format) {
659 if (!entry->format) 687 if (!entry->format)
660 ret = marker_set_format(&entry, format); 688 ret = marker_set_format(entry, format);
661 else if (strcmp(entry->format, format)) 689 else if (strcmp(entry->format, format))
662 ret = -EPERM; 690 ret = -EPERM;
663 } 691 }
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
676 goto end; 704 goto end;
677 } 705 }
678 mutex_unlock(&markers_mutex); 706 mutex_unlock(&markers_mutex);
679 marker_update_probes(); /* may update entry */ 707 marker_update_probes();
680 mutex_lock(&markers_mutex); 708 mutex_lock(&markers_mutex);
681 entry = get_marker(name); 709 entry = get_marker(name);
682 WARN_ON(!entry); 710 if (!entry)
711 goto end;
683 if (entry->rcu_pending) 712 if (entry->rcu_pending)
684 rcu_barrier_sched(); 713 rcu_barrier_sched();
685 entry->oldptr = old; 714 entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
720 rcu_barrier_sched(); 749 rcu_barrier_sched();
721 old = marker_entry_remove_probe(entry, probe, probe_private); 750 old = marker_entry_remove_probe(entry, probe, probe_private);
722 mutex_unlock(&markers_mutex); 751 mutex_unlock(&markers_mutex);
723 marker_update_probes(); /* may update entry */ 752 marker_update_probes();
724 mutex_lock(&markers_mutex); 753 mutex_lock(&markers_mutex);
725 entry = get_marker(name); 754 entry = get_marker(name);
726 if (!entry) 755 if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
801 rcu_barrier_sched(); 830 rcu_barrier_sched();
802 old = marker_entry_remove_probe(entry, NULL, probe_private); 831 old = marker_entry_remove_probe(entry, NULL, probe_private);
803 mutex_unlock(&markers_mutex); 832 mutex_unlock(&markers_mutex);
804 marker_update_probes(); /* may update entry */ 833 marker_update_probes();
805 mutex_lock(&markers_mutex); 834 mutex_lock(&markers_mutex);
806 entry = get_marker_from_private_data(probe, probe_private); 835 entry = get_marker_from_private_data(probe, probe_private);
807 WARN_ON(!entry); 836 if (!entry)
837 goto end;
808 if (entry->rcu_pending) 838 if (entry->rcu_pending)
809 rcu_barrier_sched(); 839 rcu_barrier_sched();
810 entry->oldptr = old; 840 entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
848 if (!e->ptype) { 878 if (!e->ptype) {
849 if (num == 0 && e->single.func == probe) 879 if (num == 0 && e->single.func == probe)
850 return e->single.probe_private; 880 return e->single.probe_private;
851 else
852 break;
853 } else { 881 } else {
854 struct marker_probe_closure *closure; 882 struct marker_probe_closure *closure;
855 int match = 0; 883 int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
861 return closure[i].probe_private; 889 return closure[i].probe_private;
862 } 890 }
863 } 891 }
892 break;
864 } 893 }
865 } 894 }
866 return ERR_PTR(-ENOENT); 895 return ERR_PTR(-ENOENT);
867} 896}
868EXPORT_SYMBOL_GPL(marker_get_private_data); 897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c20..dd2a54155b54 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod,
2184 struct mod_debug *debug; 2184 struct mod_debug *debug;
2185 unsigned int num_debug; 2185 unsigned int num_debug;
2186 2186
2187#ifdef CONFIG_MARKERS
2188 marker_update_probe_range(mod->markers,
2189 mod->markers + mod->num_markers);
2190#endif
2191 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2187 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2192 sizeof(*debug), &num_debug); 2188 sizeof(*debug), &num_debug);
2193 dynamic_printk_setup(debug, num_debug); 2189 dynamic_printk_setup(debug, num_debug);
2194
2195#ifdef CONFIG_TRACEPOINTS
2196 tracepoint_update_probe_range(mod->tracepoints,
2197 mod->tracepoints + mod->num_tracepoints);
2198#endif
2199 } 2190 }
2200 2191
2201 /* sechdrs[0].sh_size is always zero */ 2192 /* sechdrs[0].sh_size is always zero */
2202 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", 2193 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2203 sizeof(*mseg), &num_mcount); 2194 sizeof(*mseg), &num_mcount);
2204 ftrace_init_module(mseg, mseg + num_mcount); 2195 ftrace_init_module(mod, mseg, mseg + num_mcount);
2205 2196
2206 err = module_finalize(hdr, sechdrs, mod); 2197 err = module_finalize(hdr, sechdrs, mod);
2207 if (err < 0) 2198 if (err < 0)
@@ -2713,7 +2704,7 @@ int is_module_address(unsigned long addr)
2713 2704
2714 2705
2715/* Is this a valid kernel address? */ 2706/* Is this a valid kernel address? */
2716struct module *__module_text_address(unsigned long addr) 2707__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2717{ 2708{
2718 struct module *mod; 2709 struct module *mod;
2719 2710
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29a2583..63598dca2d0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
80 goto out_pid; 80 goto out_pid;
81 } 81 }
82 82
83 new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
84 if (IS_ERR(new_nsp->user_ns)) {
85 err = PTR_ERR(new_nsp->user_ns);
86 goto out_user;
87 }
88
89 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); 83 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
90 if (IS_ERR(new_nsp->net_ns)) { 84 if (IS_ERR(new_nsp->net_ns)) {
91 err = PTR_ERR(new_nsp->net_ns); 85 err = PTR_ERR(new_nsp->net_ns);
@@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
95 return new_nsp; 89 return new_nsp;
96 90
97out_net: 91out_net:
98 if (new_nsp->user_ns)
99 put_user_ns(new_nsp->user_ns);
100out_user:
101 if (new_nsp->pid_ns) 92 if (new_nsp->pid_ns)
102 put_pid_ns(new_nsp->pid_ns); 93 put_pid_ns(new_nsp->pid_ns);
103out_pid: 94out_pid:
@@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
130 get_nsproxy(old_ns); 121 get_nsproxy(old_ns);
131 122
132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 123 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) 124 CLONE_NEWPID | CLONE_NEWNET)))
134 return 0; 125 return 0;
135 126
136 if (!capable(CAP_SYS_ADMIN)) { 127 if (!capable(CAP_SYS_ADMIN)) {
@@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns)
173 put_ipc_ns(ns->ipc_ns); 164 put_ipc_ns(ns->ipc_ns);
174 if (ns->pid_ns) 165 if (ns->pid_ns)
175 put_pid_ns(ns->pid_ns); 166 put_pid_ns(ns->pid_ns);
176 if (ns->user_ns)
177 put_user_ns(ns->user_ns);
178 put_net(ns->net_ns); 167 put_net(ns->net_ns);
179 kmem_cache_free(nsproxy_cachep, ns); 168 kmem_cache_free(nsproxy_cachep, ns);
180} 169}
@@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
189 int err = 0; 178 int err = 0;
190 179
191 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 180 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
192 CLONE_NEWUSER | CLONE_NEWNET))) 181 CLONE_NEWNET)))
193 return 0; 182 return 0;
194 183
195 if (!capable(CAP_SYS_ADMIN)) 184 if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d5088355bfe..13f06349a786 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/dmi.h>
24 25
25int panic_on_oops; 26int panic_on_oops;
26static unsigned long tainted_mask; 27static unsigned long tainted_mask;
@@ -321,36 +322,27 @@ void oops_exit(void)
321} 322}
322 323
323#ifdef WANT_WARN_ON_SLOWPATH 324#ifdef WANT_WARN_ON_SLOWPATH
324void warn_on_slowpath(const char *file, int line)
325{
326 char function[KSYM_SYMBOL_LEN];
327 unsigned long caller = (unsigned long) __builtin_return_address(0);
328 sprint_symbol(function, caller);
329
330 printk(KERN_WARNING "------------[ cut here ]------------\n");
331 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
332 line, function);
333 print_modules();
334 dump_stack();
335 print_oops_end_marker();
336 add_taint(TAINT_WARN);
337}
338EXPORT_SYMBOL(warn_on_slowpath);
339
340
341void warn_slowpath(const char *file, int line, const char *fmt, ...) 325void warn_slowpath(const char *file, int line, const char *fmt, ...)
342{ 326{
343 va_list args; 327 va_list args;
344 char function[KSYM_SYMBOL_LEN]; 328 char function[KSYM_SYMBOL_LEN];
345 unsigned long caller = (unsigned long)__builtin_return_address(0); 329 unsigned long caller = (unsigned long)__builtin_return_address(0);
330 const char *board;
331
346 sprint_symbol(function, caller); 332 sprint_symbol(function, caller);
347 333
348 printk(KERN_WARNING "------------[ cut here ]------------\n"); 334 printk(KERN_WARNING "------------[ cut here ]------------\n");
349 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 335 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
350 line, function); 336 line, function);
351 va_start(args, fmt); 337 board = dmi_get_system_info(DMI_PRODUCT_NAME);
352 vprintk(fmt, args); 338 if (board)
353 va_end(args); 339 printk(KERN_WARNING "Hardware name: %s\n", board);
340
341 if (fmt) {
342 va_start(args, fmt);
343 vprintk(fmt, args);
344 va_end(args);
345 }
354 346
355 print_modules(); 347 print_modules();
356 dump_stack(); 348 dump_stack();
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 895337b16a24..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
@@ -311,7 +311,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
311 struct task_cputime cputime; 311 struct task_cputime cputime;
312 312
313 thread_group_cputime(p, &cputime); 313 thread_group_cputime(p, &cputime);
314 switch (which_clock) { 314 switch (CPUCLOCK_WHICH(which_clock)) {
315 default: 315 default:
316 return -EINVAL; 316 return -EINVAL;
317 case CPUCLOCK_PROF: 317 case CPUCLOCK_PROF:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5e79c662294b..887c63787de6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
116 * must supply functions here, even if the function just returns 116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the 117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the 118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_process 119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code. 120 * fields are not modified by timer code.
121 * 121 *
122 * At this time all functions EXCEPT clock_nanosleep can be 122 * At this time all functions EXCEPT clock_nanosleep can be
@@ -197,6 +197,11 @@ static int common_timer_create(struct k_itimer *new_timer)
197 return 0; 197 return 0;
198} 198}
199 199
200static int no_timer_create(struct k_itimer *new_timer)
201{
202 return -EOPNOTSUPP;
203}
204
200/* 205/*
201 * Return nonzero if we know a priori this clockid_t value is bogus. 206 * Return nonzero if we know a priori this clockid_t value is bogus.
202 */ 207 */
@@ -248,6 +253,7 @@ static __init int init_posix_timers(void)
248 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
249 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
250 .clock_set = do_posix_clock_nosettime, 255 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create,
251 }; 257 };
252 258
253 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 259 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -313,7 +319,8 @@ void do_schedule_next_timer(struct siginfo *info)
313 319
314int posix_timer_event(struct k_itimer *timr, int si_private) 320int posix_timer_event(struct k_itimer *timr, int si_private)
315{ 321{
316 int shared, ret; 322 struct task_struct *task;
323 int shared, ret = -1;
317 /* 324 /*
318 * FIXME: if ->sigq is queued we can race with 325 * FIXME: if ->sigq is queued we can race with
319 * dequeue_signal()->do_schedule_next_timer(). 326 * dequeue_signal()->do_schedule_next_timer().
@@ -327,8 +334,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
327 */ 334 */
328 timr->sigq->info.si_sys_private = si_private; 335 timr->sigq->info.si_sys_private = si_private;
329 336
330 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); 337 rcu_read_lock();
331 ret = send_sigqueue(timr->sigq, timr->it_process, shared); 338 task = pid_task(timr->it_pid, PIDTYPE_PID);
339 if (task) {
340 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
341 ret = send_sigqueue(timr->sigq, task, shared);
342 }
343 rcu_read_unlock();
332 /* If we failed to send the signal the timer stops. */ 344 /* If we failed to send the signal the timer stops. */
333 return ret > 0; 345 return ret > 0;
334} 346}
@@ -405,7 +417,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
405 return ret; 417 return ret;
406} 418}
407 419
408static struct task_struct * good_sigevent(sigevent_t * event) 420static struct pid *good_sigevent(sigevent_t * event)
409{ 421{
410 struct task_struct *rtn = current->group_leader; 422 struct task_struct *rtn = current->group_leader;
411 423
@@ -419,7 +431,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
419 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) 431 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
420 return NULL; 432 return NULL;
421 433
422 return rtn; 434 return task_pid(rtn);
423} 435}
424 436
425void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 437void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -458,6 +470,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
458 idr_remove(&posix_timers_id, tmr->it_id); 470 idr_remove(&posix_timers_id, tmr->it_id);
459 spin_unlock_irqrestore(&idr_lock, flags); 471 spin_unlock_irqrestore(&idr_lock, flags);
460 } 472 }
473 put_pid(tmr->it_pid);
461 sigqueue_free(tmr->sigq); 474 sigqueue_free(tmr->sigq);
462 kmem_cache_free(posix_timers_cache, tmr); 475 kmem_cache_free(posix_timers_cache, tmr);
463} 476}
@@ -471,7 +484,6 @@ sys_timer_create(const clockid_t which_clock,
471{ 484{
472 struct k_itimer *new_timer; 485 struct k_itimer *new_timer;
473 int error, new_timer_id; 486 int error, new_timer_id;
474 struct task_struct *process;
475 sigevent_t event; 487 sigevent_t event;
476 int it_id_set = IT_ID_NOT_SET; 488 int it_id_set = IT_ID_NOT_SET;
477 489
@@ -525,11 +537,9 @@ sys_timer_create(const clockid_t which_clock,
525 goto out; 537 goto out;
526 } 538 }
527 rcu_read_lock(); 539 rcu_read_lock();
528 process = good_sigevent(&event); 540 new_timer->it_pid = get_pid(good_sigevent(&event));
529 if (process)
530 get_task_struct(process);
531 rcu_read_unlock(); 541 rcu_read_unlock();
532 if (!process) { 542 if (!new_timer->it_pid) {
533 error = -EINVAL; 543 error = -EINVAL;
534 goto out; 544 goto out;
535 } 545 }
@@ -537,8 +547,7 @@ sys_timer_create(const clockid_t which_clock,
537 event.sigev_notify = SIGEV_SIGNAL; 547 event.sigev_notify = SIGEV_SIGNAL;
538 event.sigev_signo = SIGALRM; 548 event.sigev_signo = SIGALRM;
539 event.sigev_value.sival_int = new_timer->it_id; 549 event.sigev_value.sival_int = new_timer->it_id;
540 process = current->group_leader; 550 new_timer->it_pid = get_pid(task_tgid(current));
541 get_task_struct(process);
542 } 551 }
543 552
544 new_timer->it_sigev_notify = event.sigev_notify; 553 new_timer->it_sigev_notify = event.sigev_notify;
@@ -548,7 +557,7 @@ sys_timer_create(const clockid_t which_clock,
548 new_timer->sigq->info.si_code = SI_TIMER; 557 new_timer->sigq->info.si_code = SI_TIMER;
549 558
550 spin_lock_irq(&current->sighand->siglock); 559 spin_lock_irq(&current->sighand->siglock);
551 new_timer->it_process = process; 560 new_timer->it_signal = current->signal;
552 list_add(&new_timer->list, &current->signal->posix_timers); 561 list_add(&new_timer->list, &current->signal->posix_timers);
553 spin_unlock_irq(&current->sighand->siglock); 562 spin_unlock_irq(&current->sighand->siglock);
554 563
@@ -583,8 +592,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
583 timr = idr_find(&posix_timers_id, (int)timer_id); 592 timr = idr_find(&posix_timers_id, (int)timer_id);
584 if (timr) { 593 if (timr) {
585 spin_lock(&timr->it_lock); 594 spin_lock(&timr->it_lock);
586 if (timr->it_process && 595 if (timr->it_signal == current->signal) {
587 same_thread_group(timr->it_process, current)) {
588 spin_unlock(&idr_lock); 596 spin_unlock(&idr_lock);
589 return timr; 597 return timr;
590 } 598 }
@@ -831,8 +839,7 @@ retry_delete:
831 * This keeps any tasks waiting on the spin lock from thinking 839 * This keeps any tasks waiting on the spin lock from thinking
832 * they got something (see the lock code above). 840 * they got something (see the lock code above).
833 */ 841 */
834 put_task_struct(timer->it_process); 842 timer->it_signal = NULL;
835 timer->it_process = NULL;
836 843
837 unlock_timer(timer, flags); 844 unlock_timer(timer, flags);
838 release_posix_timer(timer, IT_ID_SET); 845 release_posix_timer(timer, IT_ID_SET);
@@ -858,8 +865,7 @@ retry_delete:
858 * This keeps any tasks waiting on the spin lock from thinking 865 * This keeps any tasks waiting on the spin lock from thinking
859 * they got something (see the lock code above). 866 * they got something (see the lock code above).
860 */ 867 */
861 put_task_struct(timer->it_process); 868 timer->it_signal = NULL;
862 timer->it_process = NULL;
863 869
864 unlock_timer(timer, flags); 870 unlock_timer(timer, flags);
865 release_posix_timer(timer, IT_ID_SET); 871 release_posix_timer(timer, IT_ID_SET);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746f..f77d3819ef57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
257 256
258int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
259{ 258{
260 int error, ftrace_save; 259 int error;
261 260
262 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
263 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
269 goto Close; 268 goto Close;
270 269
271 suspend_console(); 270 suspend_console();
272 ftrace_save = __ftrace_enabled_save();
273 error = device_suspend(PMSG_FREEZE); 271 error = device_suspend(PMSG_FREEZE);
274 if (error) 272 if (error)
275 goto Recover_platform; 273 goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
299 Resume_devices: 297 Resume_devices:
300 device_resume(in_suspend ? 298 device_resume(in_suspend ?
301 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 299 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
302 __ftrace_enabled_restore(ftrace_save);
303 resume_console(); 300 resume_console();
304 Close: 301 Close:
305 platform_end(platform_mode); 302 platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
370 367
371int hibernation_restore(int platform_mode) 368int hibernation_restore(int platform_mode)
372{ 369{
373 int error, ftrace_save; 370 int error;
374 371
375 pm_prepare_console(); 372 pm_prepare_console();
376 suspend_console(); 373 suspend_console();
377 ftrace_save = __ftrace_enabled_save();
378 error = device_suspend(PMSG_QUIESCE); 374 error = device_suspend(PMSG_QUIESCE);
379 if (error) 375 if (error)
380 goto Finish; 376 goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
389 platform_restore_cleanup(platform_mode); 385 platform_restore_cleanup(platform_mode);
390 device_resume(PMSG_RECOVER); 386 device_resume(PMSG_RECOVER);
391 Finish: 387 Finish:
392 __ftrace_enabled_restore(ftrace_save);
393 resume_console(); 388 resume_console();
394 pm_restore_console(); 389 pm_restore_console();
395 return error; 390 return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
402 397
403int hibernation_platform_enter(void) 398int hibernation_platform_enter(void)
404{ 399{
405 int error, ftrace_save; 400 int error;
406 401
407 if (!hibernation_ops) 402 if (!hibernation_ops)
408 return -ENOSYS; 403 return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
417 goto Close; 412 goto Close;
418 413
419 suspend_console(); 414 suspend_console();
420 ftrace_save = __ftrace_enabled_save();
421 error = device_suspend(PMSG_HIBERNATE); 415 error = device_suspend(PMSG_HIBERNATE);
422 if (error) { 416 if (error) {
423 if (hibernation_ops->recover) 417 if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
452 hibernation_ops->finish(); 446 hibernation_ops->finish();
453 Resume_devices: 447 Resume_devices:
454 device_resume(PMSG_RESTORE); 448 device_resume(PMSG_RESTORE);
455 __ftrace_enabled_restore(ftrace_save);
456 resume_console(); 449 resume_console();
457 Close: 450 Close:
458 hibernation_ops->end(); 451 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b8f7ce9473e8..613f16941b85 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/vmstat.h> 23#include <linux/vmstat.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
317 */ 316 */
318int suspend_devices_and_enter(suspend_state_t state) 317int suspend_devices_and_enter(suspend_state_t state)
319{ 318{
320 int error, ftrace_save; 319 int error;
321 320
322 if (!suspend_ops) 321 if (!suspend_ops)
323 return -ENOSYS; 322 return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
328 goto Close; 327 goto Close;
329 } 328 }
330 suspend_console(); 329 suspend_console();
331 ftrace_save = __ftrace_enabled_save();
332 suspend_test_start(); 330 suspend_test_start();
333 error = device_suspend(PMSG_SUSPEND); 331 error = device_suspend(PMSG_SUSPEND);
334 if (error) { 332 if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
360 suspend_test_start(); 358 suspend_test_start();
361 device_resume(PMSG_RESUME); 359 device_resume(PMSG_RESUME);
362 suspend_test_finish("resume devices"); 360 suspend_test_finish("resume devices");
363 __ftrace_enabled_restore(ftrace_save);
364 resume_console(); 361 resume_console();
365 Close: 362 Close:
366 if (suspend_ops->end) 363 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b7713b53d07a..6da14358537c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -633,7 +633,7 @@ void swsusp_close(fmode_t mode)
633 return; 633 return;
634 } 634 }
635 635
636 blkdev_put(resume_bdev, mode); /* move up */ 636 blkdev_put(resume_bdev, mode);
637} 637}
638 638
639static int swsusp_header_init(void) 639static int swsusp_header_init(void)
diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d77..e651ab05655f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
662 if (recursion_bug) { 662 if (recursion_bug) {
663 recursion_bug = 0; 663 recursion_bug = 0;
664 strcpy(printk_buf, recursion_bug_msg); 664 strcpy(printk_buf, recursion_bug_msg);
665 printed_len = sizeof(recursion_bug_msg); 665 printed_len = strlen(recursion_bug_msg);
666 } 666 }
667 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
668 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
diff --git a/kernel/profile.c b/kernel/profile.c
index dc41827fbfee..60adefb59b5e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
544}; 544};
545 545
546#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
547static inline void profile_nop(void *unused) 547static void profile_nop(void *unused)
548{ 548{
549} 549}
550 550
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4c8bcd7dd8e0..29dc700e198c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,17 @@
25#include <asm/pgtable.h> 25#include <asm/pgtable.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27 27
28
29/*
30 * Initialize a new task whose father had been ptraced.
31 *
32 * Called from copy_process().
33 */
34void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
35{
36 arch_ptrace_fork(child, clone_flags);
37}
38
28/* 39/*
29 * ptrace a task: make the debugger its new parent and 40 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 41 * move it to the ptrace list.
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
72 child->parent = child->real_parent; 83 child->parent = child->real_parent;
73 list_del_init(&child->ptrace_entry); 84 list_del_init(&child->ptrace_entry);
74 85
86 arch_ptrace_untrace(child);
75 if (task_is_traced(child)) 87 if (task_is_traced(child))
76 ptrace_untrace(child); 88 ptrace_untrace(child);
77} 89}
@@ -115,6 +127,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
115 127
116int __ptrace_may_access(struct task_struct *task, unsigned int mode) 128int __ptrace_may_access(struct task_struct *task, unsigned int mode)
117{ 129{
130 const struct cred *cred = current_cred(), *tcred;
131
118 /* May we inspect the given task? 132 /* May we inspect the given task?
119 * This check is used both for attaching with ptrace 133 * This check is used both for attaching with ptrace
120 * and for allowing access to sensitive information in /proc. 134 * and for allowing access to sensitive information in /proc.
@@ -127,13 +141,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
127 /* Don't let security modules deny introspection */ 141 /* Don't let security modules deny introspection */
128 if (task == current) 142 if (task == current)
129 return 0; 143 return 0;
130 if (((current->uid != task->euid) || 144 rcu_read_lock();
131 (current->uid != task->suid) || 145 tcred = __task_cred(task);
132 (current->uid != task->uid) || 146 if ((cred->uid != tcred->euid ||
133 (current->gid != task->egid) || 147 cred->uid != tcred->suid ||
134 (current->gid != task->sgid) || 148 cred->uid != tcred->uid ||
135 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 149 cred->gid != tcred->egid ||
150 cred->gid != tcred->sgid ||
151 cred->gid != tcred->gid) &&
152 !capable(CAP_SYS_PTRACE)) {
153 rcu_read_unlock();
136 return -EPERM; 154 return -EPERM;
155 }
156 rcu_read_unlock();
137 smp_rmb(); 157 smp_rmb();
138 if (task->mm) 158 if (task->mm)
139 dumpable = get_dumpable(task->mm); 159 dumpable = get_dumpable(task->mm);
@@ -163,6 +183,14 @@ int ptrace_attach(struct task_struct *task)
163 if (same_thread_group(task, current)) 183 if (same_thread_group(task, current))
164 goto out; 184 goto out;
165 185
186 /* Protect exec's credential calculations against our interference;
187 * SUID, SGID and LSM creds get determined differently under ptrace.
188 */
189 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
190 if (retval < 0)
191 goto out;
192
193 retval = -EPERM;
166repeat: 194repeat:
167 /* 195 /*
168 * Nasty, nasty. 196 * Nasty, nasty.
@@ -202,6 +230,7 @@ repeat:
202bad: 230bad:
203 write_unlock_irqrestore(&tasklist_lock, flags); 231 write_unlock_irqrestore(&tasklist_lock, flags);
204 task_unlock(task); 232 task_unlock(task);
233 mutex_unlock(&current->cred_exec_mutex);
205out: 234out:
206 return retval; 235 return retval;
207} 236}
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..e503a002f330 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..04982659875a 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
551 } 551 }
552} 552}
553 553
554void rcu_nmi_enter(void)
555{
556 rcu_irq_enter();
557}
558
559void rcu_nmi_exit(void)
560{
561 rcu_irq_exit();
562}
563
554static void dyntick_save_progress_counter(int cpu) 564static void dyntick_save_progress_counter(int cpu)
555{ 565{
556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); 566 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
149 sp->done_length += cp->done_length; 149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add; 150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove; 151 sp->done_remove += cp->done_remove;
152 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); 152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks; 153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_set(&sp->rcu_try_flip_1, 154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 atomic_read(&cp->rcu_try_flip_1)); 155 &sp->rcu_try_flip_1);
156 atomic_set(&sp->rcu_try_flip_e1, 156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 atomic_read(&cp->rcu_try_flip_e1)); 157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; 158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; 159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; 160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a55..b31065522104 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/reboot.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/delay.h> 45#include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
108 int rtort_mbtest; 109 int rtort_mbtest;
109}; 110};
110 111
111static int fullstop = 0; /* stop generating callbacks at test end. */
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current = NULL;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version = 0;
@@ -136,6 +136,30 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
143 /* spawning of kthreads. */
144
145/*
146 * Detect and respond to a signal-based shutdown.
147 */
148static int
149rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3)
151{
152 if (fullstop)
153 return NOTIFY_DONE;
154 if (signal_pending(current)) {
155 mutex_lock(&fullstop_mutex);
156 if (!ACCESS_ONCE(fullstop))
157 fullstop = FULLSTOP_SIGNALED;
158 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE;
161}
162
139/* 163/*
140 * Allocate an element from the rcu_tortures pool. 164 * Allocate an element from the rcu_tortures pool.
141 */ 165 */
@@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp)
199static void 223static void
200rcu_stutter_wait(void) 224rcu_stutter_wait(void)
201{ 225{
202 while (stutter_pause_test || !rcutorture_runnable) 226 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
203 if (rcutorture_runnable) 227 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1); 228 schedule_timeout_interruptible(1);
205 else 229 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 230 schedule_timeout_interruptible(round_jiffies_relative(HZ));
231 }
207} 232}
208 233
209/* 234/*
@@ -599,7 +624,7 @@ rcu_torture_writer(void *arg)
599 rcu_stutter_wait(); 624 rcu_stutter_wait();
600 } while (!kthread_should_stop() && !fullstop); 625 } while (!kthread_should_stop() && !fullstop);
601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
602 while (!kthread_should_stop()) 627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
603 schedule_timeout_uninterruptible(1); 628 schedule_timeout_uninterruptible(1);
604 return 0; 629 return 0;
605} 630}
@@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg)
624 } while (!kthread_should_stop() && !fullstop); 649 } while (!kthread_should_stop() && !fullstop);
625 650
626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
627 while (!kthread_should_stop()) 652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
628 schedule_timeout_uninterruptible(1); 653 schedule_timeout_uninterruptible(1);
629 return 0; 654 return 0;
630} 655}
@@ -734,7 +759,7 @@ rcu_torture_reader(void *arg)
734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable) 760 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t); 761 del_timer_sync(&t);
737 while (!kthread_should_stop()) 762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
738 schedule_timeout_uninterruptible(1); 763 schedule_timeout_uninterruptible(1);
739 return 0; 764 return 0;
740} 765}
@@ -831,7 +856,7 @@ rcu_torture_stats(void *arg)
831 do { 856 do {
832 schedule_timeout_interruptible(stat_interval * HZ); 857 schedule_timeout_interruptible(stat_interval * HZ);
833 rcu_torture_stats_print(); 858 rcu_torture_stats_print();
834 } while (!kthread_should_stop()); 859 } while (!kthread_should_stop() && !fullstop);
835 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 860 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
836 return 0; 861 return 0;
837} 862}
@@ -899,7 +924,7 @@ rcu_torture_shuffle(void *arg)
899 do { 924 do {
900 schedule_timeout_interruptible(shuffle_interval * HZ); 925 schedule_timeout_interruptible(shuffle_interval * HZ);
901 rcu_torture_shuffle_tasks(); 926 rcu_torture_shuffle_tasks();
902 } while (!kthread_should_stop()); 927 } while (!kthread_should_stop() && !fullstop);
903 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 928 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
904 return 0; 929 return 0;
905} 930}
@@ -914,10 +939,10 @@ rcu_torture_stutter(void *arg)
914 do { 939 do {
915 schedule_timeout_interruptible(stutter * HZ); 940 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1; 941 stutter_pause_test = 1;
917 if (!kthread_should_stop()) 942 if (!kthread_should_stop() && !fullstop)
918 schedule_timeout_interruptible(stutter * HZ); 943 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0; 944 stutter_pause_test = 0;
920 } while (!kthread_should_stop()); 945 } while (!kthread_should_stop() && !fullstop);
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 946 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0; 947 return 0;
923} 948}
@@ -934,12 +959,27 @@ rcu_torture_print_module_parms(char *tag)
934 stutter, irqreader); 959 stutter, irqreader);
935} 960}
936 961
962static struct notifier_block rcutorture_nb = {
963 .notifier_call = rcutorture_shutdown_notify,
964};
965
937static void 966static void
938rcu_torture_cleanup(void) 967rcu_torture_cleanup(void)
939{ 968{
940 int i; 969 int i;
941 970
942 fullstop = 1; 971 mutex_lock(&fullstop_mutex);
972 if (!fullstop) {
973 /* If being signaled, let it happen, then exit. */
974 mutex_unlock(&fullstop_mutex);
975 schedule_timeout_interruptible(10 * HZ);
976 if (cur_ops->cb_barrier != NULL)
977 cur_ops->cb_barrier();
978 return;
979 }
980 fullstop = FULLSTOP_CLEANUP;
981 mutex_unlock(&fullstop_mutex);
982 unregister_reboot_notifier(&rcutorture_nb);
943 if (stutter_task) { 983 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 984 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task); 985 kthread_stop(stutter_task);
@@ -1015,6 +1055,8 @@ rcu_torture_init(void)
1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1055 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1016 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1056 &srcu_ops, &sched_ops, &sched_ops_sync, };
1017 1057
1058 mutex_lock(&fullstop_mutex);
1059
1018 /* Process args and tell the world that the torturer is on the job. */ 1060 /* Process args and tell the world that the torturer is on the job. */
1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1061 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1020 cur_ops = torture_ops[i]; 1062 cur_ops = torture_ops[i];
@@ -1024,6 +1066,7 @@ rcu_torture_init(void)
1024 if (i == ARRAY_SIZE(torture_ops)) { 1066 if (i == ARRAY_SIZE(torture_ops)) {
1025 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1067 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1026 torture_type); 1068 torture_type);
1069 mutex_unlock(&fullstop_mutex);
1027 return (-EINVAL); 1070 return (-EINVAL);
1028 } 1071 }
1029 if (cur_ops->init) 1072 if (cur_ops->init)
@@ -1146,9 +1189,12 @@ rcu_torture_init(void)
1146 goto unwind; 1189 goto unwind;
1147 } 1190 }
1148 } 1191 }
1192 register_reboot_notifier(&rcutorture_nb);
1193 mutex_unlock(&fullstop_mutex);
1149 return 0; 1194 return 0;
1150 1195
1151unwind: 1196unwind:
1197 mutex_unlock(&fullstop_mutex);
1152 rcu_torture_cleanup(); 1198 rcu_torture_cleanup();
1153 return firsterr; 1199 return firsterr;
1154} 1200}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..a342b032112c
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
23 *
24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 *
27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU
29 */
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/smp.h>
35#include <linux/rcupdate.h>
36#include <linux/interrupt.h>
37#include <linux/sched.h>
38#include <asm/atomic.h>
39#include <linux/bitops.h>
40#include <linux/module.h>
41#include <linux/completion.h>
42#include <linux/moduleparam.h>
43#include <linux/percpu.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/mutex.h>
47#include <linux/time.h>
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55
56/* Data structures. */
57
58#define RCU_STATE_INITIALIZER(name) { \
59 .level = { &name.node[0] }, \
60 .levelcnt = { \
61 NUM_RCU_LVL_0, /* root of hierarchy. */ \
62 NUM_RCU_LVL_1, \
63 NUM_RCU_LVL_2, \
64 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
65 }, \
66 .signaled = RCU_SIGNAL_INIT, \
67 .gpnum = -300, \
68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
71 .n_force_qs = 0, \
72 .n_force_qs_ngp = 0, \
73}
74
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data);
77
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80
81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
83#endif /* #ifdef CONFIG_NO_HZ */
84
85static int blimit = 10; /* Maximum callbacks per softirq. */
86static int qhimark = 10000; /* If this many pending, ignore blimit. */
87static int qlowmark = 100; /* Once only this many pending, use blimit. */
88
89static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
90
91/*
92 * Return the number of RCU batches processed thus far for debug & stats.
93 */
94long rcu_batches_completed(void)
95{
96 return rcu_state.completed;
97}
98EXPORT_SYMBOL_GPL(rcu_batches_completed);
99
100/*
101 * Return the number of RCU BH batches processed thus far for debug & stats.
102 */
103long rcu_batches_completed_bh(void)
104{
105 return rcu_bh_state.completed;
106}
107EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
108
109/*
110 * Does the CPU have callbacks ready to be invoked?
111 */
112static int
113cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
114{
115 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
116}
117
118/*
119 * Does the current CPU require a yet-as-unscheduled grace period?
120 */
121static int
122cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
123{
124 /* ACCESS_ONCE() because we are accessing outside of lock. */
125 return *rdp->nxttail[RCU_DONE_TAIL] &&
126 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
127}
128
129/*
130 * Return the root node of the specified rcu_state structure.
131 */
132static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
133{
134 return &rsp->node[0];
135}
136
137#ifdef CONFIG_SMP
138
139/*
140 * If the specified CPU is offline, tell the caller that it is in
141 * a quiescent state. Otherwise, whack it with a reschedule IPI.
142 * Grace periods can end up waiting on an offline CPU when that
143 * CPU is in the process of coming online -- it will be added to the
144 * rcu_node bitmasks before it actually makes it online. The same thing
145 * can happen while a CPU is in the process of coming online. Because this
146 * race is quite rare, we check for it after detecting that the grace
147 * period has been delayed rather than checking each and every CPU
148 * each and every time we start a new grace period.
149 */
150static int rcu_implicit_offline_qs(struct rcu_data *rdp)
151{
152 /*
153 * If the CPU is offline, it is in a quiescent state. We can
154 * trust its state not to change because interrupts are disabled.
155 */
156 if (cpu_is_offline(rdp->cpu)) {
157 rdp->offline_fqs++;
158 return 1;
159 }
160
161 /* The CPU is online, so send it a reschedule IPI. */
162 if (rdp->cpu != smp_processor_id())
163 smp_send_reschedule(rdp->cpu);
164 else
165 set_need_resched();
166 rdp->resched_ipi++;
167 return 0;
168}
169
170#endif /* #ifdef CONFIG_SMP */
171
172#ifdef CONFIG_NO_HZ
173static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
174
175/**
176 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
177 *
178 * Enter nohz mode, in other words, -leave- the mode in which RCU
179 * read-side critical sections can occur. (Though RCU read-side
180 * critical sections can occur in irq handlers in nohz mode, a possibility
181 * handled by rcu_irq_enter() and rcu_irq_exit()).
182 */
183void rcu_enter_nohz(void)
184{
185 unsigned long flags;
186 struct rcu_dynticks *rdtp;
187
188 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
189 local_irq_save(flags);
190 rdtp = &__get_cpu_var(rcu_dynticks);
191 rdtp->dynticks++;
192 rdtp->dynticks_nesting--;
193 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
194 local_irq_restore(flags);
195}
196
197/*
198 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
199 *
200 * Exit nohz mode, in other words, -enter- the mode in which RCU
201 * read-side critical sections normally occur.
202 */
203void rcu_exit_nohz(void)
204{
205 unsigned long flags;
206 struct rcu_dynticks *rdtp;
207
208 local_irq_save(flags);
209 rdtp = &__get_cpu_var(rcu_dynticks);
210 rdtp->dynticks++;
211 rdtp->dynticks_nesting++;
212 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
213 local_irq_restore(flags);
214 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
215}
216
217/**
218 * rcu_nmi_enter - inform RCU of entry to NMI context
219 *
220 * If the CPU was idle with dynamic ticks active, and there is no
221 * irq handler running, this updates rdtp->dynticks_nmi to let the
222 * RCU grace-period handling know that the CPU is active.
223 */
224void rcu_nmi_enter(void)
225{
226 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
227
228 if (rdtp->dynticks & 0x1)
229 return;
230 rdtp->dynticks_nmi++;
231 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
232 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
233}
234
235/**
236 * rcu_nmi_exit - inform RCU of exit from NMI context
237 *
238 * If the CPU was idle with dynamic ticks active, and there is no
239 * irq handler running, this updates rdtp->dynticks_nmi to let the
240 * RCU grace-period handling know that the CPU is no longer active.
241 */
242void rcu_nmi_exit(void)
243{
244 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
245
246 if (rdtp->dynticks & 0x1)
247 return;
248 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
249 rdtp->dynticks_nmi++;
250 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
251}
252
253/**
254 * rcu_irq_enter - inform RCU of entry to hard irq context
255 *
256 * If the CPU was idle with dynamic ticks active, this updates the
257 * rdtp->dynticks to let the RCU handling know that the CPU is active.
258 */
259void rcu_irq_enter(void)
260{
261 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
262
263 if (rdtp->dynticks_nesting++)
264 return;
265 rdtp->dynticks++;
266 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
267 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
268}
269
270/**
271 * rcu_irq_exit - inform RCU of exit from hard irq context
272 *
273 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
274 * to put let the RCU handling be aware that the CPU is going back to idle
275 * with no ticks.
276 */
277void rcu_irq_exit(void)
278{
279 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
280
281 if (--rdtp->dynticks_nesting)
282 return;
283 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
284 rdtp->dynticks++;
285 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
286
287 /* If the interrupt queued a callback, get out of dyntick mode. */
288 if (__get_cpu_var(rcu_data).nxtlist ||
289 __get_cpu_var(rcu_bh_data).nxtlist)
290 set_need_resched();
291}
292
293/*
294 * Record the specified "completed" value, which is later used to validate
295 * dynticks counter manipulations. Specify "rsp->completed - 1" to
296 * unconditionally invalidate any future dynticks manipulations (which is
297 * useful at the beginning of a grace period).
298 */
299static void dyntick_record_completed(struct rcu_state *rsp, long comp)
300{
301 rsp->dynticks_completed = comp;
302}
303
304#ifdef CONFIG_SMP
305
306/*
307 * Recall the previously recorded value of the completion for dynticks.
308 */
309static long dyntick_recall_completed(struct rcu_state *rsp)
310{
311 return rsp->dynticks_completed;
312}
313
314/*
315 * Snapshot the specified CPU's dynticks counter so that we can later
316 * credit them with an implicit quiescent state. Return 1 if this CPU
317 * is already in a quiescent state courtesy of dynticks idle mode.
318 */
319static int dyntick_save_progress_counter(struct rcu_data *rdp)
320{
321 int ret;
322 int snap;
323 int snap_nmi;
324
325 snap = rdp->dynticks->dynticks;
326 snap_nmi = rdp->dynticks->dynticks_nmi;
327 smp_mb(); /* Order sampling of snap with end of grace period. */
328 rdp->dynticks_snap = snap;
329 rdp->dynticks_nmi_snap = snap_nmi;
330 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
331 if (ret)
332 rdp->dynticks_fqs++;
333 return ret;
334}
335
336/*
337 * Return true if the specified CPU has passed through a quiescent
338 * state by virtue of being in or having passed through an dynticks
339 * idle state since the last call to dyntick_save_progress_counter()
340 * for this same CPU.
341 */
342static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
343{
344 long curr;
345 long curr_nmi;
346 long snap;
347 long snap_nmi;
348
349 curr = rdp->dynticks->dynticks;
350 snap = rdp->dynticks_snap;
351 curr_nmi = rdp->dynticks->dynticks_nmi;
352 snap_nmi = rdp->dynticks_nmi_snap;
353 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
354
355 /*
356 * If the CPU passed through or entered a dynticks idle phase with
357 * no active irq/NMI handlers, then we can safely pretend that the CPU
358 * already acknowledged the request to pass through a quiescent
359 * state. Either way, that CPU cannot possibly be in an RCU
360 * read-side critical section that started before the beginning
361 * of the current RCU grace period.
362 */
363 if ((curr != snap || (curr & 0x1) == 0) &&
364 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
365 rdp->dynticks_fqs++;
366 return 1;
367 }
368
369 /* Go check for the CPU being offline. */
370 return rcu_implicit_offline_qs(rdp);
371}
372
373#endif /* #ifdef CONFIG_SMP */
374
375#else /* #ifdef CONFIG_NO_HZ */
376
377static void dyntick_record_completed(struct rcu_state *rsp, long comp)
378{
379}
380
381#ifdef CONFIG_SMP
382
383/*
384 * If there are no dynticks, then the only way that a CPU can passively
385 * be in a quiescent state is to be offline. Unlike dynticks idle, which
386 * is a point in time during the prior (already finished) grace period,
387 * an offline CPU is always in a quiescent state, and thus can be
388 * unconditionally applied. So just return the current value of completed.
389 */
390static long dyntick_recall_completed(struct rcu_state *rsp)
391{
392 return rsp->completed;
393}
394
395static int dyntick_save_progress_counter(struct rcu_data *rdp)
396{
397 return 0;
398}
399
400static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
401{
402 return rcu_implicit_offline_qs(rdp);
403}
404
405#endif /* #ifdef CONFIG_SMP */
406
407#endif /* #else #ifdef CONFIG_NO_HZ */
408
409#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
410
411static void record_gp_stall_check_time(struct rcu_state *rsp)
412{
413 rsp->gp_start = jiffies;
414 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
415}
416
417static void print_other_cpu_stall(struct rcu_state *rsp)
418{
419 int cpu;
420 long delta;
421 unsigned long flags;
422 struct rcu_node *rnp = rcu_get_root(rsp);
423 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
424 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
425
426 /* Only let one CPU complain about others per time interval. */
427
428 spin_lock_irqsave(&rnp->lock, flags);
429 delta = jiffies - rsp->jiffies_stall;
430 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
431 spin_unlock_irqrestore(&rnp->lock, flags);
432 return;
433 }
434 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
435 spin_unlock_irqrestore(&rnp->lock, flags);
436
437 /* OK, time to rat on our buddy... */
438
439 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
440 for (; rnp_cur < rnp_end; rnp_cur++) {
441 if (rnp_cur->qsmask == 0)
442 continue;
443 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
444 if (rnp_cur->qsmask & (1UL << cpu))
445 printk(" %d", rnp_cur->grplo + cpu);
446 }
447 printk(" (detected by %d, t=%ld jiffies)\n",
448 smp_processor_id(), (long)(jiffies - rsp->gp_start));
449 force_quiescent_state(rsp, 0); /* Kick them all. */
450}
451
452static void print_cpu_stall(struct rcu_state *rsp)
453{
454 unsigned long flags;
455 struct rcu_node *rnp = rcu_get_root(rsp);
456
457 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
458 smp_processor_id(), jiffies - rsp->gp_start);
459 dump_stack();
460 spin_lock_irqsave(&rnp->lock, flags);
461 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
462 rsp->jiffies_stall =
463 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
464 spin_unlock_irqrestore(&rnp->lock, flags);
465 set_need_resched(); /* kick ourselves to get things going. */
466}
467
468static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
469{
470 long delta;
471 struct rcu_node *rnp;
472
473 delta = jiffies - rsp->jiffies_stall;
474 rnp = rdp->mynode;
475 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
476
477 /* We haven't checked in, so go dump stack. */
478 print_cpu_stall(rsp);
479
480 } else if (rsp->gpnum != rsp->completed &&
481 delta >= RCU_STALL_RAT_DELAY) {
482
483 /* They had two time units to dump stack, so complain. */
484 print_other_cpu_stall(rsp);
485 }
486}
487
488#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
489
490static void record_gp_stall_check_time(struct rcu_state *rsp)
491{
492}
493
494static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
495{
496}
497
498#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
499
500/*
501 * Update CPU-local rcu_data state to record the newly noticed grace period.
502 * This is used both when we started the grace period and when we notice
503 * that someone else started the grace period.
504 */
505static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
506{
507 rdp->qs_pending = 1;
508 rdp->passed_quiesc = 0;
509 rdp->gpnum = rsp->gpnum;
510 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
511 RCU_JIFFIES_TILL_FORCE_QS;
512}
513
514/*
515 * Did someone else start a new RCU grace period start since we last
516 * checked? Update local state appropriately if so. Must be called
517 * on the CPU corresponding to rdp.
518 */
519static int
520check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
521{
522 unsigned long flags;
523 int ret = 0;
524
525 local_irq_save(flags);
526 if (rdp->gpnum != rsp->gpnum) {
527 note_new_gpnum(rsp, rdp);
528 ret = 1;
529 }
530 local_irq_restore(flags);
531 return ret;
532}
533
534/*
535 * Start a new RCU grace period if warranted, re-initializing the hierarchy
536 * in preparation for detecting the next grace period. The caller must hold
537 * the root node's ->lock, which is released before return. Hard irqs must
538 * be disabled.
539 */
540static void
541rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
542 __releases(rcu_get_root(rsp)->lock)
543{
544 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
545 struct rcu_node *rnp = rcu_get_root(rsp);
546 struct rcu_node *rnp_cur;
547 struct rcu_node *rnp_end;
548
549 if (!cpu_needs_another_gp(rsp, rdp)) {
550 spin_unlock_irqrestore(&rnp->lock, flags);
551 return;
552 }
553
554 /* Advance to a new grace period and initialize state. */
555 rsp->gpnum++;
556 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
557 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
558 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
559 RCU_JIFFIES_TILL_FORCE_QS;
560 record_gp_stall_check_time(rsp);
561 dyntick_record_completed(rsp, rsp->completed - 1);
562 note_new_gpnum(rsp, rdp);
563
564 /*
565 * Because we are first, we know that all our callbacks will
566 * be covered by this upcoming grace period, even the ones
567 * that were registered arbitrarily recently.
568 */
569 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
570 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
571
572 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit;
575 spin_unlock_irqrestore(&rnp->lock, flags);
576 return;
577 }
578
579 spin_unlock(&rnp->lock); /* leave irqs disabled. */
580
581
582 /* Exclude any concurrent CPU-hotplug operations. */
583 spin_lock(&rsp->onofflock); /* irqs already disabled. */
584
585 /*
586 * Set the quiescent-state-needed bits in all the non-leaf RCU
587 * nodes for all currently online CPUs. This operation relies
588 * on the layout of the hierarchy within the rsp->node[] array.
589 * Note that other CPUs will access only the leaves of the
590 * hierarchy, which still indicate that no grace period is in
591 * progress. In addition, we have excluded CPU-hotplug operations.
592 *
593 * We therefore do not need to hold any locks. Any required
594 * memory barriers will be supplied by the locks guarding the
595 * leaf rcu_nodes in the hierarchy.
596 */
597
598 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
599 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
600 rnp_cur->qsmask = rnp_cur->qsmaskinit;
601
602 /*
603 * Now set up the leaf nodes. Here we must be careful. First,
604 * we need to hold the lock in order to exclude other CPUs, which
605 * might be contending for the leaf nodes' locks. Second, as
606 * soon as we initialize a given leaf node, its CPUs might run
607 * up the rest of the hierarchy. We must therefore acquire locks
608 * for each node that we touch during this stage. (But we still
609 * are excluding CPU-hotplug operations.)
610 *
611 * Note that the grace period cannot complete until we finish
612 * the initialization process, as there will be at least one
613 * qsmask bit set in the root node until that time, namely the
614 * one corresponding to this CPU.
615 */
616 rnp_end = &rsp->node[NUM_RCU_NODES];
617 rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
618 for (; rnp_cur < rnp_end; rnp_cur++) {
619 spin_lock(&rnp_cur->lock); /* irqs already disabled. */
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
622 }
623
624 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
625 spin_unlock_irqrestore(&rsp->onofflock, flags);
626}
627
628/*
629 * Advance this CPU's callbacks, but only if the current grace period
630 * has ended. This may be called only from the CPU to whom the rdp
631 * belongs.
632 */
633static void
634rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
635{
636 long completed_snap;
637 unsigned long flags;
638
639 local_irq_save(flags);
640 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
641
642 /* Did another grace period end? */
643 if (rdp->completed != completed_snap) {
644
645 /* Advance callbacks. No harm if list empty. */
646 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
647 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
648 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
649
650 /* Remember that we saw this grace-period completion. */
651 rdp->completed = completed_snap;
652 }
653 local_irq_restore(flags);
654}
655
656/*
657 * Similar to cpu_quiet(), for which it is a helper function. Allows
658 * a group of CPUs to be quieted at one go, though all the CPUs in the
659 * group must be represented by the same leaf rcu_node structure.
660 * That structure's lock must be held upon entry, and it is released
661 * before return.
662 */
663static void
664cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
665 unsigned long flags)
666 __releases(rnp->lock)
667{
668 /* Walk up the rcu_node hierarchy. */
669 for (;;) {
670 if (!(rnp->qsmask & mask)) {
671
672 /* Our bit has already been cleared, so done. */
673 spin_unlock_irqrestore(&rnp->lock, flags);
674 return;
675 }
676 rnp->qsmask &= ~mask;
677 if (rnp->qsmask != 0) {
678
679 /* Other bits still set at this level, so done. */
680 spin_unlock_irqrestore(&rnp->lock, flags);
681 return;
682 }
683 mask = rnp->grpmask;
684 if (rnp->parent == NULL) {
685
686 /* No more levels. Exit loop holding root lock. */
687
688 break;
689 }
690 spin_unlock_irqrestore(&rnp->lock, flags);
691 rnp = rnp->parent;
692 spin_lock_irqsave(&rnp->lock, flags);
693 }
694
695 /*
696 * Get here if we are the last CPU to pass through a quiescent
697 * state for this grace period. Clean up and let rcu_start_gp()
698 * start up the next grace period if one is needed. Note that
699 * we still hold rnp->lock, as required by rcu_start_gp(), which
700 * will release it.
701 */
702 rsp->completed = rsp->gpnum;
703 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
704 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
705}
706
707/*
708 * Record a quiescent state for the specified CPU, which must either be
709 * the current CPU or an offline CPU. The lastcomp argument is used to
710 * make sure we are still in the grace period of interest. We don't want
711 * to end the current grace period based on quiescent states detected in
712 * an earlier grace period!
713 */
714static void
715cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
716{
717 unsigned long flags;
718 unsigned long mask;
719 struct rcu_node *rnp;
720
721 rnp = rdp->mynode;
722 spin_lock_irqsave(&rnp->lock, flags);
723 if (lastcomp != ACCESS_ONCE(rsp->completed)) {
724
725 /*
726 * Someone beat us to it for this grace period, so leave.
727 * The race with GP start is resolved by the fact that we
728 * hold the leaf rcu_node lock, so that the per-CPU bits
729 * cannot yet be initialized -- so we would simply find our
730 * CPU's bit already cleared in cpu_quiet_msk() if this race
731 * occurred.
732 */
733 rdp->passed_quiesc = 0; /* try again later! */
734 spin_unlock_irqrestore(&rnp->lock, flags);
735 return;
736 }
737 mask = rdp->grpmask;
738 if ((rnp->qsmask & mask) == 0) {
739 spin_unlock_irqrestore(&rnp->lock, flags);
740 } else {
741 rdp->qs_pending = 0;
742
743 /*
744 * This GP can't end until cpu checks in, so all of our
745 * callbacks can be processed during the next GP.
746 */
747 rdp = rsp->rda[smp_processor_id()];
748 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
749
750 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
751 }
752}
753
754/*
755 * Check to see if there is a new grace period of which this CPU
756 * is not yet aware, and if so, set up local rcu_data state for it.
757 * Otherwise, see if this CPU has just passed through its first
758 * quiescent state for this grace period, and record that fact if so.
759 */
760static void
761rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
762{
763 /* If there is now a new grace period, record and return. */
764 if (check_for_new_grace_period(rsp, rdp))
765 return;
766
767 /*
768 * Does this CPU still need to do its part for current grace period?
769 * If no, return and let the other CPUs do their part as well.
770 */
771 if (!rdp->qs_pending)
772 return;
773
774 /*
775 * Was there a quiescent state since the beginning of the grace
776 * period? If no, then exit and wait for the next call.
777 */
778 if (!rdp->passed_quiesc)
779 return;
780
781 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
782 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
783}
784
785#ifdef CONFIG_HOTPLUG_CPU
786
787/*
788 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
789 * and move all callbacks from the outgoing CPU to the current one.
790 */
791static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
792{
793 int i;
794 unsigned long flags;
795 long lastcomp;
796 unsigned long mask;
797 struct rcu_data *rdp = rsp->rda[cpu];
798 struct rcu_data *rdp_me;
799 struct rcu_node *rnp;
800
801 /* Exclude any attempts to start a new grace period. */
802 spin_lock_irqsave(&rsp->onofflock, flags);
803
804 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
805 rnp = rdp->mynode;
806 mask = rdp->grpmask; /* rnp->grplo is constant. */
807 do {
808 spin_lock(&rnp->lock); /* irqs already disabled. */
809 rnp->qsmaskinit &= ~mask;
810 if (rnp->qsmaskinit != 0) {
811 spin_unlock(&rnp->lock); /* irqs already disabled. */
812 break;
813 }
814 mask = rnp->grpmask;
815 spin_unlock(&rnp->lock); /* irqs already disabled. */
816 rnp = rnp->parent;
817 } while (rnp != NULL);
818 lastcomp = rsp->completed;
819
820 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
821
822 /* Being offline is a quiescent state, so go record it. */
823 cpu_quiet(cpu, rsp, rdp, lastcomp);
824
825 /*
826 * Move callbacks from the outgoing CPU to the running CPU.
827 * Note that the outgoing CPU is now quiscent, so it is now
828 * (uncharacteristically) safe to access it rcu_data structure.
829 * Note also that we must carefully retain the order of the
830 * outgoing CPU's callbacks in order for rcu_barrier() to work
831 * correctly. Finally, note that we start all the callbacks
832 * afresh, even those that have passed through a grace period
833 * and are therefore ready to invoke. The theory is that hotplug
834 * events are rare, and that if they are frequent enough to
835 * indefinitely delay callbacks, you have far worse things to
836 * be worrying about.
837 */
838 rdp_me = rsp->rda[smp_processor_id()];
839 if (rdp->nxtlist != NULL) {
840 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
841 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
842 rdp->nxtlist = NULL;
843 for (i = 0; i < RCU_NEXT_SIZE; i++)
844 rdp->nxttail[i] = &rdp->nxtlist;
845 rdp_me->qlen += rdp->qlen;
846 rdp->qlen = 0;
847 }
848 local_irq_restore(flags);
849}
850
851/*
852 * Remove the specified CPU from the RCU hierarchy and move any pending
853 * callbacks that it might have to the current CPU. This code assumes
854 * that at least one CPU in the system will remain running at all times.
855 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
856 */
857static void rcu_offline_cpu(int cpu)
858{
859 __rcu_offline_cpu(cpu, &rcu_state);
860 __rcu_offline_cpu(cpu, &rcu_bh_state);
861}
862
863#else /* #ifdef CONFIG_HOTPLUG_CPU */
864
865static void rcu_offline_cpu(int cpu)
866{
867}
868
869#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
870
871/*
872 * Invoke any RCU callbacks that have made it to the end of their grace
873 * period. Thottle as specified by rdp->blimit.
874 */
875static void rcu_do_batch(struct rcu_data *rdp)
876{
877 unsigned long flags;
878 struct rcu_head *next, *list, **tail;
879 int count;
880
881 /* If no callbacks are ready, just return.*/
882 if (!cpu_has_callbacks_ready_to_invoke(rdp))
883 return;
884
885 /*
886 * Extract the list of ready callbacks, disabling to prevent
887 * races with call_rcu() from interrupt handlers.
888 */
889 local_irq_save(flags);
890 list = rdp->nxtlist;
891 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
892 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
893 tail = rdp->nxttail[RCU_DONE_TAIL];
894 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
895 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
896 rdp->nxttail[count] = &rdp->nxtlist;
897 local_irq_restore(flags);
898
899 /* Invoke callbacks. */
900 count = 0;
901 while (list) {
902 next = list->next;
903 prefetch(next);
904 list->func(list);
905 list = next;
906 if (++count >= rdp->blimit)
907 break;
908 }
909
910 local_irq_save(flags);
911
912 /* Update count, and requeue any remaining callbacks. */
913 rdp->qlen -= count;
914 if (list != NULL) {
915 *tail = rdp->nxtlist;
916 rdp->nxtlist = list;
917 for (count = 0; count < RCU_NEXT_SIZE; count++)
918 if (&rdp->nxtlist == rdp->nxttail[count])
919 rdp->nxttail[count] = tail;
920 else
921 break;
922 }
923
924 /* Reinstate batch limit if we have worked down the excess. */
925 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
926 rdp->blimit = blimit;
927
928 local_irq_restore(flags);
929
930 /* Re-raise the RCU softirq if there are callbacks remaining. */
931 if (cpu_has_callbacks_ready_to_invoke(rdp))
932 raise_softirq(RCU_SOFTIRQ);
933}
934
935/*
936 * Check to see if this CPU is in a non-context-switch quiescent state
937 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
938 * Also schedule the RCU softirq handler.
939 *
940 * This function must be called with hardirqs disabled. It is normally
941 * invoked from the scheduling-clock interrupt. If rcu_pending returns
942 * false, there is no point in invoking rcu_check_callbacks().
943 */
944void rcu_check_callbacks(int cpu, int user)
945{
946 if (user ||
947 (idle_cpu(cpu) && !in_softirq() &&
948 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
949
950 /*
951 * Get here if this CPU took its interrupt from user
952 * mode or from the idle loop, and if this is not a
953 * nested interrupt. In this case, the CPU is in
954 * a quiescent state, so count it.
955 *
956 * No memory barrier is required here because both
957 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
958 * only CPU-local variables that other CPUs neither
959 * access nor modify, at least not while the corresponding
960 * CPU is online.
961 */
962
963 rcu_qsctr_inc(cpu);
964 rcu_bh_qsctr_inc(cpu);
965
966 } else if (!in_softirq()) {
967
968 /*
969 * Get here if this CPU did not take its interrupt from
970 * softirq, in other words, if it is not interrupting
971 * a rcu_bh read-side critical section. This is an _bh
972 * critical section, so count it.
973 */
974
975 rcu_bh_qsctr_inc(cpu);
976 }
977 raise_softirq(RCU_SOFTIRQ);
978}
979
980#ifdef CONFIG_SMP
981
982/*
983 * Scan the leaf rcu_node structures, processing dyntick state for any that
984 * have not yet encountered a quiescent state, using the function specified.
985 * Returns 1 if the current grace period ends while scanning (possibly
986 * because we made it end).
987 */
988static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
989 int (*f)(struct rcu_data *))
990{
991 unsigned long bit;
992 int cpu;
993 unsigned long flags;
994 unsigned long mask;
995 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
996 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
997
998 for (; rnp_cur < rnp_end; rnp_cur++) {
999 mask = 0;
1000 spin_lock_irqsave(&rnp_cur->lock, flags);
1001 if (rsp->completed != lastcomp) {
1002 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1003 return 1;
1004 }
1005 if (rnp_cur->qsmask == 0) {
1006 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1007 continue;
1008 }
1009 cpu = rnp_cur->grplo;
1010 bit = 1;
1011 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1012 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1013 mask |= bit;
1014 }
1015 if (mask != 0 && rsp->completed == lastcomp) {
1016
1017 /* cpu_quiet_msk() releases rnp_cur->lock. */
1018 cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1019 continue;
1020 }
1021 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1022 }
1023 return 0;
1024}
1025
1026/*
1027 * Force quiescent states on reluctant CPUs, and also detect which
1028 * CPUs are in dyntick-idle mode.
1029 */
1030static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1031{
1032 unsigned long flags;
1033 long lastcomp;
1034 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1035 struct rcu_node *rnp = rcu_get_root(rsp);
1036 u8 signaled;
1037
1038 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1039 return; /* No grace period in progress, nothing to force. */
1040 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1041 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1042 return; /* Someone else is already on the job. */
1043 }
1044 if (relaxed &&
1045 (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
1046 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1047 goto unlock_ret; /* no emergency and done recently. */
1048 rsp->n_force_qs++;
1049 spin_lock(&rnp->lock);
1050 lastcomp = rsp->completed;
1051 signaled = rsp->signaled;
1052 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1053 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1054 RCU_JIFFIES_TILL_FORCE_QS;
1055 if (lastcomp == rsp->gpnum) {
1056 rsp->n_force_qs_ngp++;
1057 spin_unlock(&rnp->lock);
1058 goto unlock_ret; /* no GP in progress, time updated. */
1059 }
1060 spin_unlock(&rnp->lock);
1061 switch (signaled) {
1062 case RCU_GP_INIT:
1063
1064 break; /* grace period still initializing, ignore. */
1065
1066 case RCU_SAVE_DYNTICK:
1067
1068 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1069 break; /* So gcc recognizes the dead code. */
1070
1071 /* Record dyntick-idle state. */
1072 if (rcu_process_dyntick(rsp, lastcomp,
1073 dyntick_save_progress_counter))
1074 goto unlock_ret;
1075
1076 /* Update state, record completion counter. */
1077 spin_lock(&rnp->lock);
1078 if (lastcomp == rsp->completed) {
1079 rsp->signaled = RCU_FORCE_QS;
1080 dyntick_record_completed(rsp, lastcomp);
1081 }
1082 spin_unlock(&rnp->lock);
1083 break;
1084
1085 case RCU_FORCE_QS:
1086
1087 /* Check dyntick-idle state, send IPI to laggarts. */
1088 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1089 rcu_implicit_dynticks_qs))
1090 goto unlock_ret;
1091
1092 /* Leave state in case more forcing is required. */
1093
1094 break;
1095 }
1096unlock_ret:
1097 spin_unlock_irqrestore(&rsp->fqslock, flags);
1098}
1099
1100#else /* #ifdef CONFIG_SMP */
1101
1102static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1103{
1104 set_need_resched();
1105}
1106
1107#endif /* #else #ifdef CONFIG_SMP */
1108
1109/*
1110 * This does the RCU processing work from softirq context for the
1111 * specified rcu_state and rcu_data structures. This may be called
1112 * only from the CPU to whom the rdp belongs.
1113 */
1114static void
1115__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1116{
1117 unsigned long flags;
1118
1119 /*
1120 * If an RCU GP has gone long enough, go check for dyntick
1121 * idle CPUs and, if needed, send resched IPIs.
1122 */
1123 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1124 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1125 force_quiescent_state(rsp, 1);
1126
1127 /*
1128 * Advance callbacks in response to end of earlier grace
1129 * period that some other CPU ended.
1130 */
1131 rcu_process_gp_end(rsp, rdp);
1132
1133 /* Update RCU state based on any recent quiescent states. */
1134 rcu_check_quiescent_state(rsp, rdp);
1135
1136 /* Does this CPU require a not-yet-started grace period? */
1137 if (cpu_needs_another_gp(rsp, rdp)) {
1138 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1139 rcu_start_gp(rsp, flags); /* releases above lock */
1140 }
1141
1142 /* If there are callbacks ready, invoke them. */
1143 rcu_do_batch(rdp);
1144}
1145
1146/*
1147 * Do softirq processing for the current CPU.
1148 */
1149static void rcu_process_callbacks(struct softirq_action *unused)
1150{
1151 /*
1152 * Memory references from any prior RCU read-side critical sections
1153 * executed by the interrupted code must be seen before any RCU
1154 * grace-period manipulations below.
1155 */
1156 smp_mb(); /* See above block comment. */
1157
1158 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1159 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1160
1161 /*
1162 * Memory references from any later RCU read-side critical sections
1163 * executed by the interrupted code must be seen after any RCU
1164 * grace-period manipulations above.
1165 */
1166 smp_mb(); /* See above block comment. */
1167}
1168
1169static void
1170__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1171 struct rcu_state *rsp)
1172{
1173 unsigned long flags;
1174 struct rcu_data *rdp;
1175
1176 head->func = func;
1177 head->next = NULL;
1178
1179 smp_mb(); /* Ensure RCU update seen before callback registry. */
1180
1181 /*
1182 * Opportunistically note grace-period endings and beginnings.
1183 * Note that we might see a beginning right after we see an
1184 * end, but never vice versa, since this CPU has to pass through
1185 * a quiescent state betweentimes.
1186 */
1187 local_irq_save(flags);
1188 rdp = rsp->rda[smp_processor_id()];
1189 rcu_process_gp_end(rsp, rdp);
1190 check_for_new_grace_period(rsp, rdp);
1191
1192 /* Add the callback to our list. */
1193 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1194 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1195
1196 /* Start a new grace period if one not already started. */
1197 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1198 unsigned long nestflag;
1199 struct rcu_node *rnp_root = rcu_get_root(rsp);
1200
1201 spin_lock_irqsave(&rnp_root->lock, nestflag);
1202 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1203 }
1204
1205 /* Force the grace period if too many callbacks or too long waiting. */
1206 if (unlikely(++rdp->qlen > qhimark)) {
1207 rdp->blimit = LONG_MAX;
1208 force_quiescent_state(rsp, 0);
1209 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1210 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1211 force_quiescent_state(rsp, 1);
1212 local_irq_restore(flags);
1213}
1214
1215/*
1216 * Queue an RCU callback for invocation after a grace period.
1217 */
1218void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1219{
1220 __call_rcu(head, func, &rcu_state);
1221}
1222EXPORT_SYMBOL_GPL(call_rcu);
1223
1224/*
1225 * Queue an RCU for invocation after a quicker grace period.
1226 */
1227void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1228{
1229 __call_rcu(head, func, &rcu_bh_state);
1230}
1231EXPORT_SYMBOL_GPL(call_rcu_bh);
1232
1233/*
1234 * Check to see if there is any immediate RCU-related work to be done
1235 * by the current CPU, for the specified type of RCU, returning 1 if so.
1236 * The checks are in order of increasing expense: checks that can be
1237 * carried out against CPU-local state are performed first. However,
1238 * we must check for CPU stalls first, else we might not get a chance.
1239 */
1240static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1241{
1242 rdp->n_rcu_pending++;
1243
1244 /* Check for CPU stalls, if enabled. */
1245 check_cpu_stall(rsp, rdp);
1246
1247 /* Is the RCU core waiting for a quiescent state from this CPU? */
1248 if (rdp->qs_pending)
1249 return 1;
1250
1251 /* Does this CPU have callbacks ready to invoke? */
1252 if (cpu_has_callbacks_ready_to_invoke(rdp))
1253 return 1;
1254
1255 /* Has RCU gone idle with this CPU needing another grace period? */
1256 if (cpu_needs_another_gp(rsp, rdp))
1257 return 1;
1258
1259 /* Has another RCU grace period completed? */
1260 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1261 return 1;
1262
1263 /* Has a new RCU grace period started? */
1264 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1265 return 1;
1266
1267 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1268 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1269 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1270 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1271 return 1;
1272
1273 /* nothing to do */
1274 return 0;
1275}
1276
1277/*
1278 * Check to see if there is any immediate RCU-related work to be done
1279 * by the current CPU, returning 1 if so. This function is part of the
1280 * RCU implementation; it is -not- an exported member of the RCU API.
1281 */
1282int rcu_pending(int cpu)
1283{
1284 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1285 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1286}
1287
1288/*
1289 * Check to see if any future RCU-related work will need to be done
1290 * by the current CPU, even if none need be done immediately, returning
1291 * 1 if so. This function is part of the RCU implementation; it is -not-
1292 * an exported member of the RCU API.
1293 */
1294int rcu_needs_cpu(int cpu)
1295{
1296 /* RCU callbacks either ready or pending? */
1297 return per_cpu(rcu_data, cpu).nxtlist ||
1298 per_cpu(rcu_bh_data, cpu).nxtlist;
1299}
1300
1301/*
1302 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
1303 * approach so that we don't have to worry about how long the CPU has
1304 * been gone, or whether it ever was online previously. We do trust the
1305 * ->mynode field, as it is constant for a given struct rcu_data and
1306 * initialized during early boot.
1307 *
1308 * Note that only one online or offline event can be happening at a given
1309 * time. Note also that we can accept some slop in the rsp->completed
1310 * access due to the fact that this CPU cannot possibly have any RCU
1311 * callbacks in flight yet.
1312 */
1313static void
1314rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1315{
1316 unsigned long flags;
1317 int i;
1318 long lastcomp;
1319 unsigned long mask;
1320 struct rcu_data *rdp = rsp->rda[cpu];
1321 struct rcu_node *rnp = rcu_get_root(rsp);
1322
1323 /* Set up local state, ensuring consistent view of global state. */
1324 spin_lock_irqsave(&rnp->lock, flags);
1325 lastcomp = rsp->completed;
1326 rdp->completed = lastcomp;
1327 rdp->gpnum = lastcomp;
1328 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1329 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1330 rdp->beenonline = 1; /* We have now been online. */
1331 rdp->passed_quiesc_completed = lastcomp - 1;
1332 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1333 rdp->nxtlist = NULL;
1334 for (i = 0; i < RCU_NEXT_SIZE; i++)
1335 rdp->nxttail[i] = &rdp->nxtlist;
1336 rdp->qlen = 0;
1337 rdp->blimit = blimit;
1338#ifdef CONFIG_NO_HZ
1339 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1340#endif /* #ifdef CONFIG_NO_HZ */
1341 rdp->cpu = cpu;
1342 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1343
1344 /*
1345 * A new grace period might start here. If so, we won't be part
1346 * of it, but that is OK, as we are currently in a quiescent state.
1347 */
1348
1349 /* Exclude any attempts to start a new GP on large systems. */
1350 spin_lock(&rsp->onofflock); /* irqs already disabled. */
1351
1352 /* Add CPU to rcu_node bitmasks. */
1353 rnp = rdp->mynode;
1354 mask = rdp->grpmask;
1355 do {
1356 /* Exclude any attempts to start a new GP on small systems. */
1357 spin_lock(&rnp->lock); /* irqs already disabled. */
1358 rnp->qsmaskinit |= mask;
1359 mask = rnp->grpmask;
1360 spin_unlock(&rnp->lock); /* irqs already disabled. */
1361 rnp = rnp->parent;
1362 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1363
1364 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1365
1366 /*
1367 * A new grace period might start here. If so, we will be part of
1368 * it, and its gpnum will be greater than ours, so we will
1369 * participate. It is also possible for the gpnum to have been
1370 * incremented before this function was called, and the bitmasks
1371 * to not be filled out until now, in which case we will also
1372 * participate due to our gpnum being behind.
1373 */
1374
1375 /* Since it is coming online, the CPU is in a quiescent state. */
1376 cpu_quiet(cpu, rsp, rdp, lastcomp);
1377 local_irq_restore(flags);
1378}
1379
1380static void __cpuinit rcu_online_cpu(int cpu)
1381{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1392}
1393
1394/*
1395 * Handle CPU online/offline notifcation events.
1396 */
1397static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1398 unsigned long action, void *hcpu)
1399{
1400 long cpu = (long)hcpu;
1401
1402 switch (action) {
1403 case CPU_UP_PREPARE:
1404 case CPU_UP_PREPARE_FROZEN:
1405 rcu_online_cpu(cpu);
1406 break;
1407 case CPU_DEAD:
1408 case CPU_DEAD_FROZEN:
1409 case CPU_UP_CANCELED:
1410 case CPU_UP_CANCELED_FROZEN:
1411 rcu_offline_cpu(cpu);
1412 break;
1413 default:
1414 break;
1415 }
1416 return NOTIFY_OK;
1417}
1418
1419/*
1420 * Compute the per-level fanout, either using the exact fanout specified
1421 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1422 */
1423#ifdef CONFIG_RCU_FANOUT_EXACT
1424static void __init rcu_init_levelspread(struct rcu_state *rsp)
1425{
1426 int i;
1427
1428 for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1429 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1430}
1431#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1432static void __init rcu_init_levelspread(struct rcu_state *rsp)
1433{
1434 int ccur;
1435 int cprv;
1436 int i;
1437
1438 cprv = NR_CPUS;
1439 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1440 ccur = rsp->levelcnt[i];
1441 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1442 cprv = ccur;
1443 }
1444}
1445#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1446
1447/*
1448 * Helper function for rcu_init() that initializes one rcu_state structure.
1449 */
1450static void __init rcu_init_one(struct rcu_state *rsp)
1451{
1452 int cpustride = 1;
1453 int i;
1454 int j;
1455 struct rcu_node *rnp;
1456
1457 /* Initialize the level-tracking arrays. */
1458
1459 for (i = 1; i < NUM_RCU_LVLS; i++)
1460 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1461 rcu_init_levelspread(rsp);
1462
1463 /* Initialize the elements themselves, starting from the leaves. */
1464
1465 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1466 cpustride *= rsp->levelspread[i];
1467 rnp = rsp->level[i];
1468 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1469 spin_lock_init(&rnp->lock);
1470 rnp->qsmask = 0;
1471 rnp->qsmaskinit = 0;
1472 rnp->grplo = j * cpustride;
1473 rnp->grphi = (j + 1) * cpustride - 1;
1474 if (rnp->grphi >= NR_CPUS)
1475 rnp->grphi = NR_CPUS - 1;
1476 if (i == 0) {
1477 rnp->grpnum = 0;
1478 rnp->grpmask = 0;
1479 rnp->parent = NULL;
1480 } else {
1481 rnp->grpnum = j % rsp->levelspread[i - 1];
1482 rnp->grpmask = 1UL << rnp->grpnum;
1483 rnp->parent = rsp->level[i - 1] +
1484 j / rsp->levelspread[i - 1];
1485 }
1486 rnp->level = i;
1487 }
1488 }
1489}
1490
1491/*
1492 * Helper macro for __rcu_init(). To be used nowhere else!
1493 * Assigns leaf node pointers into each CPU's rcu_data structure.
1494 */
1495#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1496do { \
1497 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1498 j = 0; \
1499 for_each_possible_cpu(i) { \
1500 if (i > rnp[j].grphi) \
1501 j++; \
1502 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1503 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1504 } \
1505} while (0)
1506
1507static struct notifier_block __cpuinitdata rcu_nb = {
1508 .notifier_call = rcu_cpu_notify,
1509};
1510
1511void __init __rcu_init(void)
1512{
1513 int i; /* All used by RCU_DATA_PTR_INIT(). */
1514 int j;
1515 struct rcu_node *rnp;
1516
1517 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1518#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1519 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1520#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1521 rcu_init_one(&rcu_state);
1522 RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1523 rcu_init_one(&rcu_bh_state);
1524 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1525
1526 for_each_online_cpu(i)
1527 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1528 /* Register notifier for non-boot CPUs */
1529 register_cpu_notifier(&rcu_nb);
1530 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1531}
1532
1533module_param(blimit, int, 0);
1534module_param(qhimark, int, 0);
1535module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
1/*
2 * Read-Copy Update tracing for classic implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
45
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{
48 if (!rdp->beenonline)
49 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
51 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending,
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks,
61 rdp->dynticks->dynticks_nesting,
62 rdp->dynticks->dynticks_nmi,
63 rdp->dynticks_fqs);
64#endif /* #ifdef CONFIG_NO_HZ */
65 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
66 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
67}
68
69#define PRINT_RCU_DATA(name, func, m) \
70 do { \
71 int _p_r_d_i; \
72 \
73 for_each_possible_cpu(_p_r_d_i) \
74 func(m, &per_cpu(name, _p_r_d_i)); \
75 } while (0)
76
77static int show_rcudata(struct seq_file *m, void *unused)
78{
79 seq_puts(m, "rcu:\n");
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0;
84}
85
86static int rcudata_open(struct inode *inode, struct file *file)
87{
88 return single_open(file, show_rcudata, NULL);
89}
90
91static struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE,
93 .open = rcudata_open,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98
99static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{
101 if (!rdp->beenonline)
102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending,
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks,
114 rdp->dynticks->dynticks_nesting,
115 rdp->dynticks->dynticks_nmi,
116 rdp->dynticks_fqs);
117#endif /* #ifdef CONFIG_NO_HZ */
118 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
119 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
120}
121
122static int show_rcudata_csv(struct seq_file *m, void *unused)
123{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
125#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */
128 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
129 seq_puts(m, "\"rcu:\"\n");
130 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
131 seq_puts(m, "\"rcu_bh:\"\n");
132 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
133 return 0;
134}
135
136static int rcudata_csv_open(struct inode *inode, struct file *file)
137{
138 return single_open(file, show_rcudata_csv, NULL);
139}
140
141static struct file_operations rcudata_csv_fops = {
142 .owner = THIS_MODULE,
143 .open = rcudata_csv_open,
144 .read = seq_read,
145 .llseek = seq_lseek,
146 .release = single_release,
147};
148
149static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
150{
151 int level = 0;
152 struct rcu_node *rnp;
153
154 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
155 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
156 rsp->completed, rsp->gpnum, rsp->signaled,
157 (long)(rsp->jiffies_force_qs - jiffies),
158 (int)(jiffies & 0xffff),
159 rsp->n_force_qs, rsp->n_force_qs_ngp,
160 rsp->n_force_qs - rsp->n_force_qs_ngp,
161 rsp->n_force_qs_lh);
162 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
163 if (rnp->level != level) {
164 seq_puts(m, "\n");
165 level = rnp->level;
166 }
167 seq_printf(m, "%lx/%lx %d:%d ^%d ",
168 rnp->qsmask, rnp->qsmaskinit,
169 rnp->grplo, rnp->grphi, rnp->grpnum);
170 }
171 seq_puts(m, "\n");
172}
173
174static int show_rcuhier(struct seq_file *m, void *unused)
175{
176 seq_puts(m, "rcu:\n");
177 print_one_rcu_state(m, &rcu_state);
178 seq_puts(m, "rcu_bh:\n");
179 print_one_rcu_state(m, &rcu_bh_state);
180 return 0;
181}
182
183static int rcuhier_open(struct inode *inode, struct file *file)
184{
185 return single_open(file, show_rcuhier, NULL);
186}
187
188static struct file_operations rcuhier_fops = {
189 .owner = THIS_MODULE,
190 .open = rcuhier_open,
191 .read = seq_read,
192 .llseek = seq_lseek,
193 .release = single_release,
194};
195
196static int show_rcugp(struct seq_file *m, void *unused)
197{
198 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
199 rcu_state.completed, rcu_state.gpnum);
200 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
201 rcu_bh_state.completed, rcu_bh_state.gpnum);
202 return 0;
203}
204
205static int rcugp_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, show_rcugp, NULL);
208}
209
210static struct file_operations rcugp_fops = {
211 .owner = THIS_MODULE,
212 .open = rcugp_open,
213 .read = seq_read,
214 .llseek = seq_lseek,
215 .release = single_release,
216};
217
218static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
219static int __init rcuclassic_trace_init(void)
220{
221 rcudir = debugfs_create_dir("rcu", NULL);
222 if (!rcudir)
223 goto out;
224
225 datadir = debugfs_create_file("rcudata", 0444, rcudir,
226 NULL, &rcudata_fops);
227 if (!datadir)
228 goto free_out;
229
230 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
231 NULL, &rcudata_csv_fops);
232 if (!datadir_csv)
233 goto free_out;
234
235 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
236 if (!gpdir)
237 goto free_out;
238
239 hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
240 NULL, &rcuhier_fops);
241 if (!hierdir)
242 goto free_out;
243 return 0;
244free_out:
245 if (datadir)
246 debugfs_remove(datadir);
247 if (datadir_csv)
248 debugfs_remove(datadir_csv);
249 if (gpdir)
250 debugfs_remove(gpdir);
251 debugfs_remove(rcudir);
252out:
253 return 1;
254}
255
256static void __exit rcuclassic_trace_cleanup(void)
257{
258 debugfs_remove(datadir);
259 debugfs_remove(datadir_csv);
260 debugfs_remove(gpdir);
261 debugfs_remove(hierdir);
262 debugfs_remove(rcudir);
263}
264
265
266module_init(rcuclassic_trace_init);
267module_exit(rcuclassic_trace_cleanup);
268
269MODULE_AUTHOR("Paul E. McKenney");
270MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
271MODULE_LICENSE("GPL");
diff --git a/kernel/relay.c b/kernel/relay.c
index 32b0befdcb6a..09ac2008f77b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1317,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in,
1317 if (ret < 0) 1317 if (ret < 0)
1318 break; 1318 break;
1319 else if (!ret) { 1319 else if (!ret) {
1320 if (spliced) 1320 if (flags & SPLICE_F_NONBLOCK)
1321 break;
1322 if (flags & SPLICE_F_NONBLOCK) {
1323 ret = -EAGAIN; 1321 ret = -EAGAIN;
1324 break; 1322 break;
1325 }
1326 } 1323 }
1327 1324
1328 *ppos += ret; 1325 *ppos += ret;
diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663ef..e633106b12f6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -853,6 +853,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && 853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) 854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue; 855 continue;
856 /*
857 * if a resource is "BUSY", it's not a hardware resource
858 * but a driver mapping of such a resource; we don't want
859 * to warn for those; some drivers legitimately map only
860 * partial hardware resources. (example: vesafb)
861 */
862 if (p->flags & IORESOURCE_BUSY)
863 continue;
864
856 printk(KERN_WARNING "resource map sanity check conflict: " 865 printk(KERN_WARNING "resource map sanity check conflict: "
857 "0x%llx 0x%llx 0x%llx 0x%llx %s\n", 866 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
858 (unsigned long long)addr, 867 (unsigned long long)addr,
diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb5c3dc..fff1c4a20b65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -203,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
203 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
204 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207} 212}
208 213
209static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -261,6 +266,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 266 struct cgroup_subsys_state css;
262#endif 267#endif
263 268
269#ifdef CONFIG_USER_SCHED
270 uid_t uid;
271#endif
272
264#ifdef CONFIG_FAIR_GROUP_SCHED 273#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 274 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 275 struct sched_entity **se;
@@ -286,6 +295,12 @@ struct task_group {
286 295
287#ifdef CONFIG_USER_SCHED 296#ifdef CONFIG_USER_SCHED
288 297
298/* Helper function to pass uid information to create_sched_user() */
299void set_tg_uid(struct user_struct *user)
300{
301 user->tg->uid = user->uid;
302}
303
289/* 304/*
290 * Root task group. 305 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 306 * Every UID task group (including init_task_group aka UID-0) will
@@ -345,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
345 struct task_group *tg; 360 struct task_group *tg;
346 361
347#ifdef CONFIG_USER_SCHED 362#ifdef CONFIG_USER_SCHED
348 tg = p->user->tg; 363 rcu_read_lock();
364 tg = __task_cred(p)->user->tg;
365 rcu_read_unlock();
349#elif defined(CONFIG_CGROUP_SCHED) 366#elif defined(CONFIG_CGROUP_SCHED)
350 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 367 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
351 struct task_group, css); 368 struct task_group, css);
@@ -586,6 +603,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 603#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 604 /* latency stats */
588 struct sched_info rq_sched_info; 605 struct sched_info rq_sched_info;
606 unsigned long long rq_cpu_time;
607 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 608
590 /* sys_sched_yield() stats */ 609 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 610 unsigned int yld_exp_empty;
@@ -703,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = {
703 722
704#undef SCHED_FEAT 723#undef SCHED_FEAT
705 724
706static int sched_feat_open(struct inode *inode, struct file *filp) 725static int sched_feat_show(struct seq_file *m, void *v)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{ 726{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 727 int i;
720 728
721 for (i = 0; sched_feat_names[i]; i++) { 729 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 730 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 731 seq_puts(m, "NO_");
724 } 732 seq_printf(m, "%s ", sched_feat_names[i]);
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 } 733 }
734 seq_puts(m, "\n");
736 735
737 r += sprintf(buf + r, "\n"); 736 return 0;
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 737}
746 738
747static ssize_t 739static ssize_t
@@ -786,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 778 return cnt;
787} 779}
788 780
781static int sched_feat_open(struct inode *inode, struct file *filp)
782{
783 return single_open(filp, sched_feat_show, NULL);
784}
785
789static struct file_operations sched_feat_fops = { 786static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 787 .open = sched_feat_open,
791 .read = sched_feat_read, 788 .write = sched_feat_write,
792 .write = sched_feat_write, 789 .read = seq_read,
790 .llseek = seq_lseek,
791 .release = single_release,
793}; 792};
794 793
795static __init int sched_init_debug(void) 794static __init int sched_init_debug(void)
@@ -1139,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
1139 1138
1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1139 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1141 rq->hrtick_timer.function = hrtick; 1140 rq->hrtick_timer.function = hrtick;
1142 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1143} 1141}
1144#else /* CONFIG_SCHED_HRTICK */ 1142#else /* CONFIG_SCHED_HRTICK */
1145static inline void hrtick_clear(struct rq *rq) 1143static inline void hrtick_clear(struct rq *rq)
@@ -1474,27 +1472,13 @@ static void
1474update_group_shares_cpu(struct task_group *tg, int cpu, 1472update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight) 1473 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{ 1474{
1477 int boost = 0;
1478 unsigned long shares; 1475 unsigned long shares;
1479 unsigned long rq_weight; 1476 unsigned long rq_weight;
1480 1477
1481 if (!tg->se[cpu]) 1478 if (!tg->se[cpu])
1482 return; 1479 return;
1483 1480
1484 rq_weight = tg->cfs_rq[cpu]->load.weight; 1481 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1485
1486 /*
1487 * If there are currently no tasks on the cpu pretend there is one of
1488 * average load so that when a new task gets to run here it will not
1489 * get delayed by group starvation.
1490 */
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498 1482
1499 /* 1483 /*
1500 * \Sum shares * rq_weight 1484 * \Sum shares * rq_weight
@@ -1502,7 +1486,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1502 * \Sum rq_weight 1486 * \Sum rq_weight
1503 * 1487 *
1504 */ 1488 */
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1489 shares = (sd_shares * rq_weight) / sd_rq_weight;
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1490 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507 1491
1508 if (abs(shares - tg->se[cpu]->load.weight) > 1492 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1495,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1511 unsigned long flags; 1495 unsigned long flags;
1512 1496
1513 spin_lock_irqsave(&rq->lock, flags); 1497 spin_lock_irqsave(&rq->lock, flags);
1514 /* 1498 tg->cfs_rq[cpu]->shares = shares;
1515 * record the actual number of shares, not the boosted amount.
1516 */
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519 1499
1520 __set_se_shares(tg->se[cpu], shares); 1500 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags); 1501 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1509,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1529 */ 1509 */
1530static int tg_shares_up(struct task_group *tg, void *data) 1510static int tg_shares_up(struct task_group *tg, void *data)
1531{ 1511{
1532 unsigned long rq_weight = 0; 1512 unsigned long weight, rq_weight = 0;
1533 unsigned long shares = 0; 1513 unsigned long shares = 0;
1534 struct sched_domain *sd = data; 1514 struct sched_domain *sd = data;
1535 int i; 1515 int i;
1536 1516
1537 for_each_cpu_mask(i, sd->span) { 1517 for_each_cpu_mask(i, sd->span) {
1538 rq_weight += tg->cfs_rq[i]->load.weight; 1518 /*
1519 * If there are currently no tasks on the cpu pretend there
1520 * is one of average load so that when a new task gets to
1521 * run here it will not get delayed by group starvation.
1522 */
1523 weight = tg->cfs_rq[i]->load.weight;
1524 if (!weight)
1525 weight = NICE_0_LOAD;
1526
1527 tg->cfs_rq[i]->rq_weight = weight;
1528 rq_weight += weight;
1539 shares += tg->cfs_rq[i]->shares; 1529 shares += tg->cfs_rq[i]->shares;
1540 } 1530 }
1541 1531
@@ -1545,9 +1535,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1535 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares; 1536 shares = tg->shares;
1547 1537
1548 if (!rq_weight)
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span) 1538 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight); 1539 update_group_shares_cpu(tg, i, shares, rq_weight);
1553 1540
@@ -1612,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1612 1599
1613#endif 1600#endif
1614 1601
1602/*
1603 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1604 */
1605static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1606 __releases(this_rq->lock)
1607 __acquires(busiest->lock)
1608 __acquires(this_rq->lock)
1609{
1610 int ret = 0;
1611
1612 if (unlikely(!irqs_disabled())) {
1613 /* printk() doesn't work good under rq->lock */
1614 spin_unlock(&this_rq->lock);
1615 BUG_ON(1);
1616 }
1617 if (unlikely(!spin_trylock(&busiest->lock))) {
1618 if (busiest < this_rq) {
1619 spin_unlock(&this_rq->lock);
1620 spin_lock(&busiest->lock);
1621 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1622 ret = 1;
1623 } else
1624 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1625 }
1626 return ret;
1627}
1628
1629static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1630 __releases(busiest->lock)
1631{
1632 spin_unlock(&busiest->lock);
1633 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1634}
1615#endif 1635#endif
1616 1636
1617#ifdef CONFIG_FAIR_GROUP_SCHED 1637#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1845,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1845 1865
1846 clock_offset = old_rq->clock - new_rq->clock; 1866 clock_offset = old_rq->clock - new_rq->clock;
1847 1867
1868 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1869
1848#ifdef CONFIG_SCHEDSTATS 1870#ifdef CONFIG_SCHEDSTATS
1849 if (p->se.wait_start) 1871 if (p->se.wait_start)
1850 p->se.wait_start -= clock_offset; 1872 p->se.wait_start -= clock_offset;
@@ -2254,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2254 2276
2255 smp_wmb(); 2277 smp_wmb();
2256 rq = task_rq_lock(p, &flags); 2278 rq = task_rq_lock(p, &flags);
2279 update_rq_clock(rq);
2257 old_state = p->state; 2280 old_state = p->state;
2258 if (!(old_state & state)) 2281 if (!(old_state & state))
2259 goto out; 2282 goto out;
@@ -2311,12 +2334,11 @@ out_activate:
2311 schedstat_inc(p, se.nr_wakeups_local); 2334 schedstat_inc(p, se.nr_wakeups_local);
2312 else 2335 else
2313 schedstat_inc(p, se.nr_wakeups_remote); 2336 schedstat_inc(p, se.nr_wakeups_remote);
2314 update_rq_clock(rq);
2315 activate_task(rq, p, 1); 2337 activate_task(rq, p, 1);
2316 success = 1; 2338 success = 1;
2317 2339
2318out_running: 2340out_running:
2319 trace_sched_wakeup(rq, p); 2341 trace_sched_wakeup(rq, p, success);
2320 check_preempt_curr(rq, p, sync); 2342 check_preempt_curr(rq, p, sync);
2321 2343
2322 p->state = TASK_RUNNING; 2344 p->state = TASK_RUNNING;
@@ -2449,7 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2449 p->sched_class->task_new(rq, p); 2471 p->sched_class->task_new(rq, p);
2450 inc_nr_running(rq); 2472 inc_nr_running(rq);
2451 } 2473 }
2452 trace_sched_wakeup_new(rq, p); 2474 trace_sched_wakeup_new(rq, p, 1);
2453 check_preempt_curr(rq, p, 0); 2475 check_preempt_curr(rq, p, 0);
2454#ifdef CONFIG_SMP 2476#ifdef CONFIG_SMP
2455 if (p->sched_class->task_wake_up) 2477 if (p->sched_class->task_wake_up)
@@ -2812,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2812} 2834}
2813 2835
2814/* 2836/*
2815 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2816 */
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825 /* printk() doesn't work good under rq->lock */
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848/*
2849 * If dest_cpu is allowed for this process, migrate the task to it. 2837 * If dest_cpu is allowed for this process, migrate the task to it.
2850 * This is accomplished by forcing the cpu_allowed mask to only 2838 * This is accomplished by forcing the cpu_allowed mask to only
2851 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2839 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2862,7 +2850,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2862 || unlikely(!cpu_active(dest_cpu))) 2850 || unlikely(!cpu_active(dest_cpu)))
2863 goto out; 2851 goto out;
2864 2852
2865 trace_sched_migrate_task(rq, p, dest_cpu);
2866 /* force the process onto the specified CPU */ 2853 /* force the process onto the specified CPU */
2867 if (migrate_task(p, dest_cpu, &req)) { 2854 if (migrate_task(p, dest_cpu, &req)) {
2868 /* Need to wait for migration thread (might exit: take ref). */ 2855 /* Need to wait for migration thread (might exit: take ref). */
@@ -3707,7 +3694,7 @@ out_balanced:
3707static void idle_balance(int this_cpu, struct rq *this_rq) 3694static void idle_balance(int this_cpu, struct rq *this_rq)
3708{ 3695{
3709 struct sched_domain *sd; 3696 struct sched_domain *sd;
3710 int pulled_task = -1; 3697 int pulled_task = 0;
3711 unsigned long next_balance = jiffies + HZ; 3698 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask; 3699 cpumask_t tmpmask;
3713 3700
@@ -4203,7 +4190,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4203 4190
4204 if (p == rq->idle) { 4191 if (p == rq->idle) {
4205 p->stime = cputime_add(p->stime, steal); 4192 p->stime = cputime_add(p->stime, steal);
4206 account_group_system_time(p, steal);
4207 if (atomic_read(&rq->nr_iowait) > 0) 4193 if (atomic_read(&rq->nr_iowait) > 0)
4208 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4194 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4209 else 4195 else
@@ -4339,7 +4325,7 @@ void __kprobes sub_preempt_count(int val)
4339 /* 4325 /*
4340 * Underflow? 4326 * Underflow?
4341 */ 4327 */
4342 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4328 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4343 return; 4329 return;
4344 /* 4330 /*
4345 * Is the spinlock portion underflowing? 4331 * Is the spinlock portion underflowing?
@@ -5134,6 +5120,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5134 set_load_weight(p); 5120 set_load_weight(p);
5135} 5121}
5136 5122
5123/*
5124 * check the target process has a UID that matches the current process's
5125 */
5126static bool check_same_owner(struct task_struct *p)
5127{
5128 const struct cred *cred = current_cred(), *pcred;
5129 bool match;
5130
5131 rcu_read_lock();
5132 pcred = __task_cred(p);
5133 match = (cred->euid == pcred->euid ||
5134 cred->euid == pcred->uid);
5135 rcu_read_unlock();
5136 return match;
5137}
5138
5137static int __sched_setscheduler(struct task_struct *p, int policy, 5139static int __sched_setscheduler(struct task_struct *p, int policy,
5138 struct sched_param *param, bool user) 5140 struct sched_param *param, bool user)
5139{ 5141{
@@ -5193,8 +5195,7 @@ recheck:
5193 return -EPERM; 5195 return -EPERM;
5194 5196
5195 /* can't change other user's priorities */ 5197 /* can't change other user's priorities */
5196 if ((current->euid != p->euid) && 5198 if (!check_same_owner(p))
5197 (current->euid != p->uid))
5198 return -EPERM; 5199 return -EPERM;
5199 } 5200 }
5200 5201
@@ -5426,8 +5427,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5426 read_unlock(&tasklist_lock); 5427 read_unlock(&tasklist_lock);
5427 5428
5428 retval = -EPERM; 5429 retval = -EPERM;
5429 if ((current->euid != p->euid) && (current->euid != p->uid) && 5430 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5430 !capable(CAP_SYS_NICE))
5431 goto out_unlock; 5431 goto out_unlock;
5432 5432
5433 retval = security_task_setscheduler(p, 0, NULL); 5433 retval = security_task_setscheduler(p, 0, NULL);
@@ -5896,6 +5896,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5896 * The idle tasks have their own, simple scheduling class: 5896 * The idle tasks have their own, simple scheduling class:
5897 */ 5897 */
5898 idle->sched_class = &idle_sched_class; 5898 idle->sched_class = &idle_sched_class;
5899 ftrace_graph_init_task(idle);
5899} 5900}
5900 5901
5901/* 5902/*
@@ -6126,7 +6127,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6126 6127
6127/* 6128/*
6128 * Figure out where task on dead CPU should go, use force if necessary. 6129 * Figure out where task on dead CPU should go, use force if necessary.
6129 * NOTE: interrupts should be disabled by the caller
6130 */ 6130 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6132{
@@ -6587,7 +6587,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6587 req = list_entry(rq->migration_queue.next, 6587 req = list_entry(rq->migration_queue.next,
6588 struct migration_req, list); 6588 struct migration_req, list);
6589 list_del_init(&req->list); 6589 list_del_init(&req->list);
6590 spin_unlock_irq(&rq->lock);
6590 complete(&req->done); 6591 complete(&req->done);
6592 spin_lock_irq(&rq->lock);
6591 } 6593 }
6592 spin_unlock_irq(&rq->lock); 6594 spin_unlock_irq(&rq->lock);
6593 break; 6595 break;
@@ -6636,28 +6638,6 @@ early_initcall(migration_init);
6636 6638
6637#ifdef CONFIG_SCHED_DEBUG 6639#ifdef CONFIG_SCHED_DEBUG
6638 6640
6639static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6640{
6641 switch (lvl) {
6642 case SD_LV_NONE:
6643 return "NONE";
6644 case SD_LV_SIBLING:
6645 return "SIBLING";
6646 case SD_LV_MC:
6647 return "MC";
6648 case SD_LV_CPU:
6649 return "CPU";
6650 case SD_LV_NODE:
6651 return "NODE";
6652 case SD_LV_ALLNODES:
6653 return "ALLNODES";
6654 case SD_LV_MAX:
6655 return "MAX";
6656
6657 }
6658 return "MAX";
6659}
6660
6661static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6641static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6662 cpumask_t *groupmask) 6642 cpumask_t *groupmask)
6663{ 6643{
@@ -6677,8 +6657,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6677 return -1; 6657 return -1;
6678 } 6658 }
6679 6659
6680 printk(KERN_CONT "span %s level %s\n", 6660 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6681 str, sd_level_to_string(sd->level));
6682 6661
6683 if (!cpu_isset(cpu, sd->span)) { 6662 if (!cpu_isset(cpu, sd->span)) {
6684 printk(KERN_ERR "ERROR: domain->span does not contain " 6663 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6814,6 +6793,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6814 SD_BALANCE_EXEC | 6793 SD_BALANCE_EXEC |
6815 SD_SHARE_CPUPOWER | 6794 SD_SHARE_CPUPOWER |
6816 SD_SHARE_PKG_RESOURCES); 6795 SD_SHARE_PKG_RESOURCES);
6796 if (nr_node_ids == 1)
6797 pflags &= ~SD_SERIALIZE;
6817 } 6798 }
6818 if (~cflags & pflags) 6799 if (~cflags & pflags)
6819 return 0; 6800 return 0;
@@ -7334,13 +7315,21 @@ struct allmasks {
7334}; 7315};
7335 7316
7336#if NR_CPUS > 128 7317#if NR_CPUS > 128
7337#define SCHED_CPUMASK_ALLOC 1 7318#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7338#define SCHED_CPUMASK_FREE(v) kfree(v) 7319static inline void sched_cpumask_alloc(struct allmasks **masks)
7339#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7320{
7321 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7322}
7323static inline void sched_cpumask_free(struct allmasks *masks)
7324{
7325 kfree(masks);
7326}
7340#else 7327#else
7341#define SCHED_CPUMASK_ALLOC 0 7328#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7342#define SCHED_CPUMASK_FREE(v) 7329static inline void sched_cpumask_alloc(struct allmasks **masks)
7343#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7330{ }
7331static inline void sched_cpumask_free(struct allmasks *masks)
7332{ }
7344#endif 7333#endif
7345 7334
7346#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7335#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7416,9 +7405,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7416 return -ENOMEM; 7405 return -ENOMEM;
7417 } 7406 }
7418 7407
7419#if SCHED_CPUMASK_ALLOC
7420 /* get space for all scratch cpumask variables */ 7408 /* get space for all scratch cpumask variables */
7421 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7409 sched_cpumask_alloc(&allmasks);
7422 if (!allmasks) { 7410 if (!allmasks) {
7423 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7411 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7424 kfree(rd); 7412 kfree(rd);
@@ -7427,7 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7427#endif 7415#endif
7428 return -ENOMEM; 7416 return -ENOMEM;
7429 } 7417 }
7430#endif 7418
7431 tmpmask = (cpumask_t *)allmasks; 7419 tmpmask = (cpumask_t *)allmasks;
7432 7420
7433 7421
@@ -7681,13 +7669,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7681 cpu_attach_domain(sd, rd, i); 7669 cpu_attach_domain(sd, rd, i);
7682 } 7670 }
7683 7671
7684 SCHED_CPUMASK_FREE((void *)allmasks); 7672 sched_cpumask_free(allmasks);
7685 return 0; 7673 return 0;
7686 7674
7687#ifdef CONFIG_NUMA 7675#ifdef CONFIG_NUMA
7688error: 7676error:
7689 free_sched_groups(cpu_map, tmpmask); 7677 free_sched_groups(cpu_map, tmpmask);
7690 SCHED_CPUMASK_FREE((void *)allmasks); 7678 sched_cpumask_free(allmasks);
7691 kfree(rd); 7679 kfree(rd);
7692 return -ENOMEM; 7680 return -ENOMEM;
7693#endif 7681#endif
@@ -7710,8 +7698,14 @@ static struct sched_domain_attr *dattr_cur;
7710 */ 7698 */
7711static cpumask_t fallback_doms; 7699static cpumask_t fallback_doms;
7712 7700
7713void __attribute__((weak)) arch_update_cpu_topology(void) 7701/*
7702 * arch_update_cpu_topology lets virtualized architectures update the
7703 * cpu core maps. It is supposed to return 1 if the topology changed
7704 * or 0 if it stayed the same.
7705 */
7706int __attribute__((weak)) arch_update_cpu_topology(void)
7714{ 7707{
7708 return 0;
7715} 7709}
7716 7710
7717/* 7711/*
@@ -7751,8 +7745,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7751 cpumask_t tmpmask; 7745 cpumask_t tmpmask;
7752 int i; 7746 int i;
7753 7747
7754 unregister_sched_domain_sysctl();
7755
7756 for_each_cpu_mask_nr(i, *cpu_map) 7748 for_each_cpu_mask_nr(i, *cpu_map)
7757 cpu_attach_domain(NULL, &def_root_domain, i); 7749 cpu_attach_domain(NULL, &def_root_domain, i);
7758 synchronize_sched(); 7750 synchronize_sched();
@@ -7805,17 +7797,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7805 struct sched_domain_attr *dattr_new) 7797 struct sched_domain_attr *dattr_new)
7806{ 7798{
7807 int i, j, n; 7799 int i, j, n;
7800 int new_topology;
7808 7801
7809 mutex_lock(&sched_domains_mutex); 7802 mutex_lock(&sched_domains_mutex);
7810 7803
7811 /* always unregister in case we don't destroy any domains */ 7804 /* always unregister in case we don't destroy any domains */
7812 unregister_sched_domain_sysctl(); 7805 unregister_sched_domain_sysctl();
7813 7806
7807 /* Let architecture update cpu core mappings. */
7808 new_topology = arch_update_cpu_topology();
7809
7814 n = doms_new ? ndoms_new : 0; 7810 n = doms_new ? ndoms_new : 0;
7815 7811
7816 /* Destroy deleted domains */ 7812 /* Destroy deleted domains */
7817 for (i = 0; i < ndoms_cur; i++) { 7813 for (i = 0; i < ndoms_cur; i++) {
7818 for (j = 0; j < n; j++) { 7814 for (j = 0; j < n && !new_topology; j++) {
7819 if (cpus_equal(doms_cur[i], doms_new[j]) 7815 if (cpus_equal(doms_cur[i], doms_new[j])
7820 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7816 && dattrs_equal(dattr_cur, i, dattr_new, j))
7821 goto match1; 7817 goto match1;
@@ -7830,12 +7826,12 @@ match1:
7830 ndoms_cur = 0; 7826 ndoms_cur = 0;
7831 doms_new = &fallback_doms; 7827 doms_new = &fallback_doms;
7832 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7828 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7833 dattr_new = NULL; 7829 WARN_ON_ONCE(dattr_new);
7834 } 7830 }
7835 7831
7836 /* Build new domains */ 7832 /* Build new domains */
7837 for (i = 0; i < ndoms_new; i++) { 7833 for (i = 0; i < ndoms_new; i++) {
7838 for (j = 0; j < ndoms_cur; j++) { 7834 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7839 if (cpus_equal(doms_new[i], doms_cur[j]) 7835 if (cpus_equal(doms_new[i], doms_cur[j])
7840 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7836 && dattrs_equal(dattr_new, i, dattr_cur, j))
7841 goto match2; 7837 goto match2;
@@ -8490,7 +8486,7 @@ static
8490int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8486int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8491{ 8487{
8492 struct cfs_rq *cfs_rq; 8488 struct cfs_rq *cfs_rq;
8493 struct sched_entity *se, *parent_se; 8489 struct sched_entity *se;
8494 struct rq *rq; 8490 struct rq *rq;
8495 int i; 8491 int i;
8496 8492
@@ -8506,18 +8502,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8506 for_each_possible_cpu(i) { 8502 for_each_possible_cpu(i) {
8507 rq = cpu_rq(i); 8503 rq = cpu_rq(i);
8508 8504
8509 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8505 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8510 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8506 GFP_KERNEL, cpu_to_node(i));
8511 if (!cfs_rq) 8507 if (!cfs_rq)
8512 goto err; 8508 goto err;
8513 8509
8514 se = kmalloc_node(sizeof(struct sched_entity), 8510 se = kzalloc_node(sizeof(struct sched_entity),
8515 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8511 GFP_KERNEL, cpu_to_node(i));
8516 if (!se) 8512 if (!se)
8517 goto err; 8513 goto err;
8518 8514
8519 parent_se = parent ? parent->se[i] : NULL; 8515 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8520 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8521 } 8516 }
8522 8517
8523 return 1; 8518 return 1;
@@ -8578,7 +8573,7 @@ static
8578int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8573int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8579{ 8574{
8580 struct rt_rq *rt_rq; 8575 struct rt_rq *rt_rq;
8581 struct sched_rt_entity *rt_se, *parent_se; 8576 struct sched_rt_entity *rt_se;
8582 struct rq *rq; 8577 struct rq *rq;
8583 int i; 8578 int i;
8584 8579
@@ -8595,18 +8590,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8595 for_each_possible_cpu(i) { 8590 for_each_possible_cpu(i) {
8596 rq = cpu_rq(i); 8591 rq = cpu_rq(i);
8597 8592
8598 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8593 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8599 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8594 GFP_KERNEL, cpu_to_node(i));
8600 if (!rt_rq) 8595 if (!rt_rq)
8601 goto err; 8596 goto err;
8602 8597
8603 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8598 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8604 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8599 GFP_KERNEL, cpu_to_node(i));
8605 if (!rt_se) 8600 if (!rt_se)
8606 goto err; 8601 goto err;
8607 8602
8608 parent_se = parent ? parent->rt_se[i] : NULL; 8603 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8609 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8610 } 8604 }
8611 8605
8612 return 1; 8606 return 1;
@@ -9249,11 +9243,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9249 * (balbir@in.ibm.com). 9243 * (balbir@in.ibm.com).
9250 */ 9244 */
9251 9245
9252/* track cpu usage of a group of tasks */ 9246/* track cpu usage of a group of tasks and its child groups */
9253struct cpuacct { 9247struct cpuacct {
9254 struct cgroup_subsys_state css; 9248 struct cgroup_subsys_state css;
9255 /* cpuusage holds pointer to a u64-type object on every cpu */ 9249 /* cpuusage holds pointer to a u64-type object on every cpu */
9256 u64 *cpuusage; 9250 u64 *cpuusage;
9251 struct cpuacct *parent;
9257}; 9252};
9258 9253
9259struct cgroup_subsys cpuacct_subsys; 9254struct cgroup_subsys cpuacct_subsys;
@@ -9287,6 +9282,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9287 return ERR_PTR(-ENOMEM); 9282 return ERR_PTR(-ENOMEM);
9288 } 9283 }
9289 9284
9285 if (cgrp->parent)
9286 ca->parent = cgroup_ca(cgrp->parent);
9287
9290 return &ca->css; 9288 return &ca->css;
9291} 9289}
9292 9290
@@ -9300,6 +9298,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9300 kfree(ca); 9298 kfree(ca);
9301} 9299}
9302 9300
9301static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9302{
9303 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9304 u64 data;
9305
9306#ifndef CONFIG_64BIT
9307 /*
9308 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9309 */
9310 spin_lock_irq(&cpu_rq(cpu)->lock);
9311 data = *cpuusage;
9312 spin_unlock_irq(&cpu_rq(cpu)->lock);
9313#else
9314 data = *cpuusage;
9315#endif
9316
9317 return data;
9318}
9319
9320static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9321{
9322 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9323
9324#ifndef CONFIG_64BIT
9325 /*
9326 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9327 */
9328 spin_lock_irq(&cpu_rq(cpu)->lock);
9329 *cpuusage = val;
9330 spin_unlock_irq(&cpu_rq(cpu)->lock);
9331#else
9332 *cpuusage = val;
9333#endif
9334}
9335
9303/* return total cpu usage (in nanoseconds) of a group */ 9336/* return total cpu usage (in nanoseconds) of a group */
9304static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9337static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9305{ 9338{
@@ -9307,17 +9340,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9307 u64 totalcpuusage = 0; 9340 u64 totalcpuusage = 0;
9308 int i; 9341 int i;
9309 9342
9310 for_each_possible_cpu(i) { 9343 for_each_present_cpu(i)
9311 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9344 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9312
9313 /*
9314 * Take rq->lock to make 64-bit addition safe on 32-bit
9315 * platforms.
9316 */
9317 spin_lock_irq(&cpu_rq(i)->lock);
9318 totalcpuusage += *cpuusage;
9319 spin_unlock_irq(&cpu_rq(i)->lock);
9320 }
9321 9345
9322 return totalcpuusage; 9346 return totalcpuusage;
9323} 9347}
@@ -9334,23 +9358,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9334 goto out; 9358 goto out;
9335 } 9359 }
9336 9360
9337 for_each_possible_cpu(i) { 9361 for_each_present_cpu(i)
9338 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9362 cpuacct_cpuusage_write(ca, i, 0);
9339 9363
9340 spin_lock_irq(&cpu_rq(i)->lock);
9341 *cpuusage = 0;
9342 spin_unlock_irq(&cpu_rq(i)->lock);
9343 }
9344out: 9364out:
9345 return err; 9365 return err;
9346} 9366}
9347 9367
9368static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9369 struct seq_file *m)
9370{
9371 struct cpuacct *ca = cgroup_ca(cgroup);
9372 u64 percpu;
9373 int i;
9374
9375 for_each_present_cpu(i) {
9376 percpu = cpuacct_cpuusage_read(ca, i);
9377 seq_printf(m, "%llu ", (unsigned long long) percpu);
9378 }
9379 seq_printf(m, "\n");
9380 return 0;
9381}
9382
9348static struct cftype files[] = { 9383static struct cftype files[] = {
9349 { 9384 {
9350 .name = "usage", 9385 .name = "usage",
9351 .read_u64 = cpuusage_read, 9386 .read_u64 = cpuusage_read,
9352 .write_u64 = cpuusage_write, 9387 .write_u64 = cpuusage_write,
9353 }, 9388 },
9389 {
9390 .name = "usage_percpu",
9391 .read_seq_string = cpuacct_percpu_seq_read,
9392 },
9393
9354}; 9394};
9355 9395
9356static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9396static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9366,14 +9406,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9366static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9406static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9367{ 9407{
9368 struct cpuacct *ca; 9408 struct cpuacct *ca;
9409 int cpu;
9369 9410
9370 if (!cpuacct_subsys.active) 9411 if (!cpuacct_subsys.active)
9371 return; 9412 return;
9372 9413
9414 cpu = task_cpu(tsk);
9373 ca = task_ca(tsk); 9415 ca = task_ca(tsk);
9374 if (ca) {
9375 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9376 9416
9417 for (; ca; ca = ca->parent) {
9418 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9377 *cpuusage += cputime; 9419 *cpuusage += cputime;
9378 } 9420 }
9379} 9421}
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 81787248b60f..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
118 118
119 /* 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 121 * max(scd->tick_gtod, scd->clock),
122 * max(scd->clock, scd->tick_gtod + TICK_NSEC)); 122 * scd->tick_gtod + TICK_NSEC);
123 */ 123 */
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 127 max_clock = scd->tick_gtod + TICK_NSEC;
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 26ed8e3d1c15..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
164 {
165 uid_t uid = cfs_rq->tg->uid;
166 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
167 }
134#else 168#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 169 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 170#endif
137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 171 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
139 SPLIT_NS(cfs_rq->exec_clock)); 172 SPLIT_NS(cfs_rq->exec_clock));
140 173
@@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168#ifdef CONFIG_SMP 201#ifdef CONFIG_SMP
169 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 202 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
170#endif 203#endif
204 print_cfs_group_stats(m, cpu, cfs_rq->tg);
171#endif 205#endif
172} 206}
173 207
@@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
175{ 209{
176#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
177 char path[128] = ""; 211 char path[128] = "";
178 struct cgroup *cgroup = NULL;
179 struct task_group *tg = rt_rq->tg; 212 struct task_group *tg = rt_rq->tg;
180 213
181 if (tg) 214 cgroup_path(tg->css.cgroup, path, sizeof(path));
182 cgroup = tg->css.cgroup;
183
184 if (cgroup)
185 cgroup_path(cgroup, path, sizeof(path));
186 215
187 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
188#else 217#else
@@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
272 u64 now = ktime_to_ns(ktime_get()); 301 u64 now = ktime_to_ns(ktime_get());
273 int cpu; 302 int cpu;
274 303
275 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 304 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
276 init_utsname()->release, 305 init_utsname()->release,
277 (int)strcspn(init_utsname()->version, " "), 306 (int)strcspn(init_utsname()->version, " "),
278 init_utsname()->version); 307 init_utsname()->version);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059..5ad4440f0fc4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
492 * overflow on 32 bits): 492 * overflow on 32 bits):
493 */ 493 */
494 delta_exec = (unsigned long)(now - curr->exec_start); 494 delta_exec = (unsigned long)(now - curr->exec_start);
495 if (!delta_exec)
496 return;
495 497
496 __update_curr(cfs_rq, curr, delta_exec); 498 __update_curr(cfs_rq, curr, delta_exec);
497 curr->exec_start = now; 499 curr->exec_start = now;
@@ -1345,12 +1347,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1345{ 1347{
1346 struct task_struct *curr = rq->curr; 1348 struct task_struct *curr = rq->curr;
1347 struct sched_entity *se = &curr->se, *pse = &p->se; 1349 struct sched_entity *se = &curr->se, *pse = &p->se;
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1348 1351
1349 if (unlikely(rt_prio(p->prio))) { 1352 update_curr(cfs_rq);
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1351 1353
1352 update_rq_clock(rq); 1354 if (unlikely(rt_prio(p->prio))) {
1353 update_curr(cfs_rq);
1354 resched_task(curr); 1355 resched_task(curr);
1355 return; 1356 return;
1356 } 1357 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..51d2af3e6191 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
77} 77}
78 78
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 79#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 81
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{ 83{
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
537 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
538 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
539 539
540 spin_lock(&rt_rq->rt_runtime_lock);
541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
542 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
543 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 546 }
546 spin_unlock(&rt_rq->rt_runtime_lock);
547 } 547 }
548} 548}
549 549
@@ -909,9 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
909/* Only try algorithms three times */ 909/* Only try algorithms three times */
910#define RT_MAX_TRIES 3 910#define RT_MAX_TRIES 3
911 911
912static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
914
915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
916 913
917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 914static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02c..3b01098164c8 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, 31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 32 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 33 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_sched_info.cpu_time, 34 rq->rq_cpu_time,
35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
36 36
37 seq_printf(seq, "\n"); 37 seq_printf(seq, "\n");
@@ -123,7 +123,7 @@ static inline void
123rq_sched_info_depart(struct rq *rq, unsigned long long delta) 123rq_sched_info_depart(struct rq *rq, unsigned long long delta)
124{ 124{
125 if (rq) 125 if (rq)
126 rq->rq_sched_info.cpu_time += delta; 126 rq->rq_cpu_time += delta;
127} 127}
128 128
129static inline void 129static inline void
@@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
236 unsigned long long delta = task_rq(t)->clock - 236 unsigned long long delta = task_rq(t)->clock -
237 t->sched_info.last_arrival; 237 t->sched_info.last_arrival;
238 238
239 t->sched_info.cpu_time += delta;
240 rq_sched_info_depart(task_rq(t), delta); 239 rq_sched_info_depart(task_rq(t), delta);
241 240
242 if (t->state == TASK_RUNNING) 241 if (t->state == TASK_RUNNING)
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc654455..8e95855ff3cf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
44static void __user *sig_handler(struct task_struct *t, int sig) 46static void __user *sig_handler(struct task_struct *t, int sig)
45{ 47{
46 return t->sighand->action[sig - 1].sa.sa_handler; 48 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -177,6 +179,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
177 return sig; 179 return sig;
178} 180}
179 181
182/*
183 * allocate a new signal queue record
184 * - this may be called without locks if and only if t == current, otherwise an
185 * appopriate lock must be held to stop the target task from exiting
186 */
180static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 187static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
181 int override_rlimit) 188 int override_rlimit)
182{ 189{
@@ -184,11 +191,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
184 struct user_struct *user; 191 struct user_struct *user;
185 192
186 /* 193 /*
187 * In order to avoid problems with "switch_user()", we want to make 194 * We won't get problems with the target's UID changing under us
188 * sure that the compiler doesn't re-load "t->user" 195 * because changing it requires RCU be used, and if t != current, the
196 * caller must be holding the RCU readlock (by way of a spinlock) and
197 * we use RCU protection here
189 */ 198 */
190 user = t->user; 199 user = get_uid(__task_cred(t)->user);
191 barrier();
192 atomic_inc(&user->sigpending); 200 atomic_inc(&user->sigpending);
193 if (override_rlimit || 201 if (override_rlimit ||
194 atomic_read(&user->sigpending) <= 202 atomic_read(&user->sigpending) <=
@@ -196,12 +204,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
196 q = kmem_cache_alloc(sigqueue_cachep, flags); 204 q = kmem_cache_alloc(sigqueue_cachep, flags);
197 if (unlikely(q == NULL)) { 205 if (unlikely(q == NULL)) {
198 atomic_dec(&user->sigpending); 206 atomic_dec(&user->sigpending);
207 free_uid(user);
199 } else { 208 } else {
200 INIT_LIST_HEAD(&q->list); 209 INIT_LIST_HEAD(&q->list);
201 q->flags = 0; 210 q->flags = 0;
202 q->user = get_uid(user); 211 q->user = user;
203 } 212 }
204 return(q); 213
214 return q;
205} 215}
206 216
207static void __sigqueue_free(struct sigqueue *q) 217static void __sigqueue_free(struct sigqueue *q)
@@ -562,10 +572,12 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
562 572
563/* 573/*
564 * Bad permissions for sending the signal 574 * Bad permissions for sending the signal
575 * - the caller must hold at least the RCU read lock
565 */ 576 */
566static int check_kill_permission(int sig, struct siginfo *info, 577static int check_kill_permission(int sig, struct siginfo *info,
567 struct task_struct *t) 578 struct task_struct *t)
568{ 579{
580 const struct cred *cred = current_cred(), *tcred;
569 struct pid *sid; 581 struct pid *sid;
570 int error; 582 int error;
571 583
@@ -579,8 +591,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
579 if (error) 591 if (error)
580 return error; 592 return error;
581 593
582 if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && 594 tcred = __task_cred(t);
583 (current->uid ^ t->suid) && (current->uid ^ t->uid) && 595 if ((cred->euid ^ tcred->suid) &&
596 (cred->euid ^ tcred->uid) &&
597 (cred->uid ^ tcred->suid) &&
598 (cred->uid ^ tcred->uid) &&
584 !capable(CAP_KILL)) { 599 !capable(CAP_KILL)) {
585 switch (sig) { 600 switch (sig) {
586 case SIGCONT: 601 case SIGCONT:
@@ -844,7 +859,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
844 q->info.si_errno = 0; 859 q->info.si_errno = 0;
845 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
846 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_pid_vnr(current);
847 q->info.si_uid = current->uid; 862 q->info.si_uid = current_uid();
848 break; 863 break;
849 case (unsigned long) SEND_SIG_PRIV: 864 case (unsigned long) SEND_SIG_PRIV:
850 q->info.si_signo = sig; 865 q->info.si_signo = sig;
@@ -1008,6 +1023,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1008 return sighand; 1023 return sighand;
1009} 1024}
1010 1025
1026/*
1027 * send signal info to all the members of a group
1028 * - the caller must hold the RCU read lock at least
1029 */
1011int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1030int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1012{ 1031{
1013 unsigned long flags; 1032 unsigned long flags;
@@ -1029,8 +1048,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1029/* 1048/*
1030 * __kill_pgrp_info() sends a signal to a process group: this is what the tty 1049 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
1031 * control characters do (^C, ^Z etc) 1050 * control characters do (^C, ^Z etc)
1051 * - the caller must hold at least a readlock on tasklist_lock
1032 */ 1052 */
1033
1034int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) 1053int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1035{ 1054{
1036 struct task_struct *p = NULL; 1055 struct task_struct *p = NULL;
@@ -1086,6 +1105,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1086{ 1105{
1087 int ret = -EINVAL; 1106 int ret = -EINVAL;
1088 struct task_struct *p; 1107 struct task_struct *p;
1108 const struct cred *pcred;
1089 1109
1090 if (!valid_signal(sig)) 1110 if (!valid_signal(sig))
1091 return ret; 1111 return ret;
@@ -1096,9 +1116,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1096 ret = -ESRCH; 1116 ret = -ESRCH;
1097 goto out_unlock; 1117 goto out_unlock;
1098 } 1118 }
1099 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 1119 pcred = __task_cred(p);
1100 && (euid != p->suid) && (euid != p->uid) 1120 if ((info == SEND_SIG_NOINFO ||
1101 && (uid != p->suid) && (uid != p->uid)) { 1121 (!is_si_special(info) && SI_FROMUSER(info))) &&
1122 euid != pcred->suid && euid != pcred->uid &&
1123 uid != pcred->suid && uid != pcred->uid) {
1102 ret = -EPERM; 1124 ret = -EPERM;
1103 goto out_unlock; 1125 goto out_unlock;
1104 } 1126 }
@@ -1369,10 +1391,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1369 */ 1391 */
1370 rcu_read_lock(); 1392 rcu_read_lock();
1371 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1393 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1394 info.si_uid = __task_cred(tsk)->uid;
1372 rcu_read_unlock(); 1395 rcu_read_unlock();
1373 1396
1374 info.si_uid = tsk->uid;
1375
1376 thread_group_cputime(tsk, &cputime); 1397 thread_group_cputime(tsk, &cputime);
1377 info.si_utime = cputime_to_jiffies(cputime.utime); 1398 info.si_utime = cputime_to_jiffies(cputime.utime);
1378 info.si_stime = cputime_to_jiffies(cputime.stime); 1399 info.si_stime = cputime_to_jiffies(cputime.stime);
@@ -1440,10 +1461,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1440 */ 1461 */
1441 rcu_read_lock(); 1462 rcu_read_lock();
1442 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1463 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1464 info.si_uid = __task_cred(tsk)->uid;
1443 rcu_read_unlock(); 1465 rcu_read_unlock();
1444 1466
1445 info.si_uid = tsk->uid;
1446
1447 info.si_utime = cputime_to_clock_t(tsk->utime); 1467 info.si_utime = cputime_to_clock_t(tsk->utime);
1448 info.si_stime = cputime_to_clock_t(tsk->stime); 1468 info.si_stime = cputime_to_clock_t(tsk->stime);
1449 1469
@@ -1598,7 +1618,7 @@ void ptrace_notify(int exit_code)
1598 info.si_signo = SIGTRAP; 1618 info.si_signo = SIGTRAP;
1599 info.si_code = exit_code; 1619 info.si_code = exit_code;
1600 info.si_pid = task_pid_vnr(current); 1620 info.si_pid = task_pid_vnr(current);
1601 info.si_uid = current->uid; 1621 info.si_uid = current_uid();
1602 1622
1603 /* Let the debugger run. */ 1623 /* Let the debugger run. */
1604 spin_lock_irq(&current->sighand->siglock); 1624 spin_lock_irq(&current->sighand->siglock);
@@ -1710,7 +1730,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
1710 info->si_errno = 0; 1730 info->si_errno = 0;
1711 info->si_code = SI_USER; 1731 info->si_code = SI_USER;
1712 info->si_pid = task_pid_vnr(current->parent); 1732 info->si_pid = task_pid_vnr(current->parent);
1713 info->si_uid = current->parent->uid; 1733 info->si_uid = task_uid(current->parent);
1714 } 1734 }
1715 1735
1716 /* If the (new) signal is now blocked, requeue it. */ 1736 /* If the (new) signal is now blocked, requeue it. */
@@ -2211,7 +2231,7 @@ sys_kill(pid_t pid, int sig)
2211 info.si_errno = 0; 2231 info.si_errno = 0;
2212 info.si_code = SI_USER; 2232 info.si_code = SI_USER;
2213 info.si_pid = task_tgid_vnr(current); 2233 info.si_pid = task_tgid_vnr(current);
2214 info.si_uid = current->uid; 2234 info.si_uid = current_uid();
2215 2235
2216 return kill_something_info(sig, &info, pid); 2236 return kill_something_info(sig, &info, pid);
2217} 2237}
@@ -2228,7 +2248,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2228 info.si_errno = 0; 2248 info.si_errno = 0;
2229 info.si_code = SI_TKILL; 2249 info.si_code = SI_TKILL;
2230 info.si_pid = task_tgid_vnr(current); 2250 info.si_pid = task_tgid_vnr(current);
2231 info.si_uid = current->uid; 2251 info.si_uid = current_uid();
2232 2252
2233 rcu_read_lock(); 2253 rcu_read_lock();
2234 p = find_task_by_vpid(pid); 2254 p = find_task_by_vpid(pid);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..466e75ce271a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
102 102
103EXPORT_SYMBOL(local_bh_disable); 103EXPORT_SYMBOL(local_bh_disable);
104 104
105void __local_bh_enable(void)
106{
107 WARN_ON_ONCE(in_irq());
108
109 /*
110 * softirqs should never be enabled by __local_bh_enable(),
111 * it always nests inside local_bh_enable() sections:
112 */
113 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
114
115 sub_preempt_count(SOFTIRQ_OFFSET);
116}
117EXPORT_SYMBOL_GPL(__local_bh_enable);
118
119/* 105/*
120 * Special-case - softirqs can safely be enabled in 106 * Special-case - softirqs can safely be enabled in
121 * cond_resched_softirq(), or by __do_softirq(), 107 * cond_resched_softirq(), or by __do_softirq(),
@@ -269,6 +255,7 @@ void irq_enter(void)
269{ 255{
270 int cpu = smp_processor_id(); 256 int cpu = smp_processor_id();
271 257
258 rcu_irq_enter();
272 if (idle_cpu(cpu) && !in_interrupt()) { 259 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter(); 260 __irq_enter();
274 tick_check_idle(cpu); 261 tick_check_idle(cpu);
@@ -295,9 +282,9 @@ void irq_exit(void)
295 282
296#ifdef CONFIG_NO_HZ 283#ifdef CONFIG_NO_HZ
297 /* Make sure that timer wheel updates are propagated */ 284 /* Make sure that timer wheel updates are propagated */
298 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
299 tick_nohz_stop_sched_tick(0);
300 rcu_irq_exit(); 285 rcu_irq_exit();
286 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
287 tick_nohz_stop_sched_tick(0);
301#endif 288#endif
302 preempt_enable_no_resched(); 289 preempt_enable_no_resched();
303} 290}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 3953e4aed733..1ab790c67b17 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
@@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
188 if ((long)(now - t->last_switch_timestamp) < 188 if ((long)(now - t->last_switch_timestamp) <
189 sysctl_hung_task_timeout_secs) 189 sysctl_hung_task_timeout_secs)
190 return; 190 return;
191 if (sysctl_hung_task_warnings < 0) 191 if (!sysctl_hung_task_warnings)
192 return; 192 return;
193 sysctl_hung_task_warnings--; 193 sysctl_hung_task_warnings--;
194 194
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
11#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
24} 25}
25EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
26 27
28/*
29 * Architectures that do not implement save_stack_trace_tsk get this
30 * weak alias and a once-per-bootup warning (whenever this facility
31 * is utilized - for example by procfs):
32 */
33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37}
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..d356d79e84ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -112,12 +112,17 @@ EXPORT_SYMBOL(cad_pid);
112 112
113void (*pm_power_off_prepare)(void); 113void (*pm_power_off_prepare)(void);
114 114
115/*
116 * set the priority of a task
117 * - the caller must hold the RCU read lock
118 */
115static int set_one_prio(struct task_struct *p, int niceval, int error) 119static int set_one_prio(struct task_struct *p, int niceval, int error)
116{ 120{
121 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
117 int no_nice; 122 int no_nice;
118 123
119 if (p->uid != current->euid && 124 if (pcred->uid != cred->euid &&
120 p->euid != current->euid && !capable(CAP_SYS_NICE)) { 125 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
121 error = -EPERM; 126 error = -EPERM;
122 goto out; 127 goto out;
123 } 128 }
@@ -141,6 +146,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
141{ 146{
142 struct task_struct *g, *p; 147 struct task_struct *g, *p;
143 struct user_struct *user; 148 struct user_struct *user;
149 const struct cred *cred = current_cred();
144 int error = -EINVAL; 150 int error = -EINVAL;
145 struct pid *pgrp; 151 struct pid *pgrp;
146 152
@@ -174,18 +180,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
174 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 180 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
175 break; 181 break;
176 case PRIO_USER: 182 case PRIO_USER:
177 user = current->user; 183 user = (struct user_struct *) cred->user;
178 if (!who) 184 if (!who)
179 who = current->uid; 185 who = cred->uid;
180 else 186 else if ((who != cred->uid) &&
181 if ((who != current->uid) && !(user = find_user(who))) 187 !(user = find_user(who)))
182 goto out_unlock; /* No processes for this user */ 188 goto out_unlock; /* No processes for this user */
183 189
184 do_each_thread(g, p) 190 do_each_thread(g, p)
185 if (p->uid == who) 191 if (__task_cred(p)->uid == who)
186 error = set_one_prio(p, niceval, error); 192 error = set_one_prio(p, niceval, error);
187 while_each_thread(g, p); 193 while_each_thread(g, p);
188 if (who != current->uid) 194 if (who != cred->uid)
189 free_uid(user); /* For find_user() */ 195 free_uid(user); /* For find_user() */
190 break; 196 break;
191 } 197 }
@@ -205,6 +211,7 @@ asmlinkage long sys_getpriority(int which, int who)
205{ 211{
206 struct task_struct *g, *p; 212 struct task_struct *g, *p;
207 struct user_struct *user; 213 struct user_struct *user;
214 const struct cred *cred = current_cred();
208 long niceval, retval = -ESRCH; 215 long niceval, retval = -ESRCH;
209 struct pid *pgrp; 216 struct pid *pgrp;
210 217
@@ -236,21 +243,21 @@ asmlinkage long sys_getpriority(int which, int who)
236 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 243 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
237 break; 244 break;
238 case PRIO_USER: 245 case PRIO_USER:
239 user = current->user; 246 user = (struct user_struct *) cred->user;
240 if (!who) 247 if (!who)
241 who = current->uid; 248 who = cred->uid;
242 else 249 else if ((who != cred->uid) &&
243 if ((who != current->uid) && !(user = find_user(who))) 250 !(user = find_user(who)))
244 goto out_unlock; /* No processes for this user */ 251 goto out_unlock; /* No processes for this user */
245 252
246 do_each_thread(g, p) 253 do_each_thread(g, p)
247 if (p->uid == who) { 254 if (__task_cred(p)->uid == who) {
248 niceval = 20 - task_nice(p); 255 niceval = 20 - task_nice(p);
249 if (niceval > retval) 256 if (niceval > retval)
250 retval = niceval; 257 retval = niceval;
251 } 258 }
252 while_each_thread(g, p); 259 while_each_thread(g, p);
253 if (who != current->uid) 260 if (who != cred->uid)
254 free_uid(user); /* for find_user() */ 261 free_uid(user); /* for find_user() */
255 break; 262 break;
256 } 263 }
@@ -472,46 +479,48 @@ void ctrl_alt_del(void)
472 */ 479 */
473asmlinkage long sys_setregid(gid_t rgid, gid_t egid) 480asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
474{ 481{
475 int old_rgid = current->gid; 482 const struct cred *old;
476 int old_egid = current->egid; 483 struct cred *new;
477 int new_rgid = old_rgid;
478 int new_egid = old_egid;
479 int retval; 484 int retval;
480 485
486 new = prepare_creds();
487 if (!new)
488 return -ENOMEM;
489 old = current_cred();
490
481 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); 491 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
482 if (retval) 492 if (retval)
483 return retval; 493 goto error;
484 494
495 retval = -EPERM;
485 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
486 if ((old_rgid == rgid) || 497 if (old->gid == rgid ||
487 (current->egid==rgid) || 498 old->egid == rgid ||
488 capable(CAP_SETGID)) 499 capable(CAP_SETGID))
489 new_rgid = rgid; 500 new->gid = rgid;
490 else 501 else
491 return -EPERM; 502 goto error;
492 } 503 }
493 if (egid != (gid_t) -1) { 504 if (egid != (gid_t) -1) {
494 if ((old_rgid == egid) || 505 if (old->gid == egid ||
495 (current->egid == egid) || 506 old->egid == egid ||
496 (current->sgid == egid) || 507 old->sgid == egid ||
497 capable(CAP_SETGID)) 508 capable(CAP_SETGID))
498 new_egid = egid; 509 new->egid = egid;
499 else 510 else
500 return -EPERM; 511 goto error;
501 }
502 if (new_egid != old_egid) {
503 set_dumpable(current->mm, suid_dumpable);
504 smp_wmb();
505 } 512 }
513
506 if (rgid != (gid_t) -1 || 514 if (rgid != (gid_t) -1 ||
507 (egid != (gid_t) -1 && egid != old_rgid)) 515 (egid != (gid_t) -1 && egid != old->gid))
508 current->sgid = new_egid; 516 new->sgid = new->egid;
509 current->fsgid = new_egid; 517 new->fsgid = new->egid;
510 current->egid = new_egid; 518
511 current->gid = new_rgid; 519 return commit_creds(new);
512 key_fsgid_changed(current); 520
513 proc_id_connector(current, PROC_EVENT_GID); 521error:
514 return 0; 522 abort_creds(new);
523 return retval;
515} 524}
516 525
517/* 526/*
@@ -521,56 +530,54 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
521 */ 530 */
522asmlinkage long sys_setgid(gid_t gid) 531asmlinkage long sys_setgid(gid_t gid)
523{ 532{
524 int old_egid = current->egid; 533 const struct cred *old;
534 struct cred *new;
525 int retval; 535 int retval;
526 536
537 new = prepare_creds();
538 if (!new)
539 return -ENOMEM;
540 old = current_cred();
541
527 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); 542 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
528 if (retval) 543 if (retval)
529 return retval; 544 goto error;
530 545
531 if (capable(CAP_SETGID)) { 546 retval = -EPERM;
532 if (old_egid != gid) { 547 if (capable(CAP_SETGID))
533 set_dumpable(current->mm, suid_dumpable); 548 new->gid = new->egid = new->sgid = new->fsgid = gid;
534 smp_wmb(); 549 else if (gid == old->gid || gid == old->sgid)
535 } 550 new->egid = new->fsgid = gid;
536 current->gid = current->egid = current->sgid = current->fsgid = gid;
537 } else if ((gid == current->gid) || (gid == current->sgid)) {
538 if (old_egid != gid) {
539 set_dumpable(current->mm, suid_dumpable);
540 smp_wmb();
541 }
542 current->egid = current->fsgid = gid;
543 }
544 else 551 else
545 return -EPERM; 552 goto error;
546 553
547 key_fsgid_changed(current); 554 return commit_creds(new);
548 proc_id_connector(current, PROC_EVENT_GID); 555
549 return 0; 556error:
557 abort_creds(new);
558 return retval;
550} 559}
551 560
552static int set_user(uid_t new_ruid, int dumpclear) 561/*
562 * change the user struct in a credentials set to match the new UID
563 */
564static int set_user(struct cred *new)
553{ 565{
554 struct user_struct *new_user; 566 struct user_struct *new_user;
555 567
556 new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); 568 new_user = alloc_uid(current_user_ns(), new->uid);
557 if (!new_user) 569 if (!new_user)
558 return -EAGAIN; 570 return -EAGAIN;
559 571
560 if (atomic_read(&new_user->processes) >= 572 if (atomic_read(&new_user->processes) >=
561 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 573 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
562 new_user != current->nsproxy->user_ns->root_user) { 574 new_user != INIT_USER) {
563 free_uid(new_user); 575 free_uid(new_user);
564 return -EAGAIN; 576 return -EAGAIN;
565 } 577 }
566 578
567 switch_uid(new_user); 579 free_uid(new->user);
568 580 new->user = new_user;
569 if (dumpclear) {
570 set_dumpable(current->mm, suid_dumpable);
571 smp_wmb();
572 }
573 current->uid = new_ruid;
574 return 0; 581 return 0;
575} 582}
576 583
@@ -591,54 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
591 */ 598 */
592asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) 599asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
593{ 600{
594 int old_ruid, old_euid, old_suid, new_ruid, new_euid; 601 const struct cred *old;
602 struct cred *new;
595 int retval; 603 int retval;
596 604
605 new = prepare_creds();
606 if (!new)
607 return -ENOMEM;
608 old = current_cred();
609
597 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); 610 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
598 if (retval) 611 if (retval)
599 return retval; 612 goto error;
600
601 new_ruid = old_ruid = current->uid;
602 new_euid = old_euid = current->euid;
603 old_suid = current->suid;
604 613
614 retval = -EPERM;
605 if (ruid != (uid_t) -1) { 615 if (ruid != (uid_t) -1) {
606 new_ruid = ruid; 616 new->uid = ruid;
607 if ((old_ruid != ruid) && 617 if (old->uid != ruid &&
608 (current->euid != ruid) && 618 old->euid != ruid &&
609 !capable(CAP_SETUID)) 619 !capable(CAP_SETUID))
610 return -EPERM; 620 goto error;
611 } 621 }
612 622
613 if (euid != (uid_t) -1) { 623 if (euid != (uid_t) -1) {
614 new_euid = euid; 624 new->euid = euid;
615 if ((old_ruid != euid) && 625 if (old->uid != euid &&
616 (current->euid != euid) && 626 old->euid != euid &&
617 (current->suid != euid) && 627 old->suid != euid &&
618 !capable(CAP_SETUID)) 628 !capable(CAP_SETUID))
619 return -EPERM; 629 goto error;
620 } 630 }
621 631
622 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) 632 retval = -EAGAIN;
623 return -EAGAIN; 633 if (new->uid != old->uid && set_user(new) < 0)
634 goto error;
624 635
625 if (new_euid != old_euid) {
626 set_dumpable(current->mm, suid_dumpable);
627 smp_wmb();
628 }
629 current->fsuid = current->euid = new_euid;
630 if (ruid != (uid_t) -1 || 636 if (ruid != (uid_t) -1 ||
631 (euid != (uid_t) -1 && euid != old_ruid)) 637 (euid != (uid_t) -1 && euid != old->uid))
632 current->suid = current->euid; 638 new->suid = new->euid;
633 current->fsuid = current->euid; 639 new->fsuid = new->euid;
634 640
635 key_fsuid_changed(current); 641 retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
636 proc_id_connector(current, PROC_EVENT_UID); 642 if (retval < 0)
637 643 goto error;
638 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
639}
640 644
645 return commit_creds(new);
641 646
647error:
648 abort_creds(new);
649 return retval;
650}
642 651
643/* 652/*
644 * setuid() is implemented like SysV with SAVED_IDS 653 * setuid() is implemented like SysV with SAVED_IDS
@@ -653,36 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 */ 662 */
654asmlinkage long sys_setuid(uid_t uid) 663asmlinkage long sys_setuid(uid_t uid)
655{ 664{
656 int old_euid = current->euid; 665 const struct cred *old;
657 int old_ruid, old_suid, new_suid; 666 struct cred *new;
658 int retval; 667 int retval;
659 668
669 new = prepare_creds();
670 if (!new)
671 return -ENOMEM;
672 old = current_cred();
673
660 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 674 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
661 if (retval) 675 if (retval)
662 return retval; 676 goto error;
663 677
664 old_ruid = current->uid; 678 retval = -EPERM;
665 old_suid = current->suid;
666 new_suid = old_suid;
667
668 if (capable(CAP_SETUID)) { 679 if (capable(CAP_SETUID)) {
669 if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) 680 new->suid = new->uid = uid;
670 return -EAGAIN; 681 if (uid != old->uid && set_user(new) < 0) {
671 new_suid = uid; 682 retval = -EAGAIN;
672 } else if ((uid != current->uid) && (uid != new_suid)) 683 goto error;
673 return -EPERM; 684 }
674 685 } else if (uid != old->uid && uid != new->suid) {
675 if (old_euid != uid) { 686 goto error;
676 set_dumpable(current->mm, suid_dumpable);
677 smp_wmb();
678 } 687 }
679 current->fsuid = current->euid = uid;
680 current->suid = new_suid;
681 688
682 key_fsuid_changed(current); 689 new->fsuid = new->euid = uid;
683 proc_id_connector(current, PROC_EVENT_UID); 690
691 retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
692 if (retval < 0)
693 goto error;
684 694
685 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); 695 return commit_creds(new);
696
697error:
698 abort_creds(new);
699 return retval;
686} 700}
687 701
688 702
@@ -692,54 +706,63 @@ asmlinkage long sys_setuid(uid_t uid)
692 */ 706 */
693asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 707asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
694{ 708{
695 int old_ruid = current->uid; 709 const struct cred *old;
696 int old_euid = current->euid; 710 struct cred *new;
697 int old_suid = current->suid;
698 int retval; 711 int retval;
699 712
713 new = prepare_creds();
714 if (!new)
715 return -ENOMEM;
716
700 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); 717 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
701 if (retval) 718 if (retval)
702 return retval; 719 goto error;
720 old = current_cred();
703 721
722 retval = -EPERM;
704 if (!capable(CAP_SETUID)) { 723 if (!capable(CAP_SETUID)) {
705 if ((ruid != (uid_t) -1) && (ruid != current->uid) && 724 if (ruid != (uid_t) -1 && ruid != old->uid &&
706 (ruid != current->euid) && (ruid != current->suid)) 725 ruid != old->euid && ruid != old->suid)
707 return -EPERM; 726 goto error;
708 if ((euid != (uid_t) -1) && (euid != current->uid) && 727 if (euid != (uid_t) -1 && euid != old->uid &&
709 (euid != current->euid) && (euid != current->suid)) 728 euid != old->euid && euid != old->suid)
710 return -EPERM; 729 goto error;
711 if ((suid != (uid_t) -1) && (suid != current->uid) && 730 if (suid != (uid_t) -1 && suid != old->uid &&
712 (suid != current->euid) && (suid != current->suid)) 731 suid != old->euid && suid != old->suid)
713 return -EPERM; 732 goto error;
714 } 733 }
734
735 retval = -EAGAIN;
715 if (ruid != (uid_t) -1) { 736 if (ruid != (uid_t) -1) {
716 if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) 737 new->uid = ruid;
717 return -EAGAIN; 738 if (ruid != old->uid && set_user(new) < 0)
739 goto error;
718 } 740 }
719 if (euid != (uid_t) -1) { 741 if (euid != (uid_t) -1)
720 if (euid != current->euid) { 742 new->euid = euid;
721 set_dumpable(current->mm, suid_dumpable);
722 smp_wmb();
723 }
724 current->euid = euid;
725 }
726 current->fsuid = current->euid;
727 if (suid != (uid_t) -1) 743 if (suid != (uid_t) -1)
728 current->suid = suid; 744 new->suid = suid;
745 new->fsuid = new->euid;
746
747 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
748 if (retval < 0)
749 goto error;
729 750
730 key_fsuid_changed(current); 751 return commit_creds(new);
731 proc_id_connector(current, PROC_EVENT_UID);
732 752
733 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); 753error:
754 abort_creds(new);
755 return retval;
734} 756}
735 757
736asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) 758asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
737{ 759{
760 const struct cred *cred = current_cred();
738 int retval; 761 int retval;
739 762
740 if (!(retval = put_user(current->uid, ruid)) && 763 if (!(retval = put_user(cred->uid, ruid)) &&
741 !(retval = put_user(current->euid, euid))) 764 !(retval = put_user(cred->euid, euid)))
742 retval = put_user(current->suid, suid); 765 retval = put_user(cred->suid, suid);
743 766
744 return retval; 767 return retval;
745} 768}
@@ -749,48 +772,55 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
749 */ 772 */
750asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 773asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
751{ 774{
775 const struct cred *old;
776 struct cred *new;
752 int retval; 777 int retval;
753 778
779 new = prepare_creds();
780 if (!new)
781 return -ENOMEM;
782 old = current_cred();
783
754 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); 784 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
755 if (retval) 785 if (retval)
756 return retval; 786 goto error;
757 787
788 retval = -EPERM;
758 if (!capable(CAP_SETGID)) { 789 if (!capable(CAP_SETGID)) {
759 if ((rgid != (gid_t) -1) && (rgid != current->gid) && 790 if (rgid != (gid_t) -1 && rgid != old->gid &&
760 (rgid != current->egid) && (rgid != current->sgid)) 791 rgid != old->egid && rgid != old->sgid)
761 return -EPERM; 792 goto error;
762 if ((egid != (gid_t) -1) && (egid != current->gid) && 793 if (egid != (gid_t) -1 && egid != old->gid &&
763 (egid != current->egid) && (egid != current->sgid)) 794 egid != old->egid && egid != old->sgid)
764 return -EPERM; 795 goto error;
765 if ((sgid != (gid_t) -1) && (sgid != current->gid) && 796 if (sgid != (gid_t) -1 && sgid != old->gid &&
766 (sgid != current->egid) && (sgid != current->sgid)) 797 sgid != old->egid && sgid != old->sgid)
767 return -EPERM; 798 goto error;
768 }
769 if (egid != (gid_t) -1) {
770 if (egid != current->egid) {
771 set_dumpable(current->mm, suid_dumpable);
772 smp_wmb();
773 }
774 current->egid = egid;
775 } 799 }
776 current->fsgid = current->egid; 800
777 if (rgid != (gid_t) -1) 801 if (rgid != (gid_t) -1)
778 current->gid = rgid; 802 new->gid = rgid;
803 if (egid != (gid_t) -1)
804 new->egid = egid;
779 if (sgid != (gid_t) -1) 805 if (sgid != (gid_t) -1)
780 current->sgid = sgid; 806 new->sgid = sgid;
807 new->fsgid = new->egid;
781 808
782 key_fsgid_changed(current); 809 return commit_creds(new);
783 proc_id_connector(current, PROC_EVENT_GID); 810
784 return 0; 811error:
812 abort_creds(new);
813 return retval;
785} 814}
786 815
787asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) 816asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
788{ 817{
818 const struct cred *cred = current_cred();
789 int retval; 819 int retval;
790 820
791 if (!(retval = put_user(current->gid, rgid)) && 821 if (!(retval = put_user(cred->gid, rgid)) &&
792 !(retval = put_user(current->egid, egid))) 822 !(retval = put_user(cred->egid, egid)))
793 retval = put_user(current->sgid, sgid); 823 retval = put_user(cred->sgid, sgid);
794 824
795 return retval; 825 return retval;
796} 826}
@@ -804,27 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
804 */ 834 */
805asmlinkage long sys_setfsuid(uid_t uid) 835asmlinkage long sys_setfsuid(uid_t uid)
806{ 836{
807 int old_fsuid; 837 const struct cred *old;
838 struct cred *new;
839 uid_t old_fsuid;
840
841 new = prepare_creds();
842 if (!new)
843 return current_fsuid();
844 old = current_cred();
845 old_fsuid = old->fsuid;
808 846
809 old_fsuid = current->fsuid; 847 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
810 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) 848 goto error;
811 return old_fsuid;
812 849
813 if (uid == current->uid || uid == current->euid || 850 if (uid == old->uid || uid == old->euid ||
814 uid == current->suid || uid == current->fsuid || 851 uid == old->suid || uid == old->fsuid ||
815 capable(CAP_SETUID)) { 852 capable(CAP_SETUID)) {
816 if (uid != old_fsuid) { 853 if (uid != old_fsuid) {
817 set_dumpable(current->mm, suid_dumpable); 854 new->fsuid = uid;
818 smp_wmb(); 855 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
856 goto change_okay;
819 } 857 }
820 current->fsuid = uid;
821 } 858 }
822 859
823 key_fsuid_changed(current); 860error:
824 proc_id_connector(current, PROC_EVENT_UID); 861 abort_creds(new);
825 862 return old_fsuid;
826 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
827 863
864change_okay:
865 commit_creds(new);
828 return old_fsuid; 866 return old_fsuid;
829} 867}
830 868
@@ -833,23 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid)
833 */ 871 */
834asmlinkage long sys_setfsgid(gid_t gid) 872asmlinkage long sys_setfsgid(gid_t gid)
835{ 873{
836 int old_fsgid; 874 const struct cred *old;
875 struct cred *new;
876 gid_t old_fsgid;
877
878 new = prepare_creds();
879 if (!new)
880 return current_fsgid();
881 old = current_cred();
882 old_fsgid = old->fsgid;
837 883
838 old_fsgid = current->fsgid;
839 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) 884 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
840 return old_fsgid; 885 goto error;
841 886
842 if (gid == current->gid || gid == current->egid || 887 if (gid == old->gid || gid == old->egid ||
843 gid == current->sgid || gid == current->fsgid || 888 gid == old->sgid || gid == old->fsgid ||
844 capable(CAP_SETGID)) { 889 capable(CAP_SETGID)) {
845 if (gid != old_fsgid) { 890 if (gid != old_fsgid) {
846 set_dumpable(current->mm, suid_dumpable); 891 new->fsgid = gid;
847 smp_wmb(); 892 goto change_okay;
848 } 893 }
849 current->fsgid = gid;
850 key_fsgid_changed(current);
851 proc_id_connector(current, PROC_EVENT_GID);
852 } 894 }
895
896error:
897 abort_creds(new);
898 return old_fsgid;
899
900change_okay:
901 commit_creds(new);
853 return old_fsgid; 902 return old_fsgid;
854} 903}
855 904
@@ -858,8 +907,8 @@ void do_sys_times(struct tms *tms)
858 struct task_cputime cputime; 907 struct task_cputime cputime;
859 cputime_t cutime, cstime; 908 cputime_t cutime, cstime;
860 909
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime); 910 thread_group_cputime(current, &cputime);
911 spin_lock_irq(&current->sighand->siglock);
863 cutime = current->signal->cutime; 912 cutime = current->signal->cutime;
864 cstime = current->signal->cstime; 913 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock); 914 spin_unlock_irq(&current->sighand->siglock);
@@ -1118,7 +1167,7 @@ EXPORT_SYMBOL(groups_free);
1118 1167
1119/* export the group_info to a user-space array */ 1168/* export the group_info to a user-space array */
1120static int groups_to_user(gid_t __user *grouplist, 1169static int groups_to_user(gid_t __user *grouplist,
1121 struct group_info *group_info) 1170 const struct group_info *group_info)
1122{ 1171{
1123 int i; 1172 int i;
1124 unsigned int count = group_info->ngroups; 1173 unsigned int count = group_info->ngroups;
@@ -1186,7 +1235,7 @@ static void groups_sort(struct group_info *group_info)
1186} 1235}
1187 1236
1188/* a simple bsearch */ 1237/* a simple bsearch */
1189int groups_search(struct group_info *group_info, gid_t grp) 1238int groups_search(const struct group_info *group_info, gid_t grp)
1190{ 1239{
1191 unsigned int left, right; 1240 unsigned int left, right;
1192 1241
@@ -1208,51 +1257,74 @@ int groups_search(struct group_info *group_info, gid_t grp)
1208 return 0; 1257 return 0;
1209} 1258}
1210 1259
1211/* validate and set current->group_info */ 1260/**
1212int set_current_groups(struct group_info *group_info) 1261 * set_groups - Change a group subscription in a set of credentials
1262 * @new: The newly prepared set of credentials to alter
1263 * @group_info: The group list to install
1264 *
1265 * Validate a group subscription and, if valid, insert it into a set
1266 * of credentials.
1267 */
1268int set_groups(struct cred *new, struct group_info *group_info)
1213{ 1269{
1214 int retval; 1270 int retval;
1215 struct group_info *old_info;
1216 1271
1217 retval = security_task_setgroups(group_info); 1272 retval = security_task_setgroups(group_info);
1218 if (retval) 1273 if (retval)
1219 return retval; 1274 return retval;
1220 1275
1276 put_group_info(new->group_info);
1221 groups_sort(group_info); 1277 groups_sort(group_info);
1222 get_group_info(group_info); 1278 get_group_info(group_info);
1279 new->group_info = group_info;
1280 return 0;
1281}
1223 1282
1224 task_lock(current); 1283EXPORT_SYMBOL(set_groups);
1225 old_info = current->group_info;
1226 current->group_info = group_info;
1227 task_unlock(current);
1228 1284
1229 put_group_info(old_info); 1285/**
1286 * set_current_groups - Change current's group subscription
1287 * @group_info: The group list to impose
1288 *
1289 * Validate a group subscription and, if valid, impose it upon current's task
1290 * security record.
1291 */
1292int set_current_groups(struct group_info *group_info)
1293{
1294 struct cred *new;
1295 int ret;
1230 1296
1231 return 0; 1297 new = prepare_creds();
1298 if (!new)
1299 return -ENOMEM;
1300
1301 ret = set_groups(new, group_info);
1302 if (ret < 0) {
1303 abort_creds(new);
1304 return ret;
1305 }
1306
1307 return commit_creds(new);
1232} 1308}
1233 1309
1234EXPORT_SYMBOL(set_current_groups); 1310EXPORT_SYMBOL(set_current_groups);
1235 1311
1236asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) 1312asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1237{ 1313{
1238 int i = 0; 1314 const struct cred *cred = current_cred();
1239 1315 int i;
1240 /*
1241 * SMP: Nobody else can change our grouplist. Thus we are
1242 * safe.
1243 */
1244 1316
1245 if (gidsetsize < 0) 1317 if (gidsetsize < 0)
1246 return -EINVAL; 1318 return -EINVAL;
1247 1319
1248 /* no need to grab task_lock here; it cannot change */ 1320 /* no need to grab task_lock here; it cannot change */
1249 i = current->group_info->ngroups; 1321 i = cred->group_info->ngroups;
1250 if (gidsetsize) { 1322 if (gidsetsize) {
1251 if (i > gidsetsize) { 1323 if (i > gidsetsize) {
1252 i = -EINVAL; 1324 i = -EINVAL;
1253 goto out; 1325 goto out;
1254 } 1326 }
1255 if (groups_to_user(grouplist, current->group_info)) { 1327 if (groups_to_user(grouplist, cred->group_info)) {
1256 i = -EFAULT; 1328 i = -EFAULT;
1257 goto out; 1329 goto out;
1258 } 1330 }
@@ -1296,9 +1368,11 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
1296 */ 1368 */
1297int in_group_p(gid_t grp) 1369int in_group_p(gid_t grp)
1298{ 1370{
1371 const struct cred *cred = current_cred();
1299 int retval = 1; 1372 int retval = 1;
1300 if (grp != current->fsgid) 1373
1301 retval = groups_search(current->group_info, grp); 1374 if (grp != cred->fsgid)
1375 retval = groups_search(cred->group_info, grp);
1302 return retval; 1376 return retval;
1303} 1377}
1304 1378
@@ -1306,9 +1380,11 @@ EXPORT_SYMBOL(in_group_p);
1306 1380
1307int in_egroup_p(gid_t grp) 1381int in_egroup_p(gid_t grp)
1308{ 1382{
1383 const struct cred *cred = current_cred();
1309 int retval = 1; 1384 int retval = 1;
1310 if (grp != current->egid) 1385
1311 retval = groups_search(current->group_info, grp); 1386 if (grp != cred->egid)
1387 retval = groups_search(cred->group_info, grp);
1312 return retval; 1388 return retval;
1313} 1389}
1314 1390
@@ -1624,50 +1700,56 @@ asmlinkage long sys_umask(int mask)
1624asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1700asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1625 unsigned long arg4, unsigned long arg5) 1701 unsigned long arg4, unsigned long arg5)
1626{ 1702{
1627 long error = 0; 1703 struct task_struct *me = current;
1704 unsigned char comm[sizeof(me->comm)];
1705 long error;
1628 1706
1629 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) 1707 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1708 if (error != -ENOSYS)
1630 return error; 1709 return error;
1631 1710
1711 error = 0;
1632 switch (option) { 1712 switch (option) {
1633 case PR_SET_PDEATHSIG: 1713 case PR_SET_PDEATHSIG:
1634 if (!valid_signal(arg2)) { 1714 if (!valid_signal(arg2)) {
1635 error = -EINVAL; 1715 error = -EINVAL;
1636 break; 1716 break;
1637 } 1717 }
1638 current->pdeath_signal = arg2; 1718 me->pdeath_signal = arg2;
1719 error = 0;
1639 break; 1720 break;
1640 case PR_GET_PDEATHSIG: 1721 case PR_GET_PDEATHSIG:
1641 error = put_user(current->pdeath_signal, (int __user *)arg2); 1722 error = put_user(me->pdeath_signal, (int __user *)arg2);
1642 break; 1723 break;
1643 case PR_GET_DUMPABLE: 1724 case PR_GET_DUMPABLE:
1644 error = get_dumpable(current->mm); 1725 error = get_dumpable(me->mm);
1645 break; 1726 break;
1646 case PR_SET_DUMPABLE: 1727 case PR_SET_DUMPABLE:
1647 if (arg2 < 0 || arg2 > 1) { 1728 if (arg2 < 0 || arg2 > 1) {
1648 error = -EINVAL; 1729 error = -EINVAL;
1649 break; 1730 break;
1650 } 1731 }
1651 set_dumpable(current->mm, arg2); 1732 set_dumpable(me->mm, arg2);
1733 error = 0;
1652 break; 1734 break;
1653 1735
1654 case PR_SET_UNALIGN: 1736 case PR_SET_UNALIGN:
1655 error = SET_UNALIGN_CTL(current, arg2); 1737 error = SET_UNALIGN_CTL(me, arg2);
1656 break; 1738 break;
1657 case PR_GET_UNALIGN: 1739 case PR_GET_UNALIGN:
1658 error = GET_UNALIGN_CTL(current, arg2); 1740 error = GET_UNALIGN_CTL(me, arg2);
1659 break; 1741 break;
1660 case PR_SET_FPEMU: 1742 case PR_SET_FPEMU:
1661 error = SET_FPEMU_CTL(current, arg2); 1743 error = SET_FPEMU_CTL(me, arg2);
1662 break; 1744 break;
1663 case PR_GET_FPEMU: 1745 case PR_GET_FPEMU:
1664 error = GET_FPEMU_CTL(current, arg2); 1746 error = GET_FPEMU_CTL(me, arg2);
1665 break; 1747 break;
1666 case PR_SET_FPEXC: 1748 case PR_SET_FPEXC:
1667 error = SET_FPEXC_CTL(current, arg2); 1749 error = SET_FPEXC_CTL(me, arg2);
1668 break; 1750 break;
1669 case PR_GET_FPEXC: 1751 case PR_GET_FPEXC:
1670 error = GET_FPEXC_CTL(current, arg2); 1752 error = GET_FPEXC_CTL(me, arg2);
1671 break; 1753 break;
1672 case PR_GET_TIMING: 1754 case PR_GET_TIMING:
1673 error = PR_TIMING_STATISTICAL; 1755 error = PR_TIMING_STATISTICAL;
@@ -1675,33 +1757,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1675 case PR_SET_TIMING: 1757 case PR_SET_TIMING:
1676 if (arg2 != PR_TIMING_STATISTICAL) 1758 if (arg2 != PR_TIMING_STATISTICAL)
1677 error = -EINVAL; 1759 error = -EINVAL;
1760 else
1761 error = 0;
1678 break; 1762 break;
1679 1763
1680 case PR_SET_NAME: { 1764 case PR_SET_NAME:
1681 struct task_struct *me = current; 1765 comm[sizeof(me->comm)-1] = 0;
1682 unsigned char ncomm[sizeof(me->comm)]; 1766 if (strncpy_from_user(comm, (char __user *)arg2,
1683 1767 sizeof(me->comm) - 1) < 0)
1684 ncomm[sizeof(me->comm)-1] = 0;
1685 if (strncpy_from_user(ncomm, (char __user *)arg2,
1686 sizeof(me->comm)-1) < 0)
1687 return -EFAULT; 1768 return -EFAULT;
1688 set_task_comm(me, ncomm); 1769 set_task_comm(me, comm);
1689 return 0; 1770 return 0;
1690 } 1771 case PR_GET_NAME:
1691 case PR_GET_NAME: { 1772 get_task_comm(comm, me);
1692 struct task_struct *me = current; 1773 if (copy_to_user((char __user *)arg2, comm,
1693 unsigned char tcomm[sizeof(me->comm)]; 1774 sizeof(comm)))
1694
1695 get_task_comm(tcomm, me);
1696 if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
1697 return -EFAULT; 1775 return -EFAULT;
1698 return 0; 1776 return 0;
1699 }
1700 case PR_GET_ENDIAN: 1777 case PR_GET_ENDIAN:
1701 error = GET_ENDIAN(current, arg2); 1778 error = GET_ENDIAN(me, arg2);
1702 break; 1779 break;
1703 case PR_SET_ENDIAN: 1780 case PR_SET_ENDIAN:
1704 error = SET_ENDIAN(current, arg2); 1781 error = SET_ENDIAN(me, arg2);
1705 break; 1782 break;
1706 1783
1707 case PR_GET_SECCOMP: 1784 case PR_GET_SECCOMP:
@@ -1725,6 +1802,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1725 current->default_timer_slack_ns; 1802 current->default_timer_slack_ns;
1726 else 1803 else
1727 current->timer_slack_ns = arg2; 1804 current->timer_slack_ns = arg2;
1805 error = 0;
1728 break; 1806 break;
1729 default: 1807 default:
1730 error = -EINVAL; 1808 error = -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7570da..ff6d45c7626f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -121,6 +121,10 @@ extern int sg_big_buff;
121#include <asm/system.h> 121#include <asm/system.h>
122#endif 122#endif
123 123
124#ifdef CONFIG_SPARC64
125extern int sysctl_tsb_ratio;
126#endif
127
124#ifdef __hppa__ 128#ifdef __hppa__
125extern int pwrsw_enabled; 129extern int pwrsw_enabled;
126extern int unaligned_enabled; 130extern int unaligned_enabled;
@@ -451,6 +455,16 @@ static struct ctl_table kern_table[] = {
451 .proc_handler = &proc_dointvec, 455 .proc_handler = &proc_dointvec,
452 }, 456 },
453#endif 457#endif
458#ifdef CONFIG_SPARC64
459 {
460 .ctl_name = CTL_UNNUMBERED,
461 .procname = "tsb-ratio",
462 .data = &sysctl_tsb_ratio,
463 .maxlen = sizeof (int),
464 .mode = 0644,
465 .proc_handler = &proc_dointvec,
466 },
467#endif
454#ifdef __hppa__ 468#ifdef __hppa__
455 { 469 {
456 .ctl_name = KERN_HPPA_PWRSW, 470 .ctl_name = KERN_HPPA_PWRSW,
@@ -487,6 +501,26 @@ static struct ctl_table kern_table[] = {
487 .proc_handler = &ftrace_enable_sysctl, 501 .proc_handler = &ftrace_enable_sysctl,
488 }, 502 },
489#endif 503#endif
504#ifdef CONFIG_STACK_TRACER
505 {
506 .ctl_name = CTL_UNNUMBERED,
507 .procname = "stack_tracer_enabled",
508 .data = &stack_tracer_enabled,
509 .maxlen = sizeof(int),
510 .mode = 0644,
511 .proc_handler = &stack_trace_sysctl,
512 },
513#endif
514#ifdef CONFIG_TRACING
515 {
516 .ctl_name = CTL_UNNUMBERED,
517 .procname = "ftrace_dump_on_oops",
518 .data = &ftrace_dump_on_oops,
519 .maxlen = sizeof(int),
520 .mode = 0644,
521 .proc_handler = &proc_dointvec,
522 },
523#endif
490#ifdef CONFIG_MODULES 524#ifdef CONFIG_MODULES
491 { 525 {
492 .ctl_name = KERN_MODPROBE, 526 .ctl_name = KERN_MODPROBE,
@@ -1651,7 +1685,7 @@ out:
1651 1685
1652static int test_perm(int mode, int op) 1686static int test_perm(int mode, int op)
1653{ 1687{
1654 if (!current->euid) 1688 if (!current_euid())
1655 mode >>= 6; 1689 mode >>= 6;
1656 else if (in_egroup_p(0)) 1690 else if (in_egroup_p(0))
1657 mode >>= 3; 1691 mode >>= 3;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ff15e5d486b..f5f793d92415 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
131{ 131{
132 enum hrtimer_restart res = HRTIMER_NORESTART; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
133 133
134 write_seqlock_irq(&xtime_lock); 134 write_seqlock(&xtime_lock);
135 135
136 switch (time_state) { 136 switch (time_state) {
137 case TIME_OK: 137 case TIME_OK:
@@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
164 } 164 }
165 update_vsyscall(&xtime, clock); 165 update_vsyscall(&xtime, clock);
166 166
167 write_sequnlock_irq(&xtime_lock); 167 write_sequnlock(&xtime_lock);
168 168
169 return res; 169 return res;
170} 170}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab46..8f3fc2582d38 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle)
247 if (need_resched()) 247 if (need_resched())
248 goto end; 248 goto end;
249 249
250 if (unlikely(local_softirq_pending())) { 250 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
251 static int ratelimit; 251 static int ratelimit;
252 252
253 if (ratelimit < 10) { 253 if (ratelimit < 10) {
@@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle)
282 /* Schedule the tick, if we are at least one jiffie off */ 282 /* Schedule the tick, if we are at least one jiffie off */
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 /*
286 * calculate the expiry time for the next timer wheel
287 * timer
288 */
289 expires = ktime_add_ns(last_update, tick_period.tv64 *
290 delta_jiffies);
291
292 /*
293 * If this cpu is the one which updates jiffies, then
294 * give up the assignment and let it be taken by the
295 * cpu which runs the tick timer next, which might be
296 * this cpu as well. If we don't drop this here the
297 * jiffies might be stale and do_timer() never
298 * invoked.
299 */
300 if (cpu == tick_do_timer_cpu)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302
285 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 304 cpu_set(cpu, nohz_cpu_mask);
305
306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
308 goto out;
309
287 /* 310 /*
288 * nohz_stop_sched_tick can be called several times before 311 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 312 * the nohz_restart_sched_tick is called. This happens when
@@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle)
306 rcu_enter_nohz(); 329 rcu_enter_nohz();
307 } 330 }
308 331
309 /*
310 * If this cpu is the one which updates jiffies, then
311 * give up the assignment and let it be taken by the
312 * cpu which runs the tick timer next, which might be
313 * this cpu as well. If we don't drop this here the
314 * jiffies might be stale and do_timer() never
315 * invoked.
316 */
317 if (cpu == tick_do_timer_cpu)
318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
319
320 ts->idle_sleeps++; 332 ts->idle_sleeps++;
321 333
322 /* 334 /*
@@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle)
332 goto out; 344 goto out;
333 } 345 }
334 346
335 /* 347 /* Mark expiries */
336 * calculate the expiry time for the next timer wheel
337 * timer
338 */
339 expires = ktime_add_ns(last_update, tick_period.tv64 *
340 delta_jiffies);
341 ts->idle_expires = expires; 348 ts->idle_expires = expires;
342 349
343 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@ -681,7 +688,6 @@ void tick_setup_sched_timer(void)
681 */ 688 */
682 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 689 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
683 ts->sched_timer.function = tick_sched_timer; 690 ts->sched_timer.function = tick_sched_timer;
684 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
685 691
686 /* Get the next period (per cpu) */ 692 /* Get the next period (per cpu) */
687 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 693 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e7acfb482a68..fa05e88aa76f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -518,6 +518,28 @@ void update_wall_time(void)
518 /* correct the clock when NTP error is too big */ 518 /* correct the clock when NTP error is too big */
519 clocksource_adjust(offset); 519 clocksource_adjust(offset);
520 520
521 /*
522 * Since in the loop above, we accumulate any amount of time
523 * in xtime_nsec over a second into xtime.tv_sec, its possible for
524 * xtime_nsec to be fairly small after the loop. Further, if we're
525 * slightly speeding the clocksource up in clocksource_adjust(),
526 * its possible the required corrective factor to xtime_nsec could
527 * cause it to underflow.
528 *
529 * Now, we cannot simply roll the accumulated second back, since
530 * the NTP subsystem has been notified via second_overflow. So
531 * instead we push xtime_nsec forward by the amount we underflowed,
532 * and add that amount into the error.
533 *
534 * We'll correct this error next time through this function, when
535 * xtime_nsec is not as small.
536 */
537 if (unlikely((s64)clock->xtime_nsec < 0)) {
538 s64 neg = -(s64)clock->xtime_nsec;
539 clock->xtime_nsec = 0;
540 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
541 }
542
521 /* store full nanoseconds into xtime after rounding it up and 543 /* store full nanoseconds into xtime after rounding it up and
522 * add the remainder to the error difference. 544 * add the remainder to the error difference.
523 */ 545 */
diff --git a/kernel/timer.c b/kernel/timer.c
index dbd50fabe4c7..566257d1dc10 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1192,25 +1192,25 @@ asmlinkage long sys_getppid(void)
1192asmlinkage long sys_getuid(void) 1192asmlinkage long sys_getuid(void)
1193{ 1193{
1194 /* Only we change this so SMP safe */ 1194 /* Only we change this so SMP safe */
1195 return current->uid; 1195 return current_uid();
1196} 1196}
1197 1197
1198asmlinkage long sys_geteuid(void) 1198asmlinkage long sys_geteuid(void)
1199{ 1199{
1200 /* Only we change this so SMP safe */ 1200 /* Only we change this so SMP safe */
1201 return current->euid; 1201 return current_euid();
1202} 1202}
1203 1203
1204asmlinkage long sys_getgid(void) 1204asmlinkage long sys_getgid(void)
1205{ 1205{
1206 /* Only we change this so SMP safe */ 1206 /* Only we change this so SMP safe */
1207 return current->gid; 1207 return current_gid();
1208} 1208}
1209 1209
1210asmlinkage long sys_getegid(void) 1210asmlinkage long sys_getegid(void)
1211{ 1211{
1212 /* Only we change this so SMP safe */ 1212 /* Only we change this so SMP safe */
1213 return current->egid; 1213 return current_egid();
1214} 1214}
1215 1215
1216#endif 1216#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e8..e2a4ff6fc3a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,18 +3,34 @@
3# select HAVE_FUNCTION_TRACER: 3# select HAVE_FUNCTION_TRACER:
4# 4#
5 5
6config USER_STACKTRACE_SUPPORT
7 bool
8
6config NOP_TRACER 9config NOP_TRACER
7 bool 10 bool
8 11
9config HAVE_FUNCTION_TRACER 12config HAVE_FUNCTION_TRACER
10 bool 13 bool
11 14
15config HAVE_FUNCTION_GRAPH_TRACER
16 bool
17
18config HAVE_FUNCTION_TRACE_MCOUNT_TEST
19 bool
20 help
21 This gets selected when the arch tests the function_trace_stop
22 variable at the mcount call site. Otherwise, this variable
23 is tested by the called function.
24
12config HAVE_DYNAMIC_FTRACE 25config HAVE_DYNAMIC_FTRACE
13 bool 26 bool
14 27
15config HAVE_FTRACE_MCOUNT_RECORD 28config HAVE_FTRACE_MCOUNT_RECORD
16 bool 29 bool
17 30
31config HAVE_HW_BRANCH_TRACER
32 bool
33
18config TRACER_MAX_TRACE 34config TRACER_MAX_TRACE
19 bool 35 bool
20 36
@@ -47,6 +63,20 @@ config FUNCTION_TRACER
47 (the bootup default), then the overhead of the instructions is very 63 (the bootup default), then the overhead of the instructions is very
48 small and not measurable even in micro-benchmarks. 64 small and not measurable even in micro-benchmarks.
49 65
66config FUNCTION_GRAPH_TRACER
67 bool "Kernel Function Graph Tracer"
68 depends on HAVE_FUNCTION_GRAPH_TRACER
69 depends on FUNCTION_TRACER
70 default y
71 help
72 Enable the kernel to trace a function at both its return
73 and its entry.
74 It's first purpose is to trace the duration of functions and
75 draw a call graph for each thread with some informations like
76 the return value.
77 This is done by setting the current return address on the current
78 task structure into a stack of calls.
79
50config IRQSOFF_TRACER 80config IRQSOFF_TRACER
51 bool "Interrupts-off Latency Tracer" 81 bool "Interrupts-off Latency Tracer"
52 default n 82 default n
@@ -138,6 +168,70 @@ config BOOT_TRACER
138 selected, because the self-tests are an initcall as well and that 168 selected, because the self-tests are an initcall as well and that
139 would invalidate the boot trace. ) 169 would invalidate the boot trace. )
140 170
171config TRACE_BRANCH_PROFILING
172 bool "Trace likely/unlikely profiler"
173 depends on DEBUG_KERNEL
174 select TRACING
175 help
176 This tracer profiles all the the likely and unlikely macros
177 in the kernel. It will display the results in:
178
179 /debugfs/tracing/profile_annotated_branch
180
181 Note: this will add a significant overhead, only turn this
182 on if you need to profile the system's use of these macros.
183
184 Say N if unsure.
185
186config PROFILE_ALL_BRANCHES
187 bool "Profile all if conditionals"
188 depends on TRACE_BRANCH_PROFILING
189 help
190 This tracer profiles all branch conditions. Every if ()
191 taken in the kernel is recorded whether it hit or miss.
192 The results will be displayed in:
193
194 /debugfs/tracing/profile_branch
195
196 This configuration, when enabled, will impose a great overhead
197 on the system. This should only be enabled when the system
198 is to be analyzed
199
200 Say N if unsure.
201
202config TRACING_BRANCHES
203 bool
204 help
205 Selected by tracers that will trace the likely and unlikely
206 conditions. This prevents the tracers themselves from being
207 profiled. Profiling the tracing infrastructure can only happen
208 when the likelys and unlikelys are not being traced.
209
210config BRANCH_TRACER
211 bool "Trace likely/unlikely instances"
212 depends on TRACE_BRANCH_PROFILING
213 select TRACING_BRANCHES
214 help
215 This traces the events of likely and unlikely condition
216 calls in the kernel. The difference between this and the
217 "Trace likely/unlikely profiler" is that this is not a
218 histogram of the callers, but actually places the calling
219 events into a running trace buffer to see when and where the
220 events happened, as well as their results.
221
222 Say N if unsure.
223
224config POWER_TRACER
225 bool "Trace power consumption behavior"
226 depends on DEBUG_KERNEL
227 depends on X86
228 select TRACING
229 help
230 This tracer helps developers to analyze and optimize the kernels
231 power management decisions, specifically the C-state and P-state
232 behavior.
233
234
141config STACK_TRACER 235config STACK_TRACER
142 bool "Trace max stack" 236 bool "Trace max stack"
143 depends on HAVE_FUNCTION_TRACER 237 depends on HAVE_FUNCTION_TRACER
@@ -150,13 +244,26 @@ config STACK_TRACER
150 244
151 This tracer works by hooking into every function call that the 245 This tracer works by hooking into every function call that the
152 kernel executes, and keeping a maximum stack depth value and 246 kernel executes, and keeping a maximum stack depth value and
153 stack-trace saved. Because this logic has to execute in every 247 stack-trace saved. If this is configured with DYNAMIC_FTRACE
154 kernel function, all the time, this option can slow down the 248 then it will not have any overhead while the stack tracer
155 kernel measurably and is generally intended for kernel 249 is disabled.
156 developers only. 250
251 To enable the stack tracer on bootup, pass in 'stacktrace'
252 on the kernel command line.
253
254 The stack tracer can also be enabled or disabled via the
255 sysctl kernel.stack_tracer_enabled
157 256
158 Say N if unsure. 257 Say N if unsure.
159 258
259config HW_BRANCH_TRACER
260 depends on HAVE_HW_BRANCH_TRACER
261 bool "Trace hw branches"
262 select TRACING
263 help
264 This tracer records all branches on the system in a circular
265 buffer giving access to the last N branches for each cpu.
266
160config DYNAMIC_FTRACE 267config DYNAMIC_FTRACE
161 bool "enable/disable ftrace tracepoints dynamically" 268 bool "enable/disable ftrace tracepoints dynamically"
162 depends on FUNCTION_TRACER 269 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1a49e9..349d5a93653f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13# If unlikely tracing is enabled, do not trace these files
14ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif
17
13obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
14obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
15 20
@@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
24obj-$(CONFIG_STACK_TRACER) += trace_stack.o 29obj-$(CONFIG_STACK_TRACER) += trace_stack.o
25obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 30obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
32obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o
27 36
28libftrace-y := ftrace.o 37libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 78db083390f0..2f32969c09df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,13 @@
47int ftrace_enabled __read_mostly; 47int ftrace_enabled __read_mostly;
48static int last_ftrace_enabled; 48static int last_ftrace_enabled;
49 49
50/* set when tracing only a pid */
51struct pid *ftrace_pid_trace;
52static struct pid * const ftrace_swapper_pid = &init_struct_pid;
53
54/* Quick disabling of function tracer. */
55int function_trace_stop;
56
50/* 57/*
51 * ftrace_disabled is set when an anomaly is discovered. 58 * ftrace_disabled is set when an anomaly is discovered.
52 * ftrace_disabled is much stronger than ftrace_enabled. 59 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -55,6 +62,7 @@ static int ftrace_disabled __read_mostly;
55 62
56static DEFINE_SPINLOCK(ftrace_lock); 63static DEFINE_SPINLOCK(ftrace_lock);
57static DEFINE_MUTEX(ftrace_sysctl_lock); 64static DEFINE_MUTEX(ftrace_sysctl_lock);
65static DEFINE_MUTEX(ftrace_start_lock);
58 66
59static struct ftrace_ops ftrace_list_end __read_mostly = 67static struct ftrace_ops ftrace_list_end __read_mostly =
60{ 68{
@@ -63,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
63 71
64static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 72static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
65ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 73ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
74ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
75ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
66 76
67static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 77static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
68{ 78{
@@ -79,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
79 }; 89 };
80} 90}
81 91
92static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
93{
94 if (!test_tsk_trace_trace(current))
95 return;
96
97 ftrace_pid_function(ip, parent_ip);
98}
99
100static void set_ftrace_pid_function(ftrace_func_t func)
101{
102 /* do not set ftrace_pid_function to itself! */
103 if (func != ftrace_pid_func)
104 ftrace_pid_function = func;
105}
106
82/** 107/**
83 * clear_ftrace_function - reset the ftrace function 108 * clear_ftrace_function - reset the ftrace function
84 * 109 *
@@ -88,7 +113,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
88void clear_ftrace_function(void) 113void clear_ftrace_function(void)
89{ 114{
90 ftrace_trace_function = ftrace_stub; 115 ftrace_trace_function = ftrace_stub;
116 __ftrace_trace_function = ftrace_stub;
117 ftrace_pid_function = ftrace_stub;
118}
119
120#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
121/*
122 * For those archs that do not test ftrace_trace_stop in their
123 * mcount call site, we need to do it from C.
124 */
125static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
126{
127 if (function_trace_stop)
128 return;
129
130 __ftrace_trace_function(ip, parent_ip);
91} 131}
132#endif
92 133
93static int __register_ftrace_function(struct ftrace_ops *ops) 134static int __register_ftrace_function(struct ftrace_ops *ops)
94{ 135{
@@ -106,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
106 ftrace_list = ops; 147 ftrace_list = ops;
107 148
108 if (ftrace_enabled) { 149 if (ftrace_enabled) {
150 ftrace_func_t func;
151
152 if (ops->next == &ftrace_list_end)
153 func = ops->func;
154 else
155 func = ftrace_list_func;
156
157 if (ftrace_pid_trace) {
158 set_ftrace_pid_function(func);
159 func = ftrace_pid_func;
160 }
161
109 /* 162 /*
110 * For one func, simply call it directly. 163 * For one func, simply call it directly.
111 * For more than one func, call the chain. 164 * For more than one func, call the chain.
112 */ 165 */
113 if (ops->next == &ftrace_list_end) 166#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
114 ftrace_trace_function = ops->func; 167 ftrace_trace_function = func;
115 else 168#else
116 ftrace_trace_function = ftrace_list_func; 169 __ftrace_trace_function = func;
170 ftrace_trace_function = ftrace_test_stop_func;
171#endif
117 } 172 }
118 173
119 spin_unlock(&ftrace_lock); 174 spin_unlock(&ftrace_lock);
@@ -152,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
152 207
153 if (ftrace_enabled) { 208 if (ftrace_enabled) {
154 /* If we only have one func left, then call that directly */ 209 /* If we only have one func left, then call that directly */
155 if (ftrace_list == &ftrace_list_end || 210 if (ftrace_list->next == &ftrace_list_end) {
156 ftrace_list->next == &ftrace_list_end) 211 ftrace_func_t func = ftrace_list->func;
157 ftrace_trace_function = ftrace_list->func; 212
213 if (ftrace_pid_trace) {
214 set_ftrace_pid_function(func);
215 func = ftrace_pid_func;
216 }
217#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
218 ftrace_trace_function = func;
219#else
220 __ftrace_trace_function = func;
221#endif
222 }
158 } 223 }
159 224
160 out: 225 out:
@@ -163,6 +228,36 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
163 return ret; 228 return ret;
164} 229}
165 230
231static void ftrace_update_pid_func(void)
232{
233 ftrace_func_t func;
234
235 /* should not be called from interrupt context */
236 spin_lock(&ftrace_lock);
237
238 if (ftrace_trace_function == ftrace_stub)
239 goto out;
240
241 func = ftrace_trace_function;
242
243 if (ftrace_pid_trace) {
244 set_ftrace_pid_function(func);
245 func = ftrace_pid_func;
246 } else {
247 if (func == ftrace_pid_func)
248 func = ftrace_pid_function;
249 }
250
251#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
252 ftrace_trace_function = func;
253#else
254 __ftrace_trace_function = func;
255#endif
256
257 out:
258 spin_unlock(&ftrace_lock);
259}
260
166#ifdef CONFIG_DYNAMIC_FTRACE 261#ifdef CONFIG_DYNAMIC_FTRACE
167#ifndef CONFIG_FTRACE_MCOUNT_RECORD 262#ifndef CONFIG_FTRACE_MCOUNT_RECORD
168# error Dynamic ftrace depends on MCOUNT_RECORD 263# error Dynamic ftrace depends on MCOUNT_RECORD
@@ -182,6 +277,8 @@ enum {
182 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 277 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
183 FTRACE_ENABLE_MCOUNT = (1 << 3), 278 FTRACE_ENABLE_MCOUNT = (1 << 3),
184 FTRACE_DISABLE_MCOUNT = (1 << 4), 279 FTRACE_DISABLE_MCOUNT = (1 << 4),
280 FTRACE_START_FUNC_RET = (1 << 5),
281 FTRACE_STOP_FUNC_RET = (1 << 6),
185}; 282};
186 283
187static int ftrace_filtered; 284static int ftrace_filtered;
@@ -308,7 +405,7 @@ ftrace_record_ip(unsigned long ip)
308{ 405{
309 struct dyn_ftrace *rec; 406 struct dyn_ftrace *rec;
310 407
311 if (!ftrace_enabled || ftrace_disabled) 408 if (ftrace_disabled)
312 return NULL; 409 return NULL;
313 410
314 rec = ftrace_alloc_dyn_node(ip); 411 rec = ftrace_alloc_dyn_node(ip);
@@ -322,14 +419,51 @@ ftrace_record_ip(unsigned long ip)
322 return rec; 419 return rec;
323} 420}
324 421
325#define FTRACE_ADDR ((long)(ftrace_caller)) 422static void print_ip_ins(const char *fmt, unsigned char *p)
423{
424 int i;
425
426 printk(KERN_CONT "%s", fmt);
427
428 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
429 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
430}
431
432static void ftrace_bug(int failed, unsigned long ip)
433{
434 switch (failed) {
435 case -EFAULT:
436 FTRACE_WARN_ON_ONCE(1);
437 pr_info("ftrace faulted on modifying ");
438 print_ip_sym(ip);
439 break;
440 case -EINVAL:
441 FTRACE_WARN_ON_ONCE(1);
442 pr_info("ftrace failed to modify ");
443 print_ip_sym(ip);
444 print_ip_ins(" actual: ", (unsigned char *)ip);
445 printk(KERN_CONT "\n");
446 break;
447 case -EPERM:
448 FTRACE_WARN_ON_ONCE(1);
449 pr_info("ftrace faulted on writing ");
450 print_ip_sym(ip);
451 break;
452 default:
453 FTRACE_WARN_ON_ONCE(1);
454 pr_info("ftrace faulted on unknown error ");
455 print_ip_sym(ip);
456 }
457}
458
326 459
327static int 460static int
328__ftrace_replace_code(struct dyn_ftrace *rec, 461__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
329 unsigned char *nop, int enable)
330{ 462{
331 unsigned long ip, fl; 463 unsigned long ip, fl;
332 unsigned char *call, *old, *new; 464 unsigned long ftrace_addr;
465
466 ftrace_addr = (unsigned long)ftrace_caller;
333 467
334 ip = rec->ip; 468 ip = rec->ip;
335 469
@@ -388,34 +522,28 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
388 } 522 }
389 } 523 }
390 524
391 call = ftrace_call_replace(ip, FTRACE_ADDR); 525 if (rec->flags & FTRACE_FL_ENABLED)
392 526 return ftrace_make_call(rec, ftrace_addr);
393 if (rec->flags & FTRACE_FL_ENABLED) { 527 else
394 old = nop; 528 return ftrace_make_nop(NULL, rec, ftrace_addr);
395 new = call;
396 } else {
397 old = call;
398 new = nop;
399 }
400
401 return ftrace_modify_code(ip, old, new);
402} 529}
403 530
404static void ftrace_replace_code(int enable) 531static void ftrace_replace_code(int enable)
405{ 532{
406 int i, failed; 533 int i, failed;
407 unsigned char *nop = NULL;
408 struct dyn_ftrace *rec; 534 struct dyn_ftrace *rec;
409 struct ftrace_page *pg; 535 struct ftrace_page *pg;
410 536
411 nop = ftrace_nop_replace();
412
413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 537 for (pg = ftrace_pages_start; pg; pg = pg->next) {
414 for (i = 0; i < pg->index; i++) { 538 for (i = 0; i < pg->index; i++) {
415 rec = &pg->records[i]; 539 rec = &pg->records[i];
416 540
417 /* don't modify code that has already faulted */ 541 /*
418 if (rec->flags & FTRACE_FL_FAILED) 542 * Skip over free records and records that have
543 * failed.
544 */
545 if (rec->flags & FTRACE_FL_FREE ||
546 rec->flags & FTRACE_FL_FAILED)
419 continue; 547 continue;
420 548
421 /* ignore updates to this record's mcount site */ 549 /* ignore updates to this record's mcount site */
@@ -426,68 +554,30 @@ static void ftrace_replace_code(int enable)
426 unfreeze_record(rec); 554 unfreeze_record(rec);
427 } 555 }
428 556
429 failed = __ftrace_replace_code(rec, nop, enable); 557 failed = __ftrace_replace_code(rec, enable);
430 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 558 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
431 rec->flags |= FTRACE_FL_FAILED; 559 rec->flags |= FTRACE_FL_FAILED;
432 if ((system_state == SYSTEM_BOOTING) || 560 if ((system_state == SYSTEM_BOOTING) ||
433 !core_kernel_text(rec->ip)) { 561 !core_kernel_text(rec->ip)) {
434 ftrace_free_rec(rec); 562 ftrace_free_rec(rec);
435 } 563 } else
564 ftrace_bug(failed, rec->ip);
436 } 565 }
437 } 566 }
438 } 567 }
439} 568}
440 569
441static void print_ip_ins(const char *fmt, unsigned char *p)
442{
443 int i;
444
445 printk(KERN_CONT "%s", fmt);
446
447 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
448 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
449}
450
451static int 570static int
452ftrace_code_disable(struct dyn_ftrace *rec) 571ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
453{ 572{
454 unsigned long ip; 573 unsigned long ip;
455 unsigned char *nop, *call;
456 int ret; 574 int ret;
457 575
458 ip = rec->ip; 576 ip = rec->ip;
459 577
460 nop = ftrace_nop_replace(); 578 ret = ftrace_make_nop(mod, rec, mcount_addr);
461 call = ftrace_call_replace(ip, mcount_addr);
462
463 ret = ftrace_modify_code(ip, call, nop);
464 if (ret) { 579 if (ret) {
465 switch (ret) { 580 ftrace_bug(ret, ip);
466 case -EFAULT:
467 FTRACE_WARN_ON_ONCE(1);
468 pr_info("ftrace faulted on modifying ");
469 print_ip_sym(ip);
470 break;
471 case -EINVAL:
472 FTRACE_WARN_ON_ONCE(1);
473 pr_info("ftrace failed to modify ");
474 print_ip_sym(ip);
475 print_ip_ins(" expected: ", call);
476 print_ip_ins(" actual: ", (unsigned char *)ip);
477 print_ip_ins(" replace: ", nop);
478 printk(KERN_CONT "\n");
479 break;
480 case -EPERM:
481 FTRACE_WARN_ON_ONCE(1);
482 pr_info("ftrace faulted on writing ");
483 print_ip_sym(ip);
484 break;
485 default:
486 FTRACE_WARN_ON_ONCE(1);
487 pr_info("ftrace faulted on unknown error ");
488 print_ip_sym(ip);
489 }
490
491 rec->flags |= FTRACE_FL_FAILED; 581 rec->flags |= FTRACE_FL_FAILED;
492 return 0; 582 return 0;
493 } 583 }
@@ -506,6 +596,11 @@ static int __ftrace_modify_code(void *data)
506 if (*command & FTRACE_UPDATE_TRACE_FUNC) 596 if (*command & FTRACE_UPDATE_TRACE_FUNC)
507 ftrace_update_ftrace_func(ftrace_trace_function); 597 ftrace_update_ftrace_func(ftrace_trace_function);
508 598
599 if (*command & FTRACE_START_FUNC_RET)
600 ftrace_enable_ftrace_graph_caller();
601 else if (*command & FTRACE_STOP_FUNC_RET)
602 ftrace_disable_ftrace_graph_caller();
603
509 return 0; 604 return 0;
510} 605}
511 606
@@ -515,43 +610,43 @@ static void ftrace_run_update_code(int command)
515} 610}
516 611
517static ftrace_func_t saved_ftrace_func; 612static ftrace_func_t saved_ftrace_func;
518static int ftrace_start; 613static int ftrace_start_up;
519static DEFINE_MUTEX(ftrace_start_lock);
520 614
521static void ftrace_startup(void) 615static void ftrace_startup_enable(int command)
522{ 616{
523 int command = 0;
524
525 if (unlikely(ftrace_disabled))
526 return;
527
528 mutex_lock(&ftrace_start_lock);
529 ftrace_start++;
530 command |= FTRACE_ENABLE_CALLS;
531
532 if (saved_ftrace_func != ftrace_trace_function) { 617 if (saved_ftrace_func != ftrace_trace_function) {
533 saved_ftrace_func = ftrace_trace_function; 618 saved_ftrace_func = ftrace_trace_function;
534 command |= FTRACE_UPDATE_TRACE_FUNC; 619 command |= FTRACE_UPDATE_TRACE_FUNC;
535 } 620 }
536 621
537 if (!command || !ftrace_enabled) 622 if (!command || !ftrace_enabled)
538 goto out; 623 return;
539 624
540 ftrace_run_update_code(command); 625 ftrace_run_update_code(command);
541 out:
542 mutex_unlock(&ftrace_start_lock);
543} 626}
544 627
545static void ftrace_shutdown(void) 628static void ftrace_startup(int command)
546{ 629{
547 int command = 0; 630 if (unlikely(ftrace_disabled))
631 return;
632
633 mutex_lock(&ftrace_start_lock);
634 ftrace_start_up++;
635 command |= FTRACE_ENABLE_CALLS;
548 636
637 ftrace_startup_enable(command);
638
639 mutex_unlock(&ftrace_start_lock);
640}
641
642static void ftrace_shutdown(int command)
643{
549 if (unlikely(ftrace_disabled)) 644 if (unlikely(ftrace_disabled))
550 return; 645 return;
551 646
552 mutex_lock(&ftrace_start_lock); 647 mutex_lock(&ftrace_start_lock);
553 ftrace_start--; 648 ftrace_start_up--;
554 if (!ftrace_start) 649 if (!ftrace_start_up)
555 command |= FTRACE_DISABLE_CALLS; 650 command |= FTRACE_DISABLE_CALLS;
556 651
557 if (saved_ftrace_func != ftrace_trace_function) { 652 if (saved_ftrace_func != ftrace_trace_function) {
@@ -577,8 +672,8 @@ static void ftrace_startup_sysctl(void)
577 mutex_lock(&ftrace_start_lock); 672 mutex_lock(&ftrace_start_lock);
578 /* Force update next time */ 673 /* Force update next time */
579 saved_ftrace_func = NULL; 674 saved_ftrace_func = NULL;
580 /* ftrace_start is true if we want ftrace running */ 675 /* ftrace_start_up is true if we want ftrace running */
581 if (ftrace_start) 676 if (ftrace_start_up)
582 command |= FTRACE_ENABLE_CALLS; 677 command |= FTRACE_ENABLE_CALLS;
583 678
584 ftrace_run_update_code(command); 679 ftrace_run_update_code(command);
@@ -593,8 +688,8 @@ static void ftrace_shutdown_sysctl(void)
593 return; 688 return;
594 689
595 mutex_lock(&ftrace_start_lock); 690 mutex_lock(&ftrace_start_lock);
596 /* ftrace_start is true if ftrace is running */ 691 /* ftrace_start_up is true if ftrace is running */
597 if (ftrace_start) 692 if (ftrace_start_up)
598 command |= FTRACE_DISABLE_CALLS; 693 command |= FTRACE_DISABLE_CALLS;
599 694
600 ftrace_run_update_code(command); 695 ftrace_run_update_code(command);
@@ -605,7 +700,7 @@ static cycle_t ftrace_update_time;
605static unsigned long ftrace_update_cnt; 700static unsigned long ftrace_update_cnt;
606unsigned long ftrace_update_tot_cnt; 701unsigned long ftrace_update_tot_cnt;
607 702
608static int ftrace_update_code(void) 703static int ftrace_update_code(struct module *mod)
609{ 704{
610 struct dyn_ftrace *p, *t; 705 struct dyn_ftrace *p, *t;
611 cycle_t start, stop; 706 cycle_t start, stop;
@@ -622,7 +717,7 @@ static int ftrace_update_code(void)
622 list_del_init(&p->list); 717 list_del_init(&p->list);
623 718
624 /* convert record (i.e, patch mcount-call with NOP) */ 719 /* convert record (i.e, patch mcount-call with NOP) */
625 if (ftrace_code_disable(p)) { 720 if (ftrace_code_disable(mod, p)) {
626 p->flags |= FTRACE_FL_CONVERTED; 721 p->flags |= FTRACE_FL_CONVERTED;
627 ftrace_update_cnt++; 722 ftrace_update_cnt++;
628 } else 723 } else
@@ -690,7 +785,6 @@ enum {
690#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 785#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
691 786
692struct ftrace_iterator { 787struct ftrace_iterator {
693 loff_t pos;
694 struct ftrace_page *pg; 788 struct ftrace_page *pg;
695 unsigned idx; 789 unsigned idx;
696 unsigned flags; 790 unsigned flags;
@@ -715,6 +809,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
715 iter->pg = iter->pg->next; 809 iter->pg = iter->pg->next;
716 iter->idx = 0; 810 iter->idx = 0;
717 goto retry; 811 goto retry;
812 } else {
813 iter->idx = -1;
718 } 814 }
719 } else { 815 } else {
720 rec = &iter->pg->records[iter->idx++]; 816 rec = &iter->pg->records[iter->idx++];
@@ -737,8 +833,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
737 } 833 }
738 spin_unlock(&ftrace_lock); 834 spin_unlock(&ftrace_lock);
739 835
740 iter->pos = *pos;
741
742 return rec; 836 return rec;
743} 837}
744 838
@@ -746,13 +840,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
746{ 840{
747 struct ftrace_iterator *iter = m->private; 841 struct ftrace_iterator *iter = m->private;
748 void *p = NULL; 842 void *p = NULL;
749 loff_t l = -1;
750 843
751 if (*pos > iter->pos) 844 if (*pos > 0) {
752 *pos = iter->pos; 845 if (iter->idx < 0)
846 return p;
847 (*pos)--;
848 iter->idx--;
849 }
753 850
754 l = *pos; 851 p = t_next(m, p, pos);
755 p = t_next(m, p, &l);
756 852
757 return p; 853 return p;
758} 854}
@@ -763,21 +859,15 @@ static void t_stop(struct seq_file *m, void *p)
763 859
764static int t_show(struct seq_file *m, void *v) 860static int t_show(struct seq_file *m, void *v)
765{ 861{
766 struct ftrace_iterator *iter = m->private;
767 struct dyn_ftrace *rec = v; 862 struct dyn_ftrace *rec = v;
768 char str[KSYM_SYMBOL_LEN]; 863 char str[KSYM_SYMBOL_LEN];
769 int ret = 0;
770 864
771 if (!rec) 865 if (!rec)
772 return 0; 866 return 0;
773 867
774 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 868 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
775 869
776 ret = seq_printf(m, "%s\n", str); 870 seq_printf(m, "%s\n", str);
777 if (ret < 0) {
778 iter->pos--;
779 iter->idx--;
780 }
781 871
782 return 0; 872 return 0;
783} 873}
@@ -803,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
803 return -ENOMEM; 893 return -ENOMEM;
804 894
805 iter->pg = ftrace_pages_start; 895 iter->pg = ftrace_pages_start;
806 iter->pos = 0;
807 896
808 ret = seq_open(file, &show_ftrace_seq_ops); 897 ret = seq_open(file, &show_ftrace_seq_ops);
809 if (!ret) { 898 if (!ret) {
@@ -890,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
890 979
891 if (file->f_mode & FMODE_READ) { 980 if (file->f_mode & FMODE_READ) {
892 iter->pg = ftrace_pages_start; 981 iter->pg = ftrace_pages_start;
893 iter->pos = 0;
894 iter->flags = enable ? FTRACE_ITER_FILTER : 982 iter->flags = enable ? FTRACE_ITER_FILTER :
895 FTRACE_ITER_NOTRACE; 983 FTRACE_ITER_NOTRACE;
896 984
@@ -959,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable)
959 int type = MATCH_FULL; 1047 int type = MATCH_FULL;
960 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1048 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
961 unsigned i, match = 0, search_len = 0; 1049 unsigned i, match = 0, search_len = 0;
1050 int not = 0;
1051
1052 if (buff[0] == '!') {
1053 not = 1;
1054 buff++;
1055 len--;
1056 }
962 1057
963 for (i = 0; i < len; i++) { 1058 for (i = 0; i < len; i++) {
964 if (buff[i] == '*') { 1059 if (buff[i] == '*') {
@@ -1012,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable)
1012 matched = 1; 1107 matched = 1;
1013 break; 1108 break;
1014 } 1109 }
1015 if (matched) 1110 if (matched) {
1016 rec->flags |= flag; 1111 if (not)
1112 rec->flags &= ~flag;
1113 else
1114 rec->flags |= flag;
1115 }
1017 } 1116 }
1018 pg = pg->next; 1117 pg = pg->next;
1019 } 1118 }
@@ -1181,7 +1280,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1181 1280
1182 mutex_lock(&ftrace_sysctl_lock); 1281 mutex_lock(&ftrace_sysctl_lock);
1183 mutex_lock(&ftrace_start_lock); 1282 mutex_lock(&ftrace_start_lock);
1184 if (ftrace_start && ftrace_enabled) 1283 if (ftrace_start_up && ftrace_enabled)
1185 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1284 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1186 mutex_unlock(&ftrace_start_lock); 1285 mutex_unlock(&ftrace_start_lock);
1187 mutex_unlock(&ftrace_sysctl_lock); 1286 mutex_unlock(&ftrace_sysctl_lock);
@@ -1233,12 +1332,233 @@ static struct file_operations ftrace_notrace_fops = {
1233 .release = ftrace_notrace_release, 1332 .release = ftrace_notrace_release,
1234}; 1333};
1235 1334
1236static __init int ftrace_init_debugfs(void) 1335#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1336
1337static DEFINE_MUTEX(graph_lock);
1338
1339int ftrace_graph_count;
1340unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
1341
1342static void *
1343g_next(struct seq_file *m, void *v, loff_t *pos)
1237{ 1344{
1238 struct dentry *d_tracer; 1345 unsigned long *array = m->private;
1239 struct dentry *entry; 1346 int index = *pos;
1240 1347
1241 d_tracer = tracing_init_dentry(); 1348 (*pos)++;
1349
1350 if (index >= ftrace_graph_count)
1351 return NULL;
1352
1353 return &array[index];
1354}
1355
1356static void *g_start(struct seq_file *m, loff_t *pos)
1357{
1358 void *p = NULL;
1359
1360 mutex_lock(&graph_lock);
1361
1362 p = g_next(m, p, pos);
1363
1364 return p;
1365}
1366
1367static void g_stop(struct seq_file *m, void *p)
1368{
1369 mutex_unlock(&graph_lock);
1370}
1371
1372static int g_show(struct seq_file *m, void *v)
1373{
1374 unsigned long *ptr = v;
1375 char str[KSYM_SYMBOL_LEN];
1376
1377 if (!ptr)
1378 return 0;
1379
1380 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1381
1382 seq_printf(m, "%s\n", str);
1383
1384 return 0;
1385}
1386
1387static struct seq_operations ftrace_graph_seq_ops = {
1388 .start = g_start,
1389 .next = g_next,
1390 .stop = g_stop,
1391 .show = g_show,
1392};
1393
1394static int
1395ftrace_graph_open(struct inode *inode, struct file *file)
1396{
1397 int ret = 0;
1398
1399 if (unlikely(ftrace_disabled))
1400 return -ENODEV;
1401
1402 mutex_lock(&graph_lock);
1403 if ((file->f_mode & FMODE_WRITE) &&
1404 !(file->f_flags & O_APPEND)) {
1405 ftrace_graph_count = 0;
1406 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
1407 }
1408
1409 if (file->f_mode & FMODE_READ) {
1410 ret = seq_open(file, &ftrace_graph_seq_ops);
1411 if (!ret) {
1412 struct seq_file *m = file->private_data;
1413 m->private = ftrace_graph_funcs;
1414 }
1415 } else
1416 file->private_data = ftrace_graph_funcs;
1417 mutex_unlock(&graph_lock);
1418
1419 return ret;
1420}
1421
1422static ssize_t
1423ftrace_graph_read(struct file *file, char __user *ubuf,
1424 size_t cnt, loff_t *ppos)
1425{
1426 if (file->f_mode & FMODE_READ)
1427 return seq_read(file, ubuf, cnt, ppos);
1428 else
1429 return -EPERM;
1430}
1431
1432static int
1433ftrace_set_func(unsigned long *array, int idx, char *buffer)
1434{
1435 char str[KSYM_SYMBOL_LEN];
1436 struct dyn_ftrace *rec;
1437 struct ftrace_page *pg;
1438 int found = 0;
1439 int i, j;
1440
1441 if (ftrace_disabled)
1442 return -ENODEV;
1443
1444 /* should not be called from interrupt context */
1445 spin_lock(&ftrace_lock);
1446
1447 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1448 for (i = 0; i < pg->index; i++) {
1449 rec = &pg->records[i];
1450
1451 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
1452 continue;
1453
1454 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1455 if (strcmp(str, buffer) == 0) {
1456 found = 1;
1457 for (j = 0; j < idx; j++)
1458 if (array[j] == rec->ip) {
1459 found = 0;
1460 break;
1461 }
1462 if (found)
1463 array[idx] = rec->ip;
1464 break;
1465 }
1466 }
1467 }
1468 spin_unlock(&ftrace_lock);
1469
1470 return found ? 0 : -EINVAL;
1471}
1472
1473static ssize_t
1474ftrace_graph_write(struct file *file, const char __user *ubuf,
1475 size_t cnt, loff_t *ppos)
1476{
1477 unsigned char buffer[FTRACE_BUFF_MAX+1];
1478 unsigned long *array;
1479 size_t read = 0;
1480 ssize_t ret;
1481 int index = 0;
1482 char ch;
1483
1484 if (!cnt || cnt < 0)
1485 return 0;
1486
1487 mutex_lock(&graph_lock);
1488
1489 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
1490 ret = -EBUSY;
1491 goto out;
1492 }
1493
1494 if (file->f_mode & FMODE_READ) {
1495 struct seq_file *m = file->private_data;
1496 array = m->private;
1497 } else
1498 array = file->private_data;
1499
1500 ret = get_user(ch, ubuf++);
1501 if (ret)
1502 goto out;
1503 read++;
1504 cnt--;
1505
1506 /* skip white space */
1507 while (cnt && isspace(ch)) {
1508 ret = get_user(ch, ubuf++);
1509 if (ret)
1510 goto out;
1511 read++;
1512 cnt--;
1513 }
1514
1515 if (isspace(ch)) {
1516 *ppos += read;
1517 ret = read;
1518 goto out;
1519 }
1520
1521 while (cnt && !isspace(ch)) {
1522 if (index < FTRACE_BUFF_MAX)
1523 buffer[index++] = ch;
1524 else {
1525 ret = -EINVAL;
1526 goto out;
1527 }
1528 ret = get_user(ch, ubuf++);
1529 if (ret)
1530 goto out;
1531 read++;
1532 cnt--;
1533 }
1534 buffer[index] = 0;
1535
1536 /* we allow only one at a time */
1537 ret = ftrace_set_func(array, ftrace_graph_count, buffer);
1538 if (ret)
1539 goto out;
1540
1541 ftrace_graph_count++;
1542
1543 file->f_pos += read;
1544
1545 ret = read;
1546 out:
1547 mutex_unlock(&graph_lock);
1548
1549 return ret;
1550}
1551
1552static const struct file_operations ftrace_graph_fops = {
1553 .open = ftrace_graph_open,
1554 .read = ftrace_graph_read,
1555 .write = ftrace_graph_write,
1556};
1557#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1558
1559static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
1560{
1561 struct dentry *entry;
1242 1562
1243 entry = debugfs_create_file("available_filter_functions", 0444, 1563 entry = debugfs_create_file("available_filter_functions", 0444,
1244 d_tracer, NULL, &ftrace_avail_fops); 1564 d_tracer, NULL, &ftrace_avail_fops);
@@ -1263,12 +1583,20 @@ static __init int ftrace_init_debugfs(void)
1263 pr_warning("Could not create debugfs " 1583 pr_warning("Could not create debugfs "
1264 "'set_ftrace_notrace' entry\n"); 1584 "'set_ftrace_notrace' entry\n");
1265 1585
1586#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1587 entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
1588 NULL,
1589 &ftrace_graph_fops);
1590 if (!entry)
1591 pr_warning("Could not create debugfs "
1592 "'set_graph_function' entry\n");
1593#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1594
1266 return 0; 1595 return 0;
1267} 1596}
1268 1597
1269fs_initcall(ftrace_init_debugfs); 1598static int ftrace_convert_nops(struct module *mod,
1270 1599 unsigned long *start,
1271static int ftrace_convert_nops(unsigned long *start,
1272 unsigned long *end) 1600 unsigned long *end)
1273{ 1601{
1274 unsigned long *p; 1602 unsigned long *p;
@@ -1279,23 +1607,32 @@ static int ftrace_convert_nops(unsigned long *start,
1279 p = start; 1607 p = start;
1280 while (p < end) { 1608 while (p < end) {
1281 addr = ftrace_call_adjust(*p++); 1609 addr = ftrace_call_adjust(*p++);
1610 /*
1611 * Some architecture linkers will pad between
1612 * the different mcount_loc sections of different
1613 * object files to satisfy alignments.
1614 * Skip any NULL pointers.
1615 */
1616 if (!addr)
1617 continue;
1282 ftrace_record_ip(addr); 1618 ftrace_record_ip(addr);
1283 } 1619 }
1284 1620
1285 /* disable interrupts to prevent kstop machine */ 1621 /* disable interrupts to prevent kstop machine */
1286 local_irq_save(flags); 1622 local_irq_save(flags);
1287 ftrace_update_code(); 1623 ftrace_update_code(mod);
1288 local_irq_restore(flags); 1624 local_irq_restore(flags);
1289 mutex_unlock(&ftrace_start_lock); 1625 mutex_unlock(&ftrace_start_lock);
1290 1626
1291 return 0; 1627 return 0;
1292} 1628}
1293 1629
1294void ftrace_init_module(unsigned long *start, unsigned long *end) 1630void ftrace_init_module(struct module *mod,
1631 unsigned long *start, unsigned long *end)
1295{ 1632{
1296 if (ftrace_disabled || start == end) 1633 if (ftrace_disabled || start == end)
1297 return; 1634 return;
1298 ftrace_convert_nops(start, end); 1635 ftrace_convert_nops(mod, start, end);
1299} 1636}
1300 1637
1301extern unsigned long __start_mcount_loc[]; 1638extern unsigned long __start_mcount_loc[];
@@ -1325,7 +1662,8 @@ void __init ftrace_init(void)
1325 1662
1326 last_ftrace_enabled = ftrace_enabled = 1; 1663 last_ftrace_enabled = ftrace_enabled = 1;
1327 1664
1328 ret = ftrace_convert_nops(__start_mcount_loc, 1665 ret = ftrace_convert_nops(NULL,
1666 __start_mcount_loc,
1329 __stop_mcount_loc); 1667 __stop_mcount_loc);
1330 1668
1331 return; 1669 return;
@@ -1342,12 +1680,186 @@ static int __init ftrace_nodyn_init(void)
1342} 1680}
1343device_initcall(ftrace_nodyn_init); 1681device_initcall(ftrace_nodyn_init);
1344 1682
1345# define ftrace_startup() do { } while (0) 1683static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
1346# define ftrace_shutdown() do { } while (0) 1684static inline void ftrace_startup_enable(int command) { }
1685/* Keep as macros so we do not need to define the commands */
1686# define ftrace_startup(command) do { } while (0)
1687# define ftrace_shutdown(command) do { } while (0)
1347# define ftrace_startup_sysctl() do { } while (0) 1688# define ftrace_startup_sysctl() do { } while (0)
1348# define ftrace_shutdown_sysctl() do { } while (0) 1689# define ftrace_shutdown_sysctl() do { } while (0)
1349#endif /* CONFIG_DYNAMIC_FTRACE */ 1690#endif /* CONFIG_DYNAMIC_FTRACE */
1350 1691
1692static ssize_t
1693ftrace_pid_read(struct file *file, char __user *ubuf,
1694 size_t cnt, loff_t *ppos)
1695{
1696 char buf[64];
1697 int r;
1698
1699 if (ftrace_pid_trace == ftrace_swapper_pid)
1700 r = sprintf(buf, "swapper tasks\n");
1701 else if (ftrace_pid_trace)
1702 r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
1703 else
1704 r = sprintf(buf, "no pid\n");
1705
1706 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1707}
1708
1709static void clear_ftrace_swapper(void)
1710{
1711 struct task_struct *p;
1712 int cpu;
1713
1714 get_online_cpus();
1715 for_each_online_cpu(cpu) {
1716 p = idle_task(cpu);
1717 clear_tsk_trace_trace(p);
1718 }
1719 put_online_cpus();
1720}
1721
1722static void set_ftrace_swapper(void)
1723{
1724 struct task_struct *p;
1725 int cpu;
1726
1727 get_online_cpus();
1728 for_each_online_cpu(cpu) {
1729 p = idle_task(cpu);
1730 set_tsk_trace_trace(p);
1731 }
1732 put_online_cpus();
1733}
1734
1735static void clear_ftrace_pid(struct pid *pid)
1736{
1737 struct task_struct *p;
1738
1739 do_each_pid_task(pid, PIDTYPE_PID, p) {
1740 clear_tsk_trace_trace(p);
1741 } while_each_pid_task(pid, PIDTYPE_PID, p);
1742 put_pid(pid);
1743}
1744
1745static void set_ftrace_pid(struct pid *pid)
1746{
1747 struct task_struct *p;
1748
1749 do_each_pid_task(pid, PIDTYPE_PID, p) {
1750 set_tsk_trace_trace(p);
1751 } while_each_pid_task(pid, PIDTYPE_PID, p);
1752}
1753
1754static void clear_ftrace_pid_task(struct pid **pid)
1755{
1756 if (*pid == ftrace_swapper_pid)
1757 clear_ftrace_swapper();
1758 else
1759 clear_ftrace_pid(*pid);
1760
1761 *pid = NULL;
1762}
1763
1764static void set_ftrace_pid_task(struct pid *pid)
1765{
1766 if (pid == ftrace_swapper_pid)
1767 set_ftrace_swapper();
1768 else
1769 set_ftrace_pid(pid);
1770}
1771
1772static ssize_t
1773ftrace_pid_write(struct file *filp, const char __user *ubuf,
1774 size_t cnt, loff_t *ppos)
1775{
1776 struct pid *pid;
1777 char buf[64];
1778 long val;
1779 int ret;
1780
1781 if (cnt >= sizeof(buf))
1782 return -EINVAL;
1783
1784 if (copy_from_user(&buf, ubuf, cnt))
1785 return -EFAULT;
1786
1787 buf[cnt] = 0;
1788
1789 ret = strict_strtol(buf, 10, &val);
1790 if (ret < 0)
1791 return ret;
1792
1793 mutex_lock(&ftrace_start_lock);
1794 if (val < 0) {
1795 /* disable pid tracing */
1796 if (!ftrace_pid_trace)
1797 goto out;
1798
1799 clear_ftrace_pid_task(&ftrace_pid_trace);
1800
1801 } else {
1802 /* swapper task is special */
1803 if (!val) {
1804 pid = ftrace_swapper_pid;
1805 if (pid == ftrace_pid_trace)
1806 goto out;
1807 } else {
1808 pid = find_get_pid(val);
1809
1810 if (pid == ftrace_pid_trace) {
1811 put_pid(pid);
1812 goto out;
1813 }
1814 }
1815
1816 if (ftrace_pid_trace)
1817 clear_ftrace_pid_task(&ftrace_pid_trace);
1818
1819 if (!pid)
1820 goto out;
1821
1822 ftrace_pid_trace = pid;
1823
1824 set_ftrace_pid_task(ftrace_pid_trace);
1825 }
1826
1827 /* update the function call */
1828 ftrace_update_pid_func();
1829 ftrace_startup_enable(0);
1830
1831 out:
1832 mutex_unlock(&ftrace_start_lock);
1833
1834 return cnt;
1835}
1836
1837static struct file_operations ftrace_pid_fops = {
1838 .read = ftrace_pid_read,
1839 .write = ftrace_pid_write,
1840};
1841
1842static __init int ftrace_init_debugfs(void)
1843{
1844 struct dentry *d_tracer;
1845 struct dentry *entry;
1846
1847 d_tracer = tracing_init_dentry();
1848 if (!d_tracer)
1849 return 0;
1850
1851 ftrace_init_dyn_debugfs(d_tracer);
1852
1853 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
1854 NULL, &ftrace_pid_fops);
1855 if (!entry)
1856 pr_warning("Could not create debugfs "
1857 "'set_ftrace_pid' entry\n");
1858 return 0;
1859}
1860
1861fs_initcall(ftrace_init_debugfs);
1862
1351/** 1863/**
1352 * ftrace_kill - kill ftrace 1864 * ftrace_kill - kill ftrace
1353 * 1865 *
@@ -1381,10 +1893,11 @@ int register_ftrace_function(struct ftrace_ops *ops)
1381 return -1; 1893 return -1;
1382 1894
1383 mutex_lock(&ftrace_sysctl_lock); 1895 mutex_lock(&ftrace_sysctl_lock);
1896
1384 ret = __register_ftrace_function(ops); 1897 ret = __register_ftrace_function(ops);
1385 ftrace_startup(); 1898 ftrace_startup(0);
1386 mutex_unlock(&ftrace_sysctl_lock);
1387 1899
1900 mutex_unlock(&ftrace_sysctl_lock);
1388 return ret; 1901 return ret;
1389} 1902}
1390 1903
@@ -1400,7 +1913,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1400 1913
1401 mutex_lock(&ftrace_sysctl_lock); 1914 mutex_lock(&ftrace_sysctl_lock);
1402 ret = __unregister_ftrace_function(ops); 1915 ret = __unregister_ftrace_function(ops);
1403 ftrace_shutdown(); 1916 ftrace_shutdown(0);
1404 mutex_unlock(&ftrace_sysctl_lock); 1917 mutex_unlock(&ftrace_sysctl_lock);
1405 1918
1406 return ret; 1919 return ret;
@@ -1449,3 +1962,153 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1449 return ret; 1962 return ret;
1450} 1963}
1451 1964
1965#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1966
1967static atomic_t ftrace_graph_active;
1968
1969int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1970{
1971 return 0;
1972}
1973
1974/* The callbacks that hook a function */
1975trace_func_graph_ret_t ftrace_graph_return =
1976 (trace_func_graph_ret_t)ftrace_stub;
1977trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
1978
1979/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
1980static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
1981{
1982 int i;
1983 int ret = 0;
1984 unsigned long flags;
1985 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
1986 struct task_struct *g, *t;
1987
1988 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
1989 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
1990 * sizeof(struct ftrace_ret_stack),
1991 GFP_KERNEL);
1992 if (!ret_stack_list[i]) {
1993 start = 0;
1994 end = i;
1995 ret = -ENOMEM;
1996 goto free;
1997 }
1998 }
1999
2000 read_lock_irqsave(&tasklist_lock, flags);
2001 do_each_thread(g, t) {
2002 if (start == end) {
2003 ret = -EAGAIN;
2004 goto unlock;
2005 }
2006
2007 if (t->ret_stack == NULL) {
2008 t->curr_ret_stack = -1;
2009 /* Make sure IRQs see the -1 first: */
2010 barrier();
2011 t->ret_stack = ret_stack_list[start++];
2012 atomic_set(&t->tracing_graph_pause, 0);
2013 atomic_set(&t->trace_overrun, 0);
2014 }
2015 } while_each_thread(g, t);
2016
2017unlock:
2018 read_unlock_irqrestore(&tasklist_lock, flags);
2019free:
2020 for (i = start; i < end; i++)
2021 kfree(ret_stack_list[i]);
2022 return ret;
2023}
2024
2025/* Allocate a return stack for each task */
2026static int start_graph_tracing(void)
2027{
2028 struct ftrace_ret_stack **ret_stack_list;
2029 int ret;
2030
2031 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
2032 sizeof(struct ftrace_ret_stack *),
2033 GFP_KERNEL);
2034
2035 if (!ret_stack_list)
2036 return -ENOMEM;
2037
2038 do {
2039 ret = alloc_retstack_tasklist(ret_stack_list);
2040 } while (ret == -EAGAIN);
2041
2042 kfree(ret_stack_list);
2043 return ret;
2044}
2045
2046int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2047 trace_func_graph_ent_t entryfunc)
2048{
2049 int ret = 0;
2050
2051 mutex_lock(&ftrace_sysctl_lock);
2052
2053 atomic_inc(&ftrace_graph_active);
2054 ret = start_graph_tracing();
2055 if (ret) {
2056 atomic_dec(&ftrace_graph_active);
2057 goto out;
2058 }
2059
2060 ftrace_graph_return = retfunc;
2061 ftrace_graph_entry = entryfunc;
2062
2063 ftrace_startup(FTRACE_START_FUNC_RET);
2064
2065out:
2066 mutex_unlock(&ftrace_sysctl_lock);
2067 return ret;
2068}
2069
2070void unregister_ftrace_graph(void)
2071{
2072 mutex_lock(&ftrace_sysctl_lock);
2073
2074 atomic_dec(&ftrace_graph_active);
2075 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2076 ftrace_graph_entry = ftrace_graph_entry_stub;
2077 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2078
2079 mutex_unlock(&ftrace_sysctl_lock);
2080}
2081
2082/* Allocate a return stack for newly created task */
2083void ftrace_graph_init_task(struct task_struct *t)
2084{
2085 if (atomic_read(&ftrace_graph_active)) {
2086 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2087 * sizeof(struct ftrace_ret_stack),
2088 GFP_KERNEL);
2089 if (!t->ret_stack)
2090 return;
2091 t->curr_ret_stack = -1;
2092 atomic_set(&t->tracing_graph_pause, 0);
2093 atomic_set(&t->trace_overrun, 0);
2094 } else
2095 t->ret_stack = NULL;
2096}
2097
2098void ftrace_graph_exit_task(struct task_struct *t)
2099{
2100 struct ftrace_ret_stack *ret_stack = t->ret_stack;
2101
2102 t->ret_stack = NULL;
2103 /* NULL must become visible to IRQs before we free it: */
2104 barrier();
2105
2106 kfree(ret_stack);
2107}
2108
2109void ftrace_graph_stop(void)
2110{
2111 ftrace_stop();
2112}
2113#endif
2114
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 30d57dd01a85..1d601a7c4587 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -18,8 +18,46 @@
18 18
19#include "trace.h" 19#include "trace.h"
20 20
21/* Global flag to disable all recording to ring buffers */ 21/*
22static int ring_buffers_off __read_mostly; 22 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to.
25 * Turning this switch on, makes it OK to write to the
26 * ring buffer, if the ring buffer is enabled itself.
27 *
28 * There's three layers that must be on in order to write
29 * to the ring buffer.
30 *
31 * 1) This global flag must be set.
32 * 2) The ring buffer must be enabled for recording.
33 * 3) The per cpu buffer must be enabled for recording.
34 *
35 * In case of an anomaly, this global flag has a bit set that
36 * will permantly disable all ring buffers.
37 */
38
39/*
40 * Global flag to disable all recording to ring buffers
41 * This has two bits: ON, DISABLED
42 *
43 * ON DISABLED
44 * ---- ----------
45 * 0 0 : ring buffers are off
46 * 1 0 : ring buffers are on
47 * X 1 : ring buffers are permanently disabled
48 */
49
50enum {
51 RB_BUFFERS_ON_BIT = 0,
52 RB_BUFFERS_DISABLED_BIT = 1,
53};
54
55enum {
56 RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58};
59
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
23 61
24/** 62/**
25 * tracing_on - enable all tracing buffers 63 * tracing_on - enable all tracing buffers
@@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;
29 */ 67 */
30void tracing_on(void) 68void tracing_on(void)
31{ 69{
32 ring_buffers_off = 0; 70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
33} 71}
34EXPORT_SYMBOL_GPL(tracing_on); 72EXPORT_SYMBOL_GPL(tracing_on);
35 73
@@ -43,10 +81,23 @@ EXPORT_SYMBOL_GPL(tracing_on);
43 */ 81 */
44void tracing_off(void) 82void tracing_off(void)
45{ 83{
46 ring_buffers_off = 1; 84 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
47} 85}
48EXPORT_SYMBOL_GPL(tracing_off); 86EXPORT_SYMBOL_GPL(tracing_off);
49 87
88/**
89 * tracing_off_permanent - permanently disable ring buffers
90 *
91 * This function, once called, will disable all ring buffers
92 * permanenty.
93 */
94void tracing_off_permanent(void)
95{
96 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
97}
98
99#include "trace.h"
100
50/* Up this if you want to test the TIME_EXTENTS and normalization */ 101/* Up this if you want to test the TIME_EXTENTS and normalization */
51#define DEBUG_SHIFT 0 102#define DEBUG_SHIFT 0
52 103
@@ -58,7 +109,7 @@ u64 ring_buffer_time_stamp(int cpu)
58 preempt_disable_notrace(); 109 preempt_disable_notrace();
59 /* shift to debug/test normalization and TIME_EXTENTS */ 110 /* shift to debug/test normalization and TIME_EXTENTS */
60 time = sched_clock() << DEBUG_SHIFT; 111 time = sched_clock() << DEBUG_SHIFT;
61 preempt_enable_notrace(); 112 preempt_enable_no_resched_notrace();
62 113
63 return time; 114 return time;
64} 115}
@@ -150,20 +201,24 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
150#define TS_MASK ((1ULL << TS_SHIFT) - 1) 201#define TS_MASK ((1ULL << TS_SHIFT) - 1)
151#define TS_DELTA_TEST (~TS_MASK) 202#define TS_DELTA_TEST (~TS_MASK)
152 203
153/* 204struct buffer_data_page {
154 * This hack stolen from mm/slob.c.
155 * We can store per page timing information in the page frame of the page.
156 * Thanks to Peter Zijlstra for suggesting this idea.
157 */
158struct buffer_page {
159 u64 time_stamp; /* page time stamp */ 205 u64 time_stamp; /* page time stamp */
160 local_t write; /* index for next write */
161 local_t commit; /* write commited index */ 206 local_t commit; /* write commited index */
207 unsigned char data[]; /* data of buffer page */
208};
209
210struct buffer_page {
211 local_t write; /* index for next write */
162 unsigned read; /* index for next read */ 212 unsigned read; /* index for next read */
163 struct list_head list; /* list of free pages */ 213 struct list_head list; /* list of free pages */
164 void *page; /* Actual data page */ 214 struct buffer_data_page *page; /* Actual data page */
165}; 215};
166 216
217static void rb_init_page(struct buffer_data_page *bpage)
218{
219 local_set(&bpage->commit, 0);
220}
221
167/* 222/*
168 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 223 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
169 * this issue out. 224 * this issue out.
@@ -185,7 +240,7 @@ static inline int test_time_stamp(u64 delta)
185 return 0; 240 return 0;
186} 241}
187 242
188#define BUF_PAGE_SIZE PAGE_SIZE 243#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
189 244
190/* 245/*
191 * head_page == tail_page && head == tail then buffer is empty. 246 * head_page == tail_page && head == tail then buffer is empty.
@@ -193,7 +248,8 @@ static inline int test_time_stamp(u64 delta)
193struct ring_buffer_per_cpu { 248struct ring_buffer_per_cpu {
194 int cpu; 249 int cpu;
195 struct ring_buffer *buffer; 250 struct ring_buffer *buffer;
196 spinlock_t lock; 251 spinlock_t reader_lock; /* serialize readers */
252 raw_spinlock_t lock;
197 struct lock_class_key lock_key; 253 struct lock_class_key lock_key;
198 struct list_head pages; 254 struct list_head pages;
199 struct buffer_page *head_page; /* read from head */ 255 struct buffer_page *head_page; /* read from head */
@@ -208,7 +264,6 @@ struct ring_buffer_per_cpu {
208}; 264};
209 265
210struct ring_buffer { 266struct ring_buffer {
211 unsigned long size;
212 unsigned pages; 267 unsigned pages;
213 unsigned flags; 268 unsigned flags;
214 int cpus; 269 int cpus;
@@ -227,32 +282,16 @@ struct ring_buffer_iter {
227 u64 read_stamp; 282 u64 read_stamp;
228}; 283};
229 284
285/* buffer may be either ring_buffer or ring_buffer_per_cpu */
230#define RB_WARN_ON(buffer, cond) \ 286#define RB_WARN_ON(buffer, cond) \
231 do { \ 287 ({ \
232 if (unlikely(cond)) { \ 288 int _____ret = unlikely(cond); \
233 atomic_inc(&buffer->record_disabled); \ 289 if (_____ret) { \
234 WARN_ON(1); \
235 } \
236 } while (0)
237
238#define RB_WARN_ON_RET(buffer, cond) \
239 do { \
240 if (unlikely(cond)) { \
241 atomic_inc(&buffer->record_disabled); \
242 WARN_ON(1); \
243 return -1; \
244 } \
245 } while (0)
246
247#define RB_WARN_ON_ONCE(buffer, cond) \
248 do { \
249 static int once; \
250 if (unlikely(cond) && !once) { \
251 once++; \
252 atomic_inc(&buffer->record_disabled); \ 290 atomic_inc(&buffer->record_disabled); \
253 WARN_ON(1); \ 291 WARN_ON(1); \
254 } \ 292 } \
255 } while (0) 293 _____ret; \
294 })
256 295
257/** 296/**
258 * check_pages - integrity check of buffer pages 297 * check_pages - integrity check of buffer pages
@@ -264,16 +303,20 @@ struct ring_buffer_iter {
264static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 303static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
265{ 304{
266 struct list_head *head = &cpu_buffer->pages; 305 struct list_head *head = &cpu_buffer->pages;
267 struct buffer_page *page, *tmp; 306 struct buffer_page *bpage, *tmp;
268 307
269 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); 308 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
270 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); 309 return -1;
310 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
311 return -1;
271 312
272 list_for_each_entry_safe(page, tmp, head, list) { 313 list_for_each_entry_safe(bpage, tmp, head, list) {
273 RB_WARN_ON_RET(cpu_buffer, 314 if (RB_WARN_ON(cpu_buffer,
274 page->list.next->prev != &page->list); 315 bpage->list.next->prev != &bpage->list))
275 RB_WARN_ON_RET(cpu_buffer, 316 return -1;
276 page->list.prev->next != &page->list); 317 if (RB_WARN_ON(cpu_buffer,
318 bpage->list.prev->next != &bpage->list))
319 return -1;
277 } 320 }
278 321
279 return 0; 322 return 0;
@@ -283,22 +326,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
283 unsigned nr_pages) 326 unsigned nr_pages)
284{ 327{
285 struct list_head *head = &cpu_buffer->pages; 328 struct list_head *head = &cpu_buffer->pages;
286 struct buffer_page *page, *tmp; 329 struct buffer_page *bpage, *tmp;
287 unsigned long addr; 330 unsigned long addr;
288 LIST_HEAD(pages); 331 LIST_HEAD(pages);
289 unsigned i; 332 unsigned i;
290 333
291 for (i = 0; i < nr_pages; i++) { 334 for (i = 0; i < nr_pages; i++) {
292 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 335 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
293 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 336 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
294 if (!page) 337 if (!bpage)
295 goto free_pages; 338 goto free_pages;
296 list_add(&page->list, &pages); 339 list_add(&bpage->list, &pages);
297 340
298 addr = __get_free_page(GFP_KERNEL); 341 addr = __get_free_page(GFP_KERNEL);
299 if (!addr) 342 if (!addr)
300 goto free_pages; 343 goto free_pages;
301 page->page = (void *)addr; 344 bpage->page = (void *)addr;
345 rb_init_page(bpage->page);
302 } 346 }
303 347
304 list_splice(&pages, head); 348 list_splice(&pages, head);
@@ -308,9 +352,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
308 return 0; 352 return 0;
309 353
310 free_pages: 354 free_pages:
311 list_for_each_entry_safe(page, tmp, &pages, list) { 355 list_for_each_entry_safe(bpage, tmp, &pages, list) {
312 list_del_init(&page->list); 356 list_del_init(&bpage->list);
313 free_buffer_page(page); 357 free_buffer_page(bpage);
314 } 358 }
315 return -ENOMEM; 359 return -ENOMEM;
316} 360}
@@ -319,7 +363,7 @@ static struct ring_buffer_per_cpu *
319rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 363rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
320{ 364{
321 struct ring_buffer_per_cpu *cpu_buffer; 365 struct ring_buffer_per_cpu *cpu_buffer;
322 struct buffer_page *page; 366 struct buffer_page *bpage;
323 unsigned long addr; 367 unsigned long addr;
324 int ret; 368 int ret;
325 369
@@ -330,19 +374,21 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
330 374
331 cpu_buffer->cpu = cpu; 375 cpu_buffer->cpu = cpu;
332 cpu_buffer->buffer = buffer; 376 cpu_buffer->buffer = buffer;
333 spin_lock_init(&cpu_buffer->lock); 377 spin_lock_init(&cpu_buffer->reader_lock);
378 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
334 INIT_LIST_HEAD(&cpu_buffer->pages); 379 INIT_LIST_HEAD(&cpu_buffer->pages);
335 380
336 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 381 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
337 GFP_KERNEL, cpu_to_node(cpu)); 382 GFP_KERNEL, cpu_to_node(cpu));
338 if (!page) 383 if (!bpage)
339 goto fail_free_buffer; 384 goto fail_free_buffer;
340 385
341 cpu_buffer->reader_page = page; 386 cpu_buffer->reader_page = bpage;
342 addr = __get_free_page(GFP_KERNEL); 387 addr = __get_free_page(GFP_KERNEL);
343 if (!addr) 388 if (!addr)
344 goto fail_free_reader; 389 goto fail_free_reader;
345 page->page = (void *)addr; 390 bpage->page = (void *)addr;
391 rb_init_page(bpage->page);
346 392
347 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 393 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
348 394
@@ -367,14 +413,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
367static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 413static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
368{ 414{
369 struct list_head *head = &cpu_buffer->pages; 415 struct list_head *head = &cpu_buffer->pages;
370 struct buffer_page *page, *tmp; 416 struct buffer_page *bpage, *tmp;
371 417
372 list_del_init(&cpu_buffer->reader_page->list); 418 list_del_init(&cpu_buffer->reader_page->list);
373 free_buffer_page(cpu_buffer->reader_page); 419 free_buffer_page(cpu_buffer->reader_page);
374 420
375 list_for_each_entry_safe(page, tmp, head, list) { 421 list_for_each_entry_safe(bpage, tmp, head, list) {
376 list_del_init(&page->list); 422 list_del_init(&bpage->list);
377 free_buffer_page(page); 423 free_buffer_page(bpage);
378 } 424 }
379 kfree(cpu_buffer); 425 kfree(cpu_buffer);
380} 426}
@@ -473,7 +519,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
473static void 519static void
474rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) 520rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
475{ 521{
476 struct buffer_page *page; 522 struct buffer_page *bpage;
477 struct list_head *p; 523 struct list_head *p;
478 unsigned i; 524 unsigned i;
479 525
@@ -481,13 +527,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
481 synchronize_sched(); 527 synchronize_sched();
482 528
483 for (i = 0; i < nr_pages; i++) { 529 for (i = 0; i < nr_pages; i++) {
484 BUG_ON(list_empty(&cpu_buffer->pages)); 530 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
531 return;
485 p = cpu_buffer->pages.next; 532 p = cpu_buffer->pages.next;
486 page = list_entry(p, struct buffer_page, list); 533 bpage = list_entry(p, struct buffer_page, list);
487 list_del_init(&page->list); 534 list_del_init(&bpage->list);
488 free_buffer_page(page); 535 free_buffer_page(bpage);
489 } 536 }
490 BUG_ON(list_empty(&cpu_buffer->pages)); 537 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
538 return;
491 539
492 rb_reset_cpu(cpu_buffer); 540 rb_reset_cpu(cpu_buffer);
493 541
@@ -501,7 +549,7 @@ static void
501rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 549rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
502 struct list_head *pages, unsigned nr_pages) 550 struct list_head *pages, unsigned nr_pages)
503{ 551{
504 struct buffer_page *page; 552 struct buffer_page *bpage;
505 struct list_head *p; 553 struct list_head *p;
506 unsigned i; 554 unsigned i;
507 555
@@ -509,11 +557,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
509 synchronize_sched(); 557 synchronize_sched();
510 558
511 for (i = 0; i < nr_pages; i++) { 559 for (i = 0; i < nr_pages; i++) {
512 BUG_ON(list_empty(pages)); 560 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
561 return;
513 p = pages->next; 562 p = pages->next;
514 page = list_entry(p, struct buffer_page, list); 563 bpage = list_entry(p, struct buffer_page, list);
515 list_del_init(&page->list); 564 list_del_init(&bpage->list);
516 list_add_tail(&page->list, &cpu_buffer->pages); 565 list_add_tail(&bpage->list, &cpu_buffer->pages);
517 } 566 }
518 rb_reset_cpu(cpu_buffer); 567 rb_reset_cpu(cpu_buffer);
519 568
@@ -540,7 +589,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
540{ 589{
541 struct ring_buffer_per_cpu *cpu_buffer; 590 struct ring_buffer_per_cpu *cpu_buffer;
542 unsigned nr_pages, rm_pages, new_pages; 591 unsigned nr_pages, rm_pages, new_pages;
543 struct buffer_page *page, *tmp; 592 struct buffer_page *bpage, *tmp;
544 unsigned long buffer_size; 593 unsigned long buffer_size;
545 unsigned long addr; 594 unsigned long addr;
546 LIST_HEAD(pages); 595 LIST_HEAD(pages);
@@ -570,7 +619,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
570 if (size < buffer_size) { 619 if (size < buffer_size) {
571 620
572 /* easy case, just free pages */ 621 /* easy case, just free pages */
573 BUG_ON(nr_pages >= buffer->pages); 622 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
623 mutex_unlock(&buffer->mutex);
624 return -1;
625 }
574 626
575 rm_pages = buffer->pages - nr_pages; 627 rm_pages = buffer->pages - nr_pages;
576 628
@@ -589,21 +641,26 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
589 * add these pages to the cpu_buffers. Otherwise we just free 641 * add these pages to the cpu_buffers. Otherwise we just free
590 * them all and return -ENOMEM; 642 * them all and return -ENOMEM;
591 */ 643 */
592 BUG_ON(nr_pages <= buffer->pages); 644 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
645 mutex_unlock(&buffer->mutex);
646 return -1;
647 }
648
593 new_pages = nr_pages - buffer->pages; 649 new_pages = nr_pages - buffer->pages;
594 650
595 for_each_buffer_cpu(buffer, cpu) { 651 for_each_buffer_cpu(buffer, cpu) {
596 for (i = 0; i < new_pages; i++) { 652 for (i = 0; i < new_pages; i++) {
597 page = kzalloc_node(ALIGN(sizeof(*page), 653 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
598 cache_line_size()), 654 cache_line_size()),
599 GFP_KERNEL, cpu_to_node(cpu)); 655 GFP_KERNEL, cpu_to_node(cpu));
600 if (!page) 656 if (!bpage)
601 goto free_pages; 657 goto free_pages;
602 list_add(&page->list, &pages); 658 list_add(&bpage->list, &pages);
603 addr = __get_free_page(GFP_KERNEL); 659 addr = __get_free_page(GFP_KERNEL);
604 if (!addr) 660 if (!addr)
605 goto free_pages; 661 goto free_pages;
606 page->page = (void *)addr; 662 bpage->page = (void *)addr;
663 rb_init_page(bpage->page);
607 } 664 }
608 } 665 }
609 666
@@ -612,7 +669,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
612 rb_insert_pages(cpu_buffer, &pages, new_pages); 669 rb_insert_pages(cpu_buffer, &pages, new_pages);
613 } 670 }
614 671
615 BUG_ON(!list_empty(&pages)); 672 if (RB_WARN_ON(buffer, !list_empty(&pages))) {
673 mutex_unlock(&buffer->mutex);
674 return -1;
675 }
616 676
617 out: 677 out:
618 buffer->pages = nr_pages; 678 buffer->pages = nr_pages;
@@ -621,9 +681,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
621 return size; 681 return size;
622 682
623 free_pages: 683 free_pages:
624 list_for_each_entry_safe(page, tmp, &pages, list) { 684 list_for_each_entry_safe(bpage, tmp, &pages, list) {
625 list_del_init(&page->list); 685 list_del_init(&bpage->list);
626 free_buffer_page(page); 686 free_buffer_page(bpage);
627 } 687 }
628 mutex_unlock(&buffer->mutex); 688 mutex_unlock(&buffer->mutex);
629 return -ENOMEM; 689 return -ENOMEM;
@@ -635,9 +695,15 @@ static inline int rb_null_event(struct ring_buffer_event *event)
635 return event->type == RINGBUF_TYPE_PADDING; 695 return event->type == RINGBUF_TYPE_PADDING;
636} 696}
637 697
638static inline void *__rb_page_index(struct buffer_page *page, unsigned index) 698static inline void *
699__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
700{
701 return bpage->data + index;
702}
703
704static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
639{ 705{
640 return page->page + index; 706 return bpage->page->data + index;
641} 707}
642 708
643static inline struct ring_buffer_event * 709static inline struct ring_buffer_event *
@@ -667,7 +733,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage)
667 733
668static inline unsigned rb_page_commit(struct buffer_page *bpage) 734static inline unsigned rb_page_commit(struct buffer_page *bpage)
669{ 735{
670 return local_read(&bpage->commit); 736 return local_read(&bpage->page->commit);
671} 737}
672 738
673/* Size is determined by what has been commited */ 739/* Size is determined by what has been commited */
@@ -702,7 +768,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
702 head += rb_event_length(event)) { 768 head += rb_event_length(event)) {
703 769
704 event = __rb_page_index(cpu_buffer->head_page, head); 770 event = __rb_page_index(cpu_buffer->head_page, head);
705 BUG_ON(rb_null_event(event)); 771 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
772 return;
706 /* Only count data entries */ 773 /* Only count data entries */
707 if (event->type != RINGBUF_TYPE_DATA) 774 if (event->type != RINGBUF_TYPE_DATA)
708 continue; 775 continue;
@@ -712,14 +779,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
712} 779}
713 780
714static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 781static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page **page) 782 struct buffer_page **bpage)
716{ 783{
717 struct list_head *p = (*page)->list.next; 784 struct list_head *p = (*bpage)->list.next;
718 785
719 if (p == &cpu_buffer->pages) 786 if (p == &cpu_buffer->pages)
720 p = p->next; 787 p = p->next;
721 788
722 *page = list_entry(p, struct buffer_page, list); 789 *bpage = list_entry(p, struct buffer_page, list);
723} 790}
724 791
725static inline unsigned 792static inline unsigned
@@ -755,16 +822,18 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
755 addr &= PAGE_MASK; 822 addr &= PAGE_MASK;
756 823
757 while (cpu_buffer->commit_page->page != (void *)addr) { 824 while (cpu_buffer->commit_page->page != (void *)addr) {
758 RB_WARN_ON(cpu_buffer, 825 if (RB_WARN_ON(cpu_buffer,
759 cpu_buffer->commit_page == cpu_buffer->tail_page); 826 cpu_buffer->commit_page == cpu_buffer->tail_page))
760 cpu_buffer->commit_page->commit = 827 return;
828 cpu_buffer->commit_page->page->commit =
761 cpu_buffer->commit_page->write; 829 cpu_buffer->commit_page->write;
762 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 830 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
763 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 831 cpu_buffer->write_stamp =
832 cpu_buffer->commit_page->page->time_stamp;
764 } 833 }
765 834
766 /* Now set the commit to the event's index */ 835 /* Now set the commit to the event's index */
767 local_set(&cpu_buffer->commit_page->commit, index); 836 local_set(&cpu_buffer->commit_page->page->commit, index);
768} 837}
769 838
770static inline void 839static inline void
@@ -778,25 +847,38 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
778 * back to us). This allows us to do a simple loop to 847 * back to us). This allows us to do a simple loop to
779 * assign the commit to the tail. 848 * assign the commit to the tail.
780 */ 849 */
850 again:
781 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 851 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
782 cpu_buffer->commit_page->commit = 852 cpu_buffer->commit_page->page->commit =
783 cpu_buffer->commit_page->write; 853 cpu_buffer->commit_page->write;
784 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 854 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
785 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 855 cpu_buffer->write_stamp =
856 cpu_buffer->commit_page->page->time_stamp;
786 /* add barrier to keep gcc from optimizing too much */ 857 /* add barrier to keep gcc from optimizing too much */
787 barrier(); 858 barrier();
788 } 859 }
789 while (rb_commit_index(cpu_buffer) != 860 while (rb_commit_index(cpu_buffer) !=
790 rb_page_write(cpu_buffer->commit_page)) { 861 rb_page_write(cpu_buffer->commit_page)) {
791 cpu_buffer->commit_page->commit = 862 cpu_buffer->commit_page->page->commit =
792 cpu_buffer->commit_page->write; 863 cpu_buffer->commit_page->write;
793 barrier(); 864 barrier();
794 } 865 }
866
867 /* again, keep gcc from optimizing */
868 barrier();
869
870 /*
871 * If an interrupt came in just after the first while loop
872 * and pushed the tail page forward, we will be left with
873 * a dangling commit that will never go forward.
874 */
875 if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
876 goto again;
795} 877}
796 878
797static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 879static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
798{ 880{
799 cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; 881 cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
800 cpu_buffer->reader_page->read = 0; 882 cpu_buffer->reader_page->read = 0;
801} 883}
802 884
@@ -815,7 +897,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
815 else 897 else
816 rb_inc_page(cpu_buffer, &iter->head_page); 898 rb_inc_page(cpu_buffer, &iter->head_page);
817 899
818 iter->read_stamp = iter->head_page->time_stamp; 900 iter->read_stamp = iter->head_page->page->time_stamp;
819 iter->head = 0; 901 iter->head = 0;
820} 902}
821 903
@@ -889,12 +971,15 @@ static struct ring_buffer_event *
889__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 971__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
890 unsigned type, unsigned long length, u64 *ts) 972 unsigned type, unsigned long length, u64 *ts)
891{ 973{
892 struct buffer_page *tail_page, *head_page, *reader_page; 974 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
893 unsigned long tail, write; 975 unsigned long tail, write;
894 struct ring_buffer *buffer = cpu_buffer->buffer; 976 struct ring_buffer *buffer = cpu_buffer->buffer;
895 struct ring_buffer_event *event; 977 struct ring_buffer_event *event;
896 unsigned long flags; 978 unsigned long flags;
897 979
980 commit_page = cpu_buffer->commit_page;
981 /* we just need to protect against interrupts */
982 barrier();
898 tail_page = cpu_buffer->tail_page; 983 tail_page = cpu_buffer->tail_page;
899 write = local_add_return(length, &tail_page->write); 984 write = local_add_return(length, &tail_page->write);
900 tail = write - length; 985 tail = write - length;
@@ -903,7 +988,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
903 if (write > BUF_PAGE_SIZE) { 988 if (write > BUF_PAGE_SIZE) {
904 struct buffer_page *next_page = tail_page; 989 struct buffer_page *next_page = tail_page;
905 990
906 spin_lock_irqsave(&cpu_buffer->lock, flags); 991 local_irq_save(flags);
992 __raw_spin_lock(&cpu_buffer->lock);
907 993
908 rb_inc_page(cpu_buffer, &next_page); 994 rb_inc_page(cpu_buffer, &next_page);
909 995
@@ -911,14 +997,15 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
911 reader_page = cpu_buffer->reader_page; 997 reader_page = cpu_buffer->reader_page;
912 998
913 /* we grabbed the lock before incrementing */ 999 /* we grabbed the lock before incrementing */
914 RB_WARN_ON(cpu_buffer, next_page == reader_page); 1000 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1001 goto out_unlock;
915 1002
916 /* 1003 /*
917 * If for some reason, we had an interrupt storm that made 1004 * If for some reason, we had an interrupt storm that made
918 * it all the way around the buffer, bail, and warn 1005 * it all the way around the buffer, bail, and warn
919 * about it. 1006 * about it.
920 */ 1007 */
921 if (unlikely(next_page == cpu_buffer->commit_page)) { 1008 if (unlikely(next_page == commit_page)) {
922 WARN_ON_ONCE(1); 1009 WARN_ON_ONCE(1);
923 goto out_unlock; 1010 goto out_unlock;
924 } 1011 }
@@ -949,12 +1036,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
949 */ 1036 */
950 if (tail_page == cpu_buffer->tail_page) { 1037 if (tail_page == cpu_buffer->tail_page) {
951 local_set(&next_page->write, 0); 1038 local_set(&next_page->write, 0);
952 local_set(&next_page->commit, 0); 1039 local_set(&next_page->page->commit, 0);
953 cpu_buffer->tail_page = next_page; 1040 cpu_buffer->tail_page = next_page;
954 1041
955 /* reread the time stamp */ 1042 /* reread the time stamp */
956 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1043 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
957 cpu_buffer->tail_page->time_stamp = *ts; 1044 cpu_buffer->tail_page->page->time_stamp = *ts;
958 } 1045 }
959 1046
960 /* 1047 /*
@@ -979,7 +1066,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
979 rb_set_commit_to_write(cpu_buffer); 1066 rb_set_commit_to_write(cpu_buffer);
980 } 1067 }
981 1068
982 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1069 __raw_spin_unlock(&cpu_buffer->lock);
1070 local_irq_restore(flags);
983 1071
984 /* fail and let the caller try again */ 1072 /* fail and let the caller try again */
985 return ERR_PTR(-EAGAIN); 1073 return ERR_PTR(-EAGAIN);
@@ -987,7 +1075,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
987 1075
988 /* We reserved something on the buffer */ 1076 /* We reserved something on the buffer */
989 1077
990 BUG_ON(write > BUF_PAGE_SIZE); 1078 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1079 return NULL;
991 1080
992 event = __rb_page_index(tail_page, tail); 1081 event = __rb_page_index(tail_page, tail);
993 rb_update_event(event, type, length); 1082 rb_update_event(event, type, length);
@@ -997,12 +1086,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
997 * this page's time stamp. 1086 * this page's time stamp.
998 */ 1087 */
999 if (!tail && rb_is_commit(cpu_buffer, event)) 1088 if (!tail && rb_is_commit(cpu_buffer, event))
1000 cpu_buffer->commit_page->time_stamp = *ts; 1089 cpu_buffer->commit_page->page->time_stamp = *ts;
1001 1090
1002 return event; 1091 return event;
1003 1092
1004 out_unlock: 1093 out_unlock:
1005 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1094 __raw_spin_unlock(&cpu_buffer->lock);
1095 local_irq_restore(flags);
1006 return NULL; 1096 return NULL;
1007} 1097}
1008 1098
@@ -1047,7 +1137,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1047 event->time_delta = *delta & TS_MASK; 1137 event->time_delta = *delta & TS_MASK;
1048 event->array[0] = *delta >> TS_SHIFT; 1138 event->array[0] = *delta >> TS_SHIFT;
1049 } else { 1139 } else {
1050 cpu_buffer->commit_page->time_stamp = *ts; 1140 cpu_buffer->commit_page->page->time_stamp = *ts;
1051 event->time_delta = 0; 1141 event->time_delta = 0;
1052 event->array[0] = 0; 1142 event->array[0] = 0;
1053 } 1143 }
@@ -1085,10 +1175,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1085 * storm or we have something buggy. 1175 * storm or we have something buggy.
1086 * Bail! 1176 * Bail!
1087 */ 1177 */
1088 if (unlikely(++nr_loops > 1000)) { 1178 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1089 RB_WARN_ON(cpu_buffer, 1);
1090 return NULL; 1179 return NULL;
1091 }
1092 1180
1093 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1181 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1094 1182
@@ -1184,15 +1272,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1184 struct ring_buffer_event *event; 1272 struct ring_buffer_event *event;
1185 int cpu, resched; 1273 int cpu, resched;
1186 1274
1187 if (ring_buffers_off) 1275 if (ring_buffer_flags != RB_BUFFERS_ON)
1188 return NULL; 1276 return NULL;
1189 1277
1190 if (atomic_read(&buffer->record_disabled)) 1278 if (atomic_read(&buffer->record_disabled))
1191 return NULL; 1279 return NULL;
1192 1280
1193 /* If we are tracing schedule, we don't want to recurse */ 1281 /* If we are tracing schedule, we don't want to recurse */
1194 resched = need_resched(); 1282 resched = ftrace_preempt_disable();
1195 preempt_disable_notrace();
1196 1283
1197 cpu = raw_smp_processor_id(); 1284 cpu = raw_smp_processor_id();
1198 1285
@@ -1223,10 +1310,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1223 return event; 1310 return event;
1224 1311
1225 out: 1312 out:
1226 if (resched) 1313 ftrace_preempt_enable(resched);
1227 preempt_enable_no_resched_notrace();
1228 else
1229 preempt_enable_notrace();
1230 return NULL; 1314 return NULL;
1231} 1315}
1232EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 1316EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -1269,12 +1353,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1269 /* 1353 /*
1270 * Only the last preempt count needs to restore preemption. 1354 * Only the last preempt count needs to restore preemption.
1271 */ 1355 */
1272 if (preempt_count() == 1) { 1356 if (preempt_count() == 1)
1273 if (per_cpu(rb_need_resched, cpu)) 1357 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1274 preempt_enable_no_resched_notrace(); 1358 else
1275 else
1276 preempt_enable_notrace();
1277 } else
1278 preempt_enable_no_resched_notrace(); 1359 preempt_enable_no_resched_notrace();
1279 1360
1280 return 0; 1361 return 0;
@@ -1305,14 +1386,13 @@ int ring_buffer_write(struct ring_buffer *buffer,
1305 int ret = -EBUSY; 1386 int ret = -EBUSY;
1306 int cpu, resched; 1387 int cpu, resched;
1307 1388
1308 if (ring_buffers_off) 1389 if (ring_buffer_flags != RB_BUFFERS_ON)
1309 return -EBUSY; 1390 return -EBUSY;
1310 1391
1311 if (atomic_read(&buffer->record_disabled)) 1392 if (atomic_read(&buffer->record_disabled))
1312 return -EBUSY; 1393 return -EBUSY;
1313 1394
1314 resched = need_resched(); 1395 resched = ftrace_preempt_disable();
1315 preempt_disable_notrace();
1316 1396
1317 cpu = raw_smp_processor_id(); 1397 cpu = raw_smp_processor_id();
1318 1398
@@ -1338,10 +1418,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1338 1418
1339 ret = 0; 1419 ret = 0;
1340 out: 1420 out:
1341 if (resched) 1421 ftrace_preempt_enable(resched);
1342 preempt_enable_no_resched_notrace();
1343 else
1344 preempt_enable_notrace();
1345 1422
1346 return ret; 1423 return ret;
1347} 1424}
@@ -1509,14 +1586,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1509} 1586}
1510EXPORT_SYMBOL_GPL(ring_buffer_overruns); 1587EXPORT_SYMBOL_GPL(ring_buffer_overruns);
1511 1588
1512/** 1589static void rb_iter_reset(struct ring_buffer_iter *iter)
1513 * ring_buffer_iter_reset - reset an iterator
1514 * @iter: The iterator to reset
1515 *
1516 * Resets the iterator, so that it will start from the beginning
1517 * again.
1518 */
1519void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1520{ 1590{
1521 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1591 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1522 1592
@@ -1531,7 +1601,24 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1531 if (iter->head) 1601 if (iter->head)
1532 iter->read_stamp = cpu_buffer->read_stamp; 1602 iter->read_stamp = cpu_buffer->read_stamp;
1533 else 1603 else
1534 iter->read_stamp = iter->head_page->time_stamp; 1604 iter->read_stamp = iter->head_page->page->time_stamp;
1605}
1606
1607/**
1608 * ring_buffer_iter_reset - reset an iterator
1609 * @iter: The iterator to reset
1610 *
1611 * Resets the iterator, so that it will start from the beginning
1612 * again.
1613 */
1614void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1615{
1616 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1617 unsigned long flags;
1618
1619 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1620 rb_iter_reset(iter);
1621 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1535} 1622}
1536EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); 1623EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
1537 1624
@@ -1619,7 +1706,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1619 unsigned long flags; 1706 unsigned long flags;
1620 int nr_loops = 0; 1707 int nr_loops = 0;
1621 1708
1622 spin_lock_irqsave(&cpu_buffer->lock, flags); 1709 local_irq_save(flags);
1710 __raw_spin_lock(&cpu_buffer->lock);
1623 1711
1624 again: 1712 again:
1625 /* 1713 /*
@@ -1628,8 +1716,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1628 * a case where we will loop three times. There should be no 1716 * a case where we will loop three times. There should be no
1629 * reason to loop four times (that I know of). 1717 * reason to loop four times (that I know of).
1630 */ 1718 */
1631 if (unlikely(++nr_loops > 3)) { 1719 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
1632 RB_WARN_ON(cpu_buffer, 1);
1633 reader = NULL; 1720 reader = NULL;
1634 goto out; 1721 goto out;
1635 } 1722 }
@@ -1641,8 +1728,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1641 goto out; 1728 goto out;
1642 1729
1643 /* Never should we have an index greater than the size */ 1730 /* Never should we have an index greater than the size */
1644 RB_WARN_ON(cpu_buffer, 1731 if (RB_WARN_ON(cpu_buffer,
1645 cpu_buffer->reader_page->read > rb_page_size(reader)); 1732 cpu_buffer->reader_page->read > rb_page_size(reader)))
1733 goto out;
1646 1734
1647 /* check if we caught up to the tail */ 1735 /* check if we caught up to the tail */
1648 reader = NULL; 1736 reader = NULL;
@@ -1659,7 +1747,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1659 cpu_buffer->reader_page->list.prev = reader->list.prev; 1747 cpu_buffer->reader_page->list.prev = reader->list.prev;
1660 1748
1661 local_set(&cpu_buffer->reader_page->write, 0); 1749 local_set(&cpu_buffer->reader_page->write, 0);
1662 local_set(&cpu_buffer->reader_page->commit, 0); 1750 local_set(&cpu_buffer->reader_page->page->commit, 0);
1663 1751
1664 /* Make the reader page now replace the head */ 1752 /* Make the reader page now replace the head */
1665 reader->list.prev->next = &cpu_buffer->reader_page->list; 1753 reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -1681,7 +1769,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1681 goto again; 1769 goto again;
1682 1770
1683 out: 1771 out:
1684 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1772 __raw_spin_unlock(&cpu_buffer->lock);
1773 local_irq_restore(flags);
1685 1774
1686 return reader; 1775 return reader;
1687} 1776}
@@ -1695,7 +1784,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1695 reader = rb_get_reader_page(cpu_buffer); 1784 reader = rb_get_reader_page(cpu_buffer);
1696 1785
1697 /* This function should not be called when buffer is empty */ 1786 /* This function should not be called when buffer is empty */
1698 BUG_ON(!reader); 1787 if (RB_WARN_ON(cpu_buffer, !reader))
1788 return;
1699 1789
1700 event = rb_reader_event(cpu_buffer); 1790 event = rb_reader_event(cpu_buffer);
1701 1791
@@ -1722,7 +1812,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1722 * Check if we are at the end of the buffer. 1812 * Check if we are at the end of the buffer.
1723 */ 1813 */
1724 if (iter->head >= rb_page_size(iter->head_page)) { 1814 if (iter->head >= rb_page_size(iter->head_page)) {
1725 BUG_ON(iter->head_page == cpu_buffer->commit_page); 1815 if (RB_WARN_ON(buffer,
1816 iter->head_page == cpu_buffer->commit_page))
1817 return;
1726 rb_inc_iter(iter); 1818 rb_inc_iter(iter);
1727 return; 1819 return;
1728 } 1820 }
@@ -1735,8 +1827,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1735 * This should not be called to advance the header if we are 1827 * This should not be called to advance the header if we are
1736 * at the tail of the buffer. 1828 * at the tail of the buffer.
1737 */ 1829 */
1738 BUG_ON((iter->head_page == cpu_buffer->commit_page) && 1830 if (RB_WARN_ON(cpu_buffer,
1739 (iter->head + length > rb_commit_index(cpu_buffer))); 1831 (iter->head_page == cpu_buffer->commit_page) &&
1832 (iter->head + length > rb_commit_index(cpu_buffer))))
1833 return;
1740 1834
1741 rb_update_iter_read_stamp(iter, event); 1835 rb_update_iter_read_stamp(iter, event);
1742 1836
@@ -1748,17 +1842,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1748 rb_advance_iter(iter); 1842 rb_advance_iter(iter);
1749} 1843}
1750 1844
1751/** 1845static struct ring_buffer_event *
1752 * ring_buffer_peek - peek at the next event to be read 1846rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1753 * @buffer: The ring buffer to read
1754 * @cpu: The cpu to peak at
1755 * @ts: The timestamp counter of this event.
1756 *
1757 * This will return the event that will be read next, but does
1758 * not consume the data.
1759 */
1760struct ring_buffer_event *
1761ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1762{ 1847{
1763 struct ring_buffer_per_cpu *cpu_buffer; 1848 struct ring_buffer_per_cpu *cpu_buffer;
1764 struct ring_buffer_event *event; 1849 struct ring_buffer_event *event;
@@ -1779,10 +1864,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1779 * can have. Nesting 10 deep of interrupts is clearly 1864 * can have. Nesting 10 deep of interrupts is clearly
1780 * an anomaly. 1865 * an anomaly.
1781 */ 1866 */
1782 if (unlikely(++nr_loops > 10)) { 1867 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1783 RB_WARN_ON(cpu_buffer, 1);
1784 return NULL; 1868 return NULL;
1785 }
1786 1869
1787 reader = rb_get_reader_page(cpu_buffer); 1870 reader = rb_get_reader_page(cpu_buffer);
1788 if (!reader) 1871 if (!reader)
@@ -1821,16 +1904,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1821} 1904}
1822EXPORT_SYMBOL_GPL(ring_buffer_peek); 1905EXPORT_SYMBOL_GPL(ring_buffer_peek);
1823 1906
1824/** 1907static struct ring_buffer_event *
1825 * ring_buffer_iter_peek - peek at the next event to be read 1908rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1826 * @iter: The ring buffer iterator
1827 * @ts: The timestamp counter of this event.
1828 *
1829 * This will return the event that will be read next, but does
1830 * not increment the iterator.
1831 */
1832struct ring_buffer_event *
1833ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1834{ 1909{
1835 struct ring_buffer *buffer; 1910 struct ring_buffer *buffer;
1836 struct ring_buffer_per_cpu *cpu_buffer; 1911 struct ring_buffer_per_cpu *cpu_buffer;
@@ -1852,10 +1927,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1852 * can have. Nesting 10 deep of interrupts is clearly 1927 * can have. Nesting 10 deep of interrupts is clearly
1853 * an anomaly. 1928 * an anomaly.
1854 */ 1929 */
1855 if (unlikely(++nr_loops > 10)) { 1930 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1856 RB_WARN_ON(cpu_buffer, 1);
1857 return NULL; 1931 return NULL;
1858 }
1859 1932
1860 if (rb_per_cpu_empty(cpu_buffer)) 1933 if (rb_per_cpu_empty(cpu_buffer))
1861 return NULL; 1934 return NULL;
@@ -1893,6 +1966,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1893EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 1966EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
1894 1967
1895/** 1968/**
1969 * ring_buffer_peek - peek at the next event to be read
1970 * @buffer: The ring buffer to read
1971 * @cpu: The cpu to peak at
1972 * @ts: The timestamp counter of this event.
1973 *
1974 * This will return the event that will be read next, but does
1975 * not consume the data.
1976 */
1977struct ring_buffer_event *
1978ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1979{
1980 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1981 struct ring_buffer_event *event;
1982 unsigned long flags;
1983
1984 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1985 event = rb_buffer_peek(buffer, cpu, ts);
1986 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1987
1988 return event;
1989}
1990
1991/**
1992 * ring_buffer_iter_peek - peek at the next event to be read
1993 * @iter: The ring buffer iterator
1994 * @ts: The timestamp counter of this event.
1995 *
1996 * This will return the event that will be read next, but does
1997 * not increment the iterator.
1998 */
1999struct ring_buffer_event *
2000ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2001{
2002 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2003 struct ring_buffer_event *event;
2004 unsigned long flags;
2005
2006 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2007 event = rb_iter_peek(iter, ts);
2008 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2009
2010 return event;
2011}
2012
2013/**
1896 * ring_buffer_consume - return an event and consume it 2014 * ring_buffer_consume - return an event and consume it
1897 * @buffer: The ring buffer to get the next event from 2015 * @buffer: The ring buffer to get the next event from
1898 * 2016 *
@@ -1903,19 +2021,24 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
1903struct ring_buffer_event * 2021struct ring_buffer_event *
1904ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 2022ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1905{ 2023{
1906 struct ring_buffer_per_cpu *cpu_buffer; 2024 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1907 struct ring_buffer_event *event; 2025 struct ring_buffer_event *event;
2026 unsigned long flags;
1908 2027
1909 if (!cpu_isset(cpu, buffer->cpumask)) 2028 if (!cpu_isset(cpu, buffer->cpumask))
1910 return NULL; 2029 return NULL;
1911 2030
1912 event = ring_buffer_peek(buffer, cpu, ts); 2031 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2032
2033 event = rb_buffer_peek(buffer, cpu, ts);
1913 if (!event) 2034 if (!event)
1914 return NULL; 2035 goto out;
1915 2036
1916 cpu_buffer = buffer->buffers[cpu];
1917 rb_advance_reader(cpu_buffer); 2037 rb_advance_reader(cpu_buffer);
1918 2038
2039 out:
2040 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2041
1919 return event; 2042 return event;
1920} 2043}
1921EXPORT_SYMBOL_GPL(ring_buffer_consume); 2044EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -1953,9 +2076,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1953 atomic_inc(&cpu_buffer->record_disabled); 2076 atomic_inc(&cpu_buffer->record_disabled);
1954 synchronize_sched(); 2077 synchronize_sched();
1955 2078
1956 spin_lock_irqsave(&cpu_buffer->lock, flags); 2079 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1957 ring_buffer_iter_reset(iter); 2080 __raw_spin_lock(&cpu_buffer->lock);
1958 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2081 rb_iter_reset(iter);
2082 __raw_spin_unlock(&cpu_buffer->lock);
2083 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1959 2084
1960 return iter; 2085 return iter;
1961} 2086}
@@ -1989,12 +2114,17 @@ struct ring_buffer_event *
1989ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) 2114ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
1990{ 2115{
1991 struct ring_buffer_event *event; 2116 struct ring_buffer_event *event;
2117 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2118 unsigned long flags;
1992 2119
1993 event = ring_buffer_iter_peek(iter, ts); 2120 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2121 event = rb_iter_peek(iter, ts);
1994 if (!event) 2122 if (!event)
1995 return NULL; 2123 goto out;
1996 2124
1997 rb_advance_iter(iter); 2125 rb_advance_iter(iter);
2126 out:
2127 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1998 2128
1999 return event; 2129 return event;
2000} 2130}
@@ -2016,7 +2146,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2016 cpu_buffer->head_page 2146 cpu_buffer->head_page
2017 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2147 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
2018 local_set(&cpu_buffer->head_page->write, 0); 2148 local_set(&cpu_buffer->head_page->write, 0);
2019 local_set(&cpu_buffer->head_page->commit, 0); 2149 local_set(&cpu_buffer->head_page->page->commit, 0);
2020 2150
2021 cpu_buffer->head_page->read = 0; 2151 cpu_buffer->head_page->read = 0;
2022 2152
@@ -2025,7 +2155,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2025 2155
2026 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2156 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
2027 local_set(&cpu_buffer->reader_page->write, 0); 2157 local_set(&cpu_buffer->reader_page->write, 0);
2028 local_set(&cpu_buffer->reader_page->commit, 0); 2158 local_set(&cpu_buffer->reader_page->page->commit, 0);
2029 cpu_buffer->reader_page->read = 0; 2159 cpu_buffer->reader_page->read = 0;
2030 2160
2031 cpu_buffer->overrun = 0; 2161 cpu_buffer->overrun = 0;
@@ -2045,11 +2175,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2045 if (!cpu_isset(cpu, buffer->cpumask)) 2175 if (!cpu_isset(cpu, buffer->cpumask))
2046 return; 2176 return;
2047 2177
2048 spin_lock_irqsave(&cpu_buffer->lock, flags); 2178 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2179
2180 __raw_spin_lock(&cpu_buffer->lock);
2049 2181
2050 rb_reset_cpu(cpu_buffer); 2182 rb_reset_cpu(cpu_buffer);
2051 2183
2052 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2184 __raw_spin_unlock(&cpu_buffer->lock);
2185
2186 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2053} 2187}
2054EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 2188EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2055 2189
@@ -2123,8 +2257,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2123 return -EINVAL; 2257 return -EINVAL;
2124 2258
2125 /* At least make sure the two buffers are somewhat the same */ 2259 /* At least make sure the two buffers are somewhat the same */
2126 if (buffer_a->size != buffer_b->size || 2260 if (buffer_a->pages != buffer_b->pages)
2127 buffer_a->pages != buffer_b->pages)
2128 return -EINVAL; 2261 return -EINVAL;
2129 2262
2130 cpu_buffer_a = buffer_a->buffers[cpu]; 2263 cpu_buffer_a = buffer_a->buffers[cpu];
@@ -2152,16 +2285,178 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2152} 2285}
2153EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2286EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2154 2287
2288static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2289 struct buffer_data_page *bpage)
2290{
2291 struct ring_buffer_event *event;
2292 unsigned long head;
2293
2294 __raw_spin_lock(&cpu_buffer->lock);
2295 for (head = 0; head < local_read(&bpage->commit);
2296 head += rb_event_length(event)) {
2297
2298 event = __rb_data_page_index(bpage, head);
2299 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2300 return;
2301 /* Only count data entries */
2302 if (event->type != RINGBUF_TYPE_DATA)
2303 continue;
2304 cpu_buffer->entries--;
2305 }
2306 __raw_spin_unlock(&cpu_buffer->lock);
2307}
2308
2309/**
2310 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2311 * @buffer: the buffer to allocate for.
2312 *
2313 * This function is used in conjunction with ring_buffer_read_page.
2314 * When reading a full page from the ring buffer, these functions
2315 * can be used to speed up the process. The calling function should
2316 * allocate a few pages first with this function. Then when it
2317 * needs to get pages from the ring buffer, it passes the result
2318 * of this function into ring_buffer_read_page, which will swap
2319 * the page that was allocated, with the read page of the buffer.
2320 *
2321 * Returns:
2322 * The page allocated, or NULL on error.
2323 */
2324void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2325{
2326 unsigned long addr;
2327 struct buffer_data_page *bpage;
2328
2329 addr = __get_free_page(GFP_KERNEL);
2330 if (!addr)
2331 return NULL;
2332
2333 bpage = (void *)addr;
2334
2335 return bpage;
2336}
2337
2338/**
2339 * ring_buffer_free_read_page - free an allocated read page
2340 * @buffer: the buffer the page was allocate for
2341 * @data: the page to free
2342 *
2343 * Free a page allocated from ring_buffer_alloc_read_page.
2344 */
2345void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2346{
2347 free_page((unsigned long)data);
2348}
2349
2350/**
2351 * ring_buffer_read_page - extract a page from the ring buffer
2352 * @buffer: buffer to extract from
2353 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2354 * @cpu: the cpu of the buffer to extract
2355 * @full: should the extraction only happen when the page is full.
2356 *
2357 * This function will pull out a page from the ring buffer and consume it.
2358 * @data_page must be the address of the variable that was returned
2359 * from ring_buffer_alloc_read_page. This is because the page might be used
2360 * to swap with a page in the ring buffer.
2361 *
2362 * for example:
2363 * rpage = ring_buffer_alloc_page(buffer);
2364 * if (!rpage)
2365 * return error;
2366 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
2367 * if (ret)
2368 * process_page(rpage);
2369 *
2370 * When @full is set, the function will not return true unless
2371 * the writer is off the reader page.
2372 *
2373 * Note: it is up to the calling functions to handle sleeps and wakeups.
2374 * The ring buffer can be used anywhere in the kernel and can not
2375 * blindly call wake_up. The layer that uses the ring buffer must be
2376 * responsible for that.
2377 *
2378 * Returns:
2379 * 1 if data has been transferred
2380 * 0 if no data has been transferred.
2381 */
2382int ring_buffer_read_page(struct ring_buffer *buffer,
2383 void **data_page, int cpu, int full)
2384{
2385 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2386 struct ring_buffer_event *event;
2387 struct buffer_data_page *bpage;
2388 unsigned long flags;
2389 int ret = 0;
2390
2391 if (!data_page)
2392 return 0;
2393
2394 bpage = *data_page;
2395 if (!bpage)
2396 return 0;
2397
2398 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2399
2400 /*
2401 * rb_buffer_peek will get the next ring buffer if
2402 * the current reader page is empty.
2403 */
2404 event = rb_buffer_peek(buffer, cpu, NULL);
2405 if (!event)
2406 goto out;
2407
2408 /* check for data */
2409 if (!local_read(&cpu_buffer->reader_page->page->commit))
2410 goto out;
2411 /*
2412 * If the writer is already off of the read page, then simply
2413 * switch the read page with the given page. Otherwise
2414 * we need to copy the data from the reader to the writer.
2415 */
2416 if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
2417 unsigned int read = cpu_buffer->reader_page->read;
2418
2419 if (full)
2420 goto out;
2421 /* The writer is still on the reader page, we must copy */
2422 bpage = cpu_buffer->reader_page->page;
2423 memcpy(bpage->data,
2424 cpu_buffer->reader_page->page->data + read,
2425 local_read(&bpage->commit) - read);
2426
2427 /* consume what was read */
2428 cpu_buffer->reader_page += read;
2429
2430 } else {
2431 /* swap the pages */
2432 rb_init_page(bpage);
2433 bpage = cpu_buffer->reader_page->page;
2434 cpu_buffer->reader_page->page = *data_page;
2435 cpu_buffer->reader_page->read = 0;
2436 *data_page = bpage;
2437 }
2438 ret = 1;
2439
2440 /* update the entry counter */
2441 rb_remove_entries(cpu_buffer, bpage);
2442 out:
2443 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2444
2445 return ret;
2446}
2447
2155static ssize_t 2448static ssize_t
2156rb_simple_read(struct file *filp, char __user *ubuf, 2449rb_simple_read(struct file *filp, char __user *ubuf,
2157 size_t cnt, loff_t *ppos) 2450 size_t cnt, loff_t *ppos)
2158{ 2451{
2159 int *p = filp->private_data; 2452 long *p = filp->private_data;
2160 char buf[64]; 2453 char buf[64];
2161 int r; 2454 int r;
2162 2455
2163 /* !ring_buffers_off == tracing_on */ 2456 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
2164 r = sprintf(buf, "%d\n", !*p); 2457 r = sprintf(buf, "permanently disabled\n");
2458 else
2459 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
2165 2460
2166 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2461 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2167} 2462}
@@ -2170,7 +2465,7 @@ static ssize_t
2170rb_simple_write(struct file *filp, const char __user *ubuf, 2465rb_simple_write(struct file *filp, const char __user *ubuf,
2171 size_t cnt, loff_t *ppos) 2466 size_t cnt, loff_t *ppos)
2172{ 2467{
2173 int *p = filp->private_data; 2468 long *p = filp->private_data;
2174 char buf[64]; 2469 char buf[64];
2175 long val; 2470 long val;
2176 int ret; 2471 int ret;
@@ -2187,8 +2482,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
2187 if (ret < 0) 2482 if (ret < 0)
2188 return ret; 2483 return ret;
2189 2484
2190 /* !ring_buffers_off == tracing_on */ 2485 if (val)
2191 *p = !val; 2486 set_bit(RB_BUFFERS_ON_BIT, p);
2487 else
2488 clear_bit(RB_BUFFERS_ON_BIT, p);
2192 2489
2193 (*ppos)++; 2490 (*ppos)++;
2194 2491
@@ -2210,7 +2507,7 @@ static __init int rb_init_debugfs(void)
2210 d_tracer = tracing_init_dentry(); 2507 d_tracer = tracing_init_dentry();
2211 2508
2212 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 2509 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2213 &ring_buffers_off, &rb_simple_fops); 2510 &ring_buffer_flags, &rb_simple_fops);
2214 if (!entry) 2511 if (!entry)
2215 pr_warning("Could not create debugfs 'tracing_on' entry\n"); 2512 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2216 2513
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a96b335fe75c..3608f6cb2f7a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/kprobes.h> 32#include <linux/kprobes.h>
33#include <linux/seq_file.h>
33#include <linux/writeback.h> 34#include <linux/writeback.h>
34 35
35#include <linux/stacktrace.h> 36#include <linux/stacktrace.h>
@@ -43,6 +44,38 @@
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 44unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
44unsigned long __read_mostly tracing_thresh; 45unsigned long __read_mostly tracing_thresh;
45 46
47/*
48 * We need to change this state when a selftest is running.
49 * A selftest will lurk into the ring-buffer to count the
50 * entries inserted during the selftest although some concurrent
51 * insertions into the ring-buffer such as ftrace_printk could occurred
52 * at the same time, giving false positive or negative results.
53 */
54static bool __read_mostly tracing_selftest_running;
55
56/* For tracers that don't implement custom flags */
57static struct tracer_opt dummy_tracer_opt[] = {
58 { }
59};
60
61static struct tracer_flags dummy_tracer_flags = {
62 .val = 0,
63 .opts = dummy_tracer_opt
64};
65
66static int dummy_set_flag(u32 old_flags, u32 bit, int set)
67{
68 return 0;
69}
70
71/*
72 * Kill all tracing for good (never come back).
73 * It is initialized to 1 but will turn to zero if the initialization
74 * of the tracer is successful. But that is the only place that sets
75 * this back to zero.
76 */
77int tracing_disabled = 1;
78
46static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 79static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
47 80
48static inline void ftrace_disable_cpu(void) 81static inline void ftrace_disable_cpu(void)
@@ -62,7 +95,36 @@ static cpumask_t __read_mostly tracing_buffer_mask;
62#define for_each_tracing_cpu(cpu) \ 95#define for_each_tracing_cpu(cpu) \
63 for_each_cpu_mask(cpu, tracing_buffer_mask) 96 for_each_cpu_mask(cpu, tracing_buffer_mask)
64 97
65static int tracing_disabled = 1; 98/*
99 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
100 *
101 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
102 * is set, then ftrace_dump is called. This will output the contents
103 * of the ftrace buffers to the console. This is very useful for
104 * capturing traces that lead to crashes and outputing it to a
105 * serial console.
106 *
107 * It is default off, but you can enable it with either specifying
108 * "ftrace_dump_on_oops" in the kernel command line, or setting
109 * /proc/sys/kernel/ftrace_dump_on_oops to true.
110 */
111int ftrace_dump_on_oops;
112
113static int tracing_set_tracer(char *buf);
114
115static int __init set_ftrace(char *str)
116{
117 tracing_set_tracer(str);
118 return 1;
119}
120__setup("ftrace", set_ftrace);
121
122static int __init set_ftrace_dump_on_oops(char *str)
123{
124 ftrace_dump_on_oops = 1;
125 return 1;
126}
127__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
66 128
67long 129long
68ns2usecs(cycle_t nsec) 130ns2usecs(cycle_t nsec)
@@ -112,6 +174,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
112/* tracer_enabled is used to toggle activation of a tracer */ 174/* tracer_enabled is used to toggle activation of a tracer */
113static int tracer_enabled = 1; 175static int tracer_enabled = 1;
114 176
177/**
178 * tracing_is_enabled - return tracer_enabled status
179 *
180 * This function is used by other tracers to know the status
181 * of the tracer_enabled flag. Tracers may use this function
182 * to know if it should enable their features when starting
183 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
184 */
185int tracing_is_enabled(void)
186{
187 return tracer_enabled;
188}
189
115/* function tracing enabled */ 190/* function tracing enabled */
116int ftrace_function_enabled; 191int ftrace_function_enabled;
117 192
@@ -153,8 +228,9 @@ static DEFINE_MUTEX(trace_types_lock);
153/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 228/* trace_wait is a waitqueue for tasks blocked on trace_poll */
154static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 229static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
155 230
156/* trace_flags holds iter_ctrl options */ 231/* trace_flags holds trace_options default values */
157unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 232unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
233 TRACE_ITER_ANNOTATE;
158 234
159/** 235/**
160 * trace_wake_up - wake up tasks waiting for trace input 236 * trace_wake_up - wake up tasks waiting for trace input
@@ -193,13 +269,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
193 return nsecs / 1000; 269 return nsecs / 1000;
194} 270}
195 271
196/*
197 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
198 * control the output of kernel symbols.
199 */
200#define TRACE_ITER_SYM_MASK \
201 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
202
203/* These must match the bit postions in trace_iterator_flags */ 272/* These must match the bit postions in trace_iterator_flags */
204static const char *trace_options[] = { 273static const char *trace_options[] = {
205 "print-parent", 274 "print-parent",
@@ -213,6 +282,12 @@ static const char *trace_options[] = {
213 "stacktrace", 282 "stacktrace",
214 "sched-tree", 283 "sched-tree",
215 "ftrace_printk", 284 "ftrace_printk",
285 "ftrace_preempt",
286 "branch",
287 "annotate",
288 "userstacktrace",
289 "sym-userobj",
290 "printk-msg-only",
216 NULL 291 NULL
217}; 292};
218 293
@@ -246,7 +321,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
246 321
247 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 322 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
248 data->pid = tsk->pid; 323 data->pid = tsk->pid;
249 data->uid = tsk->uid; 324 data->uid = task_uid(tsk);
250 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 325 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
251 data->policy = tsk->policy; 326 data->policy = tsk->policy;
252 data->rt_priority = tsk->rt_priority; 327 data->rt_priority = tsk->rt_priority;
@@ -359,6 +434,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
359 return trace_seq_putmem(s, hex, j); 434 return trace_seq_putmem(s, hex, j);
360} 435}
361 436
437static int
438trace_seq_path(struct trace_seq *s, struct path *path)
439{
440 unsigned char *p;
441
442 if (s->len >= (PAGE_SIZE - 1))
443 return 0;
444 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
445 if (!IS_ERR(p)) {
446 p = mangle_path(s->buffer + s->len, p, "\n");
447 if (p) {
448 s->len = p - s->buffer;
449 return 1;
450 }
451 } else {
452 s->buffer[s->len++] = '?';
453 return 1;
454 }
455
456 return 0;
457}
458
362static void 459static void
363trace_seq_reset(struct trace_seq *s) 460trace_seq_reset(struct trace_seq *s)
364{ 461{
@@ -470,7 +567,17 @@ int register_tracer(struct tracer *type)
470 return -1; 567 return -1;
471 } 568 }
472 569
570 /*
571 * When this gets called we hold the BKL which means that
572 * preemption is disabled. Various trace selftests however
573 * need to disable and enable preemption for successful tests.
574 * So we drop the BKL here and grab it after the tests again.
575 */
576 unlock_kernel();
473 mutex_lock(&trace_types_lock); 577 mutex_lock(&trace_types_lock);
578
579 tracing_selftest_running = true;
580
474 for (t = trace_types; t; t = t->next) { 581 for (t = trace_types; t; t = t->next) {
475 if (strcmp(type->name, t->name) == 0) { 582 if (strcmp(type->name, t->name) == 0) {
476 /* already found */ 583 /* already found */
@@ -481,12 +588,20 @@ int register_tracer(struct tracer *type)
481 } 588 }
482 } 589 }
483 590
591 if (!type->set_flag)
592 type->set_flag = &dummy_set_flag;
593 if (!type->flags)
594 type->flags = &dummy_tracer_flags;
595 else
596 if (!type->flags->opts)
597 type->flags->opts = dummy_tracer_opt;
598
484#ifdef CONFIG_FTRACE_STARTUP_TEST 599#ifdef CONFIG_FTRACE_STARTUP_TEST
485 if (type->selftest) { 600 if (type->selftest) {
486 struct tracer *saved_tracer = current_trace; 601 struct tracer *saved_tracer = current_trace;
487 struct trace_array *tr = &global_trace; 602 struct trace_array *tr = &global_trace;
488 int saved_ctrl = tr->ctrl;
489 int i; 603 int i;
604
490 /* 605 /*
491 * Run a selftest on this tracer. 606 * Run a selftest on this tracer.
492 * Here we reset the trace buffer, and set the current 607 * Here we reset the trace buffer, and set the current
@@ -494,25 +609,23 @@ int register_tracer(struct tracer *type)
494 * internal tracing to verify that everything is in order. 609 * internal tracing to verify that everything is in order.
495 * If we fail, we do not register this tracer. 610 * If we fail, we do not register this tracer.
496 */ 611 */
497 for_each_tracing_cpu(i) { 612 for_each_tracing_cpu(i)
498 tracing_reset(tr, i); 613 tracing_reset(tr, i);
499 } 614
500 current_trace = type; 615 current_trace = type;
501 tr->ctrl = 0;
502 /* the test is responsible for initializing and enabling */ 616 /* the test is responsible for initializing and enabling */
503 pr_info("Testing tracer %s: ", type->name); 617 pr_info("Testing tracer %s: ", type->name);
504 ret = type->selftest(type, tr); 618 ret = type->selftest(type, tr);
505 /* the test is responsible for resetting too */ 619 /* the test is responsible for resetting too */
506 current_trace = saved_tracer; 620 current_trace = saved_tracer;
507 tr->ctrl = saved_ctrl;
508 if (ret) { 621 if (ret) {
509 printk(KERN_CONT "FAILED!\n"); 622 printk(KERN_CONT "FAILED!\n");
510 goto out; 623 goto out;
511 } 624 }
512 /* Only reset on passing, to avoid touching corrupted buffers */ 625 /* Only reset on passing, to avoid touching corrupted buffers */
513 for_each_tracing_cpu(i) { 626 for_each_tracing_cpu(i)
514 tracing_reset(tr, i); 627 tracing_reset(tr, i);
515 } 628
516 printk(KERN_CONT "PASSED\n"); 629 printk(KERN_CONT "PASSED\n");
517 } 630 }
518#endif 631#endif
@@ -524,7 +637,9 @@ int register_tracer(struct tracer *type)
524 max_tracer_type_len = len; 637 max_tracer_type_len = len;
525 638
526 out: 639 out:
640 tracing_selftest_running = false;
527 mutex_unlock(&trace_types_lock); 641 mutex_unlock(&trace_types_lock);
642 lock_kernel();
528 643
529 return ret; 644 return ret;
530} 645}
@@ -564,6 +679,16 @@ void tracing_reset(struct trace_array *tr, int cpu)
564 ftrace_enable_cpu(); 679 ftrace_enable_cpu();
565} 680}
566 681
682void tracing_reset_online_cpus(struct trace_array *tr)
683{
684 int cpu;
685
686 tr->time_start = ftrace_now(tr->cpu);
687
688 for_each_online_cpu(cpu)
689 tracing_reset(tr, cpu);
690}
691
567#define SAVED_CMDLINES 128 692#define SAVED_CMDLINES 128
568static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 693static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
569static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 694static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
@@ -581,6 +706,91 @@ static void trace_init_cmdlines(void)
581 cmdline_idx = 0; 706 cmdline_idx = 0;
582} 707}
583 708
709static int trace_stop_count;
710static DEFINE_SPINLOCK(tracing_start_lock);
711
712/**
713 * ftrace_off_permanent - disable all ftrace code permanently
714 *
715 * This should only be called when a serious anomally has
716 * been detected. This will turn off the function tracing,
717 * ring buffers, and other tracing utilites. It takes no
718 * locks and can be called from any context.
719 */
720void ftrace_off_permanent(void)
721{
722 tracing_disabled = 1;
723 ftrace_stop();
724 tracing_off_permanent();
725}
726
727/**
728 * tracing_start - quick start of the tracer
729 *
730 * If tracing is enabled but was stopped by tracing_stop,
731 * this will start the tracer back up.
732 */
733void tracing_start(void)
734{
735 struct ring_buffer *buffer;
736 unsigned long flags;
737
738 if (tracing_disabled)
739 return;
740
741 spin_lock_irqsave(&tracing_start_lock, flags);
742 if (--trace_stop_count)
743 goto out;
744
745 if (trace_stop_count < 0) {
746 /* Someone screwed up their debugging */
747 WARN_ON_ONCE(1);
748 trace_stop_count = 0;
749 goto out;
750 }
751
752
753 buffer = global_trace.buffer;
754 if (buffer)
755 ring_buffer_record_enable(buffer);
756
757 buffer = max_tr.buffer;
758 if (buffer)
759 ring_buffer_record_enable(buffer);
760
761 ftrace_start();
762 out:
763 spin_unlock_irqrestore(&tracing_start_lock, flags);
764}
765
766/**
767 * tracing_stop - quick stop of the tracer
768 *
769 * Light weight way to stop tracing. Use in conjunction with
770 * tracing_start.
771 */
772void tracing_stop(void)
773{
774 struct ring_buffer *buffer;
775 unsigned long flags;
776
777 ftrace_stop();
778 spin_lock_irqsave(&tracing_start_lock, flags);
779 if (trace_stop_count++)
780 goto out;
781
782 buffer = global_trace.buffer;
783 if (buffer)
784 ring_buffer_record_disable(buffer);
785
786 buffer = max_tr.buffer;
787 if (buffer)
788 ring_buffer_record_disable(buffer);
789
790 out:
791 spin_unlock_irqrestore(&tracing_start_lock, flags);
792}
793
584void trace_stop_cmdline_recording(void); 794void trace_stop_cmdline_recording(void);
585 795
586static void trace_save_cmdline(struct task_struct *tsk) 796static void trace_save_cmdline(struct task_struct *tsk)
@@ -618,7 +828,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
618 spin_unlock(&trace_cmdline_lock); 828 spin_unlock(&trace_cmdline_lock);
619} 829}
620 830
621static char *trace_find_cmdline(int pid) 831char *trace_find_cmdline(int pid)
622{ 832{
623 char *cmdline = "<...>"; 833 char *cmdline = "<...>";
624 unsigned map; 834 unsigned map;
@@ -655,6 +865,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
655 865
656 entry->preempt_count = pc & 0xff; 866 entry->preempt_count = pc & 0xff;
657 entry->pid = (tsk) ? tsk->pid : 0; 867 entry->pid = (tsk) ? tsk->pid : 0;
868 entry->tgid = (tsk) ? tsk->tgid : 0;
658 entry->flags = 869 entry->flags =
659#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 870#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
660 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 871 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -691,6 +902,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
691 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 902 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
692} 903}
693 904
905#ifdef CONFIG_FUNCTION_GRAPH_TRACER
906static void __trace_graph_entry(struct trace_array *tr,
907 struct trace_array_cpu *data,
908 struct ftrace_graph_ent *trace,
909 unsigned long flags,
910 int pc)
911{
912 struct ring_buffer_event *event;
913 struct ftrace_graph_ent_entry *entry;
914 unsigned long irq_flags;
915
916 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
917 return;
918
919 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
920 &irq_flags);
921 if (!event)
922 return;
923 entry = ring_buffer_event_data(event);
924 tracing_generic_entry_update(&entry->ent, flags, pc);
925 entry->ent.type = TRACE_GRAPH_ENT;
926 entry->graph_ent = *trace;
927 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
928}
929
930static void __trace_graph_return(struct trace_array *tr,
931 struct trace_array_cpu *data,
932 struct ftrace_graph_ret *trace,
933 unsigned long flags,
934 int pc)
935{
936 struct ring_buffer_event *event;
937 struct ftrace_graph_ret_entry *entry;
938 unsigned long irq_flags;
939
940 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
941 return;
942
943 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
944 &irq_flags);
945 if (!event)
946 return;
947 entry = ring_buffer_event_data(event);
948 tracing_generic_entry_update(&entry->ent, flags, pc);
949 entry->ent.type = TRACE_GRAPH_RET;
950 entry->ret = *trace;
951 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
952}
953#endif
954
694void 955void
695ftrace(struct trace_array *tr, struct trace_array_cpu *data, 956ftrace(struct trace_array *tr, struct trace_array_cpu *data,
696 unsigned long ip, unsigned long parent_ip, unsigned long flags, 957 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -742,6 +1003,46 @@ void __trace_stack(struct trace_array *tr,
742 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 1003 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
743} 1004}
744 1005
1006static void ftrace_trace_userstack(struct trace_array *tr,
1007 struct trace_array_cpu *data,
1008 unsigned long flags, int pc)
1009{
1010#ifdef CONFIG_STACKTRACE
1011 struct ring_buffer_event *event;
1012 struct userstack_entry *entry;
1013 struct stack_trace trace;
1014 unsigned long irq_flags;
1015
1016 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1017 return;
1018
1019 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
1020 &irq_flags);
1021 if (!event)
1022 return;
1023 entry = ring_buffer_event_data(event);
1024 tracing_generic_entry_update(&entry->ent, flags, pc);
1025 entry->ent.type = TRACE_USER_STACK;
1026
1027 memset(&entry->caller, 0, sizeof(entry->caller));
1028
1029 trace.nr_entries = 0;
1030 trace.max_entries = FTRACE_STACK_ENTRIES;
1031 trace.skip = 0;
1032 trace.entries = entry->caller;
1033
1034 save_stack_trace_user(&trace);
1035 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1036#endif
1037}
1038
1039void __trace_userstack(struct trace_array *tr,
1040 struct trace_array_cpu *data,
1041 unsigned long flags)
1042{
1043 ftrace_trace_userstack(tr, data, flags, preempt_count());
1044}
1045
745static void 1046static void
746ftrace_trace_special(void *__tr, void *__data, 1047ftrace_trace_special(void *__tr, void *__data,
747 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1048 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -765,6 +1066,7 @@ ftrace_trace_special(void *__tr, void *__data,
765 entry->arg3 = arg3; 1066 entry->arg3 = arg3;
766 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1067 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
767 ftrace_trace_stack(tr, data, irq_flags, 4, pc); 1068 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1069 ftrace_trace_userstack(tr, data, irq_flags, pc);
768 1070
769 trace_wake_up(); 1071 trace_wake_up();
770} 1072}
@@ -803,6 +1105,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
803 entry->next_cpu = task_cpu(next); 1105 entry->next_cpu = task_cpu(next);
804 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1106 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
805 ftrace_trace_stack(tr, data, flags, 5, pc); 1107 ftrace_trace_stack(tr, data, flags, 5, pc);
1108 ftrace_trace_userstack(tr, data, flags, pc);
806} 1109}
807 1110
808void 1111void
@@ -832,6 +1135,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
832 entry->next_cpu = task_cpu(wakee); 1135 entry->next_cpu = task_cpu(wakee);
833 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1136 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
834 ftrace_trace_stack(tr, data, flags, 6, pc); 1137 ftrace_trace_stack(tr, data, flags, 6, pc);
1138 ftrace_trace_userstack(tr, data, flags, pc);
835 1139
836 trace_wake_up(); 1140 trace_wake_up();
837} 1141}
@@ -841,26 +1145,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
841{ 1145{
842 struct trace_array *tr = &global_trace; 1146 struct trace_array *tr = &global_trace;
843 struct trace_array_cpu *data; 1147 struct trace_array_cpu *data;
1148 unsigned long flags;
844 int cpu; 1149 int cpu;
845 int pc; 1150 int pc;
846 1151
847 if (tracing_disabled || !tr->ctrl) 1152 if (tracing_disabled)
848 return; 1153 return;
849 1154
850 pc = preempt_count(); 1155 pc = preempt_count();
851 preempt_disable_notrace(); 1156 local_irq_save(flags);
852 cpu = raw_smp_processor_id(); 1157 cpu = raw_smp_processor_id();
853 data = tr->data[cpu]; 1158 data = tr->data[cpu];
854 1159
855 if (likely(!atomic_read(&data->disabled))) 1160 if (likely(atomic_inc_return(&data->disabled) == 1))
856 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1161 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
857 1162
858 preempt_enable_notrace(); 1163 atomic_dec(&data->disabled);
1164 local_irq_restore(flags);
859} 1165}
860 1166
861#ifdef CONFIG_FUNCTION_TRACER 1167#ifdef CONFIG_FUNCTION_TRACER
862static void 1168static void
863function_trace_call(unsigned long ip, unsigned long parent_ip) 1169function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
864{ 1170{
865 struct trace_array *tr = &global_trace; 1171 struct trace_array *tr = &global_trace;
866 struct trace_array_cpu *data; 1172 struct trace_array_cpu *data;
@@ -873,8 +1179,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
873 return; 1179 return;
874 1180
875 pc = preempt_count(); 1181 pc = preempt_count();
876 resched = need_resched(); 1182 resched = ftrace_preempt_disable();
877 preempt_disable_notrace();
878 local_save_flags(flags); 1183 local_save_flags(flags);
879 cpu = raw_smp_processor_id(); 1184 cpu = raw_smp_processor_id();
880 data = tr->data[cpu]; 1185 data = tr->data[cpu];
@@ -884,12 +1189,97 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
884 trace_function(tr, data, ip, parent_ip, flags, pc); 1189 trace_function(tr, data, ip, parent_ip, flags, pc);
885 1190
886 atomic_dec(&data->disabled); 1191 atomic_dec(&data->disabled);
887 if (resched) 1192 ftrace_preempt_enable(resched);
888 preempt_enable_no_resched_notrace();
889 else
890 preempt_enable_notrace();
891} 1193}
892 1194
1195static void
1196function_trace_call(unsigned long ip, unsigned long parent_ip)
1197{
1198 struct trace_array *tr = &global_trace;
1199 struct trace_array_cpu *data;
1200 unsigned long flags;
1201 long disabled;
1202 int cpu;
1203 int pc;
1204
1205 if (unlikely(!ftrace_function_enabled))
1206 return;
1207
1208 /*
1209 * Need to use raw, since this must be called before the
1210 * recursive protection is performed.
1211 */
1212 local_irq_save(flags);
1213 cpu = raw_smp_processor_id();
1214 data = tr->data[cpu];
1215 disabled = atomic_inc_return(&data->disabled);
1216
1217 if (likely(disabled == 1)) {
1218 pc = preempt_count();
1219 trace_function(tr, data, ip, parent_ip, flags, pc);
1220 }
1221
1222 atomic_dec(&data->disabled);
1223 local_irq_restore(flags);
1224}
1225
1226#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1227int trace_graph_entry(struct ftrace_graph_ent *trace)
1228{
1229 struct trace_array *tr = &global_trace;
1230 struct trace_array_cpu *data;
1231 unsigned long flags;
1232 long disabled;
1233 int cpu;
1234 int pc;
1235
1236 if (!ftrace_trace_task(current))
1237 return 0;
1238
1239 if (!ftrace_graph_addr(trace->func))
1240 return 0;
1241
1242 local_irq_save(flags);
1243 cpu = raw_smp_processor_id();
1244 data = tr->data[cpu];
1245 disabled = atomic_inc_return(&data->disabled);
1246 if (likely(disabled == 1)) {
1247 pc = preempt_count();
1248 __trace_graph_entry(tr, data, trace, flags, pc);
1249 }
1250 /* Only do the atomic if it is not already set */
1251 if (!test_tsk_trace_graph(current))
1252 set_tsk_trace_graph(current);
1253 atomic_dec(&data->disabled);
1254 local_irq_restore(flags);
1255
1256 return 1;
1257}
1258
1259void trace_graph_return(struct ftrace_graph_ret *trace)
1260{
1261 struct trace_array *tr = &global_trace;
1262 struct trace_array_cpu *data;
1263 unsigned long flags;
1264 long disabled;
1265 int cpu;
1266 int pc;
1267
1268 local_irq_save(flags);
1269 cpu = raw_smp_processor_id();
1270 data = tr->data[cpu];
1271 disabled = atomic_inc_return(&data->disabled);
1272 if (likely(disabled == 1)) {
1273 pc = preempt_count();
1274 __trace_graph_return(tr, data, trace, flags, pc);
1275 }
1276 if (!trace->depth)
1277 clear_tsk_trace_graph(current);
1278 atomic_dec(&data->disabled);
1279 local_irq_restore(flags);
1280}
1281#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1282
893static struct ftrace_ops trace_ops __read_mostly = 1283static struct ftrace_ops trace_ops __read_mostly =
894{ 1284{
895 .func = function_trace_call, 1285 .func = function_trace_call,
@@ -898,9 +1288,14 @@ static struct ftrace_ops trace_ops __read_mostly =
898void tracing_start_function_trace(void) 1288void tracing_start_function_trace(void)
899{ 1289{
900 ftrace_function_enabled = 0; 1290 ftrace_function_enabled = 0;
1291
1292 if (trace_flags & TRACE_ITER_PREEMPTONLY)
1293 trace_ops.func = function_trace_call_preempt_only;
1294 else
1295 trace_ops.func = function_trace_call;
1296
901 register_ftrace_function(&trace_ops); 1297 register_ftrace_function(&trace_ops);
902 if (tracer_enabled) 1298 ftrace_function_enabled = 1;
903 ftrace_function_enabled = 1;
904} 1299}
905 1300
906void tracing_stop_function_trace(void) 1301void tracing_stop_function_trace(void)
@@ -912,6 +1307,7 @@ void tracing_stop_function_trace(void)
912 1307
913enum trace_file_type { 1308enum trace_file_type {
914 TRACE_FILE_LAT_FMT = 1, 1309 TRACE_FILE_LAT_FMT = 1,
1310 TRACE_FILE_ANNOTATE = 2,
915}; 1311};
916 1312
917static void trace_iterator_increment(struct trace_iterator *iter) 1313static void trace_iterator_increment(struct trace_iterator *iter)
@@ -1047,10 +1443,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1047 1443
1048 atomic_inc(&trace_record_cmdline_disabled); 1444 atomic_inc(&trace_record_cmdline_disabled);
1049 1445
1050 /* let the tracer grab locks here if needed */
1051 if (current_trace->start)
1052 current_trace->start(iter);
1053
1054 if (*pos != iter->pos) { 1446 if (*pos != iter->pos) {
1055 iter->ent = NULL; 1447 iter->ent = NULL;
1056 iter->cpu = 0; 1448 iter->cpu = 0;
@@ -1077,14 +1469,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1077 1469
1078static void s_stop(struct seq_file *m, void *p) 1470static void s_stop(struct seq_file *m, void *p)
1079{ 1471{
1080 struct trace_iterator *iter = m->private;
1081
1082 atomic_dec(&trace_record_cmdline_disabled); 1472 atomic_dec(&trace_record_cmdline_disabled);
1083
1084 /* let the tracer release locks here if needed */
1085 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1086 iter->trace->stop(iter);
1087
1088 mutex_unlock(&trace_types_lock); 1473 mutex_unlock(&trace_types_lock);
1089} 1474}
1090 1475
@@ -1143,7 +1528,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1143# define IP_FMT "%016lx" 1528# define IP_FMT "%016lx"
1144#endif 1529#endif
1145 1530
1146static int 1531int
1147seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 1532seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1148{ 1533{
1149 int ret; 1534 int ret;
@@ -1164,6 +1549,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1164 return ret; 1549 return ret;
1165} 1550}
1166 1551
1552static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1553 unsigned long ip, unsigned long sym_flags)
1554{
1555 struct file *file = NULL;
1556 unsigned long vmstart = 0;
1557 int ret = 1;
1558
1559 if (mm) {
1560 const struct vm_area_struct *vma;
1561
1562 down_read(&mm->mmap_sem);
1563 vma = find_vma(mm, ip);
1564 if (vma) {
1565 file = vma->vm_file;
1566 vmstart = vma->vm_start;
1567 }
1568 if (file) {
1569 ret = trace_seq_path(s, &file->f_path);
1570 if (ret)
1571 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1572 }
1573 up_read(&mm->mmap_sem);
1574 }
1575 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1576 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1577 return ret;
1578}
1579
1580static int
1581seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1582 unsigned long sym_flags)
1583{
1584 struct mm_struct *mm = NULL;
1585 int ret = 1;
1586 unsigned int i;
1587
1588 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1589 struct task_struct *task;
1590 /*
1591 * we do the lookup on the thread group leader,
1592 * since individual threads might have already quit!
1593 */
1594 rcu_read_lock();
1595 task = find_task_by_vpid(entry->ent.tgid);
1596 if (task)
1597 mm = get_task_mm(task);
1598 rcu_read_unlock();
1599 }
1600
1601 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1602 unsigned long ip = entry->caller[i];
1603
1604 if (ip == ULONG_MAX || !ret)
1605 break;
1606 if (i && ret)
1607 ret = trace_seq_puts(s, " <- ");
1608 if (!ip) {
1609 if (ret)
1610 ret = trace_seq_puts(s, "??");
1611 continue;
1612 }
1613 if (!ret)
1614 break;
1615 if (ret)
1616 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1617 }
1618
1619 if (mm)
1620 mmput(mm);
1621 return ret;
1622}
1623
1167static void print_lat_help_header(struct seq_file *m) 1624static void print_lat_help_header(struct seq_file *m)
1168{ 1625{
1169 seq_puts(m, "# _------=> CPU# \n"); 1626 seq_puts(m, "# _------=> CPU# \n");
@@ -1301,6 +1758,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1301 1758
1302static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 1759static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1303 1760
1761static int task_state_char(unsigned long state)
1762{
1763 int bit = state ? __ffs(state) + 1 : 0;
1764
1765 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1766}
1767
1304/* 1768/*
1305 * The message is supposed to contain an ending newline. 1769 * The message is supposed to contain an ending newline.
1306 * If the printing stops prematurely, try to add a newline of our own. 1770 * If the printing stops prematurely, try to add a newline of our own.
@@ -1338,6 +1802,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1338 trace_seq_putc(s, '\n'); 1802 trace_seq_putc(s, '\n');
1339} 1803}
1340 1804
1805static void test_cpu_buff_start(struct trace_iterator *iter)
1806{
1807 struct trace_seq *s = &iter->seq;
1808
1809 if (!(trace_flags & TRACE_ITER_ANNOTATE))
1810 return;
1811
1812 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1813 return;
1814
1815 if (cpu_isset(iter->cpu, iter->started))
1816 return;
1817
1818 cpu_set(iter->cpu, iter->started);
1819 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1820}
1821
1341static enum print_line_t 1822static enum print_line_t
1342print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1823print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1343{ 1824{
@@ -1352,11 +1833,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1352 char *comm; 1833 char *comm;
1353 int S, T; 1834 int S, T;
1354 int i; 1835 int i;
1355 unsigned state;
1356 1836
1357 if (entry->type == TRACE_CONT) 1837 if (entry->type == TRACE_CONT)
1358 return TRACE_TYPE_HANDLED; 1838 return TRACE_TYPE_HANDLED;
1359 1839
1840 test_cpu_buff_start(iter);
1841
1360 next_entry = find_next_entry(iter, NULL, &next_ts); 1842 next_entry = find_next_entry(iter, NULL, &next_ts);
1361 if (!next_entry) 1843 if (!next_entry)
1362 next_ts = iter->ts; 1844 next_ts = iter->ts;
@@ -1396,12 +1878,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1396 1878
1397 trace_assign_type(field, entry); 1879 trace_assign_type(field, entry);
1398 1880
1399 T = field->next_state < sizeof(state_to_char) ? 1881 T = task_state_char(field->next_state);
1400 state_to_char[field->next_state] : 'X'; 1882 S = task_state_char(field->prev_state);
1401
1402 state = field->prev_state ?
1403 __ffs(field->prev_state) + 1 : 0;
1404 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1405 comm = trace_find_cmdline(field->next_pid); 1883 comm = trace_find_cmdline(field->next_pid);
1406 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 1884 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1407 field->prev_pid, 1885 field->prev_pid,
@@ -1448,6 +1926,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1448 trace_seq_print_cont(s, iter); 1926 trace_seq_print_cont(s, iter);
1449 break; 1927 break;
1450 } 1928 }
1929 case TRACE_BRANCH: {
1930 struct trace_branch *field;
1931
1932 trace_assign_type(field, entry);
1933
1934 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1935 field->correct ? " ok " : " MISS ",
1936 field->func,
1937 field->file,
1938 field->line);
1939 break;
1940 }
1941 case TRACE_USER_STACK: {
1942 struct userstack_entry *field;
1943
1944 trace_assign_type(field, entry);
1945
1946 seq_print_userip_objs(field, s, sym_flags);
1947 trace_seq_putc(s, '\n');
1948 break;
1949 }
1451 default: 1950 default:
1452 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1951 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1453 } 1952 }
@@ -1472,6 +1971,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1472 if (entry->type == TRACE_CONT) 1971 if (entry->type == TRACE_CONT)
1473 return TRACE_TYPE_HANDLED; 1972 return TRACE_TYPE_HANDLED;
1474 1973
1974 test_cpu_buff_start(iter);
1975
1475 comm = trace_find_cmdline(iter->ent->pid); 1976 comm = trace_find_cmdline(iter->ent->pid);
1476 1977
1477 t = ns2usecs(iter->ts); 1978 t = ns2usecs(iter->ts);
@@ -1519,10 +2020,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1519 2020
1520 trace_assign_type(field, entry); 2021 trace_assign_type(field, entry);
1521 2022
1522 S = field->prev_state < sizeof(state_to_char) ? 2023 T = task_state_char(field->next_state);
1523 state_to_char[field->prev_state] : 'X'; 2024 S = task_state_char(field->prev_state);
1524 T = field->next_state < sizeof(state_to_char) ?
1525 state_to_char[field->next_state] : 'X';
1526 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", 2025 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
1527 field->prev_pid, 2026 field->prev_pid,
1528 field->prev_prio, 2027 field->prev_prio,
@@ -1581,6 +2080,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1581 trace_seq_print_cont(s, iter); 2080 trace_seq_print_cont(s, iter);
1582 break; 2081 break;
1583 } 2082 }
2083 case TRACE_GRAPH_RET: {
2084 return print_graph_function(iter);
2085 }
2086 case TRACE_GRAPH_ENT: {
2087 return print_graph_function(iter);
2088 }
2089 case TRACE_BRANCH: {
2090 struct trace_branch *field;
2091
2092 trace_assign_type(field, entry);
2093
2094 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2095 field->correct ? " ok " : " MISS ",
2096 field->func,
2097 field->file,
2098 field->line);
2099 break;
2100 }
2101 case TRACE_USER_STACK: {
2102 struct userstack_entry *field;
2103
2104 trace_assign_type(field, entry);
2105
2106 ret = seq_print_userip_objs(field, s, sym_flags);
2107 if (!ret)
2108 return TRACE_TYPE_PARTIAL_LINE;
2109 ret = trace_seq_putc(s, '\n');
2110 if (!ret)
2111 return TRACE_TYPE_PARTIAL_LINE;
2112 break;
2113 }
1584 } 2114 }
1585 return TRACE_TYPE_HANDLED; 2115 return TRACE_TYPE_HANDLED;
1586} 2116}
@@ -1621,12 +2151,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1621 2151
1622 trace_assign_type(field, entry); 2152 trace_assign_type(field, entry);
1623 2153
1624 S = field->prev_state < sizeof(state_to_char) ? 2154 T = task_state_char(field->next_state);
1625 state_to_char[field->prev_state] : 'X'; 2155 S = entry->type == TRACE_WAKE ? '+' :
1626 T = field->next_state < sizeof(state_to_char) ? 2156 task_state_char(field->prev_state);
1627 state_to_char[field->next_state] : 'X';
1628 if (entry->type == TRACE_WAKE)
1629 S = '+';
1630 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", 2157 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
1631 field->prev_pid, 2158 field->prev_pid,
1632 field->prev_prio, 2159 field->prev_prio,
@@ -1640,6 +2167,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1640 break; 2167 break;
1641 } 2168 }
1642 case TRACE_SPECIAL: 2169 case TRACE_SPECIAL:
2170 case TRACE_USER_STACK:
1643 case TRACE_STACK: { 2171 case TRACE_STACK: {
1644 struct special_entry *field; 2172 struct special_entry *field;
1645 2173
@@ -1712,12 +2240,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1712 2240
1713 trace_assign_type(field, entry); 2241 trace_assign_type(field, entry);
1714 2242
1715 S = field->prev_state < sizeof(state_to_char) ? 2243 T = task_state_char(field->next_state);
1716 state_to_char[field->prev_state] : 'X'; 2244 S = entry->type == TRACE_WAKE ? '+' :
1717 T = field->next_state < sizeof(state_to_char) ? 2245 task_state_char(field->prev_state);
1718 state_to_char[field->next_state] : 'X';
1719 if (entry->type == TRACE_WAKE)
1720 S = '+';
1721 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 2246 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
1722 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); 2247 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
1723 SEQ_PUT_HEX_FIELD_RET(s, S); 2248 SEQ_PUT_HEX_FIELD_RET(s, S);
@@ -1728,6 +2253,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1728 break; 2253 break;
1729 } 2254 }
1730 case TRACE_SPECIAL: 2255 case TRACE_SPECIAL:
2256 case TRACE_USER_STACK:
1731 case TRACE_STACK: { 2257 case TRACE_STACK: {
1732 struct special_entry *field; 2258 struct special_entry *field;
1733 2259
@@ -1744,6 +2270,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1744 return TRACE_TYPE_HANDLED; 2270 return TRACE_TYPE_HANDLED;
1745} 2271}
1746 2272
2273static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
2274{
2275 struct trace_seq *s = &iter->seq;
2276 struct trace_entry *entry = iter->ent;
2277 struct print_entry *field;
2278 int ret;
2279
2280 trace_assign_type(field, entry);
2281
2282 ret = trace_seq_printf(s, field->buf);
2283 if (!ret)
2284 return TRACE_TYPE_PARTIAL_LINE;
2285
2286 if (entry->flags & TRACE_FLAG_CONT)
2287 trace_seq_print_cont(s, iter);
2288
2289 return TRACE_TYPE_HANDLED;
2290}
2291
1747static enum print_line_t print_bin_fmt(struct trace_iterator *iter) 2292static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1748{ 2293{
1749 struct trace_seq *s = &iter->seq; 2294 struct trace_seq *s = &iter->seq;
@@ -1782,6 +2327,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1782 break; 2327 break;
1783 } 2328 }
1784 case TRACE_SPECIAL: 2329 case TRACE_SPECIAL:
2330 case TRACE_USER_STACK:
1785 case TRACE_STACK: { 2331 case TRACE_STACK: {
1786 struct special_entry *field; 2332 struct special_entry *field;
1787 2333
@@ -1823,6 +2369,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1823 return ret; 2369 return ret;
1824 } 2370 }
1825 2371
2372 if (iter->ent->type == TRACE_PRINT &&
2373 trace_flags & TRACE_ITER_PRINTK &&
2374 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2375 return print_printk_msg_only(iter);
2376
1826 if (trace_flags & TRACE_ITER_BIN) 2377 if (trace_flags & TRACE_ITER_BIN)
1827 return print_bin_fmt(iter); 2378 return print_bin_fmt(iter);
1828 2379
@@ -1847,7 +2398,9 @@ static int s_show(struct seq_file *m, void *v)
1847 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2398 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1848 seq_puts(m, "#\n"); 2399 seq_puts(m, "#\n");
1849 } 2400 }
1850 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2401 if (iter->trace && iter->trace->print_header)
2402 iter->trace->print_header(m);
2403 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1851 /* print nothing if the buffers are empty */ 2404 /* print nothing if the buffers are empty */
1852 if (trace_empty(iter)) 2405 if (trace_empty(iter))
1853 return 0; 2406 return 0;
@@ -1899,6 +2452,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1899 iter->trace = current_trace; 2452 iter->trace = current_trace;
1900 iter->pos = -1; 2453 iter->pos = -1;
1901 2454
2455 /* Notify the tracer early; before we stop tracing. */
2456 if (iter->trace && iter->trace->open)
2457 iter->trace->open(iter);
2458
2459 /* Annotate start of buffers if we had overruns */
2460 if (ring_buffer_overruns(iter->tr->buffer))
2461 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2462
2463
1902 for_each_tracing_cpu(cpu) { 2464 for_each_tracing_cpu(cpu) {
1903 2465
1904 iter->buffer_iter[cpu] = 2466 iter->buffer_iter[cpu] =
@@ -1917,13 +2479,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1917 m->private = iter; 2479 m->private = iter;
1918 2480
1919 /* stop the trace while dumping */ 2481 /* stop the trace while dumping */
1920 if (iter->tr->ctrl) { 2482 tracing_stop();
1921 tracer_enabled = 0;
1922 ftrace_function_enabled = 0;
1923 }
1924
1925 if (iter->trace && iter->trace->open)
1926 iter->trace->open(iter);
1927 2483
1928 mutex_unlock(&trace_types_lock); 2484 mutex_unlock(&trace_types_lock);
1929 2485
@@ -1966,14 +2522,7 @@ int tracing_release(struct inode *inode, struct file *file)
1966 iter->trace->close(iter); 2522 iter->trace->close(iter);
1967 2523
1968 /* reenable tracing if it was previously enabled */ 2524 /* reenable tracing if it was previously enabled */
1969 if (iter->tr->ctrl) { 2525 tracing_start();
1970 tracer_enabled = 1;
1971 /*
1972 * It is safe to enable function tracing even if it
1973 * isn't used
1974 */
1975 ftrace_function_enabled = 1;
1976 }
1977 mutex_unlock(&trace_types_lock); 2526 mutex_unlock(&trace_types_lock);
1978 2527
1979 seq_release(inode, file); 2528 seq_release(inode, file);
@@ -2151,7 +2700,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2151 if (err) 2700 if (err)
2152 goto err_unlock; 2701 goto err_unlock;
2153 2702
2154 raw_local_irq_disable(); 2703 local_irq_disable();
2155 __raw_spin_lock(&ftrace_max_lock); 2704 __raw_spin_lock(&ftrace_max_lock);
2156 for_each_tracing_cpu(cpu) { 2705 for_each_tracing_cpu(cpu) {
2157 /* 2706 /*
@@ -2168,7 +2717,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2168 } 2717 }
2169 } 2718 }
2170 __raw_spin_unlock(&ftrace_max_lock); 2719 __raw_spin_unlock(&ftrace_max_lock);
2171 raw_local_irq_enable(); 2720 local_irq_enable();
2172 2721
2173 tracing_cpumask = tracing_cpumask_new; 2722 tracing_cpumask = tracing_cpumask_new;
2174 2723
@@ -2189,13 +2738,16 @@ static struct file_operations tracing_cpumask_fops = {
2189}; 2738};
2190 2739
2191static ssize_t 2740static ssize_t
2192tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, 2741tracing_trace_options_read(struct file *filp, char __user *ubuf,
2193 size_t cnt, loff_t *ppos) 2742 size_t cnt, loff_t *ppos)
2194{ 2743{
2744 int i;
2195 char *buf; 2745 char *buf;
2196 int r = 0; 2746 int r = 0;
2197 int len = 0; 2747 int len = 0;
2198 int i; 2748 u32 tracer_flags = current_trace->flags->val;
2749 struct tracer_opt *trace_opts = current_trace->flags->opts;
2750
2199 2751
2200 /* calulate max size */ 2752 /* calulate max size */
2201 for (i = 0; trace_options[i]; i++) { 2753 for (i = 0; trace_options[i]; i++) {
@@ -2203,6 +2755,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2203 len += 3; /* "no" and space */ 2755 len += 3; /* "no" and space */
2204 } 2756 }
2205 2757
2758 /*
2759 * Increase the size with names of options specific
2760 * of the current tracer.
2761 */
2762 for (i = 0; trace_opts[i].name; i++) {
2763 len += strlen(trace_opts[i].name);
2764 len += 3; /* "no" and space */
2765 }
2766
2206 /* +2 for \n and \0 */ 2767 /* +2 for \n and \0 */
2207 buf = kmalloc(len + 2, GFP_KERNEL); 2768 buf = kmalloc(len + 2, GFP_KERNEL);
2208 if (!buf) 2769 if (!buf)
@@ -2215,6 +2776,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2215 r += sprintf(buf + r, "no%s ", trace_options[i]); 2776 r += sprintf(buf + r, "no%s ", trace_options[i]);
2216 } 2777 }
2217 2778
2779 for (i = 0; trace_opts[i].name; i++) {
2780 if (tracer_flags & trace_opts[i].bit)
2781 r += sprintf(buf + r, "%s ",
2782 trace_opts[i].name);
2783 else
2784 r += sprintf(buf + r, "no%s ",
2785 trace_opts[i].name);
2786 }
2787
2218 r += sprintf(buf + r, "\n"); 2788 r += sprintf(buf + r, "\n");
2219 WARN_ON(r >= len + 2); 2789 WARN_ON(r >= len + 2);
2220 2790
@@ -2225,13 +2795,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2225 return r; 2795 return r;
2226} 2796}
2227 2797
2798/* Try to assign a tracer specific option */
2799static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2800{
2801 struct tracer_flags *trace_flags = trace->flags;
2802 struct tracer_opt *opts = NULL;
2803 int ret = 0, i = 0;
2804 int len;
2805
2806 for (i = 0; trace_flags->opts[i].name; i++) {
2807 opts = &trace_flags->opts[i];
2808 len = strlen(opts->name);
2809
2810 if (strncmp(cmp, opts->name, len) == 0) {
2811 ret = trace->set_flag(trace_flags->val,
2812 opts->bit, !neg);
2813 break;
2814 }
2815 }
2816 /* Not found */
2817 if (!trace_flags->opts[i].name)
2818 return -EINVAL;
2819
2820 /* Refused to handle */
2821 if (ret)
2822 return ret;
2823
2824 if (neg)
2825 trace_flags->val &= ~opts->bit;
2826 else
2827 trace_flags->val |= opts->bit;
2828
2829 return 0;
2830}
2831
2228static ssize_t 2832static ssize_t
2229tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, 2833tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2230 size_t cnt, loff_t *ppos) 2834 size_t cnt, loff_t *ppos)
2231{ 2835{
2232 char buf[64]; 2836 char buf[64];
2233 char *cmp = buf; 2837 char *cmp = buf;
2234 int neg = 0; 2838 int neg = 0;
2839 int ret;
2235 int i; 2840 int i;
2236 2841
2237 if (cnt >= sizeof(buf)) 2842 if (cnt >= sizeof(buf))
@@ -2258,11 +2863,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2258 break; 2863 break;
2259 } 2864 }
2260 } 2865 }
2261 /* 2866
2262 * If no option could be set, return an error: 2867 /* If no option could be set, test the specific tracer options */
2263 */ 2868 if (!trace_options[i]) {
2264 if (!trace_options[i]) 2869 ret = set_tracer_option(current_trace, cmp, neg);
2265 return -EINVAL; 2870 if (ret)
2871 return ret;
2872 }
2266 2873
2267 filp->f_pos += cnt; 2874 filp->f_pos += cnt;
2268 2875
@@ -2271,8 +2878,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2271 2878
2272static struct file_operations tracing_iter_fops = { 2879static struct file_operations tracing_iter_fops = {
2273 .open = tracing_open_generic, 2880 .open = tracing_open_generic,
2274 .read = tracing_iter_ctrl_read, 2881 .read = tracing_trace_options_read,
2275 .write = tracing_iter_ctrl_write, 2882 .write = tracing_trace_options_write,
2276}; 2883};
2277 2884
2278static const char readme_msg[] = 2885static const char readme_msg[] =
@@ -2286,9 +2893,9 @@ static const char readme_msg[] =
2286 "# echo sched_switch > /debug/tracing/current_tracer\n" 2893 "# echo sched_switch > /debug/tracing/current_tracer\n"
2287 "# cat /debug/tracing/current_tracer\n" 2894 "# cat /debug/tracing/current_tracer\n"
2288 "sched_switch\n" 2895 "sched_switch\n"
2289 "# cat /debug/tracing/iter_ctrl\n" 2896 "# cat /debug/tracing/trace_options\n"
2290 "noprint-parent nosym-offset nosym-addr noverbose\n" 2897 "noprint-parent nosym-offset nosym-addr noverbose\n"
2291 "# echo print-parent > /debug/tracing/iter_ctrl\n" 2898 "# echo print-parent > /debug/tracing/trace_options\n"
2292 "# echo 1 > /debug/tracing/tracing_enabled\n" 2899 "# echo 1 > /debug/tracing/tracing_enabled\n"
2293 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2900 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2294 "echo 0 > /debug/tracing/tracing_enabled\n" 2901 "echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2311,11 +2918,10 @@ static ssize_t
2311tracing_ctrl_read(struct file *filp, char __user *ubuf, 2918tracing_ctrl_read(struct file *filp, char __user *ubuf,
2312 size_t cnt, loff_t *ppos) 2919 size_t cnt, loff_t *ppos)
2313{ 2920{
2314 struct trace_array *tr = filp->private_data;
2315 char buf[64]; 2921 char buf[64];
2316 int r; 2922 int r;
2317 2923
2318 r = sprintf(buf, "%ld\n", tr->ctrl); 2924 r = sprintf(buf, "%u\n", tracer_enabled);
2319 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2925 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2320} 2926}
2321 2927
@@ -2343,16 +2949,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2343 val = !!val; 2949 val = !!val;
2344 2950
2345 mutex_lock(&trace_types_lock); 2951 mutex_lock(&trace_types_lock);
2346 if (tr->ctrl ^ val) { 2952 if (tracer_enabled ^ val) {
2347 if (val) 2953 if (val) {
2348 tracer_enabled = 1; 2954 tracer_enabled = 1;
2349 else 2955 if (current_trace->start)
2956 current_trace->start(tr);
2957 tracing_start();
2958 } else {
2350 tracer_enabled = 0; 2959 tracer_enabled = 0;
2351 2960 tracing_stop();
2352 tr->ctrl = val; 2961 if (current_trace->stop)
2353 2962 current_trace->stop(tr);
2354 if (current_trace && current_trace->ctrl_update) 2963 }
2355 current_trace->ctrl_update(tr);
2356 } 2964 }
2357 mutex_unlock(&trace_types_lock); 2965 mutex_unlock(&trace_types_lock);
2358 2966
@@ -2378,29 +2986,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2378 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2986 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2379} 2987}
2380 2988
2381static ssize_t 2989static int tracing_set_tracer(char *buf)
2382tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2383 size_t cnt, loff_t *ppos)
2384{ 2990{
2385 struct trace_array *tr = &global_trace; 2991 struct trace_array *tr = &global_trace;
2386 struct tracer *t; 2992 struct tracer *t;
2387 char buf[max_tracer_type_len+1]; 2993 int ret = 0;
2388 int i;
2389 size_t ret;
2390
2391 ret = cnt;
2392
2393 if (cnt > max_tracer_type_len)
2394 cnt = max_tracer_type_len;
2395
2396 if (copy_from_user(&buf, ubuf, cnt))
2397 return -EFAULT;
2398
2399 buf[cnt] = 0;
2400
2401 /* strip ending whitespace. */
2402 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2403 buf[i] = 0;
2404 2994
2405 mutex_lock(&trace_types_lock); 2995 mutex_lock(&trace_types_lock);
2406 for (t = trace_types; t; t = t->next) { 2996 for (t = trace_types; t; t = t->next) {
@@ -2414,18 +3004,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2414 if (t == current_trace) 3004 if (t == current_trace)
2415 goto out; 3005 goto out;
2416 3006
3007 trace_branch_disable();
2417 if (current_trace && current_trace->reset) 3008 if (current_trace && current_trace->reset)
2418 current_trace->reset(tr); 3009 current_trace->reset(tr);
2419 3010
2420 current_trace = t; 3011 current_trace = t;
2421 if (t->init) 3012 if (t->init) {
2422 t->init(tr); 3013 ret = t->init(tr);
3014 if (ret)
3015 goto out;
3016 }
2423 3017
3018 trace_branch_enable(tr);
2424 out: 3019 out:
2425 mutex_unlock(&trace_types_lock); 3020 mutex_unlock(&trace_types_lock);
2426 3021
2427 if (ret > 0) 3022 return ret;
2428 filp->f_pos += ret; 3023}
3024
3025static ssize_t
3026tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3027 size_t cnt, loff_t *ppos)
3028{
3029 char buf[max_tracer_type_len+1];
3030 int i;
3031 size_t ret;
3032 int err;
3033
3034 ret = cnt;
3035
3036 if (cnt > max_tracer_type_len)
3037 cnt = max_tracer_type_len;
3038
3039 if (copy_from_user(&buf, ubuf, cnt))
3040 return -EFAULT;
3041
3042 buf[cnt] = 0;
3043
3044 /* strip ending whitespace. */
3045 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
3046 buf[i] = 0;
3047
3048 err = tracing_set_tracer(buf);
3049 if (err)
3050 return err;
3051
3052 filp->f_pos += ret;
2429 3053
2430 return ret; 3054 return ret;
2431} 3055}
@@ -2492,6 +3116,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2492 return -ENOMEM; 3116 return -ENOMEM;
2493 3117
2494 mutex_lock(&trace_types_lock); 3118 mutex_lock(&trace_types_lock);
3119
3120 /* trace pipe does not show start of buffer */
3121 cpus_setall(iter->started);
3122
2495 iter->tr = &global_trace; 3123 iter->tr = &global_trace;
2496 iter->trace = current_trace; 3124 iter->trace = current_trace;
2497 filp->private_data = iter; 3125 filp->private_data = iter;
@@ -2667,7 +3295,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
2667 char buf[64]; 3295 char buf[64];
2668 int r; 3296 int r;
2669 3297
2670 r = sprintf(buf, "%lu\n", tr->entries); 3298 r = sprintf(buf, "%lu\n", tr->entries >> 10);
2671 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3299 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2672} 3300}
2673 3301
@@ -2678,7 +3306,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2678 unsigned long val; 3306 unsigned long val;
2679 char buf[64]; 3307 char buf[64];
2680 int ret, cpu; 3308 int ret, cpu;
2681 struct trace_array *tr = filp->private_data;
2682 3309
2683 if (cnt >= sizeof(buf)) 3310 if (cnt >= sizeof(buf))
2684 return -EINVAL; 3311 return -EINVAL;
@@ -2698,12 +3325,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2698 3325
2699 mutex_lock(&trace_types_lock); 3326 mutex_lock(&trace_types_lock);
2700 3327
2701 if (tr->ctrl) { 3328 tracing_stop();
2702 cnt = -EBUSY;
2703 pr_info("ftrace: please disable tracing"
2704 " before modifying buffer size\n");
2705 goto out;
2706 }
2707 3329
2708 /* disable all cpu buffers */ 3330 /* disable all cpu buffers */
2709 for_each_tracing_cpu(cpu) { 3331 for_each_tracing_cpu(cpu) {
@@ -2713,6 +3335,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2713 atomic_inc(&max_tr.data[cpu]->disabled); 3335 atomic_inc(&max_tr.data[cpu]->disabled);
2714 } 3336 }
2715 3337
3338 /* value is in KB */
3339 val <<= 10;
3340
2716 if (val != global_trace.entries) { 3341 if (val != global_trace.entries) {
2717 ret = ring_buffer_resize(global_trace.buffer, val); 3342 ret = ring_buffer_resize(global_trace.buffer, val);
2718 if (ret < 0) { 3343 if (ret < 0) {
@@ -2751,6 +3376,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2751 atomic_dec(&max_tr.data[cpu]->disabled); 3376 atomic_dec(&max_tr.data[cpu]->disabled);
2752 } 3377 }
2753 3378
3379 tracing_start();
2754 max_tr.entries = global_trace.entries; 3380 max_tr.entries = global_trace.entries;
2755 mutex_unlock(&trace_types_lock); 3381 mutex_unlock(&trace_types_lock);
2756 3382
@@ -2762,7 +3388,7 @@ static int mark_printk(const char *fmt, ...)
2762 int ret; 3388 int ret;
2763 va_list args; 3389 va_list args;
2764 va_start(args, fmt); 3390 va_start(args, fmt);
2765 ret = trace_vprintk(0, fmt, args); 3391 ret = trace_vprintk(0, -1, fmt, args);
2766 va_end(args); 3392 va_end(args);
2767 return ret; 3393 return ret;
2768} 3394}
@@ -2773,9 +3399,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
2773{ 3399{
2774 char *buf; 3400 char *buf;
2775 char *end; 3401 char *end;
2776 struct trace_array *tr = &global_trace;
2777 3402
2778 if (!tr->ctrl || tracing_disabled) 3403 if (tracing_disabled)
2779 return -EINVAL; 3404 return -EINVAL;
2780 3405
2781 if (cnt > TRACE_BUF_SIZE) 3406 if (cnt > TRACE_BUF_SIZE)
@@ -2841,22 +3466,38 @@ static struct file_operations tracing_mark_fops = {
2841 3466
2842#ifdef CONFIG_DYNAMIC_FTRACE 3467#ifdef CONFIG_DYNAMIC_FTRACE
2843 3468
3469int __weak ftrace_arch_read_dyn_info(char *buf, int size)
3470{
3471 return 0;
3472}
3473
2844static ssize_t 3474static ssize_t
2845tracing_read_long(struct file *filp, char __user *ubuf, 3475tracing_read_dyn_info(struct file *filp, char __user *ubuf,
2846 size_t cnt, loff_t *ppos) 3476 size_t cnt, loff_t *ppos)
2847{ 3477{
3478 static char ftrace_dyn_info_buffer[1024];
3479 static DEFINE_MUTEX(dyn_info_mutex);
2848 unsigned long *p = filp->private_data; 3480 unsigned long *p = filp->private_data;
2849 char buf[64]; 3481 char *buf = ftrace_dyn_info_buffer;
3482 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
2850 int r; 3483 int r;
2851 3484
2852 r = sprintf(buf, "%ld\n", *p); 3485 mutex_lock(&dyn_info_mutex);
3486 r = sprintf(buf, "%ld ", *p);
2853 3487
2854 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3488 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
3489 buf[r++] = '\n';
3490
3491 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3492
3493 mutex_unlock(&dyn_info_mutex);
3494
3495 return r;
2855} 3496}
2856 3497
2857static struct file_operations tracing_read_long_fops = { 3498static struct file_operations tracing_dyn_info_fops = {
2858 .open = tracing_open_generic, 3499 .open = tracing_open_generic,
2859 .read = tracing_read_long, 3500 .read = tracing_read_dyn_info,
2860}; 3501};
2861#endif 3502#endif
2862 3503
@@ -2897,10 +3538,10 @@ static __init int tracer_init_debugfs(void)
2897 if (!entry) 3538 if (!entry)
2898 pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); 3539 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2899 3540
2900 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, 3541 entry = debugfs_create_file("trace_options", 0644, d_tracer,
2901 NULL, &tracing_iter_fops); 3542 NULL, &tracing_iter_fops);
2902 if (!entry) 3543 if (!entry)
2903 pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); 3544 pr_warning("Could not create debugfs 'trace_options' entry\n");
2904 3545
2905 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3546 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2906 NULL, &tracing_cpumask_fops); 3547 NULL, &tracing_cpumask_fops);
@@ -2950,11 +3591,11 @@ static __init int tracer_init_debugfs(void)
2950 pr_warning("Could not create debugfs " 3591 pr_warning("Could not create debugfs "
2951 "'trace_pipe' entry\n"); 3592 "'trace_pipe' entry\n");
2952 3593
2953 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 3594 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
2954 &global_trace, &tracing_entries_fops); 3595 &global_trace, &tracing_entries_fops);
2955 if (!entry) 3596 if (!entry)
2956 pr_warning("Could not create debugfs " 3597 pr_warning("Could not create debugfs "
2957 "'trace_entries' entry\n"); 3598 "'buffer_size_kb' entry\n");
2958 3599
2959 entry = debugfs_create_file("trace_marker", 0220, d_tracer, 3600 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
2960 NULL, &tracing_mark_fops); 3601 NULL, &tracing_mark_fops);
@@ -2965,7 +3606,7 @@ static __init int tracer_init_debugfs(void)
2965#ifdef CONFIG_DYNAMIC_FTRACE 3606#ifdef CONFIG_DYNAMIC_FTRACE
2966 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 3607 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2967 &ftrace_update_tot_cnt, 3608 &ftrace_update_tot_cnt,
2968 &tracing_read_long_fops); 3609 &tracing_dyn_info_fops);
2969 if (!entry) 3610 if (!entry)
2970 pr_warning("Could not create debugfs " 3611 pr_warning("Could not create debugfs "
2971 "'dyn_ftrace_total_info' entry\n"); 3612 "'dyn_ftrace_total_info' entry\n");
@@ -2976,7 +3617,7 @@ static __init int tracer_init_debugfs(void)
2976 return 0; 3617 return 0;
2977} 3618}
2978 3619
2979int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 3620int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
2980{ 3621{
2981 static DEFINE_SPINLOCK(trace_buf_lock); 3622 static DEFINE_SPINLOCK(trace_buf_lock);
2982 static char trace_buf[TRACE_BUF_SIZE]; 3623 static char trace_buf[TRACE_BUF_SIZE];
@@ -2984,11 +3625,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2984 struct ring_buffer_event *event; 3625 struct ring_buffer_event *event;
2985 struct trace_array *tr = &global_trace; 3626 struct trace_array *tr = &global_trace;
2986 struct trace_array_cpu *data; 3627 struct trace_array_cpu *data;
2987 struct print_entry *entry;
2988 unsigned long flags, irq_flags;
2989 int cpu, len = 0, size, pc; 3628 int cpu, len = 0, size, pc;
3629 struct print_entry *entry;
3630 unsigned long irq_flags;
2990 3631
2991 if (!tr->ctrl || tracing_disabled) 3632 if (tracing_disabled || tracing_selftest_running)
2992 return 0; 3633 return 0;
2993 3634
2994 pc = preempt_count(); 3635 pc = preempt_count();
@@ -2999,7 +3640,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2999 if (unlikely(atomic_read(&data->disabled))) 3640 if (unlikely(atomic_read(&data->disabled)))
3000 goto out; 3641 goto out;
3001 3642
3002 spin_lock_irqsave(&trace_buf_lock, flags); 3643 pause_graph_tracing();
3644 spin_lock_irqsave(&trace_buf_lock, irq_flags);
3003 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 3645 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
3004 3646
3005 len = min(len, TRACE_BUF_SIZE-1); 3647 len = min(len, TRACE_BUF_SIZE-1);
@@ -3010,17 +3652,18 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
3010 if (!event) 3652 if (!event)
3011 goto out_unlock; 3653 goto out_unlock;
3012 entry = ring_buffer_event_data(event); 3654 entry = ring_buffer_event_data(event);
3013 tracing_generic_entry_update(&entry->ent, flags, pc); 3655 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
3014 entry->ent.type = TRACE_PRINT; 3656 entry->ent.type = TRACE_PRINT;
3015 entry->ip = ip; 3657 entry->ip = ip;
3658 entry->depth = depth;
3016 3659
3017 memcpy(&entry->buf, trace_buf, len); 3660 memcpy(&entry->buf, trace_buf, len);
3018 entry->buf[len] = 0; 3661 entry->buf[len] = 0;
3019 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 3662 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3020 3663
3021 out_unlock: 3664 out_unlock:
3022 spin_unlock_irqrestore(&trace_buf_lock, flags); 3665 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3023 3666 unpause_graph_tracing();
3024 out: 3667 out:
3025 preempt_enable_notrace(); 3668 preempt_enable_notrace();
3026 3669
@@ -3037,7 +3680,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
3037 return 0; 3680 return 0;
3038 3681
3039 va_start(ap, fmt); 3682 va_start(ap, fmt);
3040 ret = trace_vprintk(ip, fmt, ap); 3683 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3041 va_end(ap); 3684 va_end(ap);
3042 return ret; 3685 return ret;
3043} 3686}
@@ -3046,7 +3689,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
3046static int trace_panic_handler(struct notifier_block *this, 3689static int trace_panic_handler(struct notifier_block *this,
3047 unsigned long event, void *unused) 3690 unsigned long event, void *unused)
3048{ 3691{
3049 ftrace_dump(); 3692 if (ftrace_dump_on_oops)
3693 ftrace_dump();
3050 return NOTIFY_OK; 3694 return NOTIFY_OK;
3051} 3695}
3052 3696
@@ -3062,7 +3706,8 @@ static int trace_die_handler(struct notifier_block *self,
3062{ 3706{
3063 switch (val) { 3707 switch (val) {
3064 case DIE_OOPS: 3708 case DIE_OOPS:
3065 ftrace_dump(); 3709 if (ftrace_dump_on_oops)
3710 ftrace_dump();
3066 break; 3711 break;
3067 default: 3712 default:
3068 break; 3713 break;
@@ -3103,7 +3748,6 @@ trace_printk_seq(struct trace_seq *s)
3103 trace_seq_reset(s); 3748 trace_seq_reset(s);
3104} 3749}
3105 3750
3106
3107void ftrace_dump(void) 3751void ftrace_dump(void)
3108{ 3752{
3109 static DEFINE_SPINLOCK(ftrace_dump_lock); 3753 static DEFINE_SPINLOCK(ftrace_dump_lock);
@@ -3128,6 +3772,9 @@ void ftrace_dump(void)
3128 atomic_inc(&global_trace.data[cpu]->disabled); 3772 atomic_inc(&global_trace.data[cpu]->disabled);
3129 } 3773 }
3130 3774
3775 /* don't look at user memory in panic mode */
3776 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3777
3131 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 3778 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3132 3779
3133 iter.tr = &global_trace; 3780 iter.tr = &global_trace;
@@ -3221,7 +3868,6 @@ __init static int tracer_alloc_buffers(void)
3221#endif 3868#endif
3222 3869
3223 /* All seems OK, enable tracing */ 3870 /* All seems OK, enable tracing */
3224 global_trace.ctrl = tracer_enabled;
3225 tracing_disabled = 0; 3871 tracing_disabled = 0;
3226 3872
3227 atomic_notifier_chain_register(&panic_notifier_list, 3873 atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad052707..cc7a4f864036 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h>
11 12
12enum trace_type { 13enum trace_type {
13 __TRACE_FIRST_TYPE = 0, 14 __TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,14 @@ enum trace_type {
21 TRACE_SPECIAL, 22 TRACE_SPECIAL,
22 TRACE_MMIO_RW, 23 TRACE_MMIO_RW,
23 TRACE_MMIO_MAP, 24 TRACE_MMIO_MAP,
24 TRACE_BOOT, 25 TRACE_BRANCH,
26 TRACE_BOOT_CALL,
27 TRACE_BOOT_RET,
28 TRACE_GRAPH_RET,
29 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK,
31 TRACE_HW_BRANCHES,
32 TRACE_POWER,
25 33
26 __TRACE_LAST_TYPE 34 __TRACE_LAST_TYPE
27}; 35};
@@ -38,6 +46,7 @@ struct trace_entry {
38 unsigned char flags; 46 unsigned char flags;
39 unsigned char preempt_count; 47 unsigned char preempt_count;
40 int pid; 48 int pid;
49 int tgid;
41}; 50};
42 51
43/* 52/*
@@ -48,6 +57,18 @@ struct ftrace_entry {
48 unsigned long ip; 57 unsigned long ip;
49 unsigned long parent_ip; 58 unsigned long parent_ip;
50}; 59};
60
61/* Function call entry */
62struct ftrace_graph_ent_entry {
63 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent;
65};
66
67/* Function return entry */
68struct ftrace_graph_ret_entry {
69 struct trace_entry ent;
70 struct ftrace_graph_ret ret;
71};
51extern struct tracer boot_tracer; 72extern struct tracer boot_tracer;
52 73
53/* 74/*
@@ -85,12 +106,18 @@ struct stack_entry {
85 unsigned long caller[FTRACE_STACK_ENTRIES]; 106 unsigned long caller[FTRACE_STACK_ENTRIES];
86}; 107};
87 108
109struct userstack_entry {
110 struct trace_entry ent;
111 unsigned long caller[FTRACE_STACK_ENTRIES];
112};
113
88/* 114/*
89 * ftrace_printk entry: 115 * ftrace_printk entry:
90 */ 116 */
91struct print_entry { 117struct print_entry {
92 struct trace_entry ent; 118 struct trace_entry ent;
93 unsigned long ip; 119 unsigned long ip;
120 int depth;
94 char buf[]; 121 char buf[];
95}; 122};
96 123
@@ -112,9 +139,35 @@ struct trace_mmiotrace_map {
112 struct mmiotrace_map map; 139 struct mmiotrace_map map;
113}; 140};
114 141
115struct trace_boot { 142struct trace_boot_call {
116 struct trace_entry ent; 143 struct trace_entry ent;
117 struct boot_trace initcall; 144 struct boot_trace_call boot_call;
145};
146
147struct trace_boot_ret {
148 struct trace_entry ent;
149 struct boot_trace_ret boot_ret;
150};
151
152#define TRACE_FUNC_SIZE 30
153#define TRACE_FILE_SIZE 20
154struct trace_branch {
155 struct trace_entry ent;
156 unsigned line;
157 char func[TRACE_FUNC_SIZE+1];
158 char file[TRACE_FILE_SIZE+1];
159 char correct;
160};
161
162struct hw_branch_entry {
163 struct trace_entry ent;
164 u64 from;
165 u64 to;
166};
167
168struct trace_power {
169 struct trace_entry ent;
170 struct power_trace state_data;
118}; 171};
119 172
120/* 173/*
@@ -172,7 +225,6 @@ struct trace_iterator;
172struct trace_array { 225struct trace_array {
173 struct ring_buffer *buffer; 226 struct ring_buffer *buffer;
174 unsigned long entries; 227 unsigned long entries;
175 long ctrl;
176 int cpu; 228 int cpu;
177 cycle_t time_start; 229 cycle_t time_start;
178 struct task_struct *waiter; 230 struct task_struct *waiter;
@@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void);
212 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
213 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ 265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
214 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
215 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
216 IF_ASSIGN(var, ent, struct special_entry, 0); \ 269 IF_ASSIGN(var, ent, struct special_entry, 0); \
217 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
218 TRACE_MMIO_RW); \ 271 TRACE_MMIO_RW); \
219 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 272 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
220 TRACE_MMIO_MAP); \ 273 TRACE_MMIO_MAP); \
221 IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ 274 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
275 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
276 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
277 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
278 TRACE_GRAPH_ENT); \
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
222 __ftrace_bad_type(); \ 283 __ftrace_bad_type(); \
223 } while (0) 284 } while (0)
224 285
@@ -229,29 +290,56 @@ enum print_line_t {
229 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
230}; 291};
231 292
293
294/*
295 * An option specific to a tracer. This is a boolean value.
296 * The bit is the bit index that sets its value on the
297 * flags value in struct tracer_flags.
298 */
299struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */
302};
303
304/*
305 * The set of specific options for a tracer. Your tracer
306 * have to set the initial value of the flags val.
307 */
308struct tracer_flags {
309 u32 val;
310 struct tracer_opt *opts;
311};
312
313/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b
315
232/* 316/*
233 * A specific tracer, represented by methods that operate on a trace array: 317 * A specific tracer, represented by methods that operate on a trace array:
234 */ 318 */
235struct tracer { 319struct tracer {
236 const char *name; 320 const char *name;
237 void (*init)(struct trace_array *tr); 321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr);
238 void (*reset)(struct trace_array *tr); 323 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr);
239 void (*open)(struct trace_iterator *iter); 326 void (*open)(struct trace_iterator *iter);
240 void (*pipe_open)(struct trace_iterator *iter); 327 void (*pipe_open)(struct trace_iterator *iter);
241 void (*close)(struct trace_iterator *iter); 328 void (*close)(struct trace_iterator *iter);
242 void (*start)(struct trace_iterator *iter);
243 void (*stop)(struct trace_iterator *iter);
244 ssize_t (*read)(struct trace_iterator *iter, 329 ssize_t (*read)(struct trace_iterator *iter,
245 struct file *filp, char __user *ubuf, 330 struct file *filp, char __user *ubuf,
246 size_t cnt, loff_t *ppos); 331 size_t cnt, loff_t *ppos);
247 void (*ctrl_update)(struct trace_array *tr);
248#ifdef CONFIG_FTRACE_STARTUP_TEST 332#ifdef CONFIG_FTRACE_STARTUP_TEST
249 int (*selftest)(struct tracer *trace, 333 int (*selftest)(struct tracer *trace,
250 struct trace_array *tr); 334 struct trace_array *tr);
251#endif 335#endif
336 void (*print_header)(struct seq_file *m);
252 enum print_line_t (*print_line)(struct trace_iterator *iter); 337 enum print_line_t (*print_line)(struct trace_iterator *iter);
338 /* If you handled the flag setting, return 0 */
339 int (*set_flag)(u32 old_flags, u32 bit, int set);
253 struct tracer *next; 340 struct tracer *next;
254 int print_max; 341 int print_max;
342 struct tracer_flags *flags;
255}; 343};
256 344
257struct trace_seq { 345struct trace_seq {
@@ -279,10 +367,14 @@ struct trace_iterator {
279 unsigned long iter_flags; 367 unsigned long iter_flags;
280 loff_t pos; 368 loff_t pos;
281 long idx; 369 long idx;
370
371 cpumask_t started;
282}; 372};
283 373
374int tracing_is_enabled(void);
284void trace_wake_up(void); 375void trace_wake_up(void);
285void tracing_reset(struct trace_array *tr, int cpu); 376void tracing_reset(struct trace_array *tr, int cpu);
377void tracing_reset_online_cpus(struct trace_array *tr);
286int tracing_open_generic(struct inode *inode, struct file *filp); 378int tracing_open_generic(struct inode *inode, struct file *filp);
287struct dentry *tracing_init_dentry(void); 379struct dentry *tracing_init_dentry(void);
288void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 380void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
@@ -321,8 +413,15 @@ void trace_function(struct trace_array *tr,
321 unsigned long parent_ip, 413 unsigned long parent_ip,
322 unsigned long flags, int pc); 414 unsigned long flags, int pc);
323 415
416void trace_graph_return(struct ftrace_graph_ret *trace);
417int trace_graph_entry(struct ftrace_graph_ent *trace);
418void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
419
324void tracing_start_cmdline_record(void); 420void tracing_start_cmdline_record(void);
325void tracing_stop_cmdline_record(void); 421void tracing_stop_cmdline_record(void);
422void tracing_sched_switch_assign_trace(struct trace_array *tr);
423void tracing_stop_sched_switch_record(void);
424void tracing_start_sched_switch_record(void);
326int register_tracer(struct tracer *type); 425int register_tracer(struct tracer *type);
327void unregister_tracer(struct tracer *type); 426void unregister_tracer(struct tracer *type);
328 427
@@ -358,6 +457,7 @@ struct tracer_switch_ops {
358 struct tracer_switch_ops *next; 457 struct tracer_switch_ops *next;
359}; 458};
360 459
460char *trace_find_cmdline(int pid);
361#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 461#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
362 462
363#ifdef CONFIG_DYNAMIC_FTRACE 463#ifdef CONFIG_DYNAMIC_FTRACE
@@ -383,19 +483,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
383 struct trace_array *tr); 483 struct trace_array *tr);
384extern int trace_selftest_startup_sysprof(struct tracer *trace, 484extern int trace_selftest_startup_sysprof(struct tracer *trace,
385 struct trace_array *tr); 485 struct trace_array *tr);
486extern int trace_selftest_startup_branch(struct tracer *trace,
487 struct trace_array *tr);
386#endif /* CONFIG_FTRACE_STARTUP_TEST */ 488#endif /* CONFIG_FTRACE_STARTUP_TEST */
387 489
388extern void *head_page(struct trace_array_cpu *data); 490extern void *head_page(struct trace_array_cpu *data);
389extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 491extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
390extern void trace_seq_print_cont(struct trace_seq *s, 492extern void trace_seq_print_cont(struct trace_seq *s,
391 struct trace_iterator *iter); 493 struct trace_iterator *iter);
494
495extern int
496seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
497 unsigned long sym_flags);
392extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 498extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
393 size_t cnt); 499 size_t cnt);
394extern long ns2usecs(cycle_t nsec); 500extern long ns2usecs(cycle_t nsec);
395extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); 501extern int
502trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
396 503
397extern unsigned long trace_flags; 504extern unsigned long trace_flags;
398 505
506/* Standard output formatting function used for function return traces */
507#ifdef CONFIG_FUNCTION_GRAPH_TRACER
508extern enum print_line_t print_graph_function(struct trace_iterator *iter);
509
510#ifdef CONFIG_DYNAMIC_FTRACE
511/* TODO: make this variable */
512#define FTRACE_GRAPH_MAX_FUNCS 32
513extern int ftrace_graph_count;
514extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
515
516static inline int ftrace_graph_addr(unsigned long addr)
517{
518 int i;
519
520 if (!ftrace_graph_count || test_tsk_trace_graph(current))
521 return 1;
522
523 for (i = 0; i < ftrace_graph_count; i++) {
524 if (addr == ftrace_graph_funcs[i])
525 return 1;
526 }
527
528 return 0;
529}
530#else
531static inline int ftrace_trace_addr(unsigned long addr)
532{
533 return 1;
534}
535static inline int ftrace_graph_addr(unsigned long addr)
536{
537 return 1;
538}
539#endif /* CONFIG_DYNAMIC_FTRACE */
540
541#else /* CONFIG_FUNCTION_GRAPH_TRACER */
542static inline enum print_line_t
543print_graph_function(struct trace_iterator *iter)
544{
545 return TRACE_TYPE_UNHANDLED;
546}
547#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
548
549extern struct pid *ftrace_pid_trace;
550
551static inline int ftrace_trace_task(struct task_struct *task)
552{
553 if (!ftrace_pid_trace)
554 return 1;
555
556 return test_tsk_trace_trace(task);
557}
558
399/* 559/*
400 * trace_iterator_flags is an enumeration that defines bit 560 * trace_iterator_flags is an enumeration that defines bit
401 * positions into trace_flags that controls the output. 561 * positions into trace_flags that controls the output.
@@ -415,8 +575,93 @@ enum trace_iterator_flags {
415 TRACE_ITER_STACKTRACE = 0x100, 575 TRACE_ITER_STACKTRACE = 0x100,
416 TRACE_ITER_SCHED_TREE = 0x200, 576 TRACE_ITER_SCHED_TREE = 0x200,
417 TRACE_ITER_PRINTK = 0x400, 577 TRACE_ITER_PRINTK = 0x400,
578 TRACE_ITER_PREEMPTONLY = 0x800,
579 TRACE_ITER_BRANCH = 0x1000,
580 TRACE_ITER_ANNOTATE = 0x2000,
581 TRACE_ITER_USERSTACKTRACE = 0x4000,
582 TRACE_ITER_SYM_USEROBJ = 0x8000,
583 TRACE_ITER_PRINTK_MSGONLY = 0x10000
418}; 584};
419 585
586/*
587 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
588 * control the output of kernel symbols.
589 */
590#define TRACE_ITER_SYM_MASK \
591 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
592
420extern struct tracer nop_trace; 593extern struct tracer nop_trace;
421 594
595/**
596 * ftrace_preempt_disable - disable preemption scheduler safe
597 *
598 * When tracing can happen inside the scheduler, there exists
599 * cases that the tracing might happen before the need_resched
600 * flag is checked. If this happens and the tracer calls
601 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion.
603 *
604 * To prevent this, we read the need_recshed flag before
605 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable.
608 *
609 * The rational for doing the above is that if need resched is set
610 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched.
613 */
614static inline int ftrace_preempt_disable(void)
615{
616 int resched;
617
618 resched = need_resched();
619 preempt_disable_notrace();
620
621 return resched;
622}
623
624/**
625 * ftrace_preempt_enable - enable preemption scheduler safe
626 * @resched: the return value from ftrace_preempt_disable
627 *
628 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or
631 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead.
634 */
635static inline void ftrace_preempt_enable(int resched)
636{
637 if (resched)
638 preempt_enable_no_resched_notrace();
639 else
640 preempt_enable_notrace();
641}
642
643#ifdef CONFIG_BRANCH_TRACER
644extern int enable_branch_tracing(struct trace_array *tr);
645extern void disable_branch_tracing(void);
646static inline int trace_branch_enable(struct trace_array *tr)
647{
648 if (trace_flags & TRACE_ITER_BRANCH)
649 return enable_branch_tracing(tr);
650 return 0;
651}
652static inline void trace_branch_disable(void)
653{
654 /* due to races, always disable */
655 disable_branch_tracing();
656}
657#else
658static inline int trace_branch_enable(struct trace_array *tr)
659{
660 return 0;
661}
662static inline void trace_branch_disable(void)
663{
664}
665#endif /* CONFIG_BRANCH_TRACER */
666
422#endif /* _LINUX_KERNEL_TRACE_H */ 667#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff2..3ccebde28482 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,101 +13,161 @@
13#include "trace.h" 13#include "trace.h"
14 14
15static struct trace_array *boot_trace; 15static struct trace_array *boot_trace;
16static int trace_boot_enabled; 16static bool pre_initcalls_finished;
17 17
18 18/* Tells the boot tracer that the pre_smp_initcalls are finished.
19/* Should be started after do_pre_smp_initcalls() in init/main.c */ 19 * So we are ready .
20 * It doesn't enable sched events tracing however.
21 * You have to call enable_boot_trace to do so.
22 */
20void start_boot_trace(void) 23void start_boot_trace(void)
21{ 24{
22 trace_boot_enabled = 1; 25 pre_initcalls_finished = true;
23} 26}
24 27
25void stop_boot_trace(void) 28void enable_boot_trace(void)
26{ 29{
27 trace_boot_enabled = 0; 30 if (pre_initcalls_finished)
31 tracing_start_sched_switch_record();
28} 32}
29 33
30void reset_boot_trace(struct trace_array *tr) 34void disable_boot_trace(void)
31{ 35{
32 stop_boot_trace(); 36 if (pre_initcalls_finished)
37 tracing_stop_sched_switch_record();
33} 38}
34 39
35static void boot_trace_init(struct trace_array *tr) 40static int boot_trace_init(struct trace_array *tr)
36{ 41{
37 int cpu; 42 int cpu;
38 boot_trace = tr; 43 boot_trace = tr;
39 44
40 trace_boot_enabled = 0;
41
42 for_each_cpu_mask(cpu, cpu_possible_map) 45 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu); 46 tracing_reset(tr, cpu);
47
48 tracing_sched_switch_assign_trace(tr);
49 return 0;
44} 50}
45 51
46static void boot_trace_ctrl_update(struct trace_array *tr) 52static enum print_line_t
53initcall_call_print_line(struct trace_iterator *iter)
47{ 54{
48 if (tr->ctrl) 55 struct trace_entry *entry = iter->ent;
49 start_boot_trace(); 56 struct trace_seq *s = &iter->seq;
57 struct trace_boot_call *field;
58 struct boot_trace_call *call;
59 u64 ts;
60 unsigned long nsec_rem;
61 int ret;
62
63 trace_assign_type(field, entry);
64 call = &field->boot_call;
65 ts = iter->ts;
66 nsec_rem = do_div(ts, 1000000000);
67
68 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
69 (unsigned long)ts, nsec_rem, call->func, call->caller);
70
71 if (!ret)
72 return TRACE_TYPE_PARTIAL_LINE;
50 else 73 else
51 stop_boot_trace(); 74 return TRACE_TYPE_HANDLED;
52} 75}
53 76
54static enum print_line_t initcall_print_line(struct trace_iterator *iter) 77static enum print_line_t
78initcall_ret_print_line(struct trace_iterator *iter)
55{ 79{
56 int ret;
57 struct trace_entry *entry = iter->ent; 80 struct trace_entry *entry = iter->ent;
58 struct trace_boot *field = (struct trace_boot *)entry;
59 struct boot_trace *it = &field->initcall;
60 struct trace_seq *s = &iter->seq; 81 struct trace_seq *s = &iter->seq;
61 struct timespec calltime = ktime_to_timespec(it->calltime); 82 struct trace_boot_ret *field;
62 struct timespec rettime = ktime_to_timespec(it->rettime); 83 struct boot_trace_ret *init_ret;
63 84 u64 ts;
64 if (entry->type == TRACE_BOOT) { 85 unsigned long nsec_rem;
65 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 86 int ret;
66 calltime.tv_sec, 87
67 calltime.tv_nsec, 88 trace_assign_type(field, entry);
68 it->func, it->caller); 89 init_ret = &field->boot_ret;
69 if (!ret) 90 ts = iter->ts;
70 return TRACE_TYPE_PARTIAL_LINE; 91 nsec_rem = do_div(ts, 1000000000);
71 92
72 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 93 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
73 "returned %d after %lld msecs\n", 94 "returned %d after %llu msecs\n",
74 rettime.tv_sec, 95 (unsigned long) ts,
75 rettime.tv_nsec, 96 nsec_rem,
76 it->func, it->result, it->duration); 97 init_ret->func, init_ret->result, init_ret->duration);
77 98
78 if (!ret) 99 if (!ret)
79 return TRACE_TYPE_PARTIAL_LINE; 100 return TRACE_TYPE_PARTIAL_LINE;
101 else
80 return TRACE_TYPE_HANDLED; 102 return TRACE_TYPE_HANDLED;
103}
104
105static enum print_line_t initcall_print_line(struct trace_iterator *iter)
106{
107 struct trace_entry *entry = iter->ent;
108
109 switch (entry->type) {
110 case TRACE_BOOT_CALL:
111 return initcall_call_print_line(iter);
112 case TRACE_BOOT_RET:
113 return initcall_ret_print_line(iter);
114 default:
115 return TRACE_TYPE_UNHANDLED;
81 } 116 }
82 return TRACE_TYPE_UNHANDLED;
83} 117}
84 118
85struct tracer boot_tracer __read_mostly = 119struct tracer boot_tracer __read_mostly =
86{ 120{
87 .name = "initcall", 121 .name = "initcall",
88 .init = boot_trace_init, 122 .init = boot_trace_init,
89 .reset = reset_boot_trace, 123 .reset = tracing_reset_online_cpus,
90 .ctrl_update = boot_trace_ctrl_update,
91 .print_line = initcall_print_line, 124 .print_line = initcall_print_line,
92}; 125};
93 126
94void trace_boot(struct boot_trace *it, initcall_t fn) 127void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
95{ 128{
96 struct ring_buffer_event *event; 129 struct ring_buffer_event *event;
97 struct trace_boot *entry; 130 struct trace_boot_call *entry;
98 struct trace_array_cpu *data;
99 unsigned long irq_flags; 131 unsigned long irq_flags;
100 struct trace_array *tr = boot_trace; 132 struct trace_array *tr = boot_trace;
101 133
102 if (!trace_boot_enabled) 134 if (!pre_initcalls_finished)
103 return; 135 return;
104 136
105 /* Get its name now since this function could 137 /* Get its name now since this function could
106 * disappear because it is in the .init section. 138 * disappear because it is in the .init section.
107 */ 139 */
108 sprint_symbol(it->func, (unsigned long)fn); 140 sprint_symbol(bt->func, (unsigned long)fn);
141 preempt_disable();
142
143 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
144 &irq_flags);
145 if (!event)
146 goto out;
147 entry = ring_buffer_event_data(event);
148 tracing_generic_entry_update(&entry->ent, 0, 0);
149 entry->ent.type = TRACE_BOOT_CALL;
150 entry->boot_call = *bt;
151 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
152
153 trace_wake_up();
154
155 out:
156 preempt_enable();
157}
158
159void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
160{
161 struct ring_buffer_event *event;
162 struct trace_boot_ret *entry;
163 unsigned long irq_flags;
164 struct trace_array *tr = boot_trace;
165
166 if (!pre_initcalls_finished)
167 return;
168
169 sprint_symbol(bt->func, (unsigned long)fn);
109 preempt_disable(); 170 preempt_disable();
110 data = tr->data[smp_processor_id()];
111 171
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 172 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
113 &irq_flags); 173 &irq_flags);
@@ -115,8 +175,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
115 goto out; 175 goto out;
116 entry = ring_buffer_event_data(event); 176 entry = ring_buffer_event_data(event);
117 tracing_generic_entry_update(&entry->ent, 0, 0); 177 tracing_generic_entry_update(&entry->ent, 0, 0);
118 entry->ent.type = TRACE_BOOT; 178 entry->ent.type = TRACE_BOOT_RET;
119 entry->initcall = *it; 179 entry->boot_ret = *bt;
120 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 180 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
121 181
122 trace_wake_up(); 182 trace_wake_up();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000000..6c00feb3bac7
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,342 @@
1/*
2 * unlikely profiler
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h>
12#include <linux/module.h>
13#include <linux/ftrace.h>
14#include <linux/hash.h>
15#include <linux/fs.h>
16#include <asm/local.h>
17#include "trace.h"
18
19#ifdef CONFIG_BRANCH_TRACER
20
21static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex);
23static struct trace_array *branch_tracer;
24
25static void
26probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
27{
28 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event;
30 struct trace_branch *entry;
31 unsigned long flags, irq_flags;
32 int cpu, pc;
33 const char *p;
34
35 /*
36 * I would love to save just the ftrace_likely_data pointer, but
37 * this code can also be used by modules. Ugly things can happen
38 * if the module is unloaded, and then we go and read the
39 * pointer. This is slower, but much safer.
40 */
41
42 if (unlikely(!tr))
43 return;
44
45 local_irq_save(flags);
46 cpu = raw_smp_processor_id();
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
51 &irq_flags);
52 if (!event)
53 goto out;
54
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59
60 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file);
62 while (p >= f->file && *p != '/')
63 p--;
64 p++;
65
66 strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
67 strncpy(entry->file, p, TRACE_FILE_SIZE);
68 entry->func[TRACE_FUNC_SIZE] = 0;
69 entry->file[TRACE_FILE_SIZE] = 0;
70 entry->line = f->line;
71 entry->correct = val == expect;
72
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
74
75 out:
76 atomic_dec(&tr->data[cpu]->disabled);
77 local_irq_restore(flags);
78}
79
80static inline
81void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
82{
83 if (!branch_tracing_enabled)
84 return;
85
86 probe_likely_condition(f, val, expect);
87}
88
89int enable_branch_tracing(struct trace_array *tr)
90{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr;
95 /*
96 * Must be seen before enabling. The reader is a condition
97 * where we do not need a matching rmb()
98 */
99 smp_wmb();
100 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex);
102
103 return ret;
104}
105
106void disable_branch_tracing(void)
107{
108 mutex_lock(&branch_tracing_mutex);
109
110 if (!branch_tracing_enabled)
111 goto out_unlock;
112
113 branch_tracing_enabled--;
114
115 out_unlock:
116 mutex_unlock(&branch_tracing_mutex);
117}
118
119static void start_branch_trace(struct trace_array *tr)
120{
121 enable_branch_tracing(tr);
122}
123
124static void stop_branch_trace(struct trace_array *tr)
125{
126 disable_branch_tracing();
127}
128
129static int branch_trace_init(struct trace_array *tr)
130{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr);
137 return 0;
138}
139
140static void branch_trace_reset(struct trace_array *tr)
141{
142 stop_branch_trace(tr);
143}
144
145struct tracer branch_trace __read_mostly =
146{
147 .name = "branch",
148 .init = branch_trace_init,
149 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch,
152#endif
153};
154
155__init static int init_branch_trace(void)
156{
157 return register_tracer(&branch_trace);
158}
159
160device_initcall(init_branch_trace);
161#else
162static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
164{
165}
166#endif /* CONFIG_BRANCH_TRACER */
167
168void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
169{
170 /*
171 * I would love to have a trace point here instead, but the
172 * trace point code is so inundated with unlikely and likely
173 * conditions that the recursive nightmare that exists is too
174 * much to try to get working. At least for now.
175 */
176 trace_likely_condition(f, val, expect);
177
178 /* FIXME: Make this atomic! */
179 if (val == expect)
180 f->correct++;
181 else
182 f->incorrect++;
183}
184EXPORT_SYMBOL(ftrace_likely_update);
185
186struct ftrace_pointer {
187 void *start;
188 void *stop;
189 int hit;
190};
191
192static void *
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{
195 const struct ftrace_pointer *f = m->private;
196 struct ftrace_branch_data *p = v;
197
198 (*pos)++;
199
200 if (v == (void *)1)
201 return f->start;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209}
210
211static void *t_start(struct seq_file *m, loff_t *pos)
212{
213 void *t = (void *)1;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218
219 return t;
220}
221
222static void t_stop(struct seq_file *m, void *p)
223{
224}
225
226static int t_show(struct seq_file *m, void *v)
227{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v;
230 const char *f;
231 long percent;
232
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */
247 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/')
249 f--;
250 f++;
251
252 /*
253 * The miss is overlayed on correct, and hit on incorrect.
254 */
255 if (p->correct) {
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0)
263 seq_printf(m, " X ");
264 else
265 seq_printf(m, "%3ld ", percent);
266 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
267 return 0;
268}
269
270static struct seq_operations tracing_likely_seq_ops = {
271 .start = t_start,
272 .next = t_next,
273 .stop = t_stop,
274 .show = t_show,
275};
276
277static int tracing_branch_open(struct inode *inode, struct file *file)
278{
279 int ret;
280
281 ret = seq_open(file, &tracing_likely_seq_ops);
282 if (!ret) {
283 struct seq_file *m = file->private_data;
284 m->private = (void *)inode->i_private;
285 }
286
287 return ret;
288}
289
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295
296#ifdef CONFIG_PROFILE_ALL_BRANCHES
297extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[];
299
300static const struct ftrace_pointer ftrace_branch_pos = {
301 .start = __start_branch_profile,
302 .stop = __stop_branch_profile,
303 .hit = 1,
304};
305
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */
307
308extern unsigned long __start_annotated_branch_profile[];
309extern unsigned long __stop_annotated_branch_profile[];
310
311static const struct ftrace_pointer ftrace_annotated_branch_pos = {
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315
316static __init int ftrace_branch_init(void)
317{
318 struct dentry *d_tracer;
319 struct dentry *entry;
320
321 d_tracer = tracing_init_dentry();
322
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
324 (void *)&ftrace_annotated_branch_pos,
325 &tracing_branch_fops);
326 if (!entry)
327 pr_warning("Could not create debugfs "
328 "'profile_annotatet_branch' entry\n");
329
330#ifdef CONFIG_PROFILE_ALL_BRANCHES
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer,
332 (void *)&ftrace_branch_pos,
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338
339 return 0;
340}
341
342device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0f85a64003d3..9236d7e25a16 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,20 +16,10 @@
16 16
17#include "trace.h" 17#include "trace.h"
18 18
19static void function_reset(struct trace_array *tr)
20{
21 int cpu;
22
23 tr->time_start = ftrace_now(tr->cpu);
24
25 for_each_online_cpu(cpu)
26 tracing_reset(tr, cpu);
27}
28
29static void start_function_trace(struct trace_array *tr) 19static void start_function_trace(struct trace_array *tr)
30{ 20{
31 tr->cpu = get_cpu(); 21 tr->cpu = get_cpu();
32 function_reset(tr); 22 tracing_reset_online_cpus(tr);
33 put_cpu(); 23 put_cpu();
34 24
35 tracing_start_cmdline_record(); 25 tracing_start_cmdline_record();
@@ -42,24 +32,20 @@ static void stop_function_trace(struct trace_array *tr)
42 tracing_stop_cmdline_record(); 32 tracing_stop_cmdline_record();
43} 33}
44 34
45static void function_trace_init(struct trace_array *tr) 35static int function_trace_init(struct trace_array *tr)
46{ 36{
47 if (tr->ctrl) 37 start_function_trace(tr);
48 start_function_trace(tr); 38 return 0;
49} 39}
50 40
51static void function_trace_reset(struct trace_array *tr) 41static void function_trace_reset(struct trace_array *tr)
52{ 42{
53 if (tr->ctrl) 43 stop_function_trace(tr);
54 stop_function_trace(tr);
55} 44}
56 45
57static void function_trace_ctrl_update(struct trace_array *tr) 46static void function_trace_start(struct trace_array *tr)
58{ 47{
59 if (tr->ctrl) 48 tracing_reset_online_cpus(tr);
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63} 49}
64 50
65static struct tracer function_trace __read_mostly = 51static struct tracer function_trace __read_mostly =
@@ -67,7 +53,7 @@ static struct tracer function_trace __read_mostly =
67 .name = "function", 53 .name = "function",
68 .init = function_trace_init, 54 .init = function_trace_init,
69 .reset = function_trace_reset, 55 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update, 56 .start = function_trace_start,
71#ifdef CONFIG_FTRACE_SELFTEST 57#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function, 58 .selftest = trace_selftest_startup_function,
73#endif 59#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 000000000000..4bf39fcae97a
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,669 @@
1/*
2 *
3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 *
8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/fs.h>
13
14#include "trace.h"
15
16#define TRACE_GRAPH_INDENT 2
17
18/* Flag options */
19#define TRACE_GRAPH_PRINT_OVERRUN 0x1
20#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8
23
24static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
29 /* Display Overhead ? */
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
33 { } /* Empty entry */
34};
35
36static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
39 .opts = trace_opts
40};
41
42/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44
45static int graph_trace_init(struct trace_array *tr)
46{
47 int cpu, ret;
48
49 for_each_online_cpu(cpu)
50 tracing_reset(tr, cpu);
51
52 ret = register_ftrace_graph(&trace_graph_return,
53 &trace_graph_entry);
54 if (ret)
55 return ret;
56 tracing_start_cmdline_record();
57
58 return 0;
59}
60
61static void graph_trace_reset(struct trace_array *tr)
62{
63 tracing_stop_cmdline_record();
64 unregister_ftrace_graph();
65}
66
67static inline int log10_cpu(int nb)
68{
69 if (nb / 100)
70 return 3;
71 if (nb / 10)
72 return 2;
73 return 1;
74}
75
76static enum print_line_t
77print_graph_cpu(struct trace_seq *s, int cpu)
78{
79 int i;
80 int ret;
81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
83
84
85 /*
86 * Start with a space character - to make it stand out
87 * to the right a bit when trace output is pasted into
88 * email:
89 */
90 ret = trace_seq_printf(s, " ");
91
92 /*
93 * Tricky - we space the CPU field according to the max
94 * number of online CPUs. On a 2-cpu system it would take
95 * a maximum of 1 digit - on a 128 cpu system it would
96 * take up to 3 digits:
97 */
98 for (i = 0; i < log10_all - log10_this; i++) {
99 ret = trace_seq_printf(s, " ");
100 if (!ret)
101 return TRACE_TYPE_PARTIAL_LINE;
102 }
103 ret = trace_seq_printf(s, "%d) ", cpu);
104 if (!ret)
105 return TRACE_TYPE_PARTIAL_LINE;
106
107 return TRACE_TYPE_HANDLED;
108}
109
110#define TRACE_GRAPH_PROCINFO_LENGTH 14
111
112static enum print_line_t
113print_graph_proc(struct trace_seq *s, pid_t pid)
114{
115 int i;
116 int ret;
117 int len;
118 char comm[8];
119 int spaces = 0;
120 /* sign + log10(MAX_INT) + '\0' */
121 char pid_str[11];
122
123 strncpy(comm, trace_find_cmdline(pid), 7);
124 comm[7] = '\0';
125 sprintf(pid_str, "%d", pid);
126
127 /* 1 stands for the "-" character */
128 len = strlen(comm) + strlen(pid_str) + 1;
129
130 if (len < TRACE_GRAPH_PROCINFO_LENGTH)
131 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
132
133 /* First spaces to align center */
134 for (i = 0; i < spaces / 2; i++) {
135 ret = trace_seq_printf(s, " ");
136 if (!ret)
137 return TRACE_TYPE_PARTIAL_LINE;
138 }
139
140 ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
141 if (!ret)
142 return TRACE_TYPE_PARTIAL_LINE;
143
144 /* Last spaces to align center */
145 for (i = 0; i < spaces - (spaces / 2); i++) {
146 ret = trace_seq_printf(s, " ");
147 if (!ret)
148 return TRACE_TYPE_PARTIAL_LINE;
149 }
150 return TRACE_TYPE_HANDLED;
151}
152
153
154/* If the pid changed since the last trace, output this event */
155static enum print_line_t
156verif_pid(struct trace_seq *s, pid_t pid, int cpu)
157{
158 pid_t prev_pid;
159 int ret;
160
161 if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
162 return TRACE_TYPE_HANDLED;
163
164 prev_pid = last_pid[cpu];
165 last_pid[cpu] = pid;
166
167/*
168 * Context-switch trace line:
169
170 ------------------------------------------
171 | 1) migration/0--1 => sshd-1755
172 ------------------------------------------
173
174 */
175 ret = trace_seq_printf(s,
176 " ------------------------------------------\n");
177 if (!ret)
178 TRACE_TYPE_PARTIAL_LINE;
179
180 ret = print_graph_cpu(s, cpu);
181 if (ret == TRACE_TYPE_PARTIAL_LINE)
182 TRACE_TYPE_PARTIAL_LINE;
183
184 ret = print_graph_proc(s, prev_pid);
185 if (ret == TRACE_TYPE_PARTIAL_LINE)
186 TRACE_TYPE_PARTIAL_LINE;
187
188 ret = trace_seq_printf(s, " => ");
189 if (!ret)
190 TRACE_TYPE_PARTIAL_LINE;
191
192 ret = print_graph_proc(s, pid);
193 if (ret == TRACE_TYPE_PARTIAL_LINE)
194 TRACE_TYPE_PARTIAL_LINE;
195
196 ret = trace_seq_printf(s,
197 "\n ------------------------------------------\n\n");
198 if (!ret)
199 TRACE_TYPE_PARTIAL_LINE;
200
201 return ret;
202}
203
204static bool
205trace_branch_is_leaf(struct trace_iterator *iter,
206 struct ftrace_graph_ent_entry *curr)
207{
208 struct ring_buffer_iter *ring_iter;
209 struct ring_buffer_event *event;
210 struct ftrace_graph_ret_entry *next;
211
212 ring_iter = iter->buffer_iter[iter->cpu];
213
214 if (!ring_iter)
215 return false;
216
217 event = ring_buffer_iter_peek(ring_iter, NULL);
218
219 if (!event)
220 return false;
221
222 next = ring_buffer_event_data(event);
223
224 if (next->ent.type != TRACE_GRAPH_RET)
225 return false;
226
227 if (curr->ent.pid != next->ent.pid ||
228 curr->graph_ent.func != next->ret.func)
229 return false;
230
231 return true;
232}
233
234static enum print_line_t
235print_graph_irq(struct trace_seq *s, unsigned long addr,
236 enum trace_type type, int cpu, pid_t pid)
237{
238 int ret;
239
240 if (addr < (unsigned long)__irqentry_text_start ||
241 addr >= (unsigned long)__irqentry_text_end)
242 return TRACE_TYPE_UNHANDLED;
243
244 if (type == TRACE_GRAPH_ENT) {
245 ret = trace_seq_printf(s, "==========> | ");
246 } else {
247 /* Cpu */
248 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
249 ret = print_graph_cpu(s, cpu);
250 if (ret == TRACE_TYPE_PARTIAL_LINE)
251 return TRACE_TYPE_PARTIAL_LINE;
252 }
253 /* Proc */
254 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
255 ret = print_graph_proc(s, pid);
256 if (ret == TRACE_TYPE_PARTIAL_LINE)
257 return TRACE_TYPE_PARTIAL_LINE;
258
259 ret = trace_seq_printf(s, " | ");
260 if (!ret)
261 return TRACE_TYPE_PARTIAL_LINE;
262 }
263
264 /* No overhead */
265 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
266 ret = trace_seq_printf(s, " ");
267 if (!ret)
268 return TRACE_TYPE_PARTIAL_LINE;
269 }
270
271 ret = trace_seq_printf(s, "<========== |\n");
272 }
273 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE;
275 return TRACE_TYPE_HANDLED;
276}
277
278static enum print_line_t
279print_graph_duration(unsigned long long duration, struct trace_seq *s)
280{
281 unsigned long nsecs_rem = do_div(duration, 1000);
282 /* log10(ULONG_MAX) + '\0' */
283 char msecs_str[21];
284 char nsecs_str[5];
285 int ret, len;
286 int i;
287
288 sprintf(msecs_str, "%lu", (unsigned long) duration);
289
290 /* Print msecs */
291 ret = trace_seq_printf(s, msecs_str);
292 if (!ret)
293 return TRACE_TYPE_PARTIAL_LINE;
294
295 len = strlen(msecs_str);
296
297 /* Print nsecs (we don't want to exceed 7 numbers) */
298 if (len < 7) {
299 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
300 ret = trace_seq_printf(s, ".%s", nsecs_str);
301 if (!ret)
302 return TRACE_TYPE_PARTIAL_LINE;
303 len += strlen(nsecs_str);
304 }
305
306 ret = trace_seq_printf(s, " us ");
307 if (!ret)
308 return TRACE_TYPE_PARTIAL_LINE;
309
310 /* Print remaining spaces to fit the row's width */
311 for (i = len; i < 7; i++) {
312 ret = trace_seq_printf(s, " ");
313 if (!ret)
314 return TRACE_TYPE_PARTIAL_LINE;
315 }
316
317 ret = trace_seq_printf(s, "| ");
318 if (!ret)
319 return TRACE_TYPE_PARTIAL_LINE;
320 return TRACE_TYPE_HANDLED;
321
322}
323
324/* Signal a overhead of time execution to the output */
325static int
326print_graph_overhead(unsigned long long duration, struct trace_seq *s)
327{
328 /* Duration exceeded 100 msecs */
329 if (duration > 100000ULL)
330 return trace_seq_printf(s, "! ");
331
332 /* Duration exceeded 10 msecs */
333 if (duration > 10000ULL)
334 return trace_seq_printf(s, "+ ");
335
336 return trace_seq_printf(s, " ");
337}
338
339/* Case of a leaf function on its call entry */
340static enum print_line_t
341print_graph_entry_leaf(struct trace_iterator *iter,
342 struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
343{
344 struct ftrace_graph_ret_entry *ret_entry;
345 struct ftrace_graph_ret *graph_ret;
346 struct ring_buffer_event *event;
347 struct ftrace_graph_ent *call;
348 unsigned long long duration;
349 int ret;
350 int i;
351
352 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
353 ret_entry = ring_buffer_event_data(event);
354 graph_ret = &ret_entry->ret;
355 call = &entry->graph_ent;
356 duration = graph_ret->rettime - graph_ret->calltime;
357
358 /* Overhead */
359 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
360 ret = print_graph_overhead(duration, s);
361 if (!ret)
362 return TRACE_TYPE_PARTIAL_LINE;
363 }
364
365 /* Duration */
366 ret = print_graph_duration(duration, s);
367 if (ret == TRACE_TYPE_PARTIAL_LINE)
368 return TRACE_TYPE_PARTIAL_LINE;
369
370 /* Function */
371 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
372 ret = trace_seq_printf(s, " ");
373 if (!ret)
374 return TRACE_TYPE_PARTIAL_LINE;
375 }
376
377 ret = seq_print_ip_sym(s, call->func, 0);
378 if (!ret)
379 return TRACE_TYPE_PARTIAL_LINE;
380
381 ret = trace_seq_printf(s, "();\n");
382 if (!ret)
383 return TRACE_TYPE_PARTIAL_LINE;
384
385 return TRACE_TYPE_HANDLED;
386}
387
388static enum print_line_t
389print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
390 struct trace_seq *s, pid_t pid, int cpu)
391{
392 int i;
393 int ret;
394 struct ftrace_graph_ent *call = &entry->graph_ent;
395
396 /* No overhead */
397 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
398 ret = trace_seq_printf(s, " ");
399 if (!ret)
400 return TRACE_TYPE_PARTIAL_LINE;
401 }
402
403 /* Interrupt */
404 ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
405 if (ret == TRACE_TYPE_UNHANDLED) {
406 /* No time */
407 ret = trace_seq_printf(s, " | ");
408 if (!ret)
409 return TRACE_TYPE_PARTIAL_LINE;
410 } else {
411 if (ret == TRACE_TYPE_PARTIAL_LINE)
412 return TRACE_TYPE_PARTIAL_LINE;
413 }
414
415
416 /* Function */
417 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
418 ret = trace_seq_printf(s, " ");
419 if (!ret)
420 return TRACE_TYPE_PARTIAL_LINE;
421 }
422
423 ret = seq_print_ip_sym(s, call->func, 0);
424 if (!ret)
425 return TRACE_TYPE_PARTIAL_LINE;
426
427 ret = trace_seq_printf(s, "() {\n");
428 if (!ret)
429 return TRACE_TYPE_PARTIAL_LINE;
430
431 return TRACE_TYPE_HANDLED;
432}
433
434static enum print_line_t
435print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
436 struct trace_iterator *iter, int cpu)
437{
438 int ret;
439 struct trace_entry *ent = iter->ent;
440
441 /* Pid */
442 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
443 return TRACE_TYPE_PARTIAL_LINE;
444
445 /* Cpu */
446 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
447 ret = print_graph_cpu(s, cpu);
448 if (ret == TRACE_TYPE_PARTIAL_LINE)
449 return TRACE_TYPE_PARTIAL_LINE;
450 }
451
452 /* Proc */
453 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
454 ret = print_graph_proc(s, ent->pid);
455 if (ret == TRACE_TYPE_PARTIAL_LINE)
456 return TRACE_TYPE_PARTIAL_LINE;
457
458 ret = trace_seq_printf(s, " | ");
459 if (!ret)
460 return TRACE_TYPE_PARTIAL_LINE;
461 }
462
463 if (trace_branch_is_leaf(iter, field))
464 return print_graph_entry_leaf(iter, field, s);
465 else
466 return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
467
468}
469
470static enum print_line_t
471print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
472 struct trace_entry *ent, int cpu)
473{
474 int i;
475 int ret;
476 unsigned long long duration = trace->rettime - trace->calltime;
477
478 /* Pid */
479 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
480 return TRACE_TYPE_PARTIAL_LINE;
481
482 /* Cpu */
483 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
484 ret = print_graph_cpu(s, cpu);
485 if (ret == TRACE_TYPE_PARTIAL_LINE)
486 return TRACE_TYPE_PARTIAL_LINE;
487 }
488
489 /* Proc */
490 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
491 ret = print_graph_proc(s, ent->pid);
492 if (ret == TRACE_TYPE_PARTIAL_LINE)
493 return TRACE_TYPE_PARTIAL_LINE;
494
495 ret = trace_seq_printf(s, " | ");
496 if (!ret)
497 return TRACE_TYPE_PARTIAL_LINE;
498 }
499
500 /* Overhead */
501 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
502 ret = print_graph_overhead(duration, s);
503 if (!ret)
504 return TRACE_TYPE_PARTIAL_LINE;
505 }
506
507 /* Duration */
508 ret = print_graph_duration(duration, s);
509 if (ret == TRACE_TYPE_PARTIAL_LINE)
510 return TRACE_TYPE_PARTIAL_LINE;
511
512 /* Closing brace */
513 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
514 ret = trace_seq_printf(s, " ");
515 if (!ret)
516 return TRACE_TYPE_PARTIAL_LINE;
517 }
518
519 ret = trace_seq_printf(s, "}\n");
520 if (!ret)
521 return TRACE_TYPE_PARTIAL_LINE;
522
523 /* Overrun */
524 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
525 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
526 trace->overrun);
527 if (!ret)
528 return TRACE_TYPE_PARTIAL_LINE;
529 }
530
531 ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
532 if (ret == TRACE_TYPE_PARTIAL_LINE)
533 return TRACE_TYPE_PARTIAL_LINE;
534
535 return TRACE_TYPE_HANDLED;
536}
537
538static enum print_line_t
539print_graph_comment(struct print_entry *trace, struct trace_seq *s,
540 struct trace_entry *ent, struct trace_iterator *iter)
541{
542 int i;
543 int ret;
544
545 /* Pid */
546 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
547 return TRACE_TYPE_PARTIAL_LINE;
548
549 /* Cpu */
550 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
551 ret = print_graph_cpu(s, iter->cpu);
552 if (ret == TRACE_TYPE_PARTIAL_LINE)
553 return TRACE_TYPE_PARTIAL_LINE;
554 }
555
556 /* Proc */
557 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
558 ret = print_graph_proc(s, ent->pid);
559 if (ret == TRACE_TYPE_PARTIAL_LINE)
560 return TRACE_TYPE_PARTIAL_LINE;
561
562 ret = trace_seq_printf(s, " | ");
563 if (!ret)
564 return TRACE_TYPE_PARTIAL_LINE;
565 }
566
567 /* No overhead */
568 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
569 ret = trace_seq_printf(s, " ");
570 if (!ret)
571 return TRACE_TYPE_PARTIAL_LINE;
572 }
573
574 /* No time */
575 ret = trace_seq_printf(s, " | ");
576 if (!ret)
577 return TRACE_TYPE_PARTIAL_LINE;
578
579 /* Indentation */
580 if (trace->depth > 0)
581 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
582 ret = trace_seq_printf(s, " ");
583 if (!ret)
584 return TRACE_TYPE_PARTIAL_LINE;
585 }
586
587 /* The comment */
588 ret = trace_seq_printf(s, "/* %s", trace->buf);
589 if (!ret)
590 return TRACE_TYPE_PARTIAL_LINE;
591
592 if (ent->flags & TRACE_FLAG_CONT)
593 trace_seq_print_cont(s, iter);
594
595 ret = trace_seq_printf(s, " */\n");
596 if (!ret)
597 return TRACE_TYPE_PARTIAL_LINE;
598
599 return TRACE_TYPE_HANDLED;
600}
601
602
603enum print_line_t
604print_graph_function(struct trace_iterator *iter)
605{
606 struct trace_seq *s = &iter->seq;
607 struct trace_entry *entry = iter->ent;
608
609 switch (entry->type) {
610 case TRACE_GRAPH_ENT: {
611 struct ftrace_graph_ent_entry *field;
612 trace_assign_type(field, entry);
613 return print_graph_entry(field, s, iter,
614 iter->cpu);
615 }
616 case TRACE_GRAPH_RET: {
617 struct ftrace_graph_ret_entry *field;
618 trace_assign_type(field, entry);
619 return print_graph_return(&field->ret, s, entry, iter->cpu);
620 }
621 case TRACE_PRINT: {
622 struct print_entry *field;
623 trace_assign_type(field, entry);
624 return print_graph_comment(field, s, entry, iter);
625 }
626 default:
627 return TRACE_TYPE_UNHANDLED;
628 }
629}
630
631static void print_graph_headers(struct seq_file *s)
632{
633 /* 1st line */
634 seq_printf(s, "# ");
635 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
636 seq_printf(s, "CPU ");
637 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
638 seq_printf(s, "TASK/PID ");
639 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
640 seq_printf(s, "OVERHEAD/");
641 seq_printf(s, "DURATION FUNCTION CALLS\n");
642
643 /* 2nd line */
644 seq_printf(s, "# ");
645 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
646 seq_printf(s, "| ");
647 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
648 seq_printf(s, "| | ");
649 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
650 seq_printf(s, "| ");
651 seq_printf(s, "| | | | |\n");
652 } else
653 seq_printf(s, " | | | | |\n");
654}
655static struct tracer graph_trace __read_mostly = {
656 .name = "function_graph",
657 .init = graph_trace_init,
658 .reset = graph_trace_reset,
659 .print_line = print_graph_function,
660 .print_header = print_graph_headers,
661 .flags = &tracer_flags,
662};
663
664static __init int init_graph_trace(void)
665{
666 return register_tracer(&graph_trace);
667}
668
669device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
new file mode 100644
index 000000000000..b6a3e20a49a9
--- /dev/null
+++ b/kernel/trace/trace_hw_branches.c
@@ -0,0 +1,195 @@
1/*
2 * h/w branch tracer for x86 based on bts
3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/kallsyms.h>
13
14#include <asm/ds.h>
15
16#include "trace.h"
17
18
19#define SIZEOF_BTS (1 << 13)
20
21static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23
24#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id())
26
27
28static void bts_trace_start_cpu(void *arg)
29{
30 if (this_tracer)
31 ds_release_bts(this_tracer);
32
33 this_tracer =
34 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
35 /* ovfl = */ NULL, /* th = */ (size_t)-1,
36 BTS_KERNEL);
37 if (IS_ERR(this_tracer)) {
38 this_tracer = NULL;
39 return;
40 }
41}
42
43static void bts_trace_start(struct trace_array *tr)
44{
45 int cpu;
46
47 tracing_reset_online_cpus(tr);
48
49 for_each_cpu_mask(cpu, cpu_possible_map)
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51}
52
53static void bts_trace_stop_cpu(void *arg)
54{
55 if (this_tracer) {
56 ds_release_bts(this_tracer);
57 this_tracer = NULL;
58 }
59}
60
61static void bts_trace_stop(struct trace_array *tr)
62{
63 int cpu;
64
65 for_each_cpu_mask(cpu, cpu_possible_map)
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
67}
68
69static int bts_trace_init(struct trace_array *tr)
70{
71 tracing_reset_online_cpus(tr);
72 bts_trace_start(tr);
73
74 return 0;
75}
76
77static void bts_trace_print_header(struct seq_file *m)
78{
79 seq_puts(m,
80 "# CPU# FROM TO FUNCTION\n");
81 seq_puts(m,
82 "# | | | |\n");
83}
84
85static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
86{
87 struct trace_entry *entry = iter->ent;
88 struct trace_seq *seq = &iter->seq;
89 struct hw_branch_entry *it;
90
91 trace_assign_type(it, entry);
92
93 if (entry->type == TRACE_HW_BRANCHES) {
94 if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
95 trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
96 it->from, it->to) &&
97 (!it->from ||
98 seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
99 trace_seq_printf(seq, "\n"))
100 return TRACE_TYPE_HANDLED;
101 return TRACE_TYPE_PARTIAL_LINE;;
102 }
103 return TRACE_TYPE_UNHANDLED;
104}
105
106void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
107{
108 struct ring_buffer_event *event;
109 struct hw_branch_entry *entry;
110 unsigned long irq;
111
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
113 if (!event)
114 return;
115 entry = ring_buffer_event_data(event);
116 tracing_generic_entry_update(&entry->ent, 0, from);
117 entry->ent.type = TRACE_HW_BRANCHES;
118 entry->ent.cpu = smp_processor_id();
119 entry->from = from;
120 entry->to = to;
121 ring_buffer_unlock_commit(tr->buffer, event, irq);
122}
123
124static void trace_bts_at(struct trace_array *tr,
125 const struct bts_trace *trace, void *at)
126{
127 struct bts_struct bts;
128 int err = 0;
129
130 WARN_ON_ONCE(!trace->read);
131 if (!trace->read)
132 return;
133
134 err = trace->read(this_tracer, at, &bts);
135 if (err < 0)
136 return;
137
138 switch (bts.qualifier) {
139 case BTS_BRANCH:
140 trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
141 break;
142 }
143}
144
145static void trace_bts_cpu(void *arg)
146{
147 struct trace_array *tr = (struct trace_array *) arg;
148 const struct bts_trace *trace;
149 unsigned char *at;
150
151 if (!this_tracer)
152 return;
153
154 ds_suspend_bts(this_tracer);
155 trace = ds_read_bts(this_tracer);
156 if (!trace)
157 goto out;
158
159 for (at = trace->ds.top; (void *)at < trace->ds.end;
160 at += trace->ds.size)
161 trace_bts_at(tr, trace, at);
162
163 for (at = trace->ds.begin; (void *)at < trace->ds.top;
164 at += trace->ds.size)
165 trace_bts_at(tr, trace, at);
166
167out:
168 ds_resume_bts(this_tracer);
169}
170
171static void trace_bts_prepare(struct trace_iterator *iter)
172{
173 int cpu;
174
175 for_each_cpu_mask(cpu, cpu_possible_map)
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177}
178
179struct tracer bts_tracer __read_mostly =
180{
181 .name = "hw-branch-tracer",
182 .init = bts_trace_init,
183 .reset = bts_trace_stop,
184 .print_header = bts_trace_print_header,
185 .print_line = bts_trace_print_line,
186 .start = bts_trace_start,
187 .stop = bts_trace_stop,
188 .open = trace_bts_prepare
189};
190
191__init static int init_bts_trace(void)
192{
193 return register_tracer(&bts_tracer);
194}
195device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9c74071c10e0..7c2e326bbc8b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 353}
354#endif /* CONFIG_PREEMPT_TRACER */ 354#endif /* CONFIG_PREEMPT_TRACER */
355 355
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
356static void start_irqsoff_tracer(struct trace_array *tr) 362static void start_irqsoff_tracer(struct trace_array *tr)
357{ 363{
358 register_ftrace_function(&trace_ops); 364 register_ftrace_function(&trace_ops);
359 tracer_enabled = 1; 365 if (tracing_is_enabled()) {
366 tracer_enabled = 1;
367 save_tracer_enabled = 1;
368 } else {
369 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
360} 372}
361 373
362static void stop_irqsoff_tracer(struct trace_array *tr) 374static void stop_irqsoff_tracer(struct trace_array *tr)
363{ 375{
364 tracer_enabled = 0; 376 tracer_enabled = 0;
377 save_tracer_enabled = 0;
365 unregister_ftrace_function(&trace_ops); 378 unregister_ftrace_function(&trace_ops);
366} 379}
367 380
@@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
370 irqsoff_trace = tr; 383 irqsoff_trace = tr;
371 /* make sure that the tracer is visible */ 384 /* make sure that the tracer is visible */
372 smp_wmb(); 385 smp_wmb();
373 386 start_irqsoff_tracer(tr);
374 if (tr->ctrl)
375 start_irqsoff_tracer(tr);
376} 387}
377 388
378static void irqsoff_tracer_reset(struct trace_array *tr) 389static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 390{
380 if (tr->ctrl) 391 stop_irqsoff_tracer(tr);
381 stop_irqsoff_tracer(tr);
382} 392}
383 393
384static void irqsoff_tracer_ctrl_update(struct trace_array *tr) 394static void irqsoff_tracer_start(struct trace_array *tr)
385{ 395{
386 if (tr->ctrl) 396 tracer_enabled = 1;
387 start_irqsoff_tracer(tr); 397 save_tracer_enabled = 1;
388 else 398}
389 stop_irqsoff_tracer(tr); 399
400static void irqsoff_tracer_stop(struct trace_array *tr)
401{
402 tracer_enabled = 0;
403 save_tracer_enabled = 0;
390} 404}
391 405
392static void irqsoff_tracer_open(struct trace_iterator *iter) 406static void irqsoff_tracer_open(struct trace_iterator *iter)
393{ 407{
394 /* stop the trace while dumping */ 408 /* stop the trace while dumping */
395 if (iter->tr->ctrl) 409 tracer_enabled = 0;
396 stop_irqsoff_tracer(iter->tr);
397} 410}
398 411
399static void irqsoff_tracer_close(struct trace_iterator *iter) 412static void irqsoff_tracer_close(struct trace_iterator *iter)
400{ 413{
401 if (iter->tr->ctrl) 414 /* restart tracing */
402 start_irqsoff_tracer(iter->tr); 415 tracer_enabled = save_tracer_enabled;
403} 416}
404 417
405#ifdef CONFIG_IRQSOFF_TRACER 418#ifdef CONFIG_IRQSOFF_TRACER
406static void irqsoff_tracer_init(struct trace_array *tr) 419static int irqsoff_tracer_init(struct trace_array *tr)
407{ 420{
408 trace_type = TRACER_IRQS_OFF; 421 trace_type = TRACER_IRQS_OFF;
409 422
410 __irqsoff_tracer_init(tr); 423 __irqsoff_tracer_init(tr);
424 return 0;
411} 425}
412static struct tracer irqsoff_tracer __read_mostly = 426static struct tracer irqsoff_tracer __read_mostly =
413{ 427{
414 .name = "irqsoff", 428 .name = "irqsoff",
415 .init = irqsoff_tracer_init, 429 .init = irqsoff_tracer_init,
416 .reset = irqsoff_tracer_reset, 430 .reset = irqsoff_tracer_reset,
431 .start = irqsoff_tracer_start,
432 .stop = irqsoff_tracer_stop,
417 .open = irqsoff_tracer_open, 433 .open = irqsoff_tracer_open,
418 .close = irqsoff_tracer_close, 434 .close = irqsoff_tracer_close,
419 .ctrl_update = irqsoff_tracer_ctrl_update,
420 .print_max = 1, 435 .print_max = 1,
421#ifdef CONFIG_FTRACE_SELFTEST 436#ifdef CONFIG_FTRACE_SELFTEST
422 .selftest = trace_selftest_startup_irqsoff, 437 .selftest = trace_selftest_startup_irqsoff,
@@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
428#endif 443#endif
429 444
430#ifdef CONFIG_PREEMPT_TRACER 445#ifdef CONFIG_PREEMPT_TRACER
431static void preemptoff_tracer_init(struct trace_array *tr) 446static int preemptoff_tracer_init(struct trace_array *tr)
432{ 447{
433 trace_type = TRACER_PREEMPT_OFF; 448 trace_type = TRACER_PREEMPT_OFF;
434 449
435 __irqsoff_tracer_init(tr); 450 __irqsoff_tracer_init(tr);
451 return 0;
436} 452}
437 453
438static struct tracer preemptoff_tracer __read_mostly = 454static struct tracer preemptoff_tracer __read_mostly =
@@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
440 .name = "preemptoff", 456 .name = "preemptoff",
441 .init = preemptoff_tracer_init, 457 .init = preemptoff_tracer_init,
442 .reset = irqsoff_tracer_reset, 458 .reset = irqsoff_tracer_reset,
459 .start = irqsoff_tracer_start,
460 .stop = irqsoff_tracer_stop,
443 .open = irqsoff_tracer_open, 461 .open = irqsoff_tracer_open,
444 .close = irqsoff_tracer_close, 462 .close = irqsoff_tracer_close,
445 .ctrl_update = irqsoff_tracer_ctrl_update,
446 .print_max = 1, 463 .print_max = 1,
447#ifdef CONFIG_FTRACE_SELFTEST 464#ifdef CONFIG_FTRACE_SELFTEST
448 .selftest = trace_selftest_startup_preemptoff, 465 .selftest = trace_selftest_startup_preemptoff,
@@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
456#if defined(CONFIG_IRQSOFF_TRACER) && \ 473#if defined(CONFIG_IRQSOFF_TRACER) && \
457 defined(CONFIG_PREEMPT_TRACER) 474 defined(CONFIG_PREEMPT_TRACER)
458 475
459static void preemptirqsoff_tracer_init(struct trace_array *tr) 476static int preemptirqsoff_tracer_init(struct trace_array *tr)
460{ 477{
461 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 478 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
462 479
463 __irqsoff_tracer_init(tr); 480 __irqsoff_tracer_init(tr);
481 return 0;
464} 482}
465 483
466static struct tracer preemptirqsoff_tracer __read_mostly = 484static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
468 .name = "preemptirqsoff", 486 .name = "preemptirqsoff",
469 .init = preemptirqsoff_tracer_init, 487 .init = preemptirqsoff_tracer_init,
470 .reset = irqsoff_tracer_reset, 488 .reset = irqsoff_tracer_reset,
489 .start = irqsoff_tracer_start,
490 .stop = irqsoff_tracer_stop,
471 .open = irqsoff_tracer_open, 491 .open = irqsoff_tracer_open,
472 .close = irqsoff_tracer_close, 492 .close = irqsoff_tracer_close,
473 .ctrl_update = irqsoff_tracer_ctrl_update,
474 .print_max = 1, 493 .print_max = 1,
475#ifdef CONFIG_FTRACE_SELFTEST 494#ifdef CONFIG_FTRACE_SELFTEST
476 .selftest = trace_selftest_startup_preemptirqsoff, 495 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e62cbf78eab6..fffcb069f1dc 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -22,44 +22,35 @@ static unsigned long prev_overruns;
22 22
23static void mmio_reset_data(struct trace_array *tr) 23static void mmio_reset_data(struct trace_array *tr)
24{ 24{
25 int cpu;
26
27 overrun_detected = false; 25 overrun_detected = false;
28 prev_overruns = 0; 26 prev_overruns = 0;
29 tr->time_start = ftrace_now(tr->cpu);
30 27
31 for_each_online_cpu(cpu) 28 tracing_reset_online_cpus(tr);
32 tracing_reset(tr, cpu);
33} 29}
34 30
35static void mmio_trace_init(struct trace_array *tr) 31static int mmio_trace_init(struct trace_array *tr)
36{ 32{
37 pr_debug("in %s\n", __func__); 33 pr_debug("in %s\n", __func__);
38 mmio_trace_array = tr; 34 mmio_trace_array = tr;
39 if (tr->ctrl) { 35
40 mmio_reset_data(tr); 36 mmio_reset_data(tr);
41 enable_mmiotrace(); 37 enable_mmiotrace();
42 } 38 return 0;
43} 39}
44 40
45static void mmio_trace_reset(struct trace_array *tr) 41static void mmio_trace_reset(struct trace_array *tr)
46{ 42{
47 pr_debug("in %s\n", __func__); 43 pr_debug("in %s\n", __func__);
48 if (tr->ctrl) 44
49 disable_mmiotrace(); 45 disable_mmiotrace();
50 mmio_reset_data(tr); 46 mmio_reset_data(tr);
51 mmio_trace_array = NULL; 47 mmio_trace_array = NULL;
52} 48}
53 49
54static void mmio_trace_ctrl_update(struct trace_array *tr) 50static void mmio_trace_start(struct trace_array *tr)
55{ 51{
56 pr_debug("in %s\n", __func__); 52 pr_debug("in %s\n", __func__);
57 if (tr->ctrl) { 53 mmio_reset_data(tr);
58 mmio_reset_data(tr);
59 enable_mmiotrace();
60 } else {
61 disable_mmiotrace();
62 }
63} 54}
64 55
65static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 56static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -296,10 +287,10 @@ static struct tracer mmio_tracer __read_mostly =
296 .name = "mmiotrace", 287 .name = "mmiotrace",
297 .init = mmio_trace_init, 288 .init = mmio_trace_init,
298 .reset = mmio_trace_reset, 289 .reset = mmio_trace_reset,
290 .start = mmio_trace_start,
299 .pipe_open = mmio_pipe_open, 291 .pipe_open = mmio_pipe_open,
300 .close = mmio_close, 292 .close = mmio_close,
301 .read = mmio_read, 293 .read = mmio_read,
302 .ctrl_update = mmio_trace_ctrl_update,
303 .print_line = mmio_print_line, 294 .print_line = mmio_print_line,
304}; 295};
305 296
@@ -371,5 +362,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
371 362
372int mmio_trace_printk(const char *fmt, va_list args) 363int mmio_trace_printk(const char *fmt, va_list args)
373{ 364{
374 return trace_vprintk(0, fmt, args); 365 return trace_vprintk(0, -1, fmt, args);
375} 366}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b4862515..b9767acd30ac 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
12 12
13#include "trace.h" 13#include "trace.h"
14 14
15/* Our two options */
16enum {
17 TRACE_NOP_OPT_ACCEPT = 0x1,
18 TRACE_NOP_OPT_REFUSE = 0x2
19};
20
21/* Options for the tracer (see trace_options file) */
22static struct tracer_opt nop_opts[] = {
23 /* Option that will be accepted by set_flag callback */
24 { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
25 /* Option that will be refused by set_flag callback */
26 { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
27 { } /* Always set a last empty entry */
28};
29
30static struct tracer_flags nop_flags = {
31 /* You can check your flags value here when you want. */
32 .val = 0, /* By default: all flags disabled */
33 .opts = nop_opts
34};
35
15static struct trace_array *ctx_trace; 36static struct trace_array *ctx_trace;
16 37
17static void start_nop_trace(struct trace_array *tr) 38static void start_nop_trace(struct trace_array *tr)
@@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr)
24 /* Nothing to do! */ 45 /* Nothing to do! */
25} 46}
26 47
27static void nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
28{ 49{
29 int cpu; 50 int cpu;
30 ctx_trace = tr; 51 ctx_trace = tr;
@@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr)
32 for_each_online_cpu(cpu) 53 for_each_online_cpu(cpu)
33 tracing_reset(tr, cpu); 54 tracing_reset(tr, cpu);
34 55
35 if (tr->ctrl) 56 start_nop_trace(tr);
36 start_nop_trace(tr); 57 return 0;
37} 58}
38 59
39static void nop_trace_reset(struct trace_array *tr) 60static void nop_trace_reset(struct trace_array *tr)
40{ 61{
41 if (tr->ctrl) 62 stop_nop_trace(tr);
42 stop_nop_trace(tr);
43} 63}
44 64
45static void nop_trace_ctrl_update(struct trace_array *tr) 65/* It only serves as a signal handler and a callback to
66 * accept or refuse tthe setting of a flag.
67 * If you don't implement it, then the flag setting will be
68 * automatically accepted.
69 */
70static int nop_set_flag(u32 old_flags, u32 bit, int set)
46{ 71{
47 /* When starting a new trace, reset the buffers */ 72 /*
48 if (tr->ctrl) 73 * Note that you don't need to update nop_flags.val yourself.
49 start_nop_trace(tr); 74 * The tracing Api will do it automatically if you return 0
50 else 75 */
51 stop_nop_trace(tr); 76 if (bit == TRACE_NOP_OPT_ACCEPT) {
77 printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
78 " Now cat trace_options to see the result\n",
79 set);
80 return 0;
81 }
82
83 if (bit == TRACE_NOP_OPT_REFUSE) {
84 printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
85 "Now cat trace_options to see the result\n",
86 set);
87 return -EINVAL;
88 }
89
90 return 0;
52} 91}
53 92
93
54struct tracer nop_trace __read_mostly = 94struct tracer nop_trace __read_mostly =
55{ 95{
56 .name = "nop", 96 .name = "nop",
57 .init = nop_trace_init, 97 .init = nop_trace_init,
58 .reset = nop_trace_reset, 98 .reset = nop_trace_reset,
59 .ctrl_update = nop_trace_ctrl_update,
60#ifdef CONFIG_FTRACE_SELFTEST 99#ifdef CONFIG_FTRACE_SELFTEST
61 .selftest = trace_selftest_startup_nop, 100 .selftest = trace_selftest_startup_nop,
62#endif 101#endif
102 .flags = &nop_flags,
103 .set_flag = nop_set_flag
63}; 104};
64 105
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 000000000000..a7172a352f62
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <linux/ftrace.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19
20static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled;
22
23
24static void start_power_trace(struct trace_array *tr)
25{
26 trace_power_enabled = 1;
27}
28
29static void stop_power_trace(struct trace_array *tr)
30{
31 trace_power_enabled = 0;
32}
33
34
35static int power_trace_init(struct trace_array *tr)
36{
37 int cpu;
38 power_trace = tr;
39
40 trace_power_enabled = 1;
41
42 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu);
44 return 0;
45}
46
47static enum print_line_t power_print_line(struct trace_iterator *iter)
48{
49 int ret = 0;
50 struct trace_entry *entry = iter->ent;
51 struct trace_power *field ;
52 struct power_trace *it;
53 struct trace_seq *s = &iter->seq;
54 struct timespec stamp;
55 struct timespec duration;
56
57 trace_assign_type(field, entry);
58 it = &field->state_data;
59 stamp = ktime_to_timespec(it->stamp);
60 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
61
62 if (entry->type == TRACE_POWER) {
63 if (it->type == POWER_CSTATE)
64 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
65 stamp.tv_sec,
66 stamp.tv_nsec,
67 it->state, iter->cpu,
68 duration.tv_sec,
69 duration.tv_nsec);
70 if (it->type == POWER_PSTATE)
71 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
72 stamp.tv_sec,
73 stamp.tv_nsec,
74 it->state, iter->cpu);
75 if (!ret)
76 return TRACE_TYPE_PARTIAL_LINE;
77 return TRACE_TYPE_HANDLED;
78 }
79 return TRACE_TYPE_UNHANDLED;
80}
81
82static struct tracer power_tracer __read_mostly =
83{
84 .name = "power",
85 .init = power_trace_init,
86 .start = start_power_trace,
87 .stop = stop_power_trace,
88 .reset = stop_power_trace,
89 .print_line = power_print_line,
90};
91
92static int init_power_trace(void)
93{
94 return register_tracer(&power_tracer);
95}
96device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a62..df175cb4564f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@
16 16
17static struct trace_array *ctx_trace; 17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex);
20 21
21static void 22static void
22probe_sched_switch(struct rq *__rq, struct task_struct *prev, 23probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
27 int cpu; 28 int cpu;
28 int pc; 29 int pc;
29 30
30 if (!atomic_read(&sched_ref)) 31 if (!sched_ref)
31 return; 32 return;
32 33
33 tracing_record_cmdline(prev); 34 tracing_record_cmdline(prev);
@@ -48,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
48} 49}
49 50
50static void 51static void
51probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) 52probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
52{ 53{
53 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
54 unsigned long flags; 55 unsigned long flags;
@@ -71,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
71 local_irq_restore(flags); 72 local_irq_restore(flags);
72} 73}
73 74
74static void sched_switch_reset(struct trace_array *tr)
75{
76 int cpu;
77
78 tr->time_start = ftrace_now(tr->cpu);
79
80 for_each_online_cpu(cpu)
81 tracing_reset(tr, cpu);
82}
83
84static int tracing_sched_register(void) 75static int tracing_sched_register(void)
85{ 76{
86 int ret; 77 int ret;
@@ -123,20 +114,18 @@ static void tracing_sched_unregister(void)
123 114
124static void tracing_start_sched_switch(void) 115static void tracing_start_sched_switch(void)
125{ 116{
126 long ref; 117 mutex_lock(&sched_register_mutex);
127 118 if (!(sched_ref++))
128 ref = atomic_inc_return(&sched_ref);
129 if (ref == 1)
130 tracing_sched_register(); 119 tracing_sched_register();
120 mutex_unlock(&sched_register_mutex);
131} 121}
132 122
133static void tracing_stop_sched_switch(void) 123static void tracing_stop_sched_switch(void)
134{ 124{
135 long ref; 125 mutex_lock(&sched_register_mutex);
136 126 if (!(--sched_ref))
137 ref = atomic_dec_and_test(&sched_ref);
138 if (ref)
139 tracing_sched_unregister(); 127 tracing_sched_unregister();
128 mutex_unlock(&sched_register_mutex);
140} 129}
141 130
142void tracing_start_cmdline_record(void) 131void tracing_start_cmdline_record(void)
@@ -149,40 +138,86 @@ void tracing_stop_cmdline_record(void)
149 tracing_stop_sched_switch(); 138 tracing_stop_sched_switch();
150} 139}
151 140
141/**
142 * tracing_start_sched_switch_record - start tracing context switches
143 *
144 * Turns on context switch tracing for a tracer.
145 */
146void tracing_start_sched_switch_record(void)
147{
148 if (unlikely(!ctx_trace)) {
149 WARN_ON(1);
150 return;
151 }
152
153 tracing_start_sched_switch();
154
155 mutex_lock(&sched_register_mutex);
156 tracer_enabled++;
157 mutex_unlock(&sched_register_mutex);
158}
159
160/**
161 * tracing_stop_sched_switch_record - start tracing context switches
162 *
163 * Turns off context switch tracing for a tracer.
164 */
165void tracing_stop_sched_switch_record(void)
166{
167 mutex_lock(&sched_register_mutex);
168 tracer_enabled--;
169 WARN_ON(tracer_enabled < 0);
170 mutex_unlock(&sched_register_mutex);
171
172 tracing_stop_sched_switch();
173}
174
175/**
176 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
177 * @tr: trace array pointer to assign
178 *
179 * Some tracers might want to record the context switches in their
180 * trace. This function lets those tracers assign the trace array
181 * to use.
182 */
183void tracing_sched_switch_assign_trace(struct trace_array *tr)
184{
185 ctx_trace = tr;
186}
187
152static void start_sched_trace(struct trace_array *tr) 188static void start_sched_trace(struct trace_array *tr)
153{ 189{
154 sched_switch_reset(tr); 190 tracing_reset_online_cpus(tr);
155 tracing_start_cmdline_record(); 191 tracing_start_sched_switch_record();
156 tracer_enabled = 1;
157} 192}
158 193
159static void stop_sched_trace(struct trace_array *tr) 194static void stop_sched_trace(struct trace_array *tr)
160{ 195{
161 tracer_enabled = 0; 196 tracing_stop_sched_switch_record();
162 tracing_stop_cmdline_record();
163} 197}
164 198
165static void sched_switch_trace_init(struct trace_array *tr) 199static int sched_switch_trace_init(struct trace_array *tr)
166{ 200{
167 ctx_trace = tr; 201 ctx_trace = tr;
168 202 start_sched_trace(tr);
169 if (tr->ctrl) 203 return 0;
170 start_sched_trace(tr);
171} 204}
172 205
173static void sched_switch_trace_reset(struct trace_array *tr) 206static void sched_switch_trace_reset(struct trace_array *tr)
174{ 207{
175 if (tr->ctrl) 208 if (sched_ref)
176 stop_sched_trace(tr); 209 stop_sched_trace(tr);
177} 210}
178 211
179static void sched_switch_trace_ctrl_update(struct trace_array *tr) 212static void sched_switch_trace_start(struct trace_array *tr)
180{ 213{
181 /* When starting a new trace, reset the buffers */ 214 tracing_reset_online_cpus(tr);
182 if (tr->ctrl) 215 tracing_start_sched_switch();
183 start_sched_trace(tr); 216}
184 else 217
185 stop_sched_trace(tr); 218static void sched_switch_trace_stop(struct trace_array *tr)
219{
220 tracing_stop_sched_switch();
186} 221}
187 222
188static struct tracer sched_switch_trace __read_mostly = 223static struct tracer sched_switch_trace __read_mostly =
@@ -190,7 +225,8 @@ static struct tracer sched_switch_trace __read_mostly =
190 .name = "sched_switch", 225 .name = "sched_switch",
191 .init = sched_switch_trace_init, 226 .init = sched_switch_trace_init,
192 .reset = sched_switch_trace_reset, 227 .reset = sched_switch_trace_reset,
193 .ctrl_update = sched_switch_trace_ctrl_update, 228 .start = sched_switch_trace_start,
229 .stop = sched_switch_trace_stop,
194#ifdef CONFIG_FTRACE_SELFTEST 230#ifdef CONFIG_FTRACE_SELFTEST
195 .selftest = trace_selftest_startup_sched_switch, 231 .selftest = trace_selftest_startup_sched_switch,
196#endif 232#endif
@@ -198,14 +234,7 @@ static struct tracer sched_switch_trace __read_mostly =
198 234
199__init static int init_sched_switch_trace(void) 235__init static int init_sched_switch_trace(void)
200{ 236{
201 int ret = 0;
202
203 if (atomic_read(&sched_ref))
204 ret = tracing_sched_register();
205 if (ret) {
206 pr_info("error registering scheduler trace\n");
207 return ret;
208 }
209 return register_tracer(&sched_switch_trace); 237 return register_tracer(&sched_switch_trace);
210} 238}
211device_initcall(init_sched_switch_trace); 239device_initcall(init_sched_switch_trace);
240
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3ae93f16b565..43586b689e31 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
50 return; 50 return;
51 51
52 pc = preempt_count(); 52 pc = preempt_count();
53 resched = need_resched(); 53 resched = ftrace_preempt_disable();
54 preempt_disable_notrace();
55 54
56 cpu = raw_smp_processor_id(); 55 cpu = raw_smp_processor_id();
57 data = tr->data[cpu]; 56 data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
81 out: 80 out:
82 atomic_dec(&data->disabled); 81 atomic_dec(&data->disabled);
83 82
84 /* 83 ftrace_preempt_enable(resched);
85 * To prevent recursion from the scheduler, if the
86 * resched flag was set before we entered, then
87 * don't reschedule.
88 */
89 if (resched)
90 preempt_enable_no_resched_notrace();
91 else
92 preempt_enable_notrace();
93} 84}
94 85
95static struct ftrace_ops trace_ops __read_mostly = 86static struct ftrace_ops trace_ops __read_mostly =
@@ -220,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr)
220} 211}
221 212
222static void 213static void
223probe_wakeup(struct rq *rq, struct task_struct *p) 214probe_wakeup(struct rq *rq, struct task_struct *p, int success)
224{ 215{
225 int cpu = smp_processor_id(); 216 int cpu = smp_processor_id();
226 unsigned long flags; 217 unsigned long flags;
@@ -271,6 +262,12 @@ out:
271 atomic_dec(&wakeup_trace->data[cpu]->disabled); 262 atomic_dec(&wakeup_trace->data[cpu]->disabled);
272} 263}
273 264
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
274static void start_wakeup_tracer(struct trace_array *tr) 271static void start_wakeup_tracer(struct trace_array *tr)
275{ 272{
276 int ret; 273 int ret;
@@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
309 306
310 register_ftrace_function(&trace_ops); 307 register_ftrace_function(&trace_ops);
311 308
312 tracer_enabled = 1; 309 if (tracing_is_enabled()) {
310 tracer_enabled = 1;
311 save_tracer_enabled = 1;
312 } else {
313 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
313 316
314 return; 317 return;
315fail_deprobe_wake_new: 318fail_deprobe_wake_new:
@@ -321,49 +324,53 @@ fail_deprobe:
321static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
322{ 325{
323 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
324 unregister_ftrace_function(&trace_ops); 328 unregister_ftrace_function(&trace_ops);
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 329 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 330 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 331 unregister_trace_sched_wakeup(probe_wakeup);
328} 332}
329 333
330static void wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
331{ 335{
332 wakeup_trace = tr; 336 wakeup_trace = tr;
333 337 start_wakeup_tracer(tr);
334 if (tr->ctrl) 338 return 0;
335 start_wakeup_tracer(tr);
336} 339}
337 340
338static void wakeup_tracer_reset(struct trace_array *tr) 341static void wakeup_tracer_reset(struct trace_array *tr)
339{ 342{
340 if (tr->ctrl) { 343 stop_wakeup_tracer(tr);
341 stop_wakeup_tracer(tr); 344 /* make sure we put back any tasks we are tracing */
342 /* make sure we put back any tasks we are tracing */ 345 wakeup_reset(tr);
343 wakeup_reset(tr); 346}
344 } 347
348static void wakeup_tracer_start(struct trace_array *tr)
349{
350 wakeup_reset(tr);
351 tracer_enabled = 1;
352 save_tracer_enabled = 1;
345} 353}
346 354
347static void wakeup_tracer_ctrl_update(struct trace_array *tr) 355static void wakeup_tracer_stop(struct trace_array *tr)
348{ 356{
349 if (tr->ctrl) 357 tracer_enabled = 0;
350 start_wakeup_tracer(tr); 358 save_tracer_enabled = 0;
351 else
352 stop_wakeup_tracer(tr);
353} 359}
354 360
355static void wakeup_tracer_open(struct trace_iterator *iter) 361static void wakeup_tracer_open(struct trace_iterator *iter)
356{ 362{
357 /* stop the trace while dumping */ 363 /* stop the trace while dumping */
358 if (iter->tr->ctrl) 364 tracer_enabled = 0;
359 stop_wakeup_tracer(iter->tr);
360} 365}
361 366
362static void wakeup_tracer_close(struct trace_iterator *iter) 367static void wakeup_tracer_close(struct trace_iterator *iter)
363{ 368{
364 /* forget about any processes we were recording */ 369 /* forget about any processes we were recording */
365 if (iter->tr->ctrl) 370 if (save_tracer_enabled) {
366 start_wakeup_tracer(iter->tr); 371 wakeup_reset(iter->tr);
372 tracer_enabled = 1;
373 }
367} 374}
368 375
369static struct tracer wakeup_tracer __read_mostly = 376static struct tracer wakeup_tracer __read_mostly =
@@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
371 .name = "wakeup", 378 .name = "wakeup",
372 .init = wakeup_tracer_init, 379 .init = wakeup_tracer_init,
373 .reset = wakeup_tracer_reset, 380 .reset = wakeup_tracer_reset,
381 .start = wakeup_tracer_start,
382 .stop = wakeup_tracer_stop,
374 .open = wakeup_tracer_open, 383 .open = wakeup_tracer_open,
375 .close = wakeup_tracer_close, 384 .close = wakeup_tracer_close,
376 .ctrl_update = wakeup_tracer_ctrl_update,
377 .print_max = 1, 385 .print_max = 1,
378#ifdef CONFIG_FTRACE_SELFTEST 386#ifdef CONFIG_FTRACE_SELFTEST
379 .selftest = trace_selftest_startup_wakeup, 387 .selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 90bc752a7580..88c8eb70f54a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH:
16 return 1; 17 return 1;
17 } 18 }
18 return 0; 19 return 0;
@@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
51 int cpu, ret = 0; 52 int cpu, ret = 0;
52 53
53 /* Don't allow flipping of max traces now */ 54 /* Don't allow flipping of max traces now */
54 raw_local_irq_save(flags); 55 local_irq_save(flags);
55 __raw_spin_lock(&ftrace_max_lock); 56 __raw_spin_lock(&ftrace_max_lock);
56 57
57 cnt = ring_buffer_entries(tr->buffer); 58 cnt = ring_buffer_entries(tr->buffer);
@@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
62 break; 63 break;
63 } 64 }
64 __raw_spin_unlock(&ftrace_max_lock); 65 __raw_spin_unlock(&ftrace_max_lock);
65 raw_local_irq_restore(flags); 66 local_irq_restore(flags);
66 67
67 if (count) 68 if (count)
68 *count = cnt; 69 *count = cnt;
@@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
70 return ret; 71 return ret;
71} 72}
72 73
74static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
75{
76 printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
77 trace->name, init_ret);
78}
73#ifdef CONFIG_FUNCTION_TRACER 79#ifdef CONFIG_FUNCTION_TRACER
74 80
75#ifdef CONFIG_DYNAMIC_FTRACE 81#ifdef CONFIG_DYNAMIC_FTRACE
@@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
110 ftrace_set_filter(func_name, strlen(func_name), 1); 116 ftrace_set_filter(func_name, strlen(func_name), 1);
111 117
112 /* enable tracing */ 118 /* enable tracing */
113 tr->ctrl = 1; 119 ret = trace->init(tr);
114 trace->init(tr); 120 if (ret) {
121 warn_failed_init_tracer(trace, ret);
122 goto out;
123 }
115 124
116 /* Sleep for a 1/10 of a second */ 125 /* Sleep for a 1/10 of a second */
117 msleep(100); 126 msleep(100);
@@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
134 msleep(100); 143 msleep(100);
135 144
136 /* stop the tracing. */ 145 /* stop the tracing. */
137 tr->ctrl = 0; 146 tracing_stop();
138 trace->ctrl_update(tr);
139 ftrace_enabled = 0; 147 ftrace_enabled = 0;
140 148
141 /* check the trace buffer */ 149 /* check the trace buffer */
142 ret = trace_test_buffer(tr, &count); 150 ret = trace_test_buffer(tr, &count);
143 trace->reset(tr); 151 trace->reset(tr);
152 tracing_start();
144 153
145 /* we should only have one item */ 154 /* we should only have one item */
146 if (!ret && count != 1) { 155 if (!ret && count != 1) {
@@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
148 ret = -1; 157 ret = -1;
149 goto out; 158 goto out;
150 } 159 }
160
151 out: 161 out:
152 ftrace_enabled = save_ftrace_enabled; 162 ftrace_enabled = save_ftrace_enabled;
153 tracer_enabled = save_tracer_enabled; 163 tracer_enabled = save_tracer_enabled;
@@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
180 ftrace_enabled = 1; 190 ftrace_enabled = 1;
181 tracer_enabled = 1; 191 tracer_enabled = 1;
182 192
183 tr->ctrl = 1; 193 ret = trace->init(tr);
184 trace->init(tr); 194 if (ret) {
195 warn_failed_init_tracer(trace, ret);
196 goto out;
197 }
198
185 /* Sleep for a 1/10 of a second */ 199 /* Sleep for a 1/10 of a second */
186 msleep(100); 200 msleep(100);
187 /* stop the tracing. */ 201 /* stop the tracing. */
188 tr->ctrl = 0; 202 tracing_stop();
189 trace->ctrl_update(tr);
190 ftrace_enabled = 0; 203 ftrace_enabled = 0;
191 204
192 /* check the trace buffer */ 205 /* check the trace buffer */
193 ret = trace_test_buffer(tr, &count); 206 ret = trace_test_buffer(tr, &count);
194 trace->reset(tr); 207 trace->reset(tr);
208 tracing_start();
195 209
196 if (!ret && !count) { 210 if (!ret && !count) {
197 printk(KERN_CONT ".. no entries found .."); 211 printk(KERN_CONT ".. no entries found ..");
@@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
223 int ret; 237 int ret;
224 238
225 /* start the tracing */ 239 /* start the tracing */
226 tr->ctrl = 1; 240 ret = trace->init(tr);
227 trace->init(tr); 241 if (ret) {
242 warn_failed_init_tracer(trace, ret);
243 return ret;
244 }
245
228 /* reset the max latency */ 246 /* reset the max latency */
229 tracing_max_latency = 0; 247 tracing_max_latency = 0;
230 /* disable interrupts for a bit */ 248 /* disable interrupts for a bit */
@@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
232 udelay(100); 250 udelay(100);
233 local_irq_enable(); 251 local_irq_enable();
234 /* stop the tracing. */ 252 /* stop the tracing. */
235 tr->ctrl = 0; 253 tracing_stop();
236 trace->ctrl_update(tr);
237 /* check both trace buffers */ 254 /* check both trace buffers */
238 ret = trace_test_buffer(tr, NULL); 255 ret = trace_test_buffer(tr, NULL);
239 if (!ret) 256 if (!ret)
240 ret = trace_test_buffer(&max_tr, &count); 257 ret = trace_test_buffer(&max_tr, &count);
241 trace->reset(tr); 258 trace->reset(tr);
259 tracing_start();
242 260
243 if (!ret && !count) { 261 if (!ret && !count) {
244 printk(KERN_CONT ".. no entries found .."); 262 printk(KERN_CONT ".. no entries found ..");
@@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
259 unsigned long count; 277 unsigned long count;
260 int ret; 278 int ret;
261 279
280 /*
281 * Now that the big kernel lock is no longer preemptable,
282 * and this is called with the BKL held, it will always
283 * fail. If preemption is already disabled, simply
284 * pass the test. When the BKL is removed, or becomes
285 * preemptible again, we will once again test this,
286 * so keep it in.
287 */
288 if (preempt_count()) {
289 printk(KERN_CONT "can not test ... force ");
290 return 0;
291 }
292
262 /* start the tracing */ 293 /* start the tracing */
263 tr->ctrl = 1; 294 ret = trace->init(tr);
264 trace->init(tr); 295 if (ret) {
296 warn_failed_init_tracer(trace, ret);
297 return ret;
298 }
299
265 /* reset the max latency */ 300 /* reset the max latency */
266 tracing_max_latency = 0; 301 tracing_max_latency = 0;
267 /* disable preemption for a bit */ 302 /* disable preemption for a bit */
@@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
269 udelay(100); 304 udelay(100);
270 preempt_enable(); 305 preempt_enable();
271 /* stop the tracing. */ 306 /* stop the tracing. */
272 tr->ctrl = 0; 307 tracing_stop();
273 trace->ctrl_update(tr);
274 /* check both trace buffers */ 308 /* check both trace buffers */
275 ret = trace_test_buffer(tr, NULL); 309 ret = trace_test_buffer(tr, NULL);
276 if (!ret) 310 if (!ret)
277 ret = trace_test_buffer(&max_tr, &count); 311 ret = trace_test_buffer(&max_tr, &count);
278 trace->reset(tr); 312 trace->reset(tr);
313 tracing_start();
279 314
280 if (!ret && !count) { 315 if (!ret && !count) {
281 printk(KERN_CONT ".. no entries found .."); 316 printk(KERN_CONT ".. no entries found ..");
@@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
296 unsigned long count; 331 unsigned long count;
297 int ret; 332 int ret;
298 333
334 /*
335 * Now that the big kernel lock is no longer preemptable,
336 * and this is called with the BKL held, it will always
337 * fail. If preemption is already disabled, simply
338 * pass the test. When the BKL is removed, or becomes
339 * preemptible again, we will once again test this,
340 * so keep it in.
341 */
342 if (preempt_count()) {
343 printk(KERN_CONT "can not test ... force ");
344 return 0;
345 }
346
299 /* start the tracing */ 347 /* start the tracing */
300 tr->ctrl = 1; 348 ret = trace->init(tr);
301 trace->init(tr); 349 if (ret) {
350 warn_failed_init_tracer(trace, ret);
351 goto out;
352 }
302 353
303 /* reset the max latency */ 354 /* reset the max latency */
304 tracing_max_latency = 0; 355 tracing_max_latency = 0;
@@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
312 local_irq_enable(); 363 local_irq_enable();
313 364
314 /* stop the tracing. */ 365 /* stop the tracing. */
315 tr->ctrl = 0; 366 tracing_stop();
316 trace->ctrl_update(tr);
317 /* check both trace buffers */ 367 /* check both trace buffers */
318 ret = trace_test_buffer(tr, NULL); 368 ret = trace_test_buffer(tr, NULL);
319 if (ret) 369 if (ret) {
370 tracing_start();
320 goto out; 371 goto out;
372 }
321 373
322 ret = trace_test_buffer(&max_tr, &count); 374 ret = trace_test_buffer(&max_tr, &count);
323 if (ret) 375 if (ret) {
376 tracing_start();
324 goto out; 377 goto out;
378 }
325 379
326 if (!ret && !count) { 380 if (!ret && !count) {
327 printk(KERN_CONT ".. no entries found .."); 381 printk(KERN_CONT ".. no entries found ..");
328 ret = -1; 382 ret = -1;
383 tracing_start();
329 goto out; 384 goto out;
330 } 385 }
331 386
332 /* do the test by disabling interrupts first this time */ 387 /* do the test by disabling interrupts first this time */
333 tracing_max_latency = 0; 388 tracing_max_latency = 0;
334 tr->ctrl = 1; 389 tracing_start();
335 trace->ctrl_update(tr);
336 preempt_disable(); 390 preempt_disable();
337 local_irq_disable(); 391 local_irq_disable();
338 udelay(100); 392 udelay(100);
@@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
341 local_irq_enable(); 395 local_irq_enable();
342 396
343 /* stop the tracing. */ 397 /* stop the tracing. */
344 tr->ctrl = 0; 398 tracing_stop();
345 trace->ctrl_update(tr);
346 /* check both trace buffers */ 399 /* check both trace buffers */
347 ret = trace_test_buffer(tr, NULL); 400 ret = trace_test_buffer(tr, NULL);
348 if (ret) 401 if (ret)
@@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
358 411
359 out: 412 out:
360 trace->reset(tr); 413 trace->reset(tr);
414 tracing_start();
361 tracing_max_latency = save_max; 415 tracing_max_latency = save_max;
362 416
363 return ret; 417 return ret;
@@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
423 wait_for_completion(&isrt); 477 wait_for_completion(&isrt);
424 478
425 /* start the tracing */ 479 /* start the tracing */
426 tr->ctrl = 1; 480 ret = trace->init(tr);
427 trace->init(tr); 481 if (ret) {
482 warn_failed_init_tracer(trace, ret);
483 return ret;
484 }
485
428 /* reset the max latency */ 486 /* reset the max latency */
429 tracing_max_latency = 0; 487 tracing_max_latency = 0;
430 488
@@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
448 msleep(100); 506 msleep(100);
449 507
450 /* stop the tracing. */ 508 /* stop the tracing. */
451 tr->ctrl = 0; 509 tracing_stop();
452 trace->ctrl_update(tr);
453 /* check both trace buffers */ 510 /* check both trace buffers */
454 ret = trace_test_buffer(tr, NULL); 511 ret = trace_test_buffer(tr, NULL);
455 if (!ret) 512 if (!ret)
@@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
457 514
458 515
459 trace->reset(tr); 516 trace->reset(tr);
517 tracing_start();
460 518
461 tracing_max_latency = save_max; 519 tracing_max_latency = save_max;
462 520
@@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
480 int ret; 538 int ret;
481 539
482 /* start the tracing */ 540 /* start the tracing */
483 tr->ctrl = 1; 541 ret = trace->init(tr);
484 trace->init(tr); 542 if (ret) {
543 warn_failed_init_tracer(trace, ret);
544 return ret;
545 }
546
485 /* Sleep for a 1/10 of a second */ 547 /* Sleep for a 1/10 of a second */
486 msleep(100); 548 msleep(100);
487 /* stop the tracing. */ 549 /* stop the tracing. */
488 tr->ctrl = 0; 550 tracing_stop();
489 trace->ctrl_update(tr);
490 /* check the trace buffer */ 551 /* check the trace buffer */
491 ret = trace_test_buffer(tr, &count); 552 ret = trace_test_buffer(tr, &count);
492 trace->reset(tr); 553 trace->reset(tr);
554 tracing_start();
493 555
494 if (!ret && !count) { 556 if (!ret && !count) {
495 printk(KERN_CONT ".. no entries found .."); 557 printk(KERN_CONT ".. no entries found ..");
@@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
508 int ret; 570 int ret;
509 571
510 /* start the tracing */ 572 /* start the tracing */
511 tr->ctrl = 1; 573 ret = trace->init(tr);
512 trace->init(tr); 574 if (ret) {
575 warn_failed_init_tracer(trace, ret);
576 return 0;
577 }
578
513 /* Sleep for a 1/10 of a second */ 579 /* Sleep for a 1/10 of a second */
514 msleep(100); 580 msleep(100);
515 /* stop the tracing. */ 581 /* stop the tracing. */
516 tr->ctrl = 0; 582 tracing_stop();
517 trace->ctrl_update(tr);
518 /* check the trace buffer */ 583 /* check the trace buffer */
519 ret = trace_test_buffer(tr, &count); 584 ret = trace_test_buffer(tr, &count);
520 trace->reset(tr); 585 trace->reset(tr);
586 tracing_start();
521 587
522 return ret; 588 return ret;
523} 589}
524#endif /* CONFIG_SYSPROF_TRACER */ 590#endif /* CONFIG_SYSPROF_TRACER */
591
592#ifdef CONFIG_BRANCH_TRACER
593int
594trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
595{
596 unsigned long count;
597 int ret;
598
599 /* start the tracing */
600 ret = trace->init(tr);
601 if (ret) {
602 warn_failed_init_tracer(trace, ret);
603 return ret;
604 }
605
606 /* Sleep for a 1/10 of a second */
607 msleep(100);
608 /* stop the tracing. */
609 tracing_stop();
610 /* check the trace buffer */
611 ret = trace_test_buffer(tr, &count);
612 trace->reset(tr);
613 tracing_start();
614
615 return ret;
616}
617#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3bdb44bde4b7..d0871bc0aca5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/sysctl.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/fs.h> 15#include <linux/fs.h>
15#include "trace.h" 16#include "trace.h"
@@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock =
31 32
32static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
33static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
35static DEFINE_MUTEX(stack_sysctl_mutex);
36
37int stack_tracer_enabled;
38static int last_stack_tracer_enabled;
34 39
35static inline void check_stack(void) 40static inline void check_stack(void)
36{ 41{
@@ -48,7 +53,7 @@ static inline void check_stack(void)
48 if (!object_is_on_stack(&this_size)) 53 if (!object_is_on_stack(&this_size))
49 return; 54 return;
50 55
51 raw_local_irq_save(flags); 56 local_irq_save(flags);
52 __raw_spin_lock(&max_stack_lock); 57 __raw_spin_lock(&max_stack_lock);
53 58
54 /* a race could have already updated it */ 59 /* a race could have already updated it */
@@ -78,6 +83,7 @@ static inline void check_stack(void)
78 * on a new max, so it is far from a fast path. 83 * on a new max, so it is far from a fast path.
79 */ 84 */
80 while (i < max_stack_trace.nr_entries) { 85 while (i < max_stack_trace.nr_entries) {
86 int found = 0;
81 87
82 stack_dump_index[i] = this_size; 88 stack_dump_index[i] = this_size;
83 p = start; 89 p = start;
@@ -86,17 +92,19 @@ static inline void check_stack(void)
86 if (*p == stack_dump_trace[i]) { 92 if (*p == stack_dump_trace[i]) {
87 this_size = stack_dump_index[i++] = 93 this_size = stack_dump_index[i++] =
88 (top - p) * sizeof(unsigned long); 94 (top - p) * sizeof(unsigned long);
95 found = 1;
89 /* Start the search from here */ 96 /* Start the search from here */
90 start = p + 1; 97 start = p + 1;
91 } 98 }
92 } 99 }
93 100
94 i++; 101 if (!found)
102 i++;
95 } 103 }
96 104
97 out: 105 out:
98 __raw_spin_unlock(&max_stack_lock); 106 __raw_spin_unlock(&max_stack_lock);
99 raw_local_irq_restore(flags); 107 local_irq_restore(flags);
100} 108}
101 109
102static void 110static void
@@ -107,8 +115,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
107 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
108 return; 116 return;
109 117
110 resched = need_resched(); 118 resched = ftrace_preempt_disable();
111 preempt_disable_notrace();
112 119
113 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
114 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
120 out: 127 out:
121 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
122 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
123 if (resched) 130 ftrace_preempt_enable(resched);
124 preempt_enable_no_resched_notrace();
125 else
126 preempt_enable_notrace();
127} 131}
128 132
129static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
@@ -166,16 +170,16 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
166 if (ret < 0) 170 if (ret < 0)
167 return ret; 171 return ret;
168 172
169 raw_local_irq_save(flags); 173 local_irq_save(flags);
170 __raw_spin_lock(&max_stack_lock); 174 __raw_spin_lock(&max_stack_lock);
171 *ptr = val; 175 *ptr = val;
172 __raw_spin_unlock(&max_stack_lock); 176 __raw_spin_unlock(&max_stack_lock);
173 raw_local_irq_restore(flags); 177 local_irq_restore(flags);
174 178
175 return count; 179 return count;
176} 180}
177 181
178static struct file_operations stack_max_size_fops = { 182static const struct file_operations stack_max_size_fops = {
179 .open = tracing_open_generic, 183 .open = tracing_open_generic,
180 .read = stack_max_size_read, 184 .read = stack_max_size_read,
181 .write = stack_max_size_write, 185 .write = stack_max_size_write,
@@ -273,7 +277,7 @@ static int t_show(struct seq_file *m, void *v)
273 return 0; 277 return 0;
274} 278}
275 279
276static struct seq_operations stack_trace_seq_ops = { 280static const struct seq_operations stack_trace_seq_ops = {
277 .start = t_start, 281 .start = t_start,
278 .next = t_next, 282 .next = t_next,
279 .stop = t_stop, 283 .stop = t_stop,
@@ -289,12 +293,47 @@ static int stack_trace_open(struct inode *inode, struct file *file)
289 return ret; 293 return ret;
290} 294}
291 295
292static struct file_operations stack_trace_fops = { 296static const struct file_operations stack_trace_fops = {
293 .open = stack_trace_open, 297 .open = stack_trace_open,
294 .read = seq_read, 298 .read = seq_read,
295 .llseek = seq_lseek, 299 .llseek = seq_lseek,
296}; 300};
297 301
302int
303stack_trace_sysctl(struct ctl_table *table, int write,
304 struct file *file, void __user *buffer, size_t *lenp,
305 loff_t *ppos)
306{
307 int ret;
308
309 mutex_lock(&stack_sysctl_mutex);
310
311 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
312
313 if (ret || !write ||
314 (last_stack_tracer_enabled == stack_tracer_enabled))
315 goto out;
316
317 last_stack_tracer_enabled = stack_tracer_enabled;
318
319 if (stack_tracer_enabled)
320 register_ftrace_function(&trace_ops);
321 else
322 unregister_ftrace_function(&trace_ops);
323
324 out:
325 mutex_unlock(&stack_sysctl_mutex);
326 return ret;
327}
328
329static __init int enable_stacktrace(char *str)
330{
331 stack_tracer_enabled = 1;
332 last_stack_tracer_enabled = 1;
333 return 1;
334}
335__setup("stacktrace", enable_stacktrace);
336
298static __init int stack_trace_init(void) 337static __init int stack_trace_init(void)
299{ 338{
300 struct dentry *d_tracer; 339 struct dentry *d_tracer;
@@ -312,7 +351,8 @@ static __init int stack_trace_init(void)
312 if (!entry) 351 if (!entry)
313 pr_warning("Could not create debugfs 'stack_trace' entry\n"); 352 pr_warning("Could not create debugfs 'stack_trace' entry\n");
314 353
315 register_ftrace_function(&trace_ops); 354 if (stack_tracer_enabled)
355 register_ftrace_function(&trace_ops);
316 356
317 return 0; 357 return 0;
318} 358}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba55..a5779bd975db 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,6 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 205
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 207}
@@ -234,20 +233,10 @@ static void stop_stack_timers(void)
234 stop_stack_timer(cpu); 233 stop_stack_timer(cpu);
235} 234}
236 235
237static void stack_reset(struct trace_array *tr)
238{
239 int cpu;
240
241 tr->time_start = ftrace_now(tr->cpu);
242
243 for_each_online_cpu(cpu)
244 tracing_reset(tr, cpu);
245}
246
247static void start_stack_trace(struct trace_array *tr) 236static void start_stack_trace(struct trace_array *tr)
248{ 237{
249 mutex_lock(&sample_timer_lock); 238 mutex_lock(&sample_timer_lock);
250 stack_reset(tr); 239 tracing_reset_online_cpus(tr);
251 start_stack_timers(); 240 start_stack_timers();
252 tracer_enabled = 1; 241 tracer_enabled = 1;
253 mutex_unlock(&sample_timer_lock); 242 mutex_unlock(&sample_timer_lock);
@@ -261,27 +250,17 @@ static void stop_stack_trace(struct trace_array *tr)
261 mutex_unlock(&sample_timer_lock); 250 mutex_unlock(&sample_timer_lock);
262} 251}
263 252
264static void stack_trace_init(struct trace_array *tr) 253static int stack_trace_init(struct trace_array *tr)
265{ 254{
266 sysprof_trace = tr; 255 sysprof_trace = tr;
267 256
268 if (tr->ctrl) 257 start_stack_trace(tr);
269 start_stack_trace(tr); 258 return 0;
270} 259}
271 260
272static void stack_trace_reset(struct trace_array *tr) 261static void stack_trace_reset(struct trace_array *tr)
273{ 262{
274 if (tr->ctrl) 263 stop_stack_trace(tr);
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285} 264}
286 265
287static struct tracer stack_trace __read_mostly = 266static struct tracer stack_trace __read_mostly =
@@ -289,7 +268,6 @@ static struct tracer stack_trace __read_mostly =
289 .name = "sysprof", 268 .name = "sysprof",
290 .init = stack_trace_init, 269 .init = stack_trace_init,
291 .reset = stack_trace_reset, 270 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST 271#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof, 272 .selftest = trace_selftest_startup_sysprof,
295#endif 273#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index af8c85664882..79602740bbb5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
43 */ 43 */
44#define TRACEPOINT_HASH_BITS 6 44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) 45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -54,40 +55,43 @@ struct tracepoint_entry {
54 struct hlist_node hlist; 55 struct hlist_node hlist;
55 void **funcs; 56 void **funcs;
56 int refcount; /* Number of times armed. 0 if disarmed. */ 57 int refcount; /* Number of times armed. 0 if disarmed. */
57 struct rcu_head rcu;
58 void *oldptr;
59 unsigned char rcu_pending:1;
60 char name[0]; 58 char name[0];
61}; 59};
62 60
63static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; 61struct tp_probes {
62 union {
63 struct rcu_head rcu;
64 struct list_head list;
65 } u;
66 void *probes[0];
67};
64 68
65static void free_old_closure(struct rcu_head *head) 69static inline void *allocate_probes(int count)
66{ 70{
67 struct tracepoint_entry *entry = container_of(head, 71 struct tp_probes *p = kmalloc(count * sizeof(void *)
68 struct tracepoint_entry, rcu); 72 + sizeof(struct tp_probes), GFP_KERNEL);
69 kfree(entry->oldptr); 73 return p == NULL ? NULL : p->probes;
70 /* Make sure we free the data before setting the pending flag to 0 */
71 smp_wmb();
72 entry->rcu_pending = 0;
73} 74}
74 75
75static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) 76static void rcu_free_old_probes(struct rcu_head *head)
76{ 77{
77 if (!old) 78 kfree(container_of(head, struct tp_probes, u.rcu));
78 return; 79}
79 entry->oldptr = old; 80
80 entry->rcu_pending = 1; 81static inline void release_probes(void *old)
81 /* write rcu_pending before calling the RCU callback */ 82{
82 smp_wmb(); 83 if (old) {
83 call_rcu_sched(&entry->rcu, free_old_closure); 84 struct tp_probes *tp_probes = container_of(old,
85 struct tp_probes, probes[0]);
86 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
87 }
84} 88}
85 89
86static void debug_print_probes(struct tracepoint_entry *entry) 90static void debug_print_probes(struct tracepoint_entry *entry)
87{ 91{
88 int i; 92 int i;
89 93
90 if (!tracepoint_debug) 94 if (!tracepoint_debug || !entry->funcs)
91 return; 95 return;
92 96
93 for (i = 0; entry->funcs[i]; i++) 97 for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 return ERR_PTR(-EEXIST); 115 return ERR_PTR(-EEXIST);
112 } 116 }
113 /* + 2 : one for new probe, one for NULL func */ 117 /* + 2 : one for new probe, one for NULL func */
114 new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); 118 new = allocate_probes(nr_probes + 2);
115 if (new == NULL) 119 if (new == NULL)
116 return ERR_PTR(-ENOMEM); 120 return ERR_PTR(-ENOMEM);
117 if (old) 121 if (old)
118 memcpy(new, old, nr_probes * sizeof(void *)); 122 memcpy(new, old, nr_probes * sizeof(void *));
119 new[nr_probes] = probe; 123 new[nr_probes] = probe;
124 new[nr_probes + 1] = NULL;
120 entry->refcount = nr_probes + 1; 125 entry->refcount = nr_probes + 1;
121 entry->funcs = new; 126 entry->funcs = new;
122 debug_print_probes(entry); 127 debug_print_probes(entry);
@@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
132 old = entry->funcs; 137 old = entry->funcs;
133 138
134 if (!old) 139 if (!old)
135 return NULL; 140 return ERR_PTR(-ENOENT);
136 141
137 debug_print_probes(entry); 142 debug_print_probes(entry);
138 /* (N -> M), (N > 1, M >= 0) probes */ 143 /* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
151 int j = 0; 156 int j = 0;
152 /* N -> M, (N > 1, M > 0) */ 157 /* N -> M, (N > 1, M > 0) */
153 /* + 1 for NULL */ 158 /* + 1 for NULL */
154 new = kzalloc((nr_probes - nr_del + 1) 159 new = allocate_probes(nr_probes - nr_del + 1);
155 * sizeof(void *), GFP_KERNEL);
156 if (new == NULL) 160 if (new == NULL)
157 return ERR_PTR(-ENOMEM); 161 return ERR_PTR(-ENOMEM);
158 for (i = 0; old[i]; i++) 162 for (i = 0; old[i]; i++)
159 if ((probe && old[i] != probe)) 163 if ((probe && old[i] != probe))
160 new[j++] = old[i]; 164 new[j++] = old[i];
165 new[nr_probes - nr_del] = NULL;
161 entry->refcount = nr_probes - nr_del; 166 entry->refcount = nr_probes - nr_del;
162 entry->funcs = new; 167 entry->funcs = new;
163 } 168 }
@@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
215 memcpy(&e->name[0], name, name_len); 220 memcpy(&e->name[0], name, name_len);
216 e->funcs = NULL; 221 e->funcs = NULL;
217 e->refcount = 0; 222 e->refcount = 0;
218 e->rcu_pending = 0;
219 hlist_add_head(&e->hlist, head); 223 hlist_add_head(&e->hlist, head);
220 return e; 224 return e;
221} 225}
@@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
224 * Remove the tracepoint from the tracepoint hash table. Must be called with 228 * Remove the tracepoint from the tracepoint hash table. Must be called with
225 * mutex_lock held. 229 * mutex_lock held.
226 */ 230 */
227static int remove_tracepoint(const char *name) 231static inline void remove_tracepoint(struct tracepoint_entry *e)
228{ 232{
229 struct hlist_head *head;
230 struct hlist_node *node;
231 struct tracepoint_entry *e;
232 int found = 0;
233 size_t len = strlen(name) + 1;
234 u32 hash = jhash(name, len-1, 0);
235
236 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
237 hlist_for_each_entry(e, node, head, hlist) {
238 if (!strcmp(name, e->name)) {
239 found = 1;
240 break;
241 }
242 }
243 if (!found)
244 return -ENOENT;
245 if (e->refcount)
246 return -EBUSY;
247 hlist_del(&e->hlist); 233 hlist_del(&e->hlist);
248 /* Make sure the call_rcu_sched has been executed */
249 if (e->rcu_pending)
250 rcu_barrier_sched();
251 kfree(e); 234 kfree(e);
252 return 0;
253} 235}
254 236
255/* 237/*
@@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
280static void disable_tracepoint(struct tracepoint *elem) 262static void disable_tracepoint(struct tracepoint *elem)
281{ 263{
282 elem->state = 0; 264 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL);
283} 266}
284 267
285/** 268/**
@@ -320,6 +303,23 @@ static void tracepoint_update_probes(void)
320 module_update_tracepoints(); 303 module_update_tracepoints();
321} 304}
322 305
306static void *tracepoint_add_probe(const char *name, void *probe)
307{
308 struct tracepoint_entry *entry;
309 void *old;
310
311 entry = get_tracepoint(name);
312 if (!entry) {
313 entry = add_tracepoint(name);
314 if (IS_ERR(entry))
315 return entry;
316 }
317 old = tracepoint_entry_add_probe(entry, probe);
318 if (IS_ERR(old) && !entry->refcount)
319 remove_tracepoint(entry);
320 return old;
321}
322
323/** 323/**
324 * tracepoint_probe_register - Connect a probe to a tracepoint 324 * tracepoint_probe_register - Connect a probe to a tracepoint
325 * @name: tracepoint name 325 * @name: tracepoint name
@@ -330,44 +330,36 @@ static void tracepoint_update_probes(void)
330 */ 330 */
331int tracepoint_probe_register(const char *name, void *probe) 331int tracepoint_probe_register(const char *name, void *probe)
332{ 332{
333 struct tracepoint_entry *entry;
334 int ret = 0;
335 void *old; 333 void *old;
336 334
337 mutex_lock(&tracepoints_mutex); 335 mutex_lock(&tracepoints_mutex);
338 entry = get_tracepoint(name); 336 old = tracepoint_add_probe(name, probe);
339 if (!entry) {
340 entry = add_tracepoint(name);
341 if (IS_ERR(entry)) {
342 ret = PTR_ERR(entry);
343 goto end;
344 }
345 }
346 /*
347 * If we detect that a call_rcu_sched is pending for this tracepoint,
348 * make sure it's executed now.
349 */
350 if (entry->rcu_pending)
351 rcu_barrier_sched();
352 old = tracepoint_entry_add_probe(entry, probe);
353 if (IS_ERR(old)) {
354 ret = PTR_ERR(old);
355 goto end;
356 }
357 mutex_unlock(&tracepoints_mutex); 337 mutex_unlock(&tracepoints_mutex);
338 if (IS_ERR(old))
339 return PTR_ERR(old);
340
358 tracepoint_update_probes(); /* may update entry */ 341 tracepoint_update_probes(); /* may update entry */
359 mutex_lock(&tracepoints_mutex); 342 release_probes(old);
360 entry = get_tracepoint(name); 343 return 0;
361 WARN_ON(!entry);
362 if (entry->rcu_pending)
363 rcu_barrier_sched();
364 tracepoint_entry_free_old(entry, old);
365end:
366 mutex_unlock(&tracepoints_mutex);
367 return ret;
368} 344}
369EXPORT_SYMBOL_GPL(tracepoint_probe_register); 345EXPORT_SYMBOL_GPL(tracepoint_probe_register);
370 346
347static void *tracepoint_remove_probe(const char *name, void *probe)
348{
349 struct tracepoint_entry *entry;
350 void *old;
351
352 entry = get_tracepoint(name);
353 if (!entry)
354 return ERR_PTR(-ENOENT);
355 old = tracepoint_entry_remove_probe(entry, probe);
356 if (IS_ERR(old))
357 return old;
358 if (!entry->refcount)
359 remove_tracepoint(entry);
360 return old;
361}
362
371/** 363/**
372 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 364 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
373 * @name: tracepoint name 365 * @name: tracepoint name
@@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
380 */ 372 */
381int tracepoint_probe_unregister(const char *name, void *probe) 373int tracepoint_probe_unregister(const char *name, void *probe)
382{ 374{
383 struct tracepoint_entry *entry;
384 void *old; 375 void *old;
385 int ret = -ENOENT;
386 376
387 mutex_lock(&tracepoints_mutex); 377 mutex_lock(&tracepoints_mutex);
388 entry = get_tracepoint(name); 378 old = tracepoint_remove_probe(name, probe);
389 if (!entry)
390 goto end;
391 if (entry->rcu_pending)
392 rcu_barrier_sched();
393 old = tracepoint_entry_remove_probe(entry, probe);
394 if (!old) {
395 printk(KERN_WARNING "Warning: Trying to unregister a probe"
396 "that doesn't exist\n");
397 goto end;
398 }
399 mutex_unlock(&tracepoints_mutex); 379 mutex_unlock(&tracepoints_mutex);
380 if (IS_ERR(old))
381 return PTR_ERR(old);
382
400 tracepoint_update_probes(); /* may update entry */ 383 tracepoint_update_probes(); /* may update entry */
384 release_probes(old);
385 return 0;
386}
387EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
388
389static LIST_HEAD(old_probes);
390static int need_update;
391
392static void tracepoint_add_old_probes(void *old)
393{
394 need_update = 1;
395 if (old) {
396 struct tp_probes *tp_probes = container_of(old,
397 struct tp_probes, probes[0]);
398 list_add(&tp_probes->u.list, &old_probes);
399 }
400}
401
402/**
403 * tracepoint_probe_register_noupdate - register a probe but not connect
404 * @name: tracepoint name
405 * @probe: probe handler
406 *
407 * caller must call tracepoint_probe_update_all()
408 */
409int tracepoint_probe_register_noupdate(const char *name, void *probe)
410{
411 void *old;
412
401 mutex_lock(&tracepoints_mutex); 413 mutex_lock(&tracepoints_mutex);
402 entry = get_tracepoint(name); 414 old = tracepoint_add_probe(name, probe);
403 if (!entry) 415 if (IS_ERR(old)) {
404 goto end; 416 mutex_unlock(&tracepoints_mutex);
405 if (entry->rcu_pending) 417 return PTR_ERR(old);
406 rcu_barrier_sched(); 418 }
407 tracepoint_entry_free_old(entry, old); 419 tracepoint_add_old_probes(old);
408 remove_tracepoint(name); /* Ignore busy error message */
409 ret = 0;
410end:
411 mutex_unlock(&tracepoints_mutex); 420 mutex_unlock(&tracepoints_mutex);
412 return ret; 421 return 0;
413} 422}
414EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 423EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
424
425/**
426 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
427 * @name: tracepoint name
428 * @probe: probe function pointer
429 *
430 * caller must call tracepoint_probe_update_all()
431 */
432int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
433{
434 void *old;
435
436 mutex_lock(&tracepoints_mutex);
437 old = tracepoint_remove_probe(name, probe);
438 if (IS_ERR(old)) {
439 mutex_unlock(&tracepoints_mutex);
440 return PTR_ERR(old);
441 }
442 tracepoint_add_old_probes(old);
443 mutex_unlock(&tracepoints_mutex);
444 return 0;
445}
446EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
447
448/**
449 * tracepoint_probe_update_all - update tracepoints
450 */
451void tracepoint_probe_update_all(void)
452{
453 LIST_HEAD(release_probes);
454 struct tp_probes *pos, *next;
455
456 mutex_lock(&tracepoints_mutex);
457 if (!need_update) {
458 mutex_unlock(&tracepoints_mutex);
459 return;
460 }
461 if (!list_empty(&old_probes))
462 list_replace_init(&old_probes, &release_probes);
463 need_update = 0;
464 mutex_unlock(&tracepoints_mutex);
465
466 tracepoint_update_probes();
467 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
468 list_del(&pos->u.list);
469 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
470 }
471}
472EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
415 473
416/** 474/**
417 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. 475 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
@@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
483 iter->tracepoint = NULL; 541 iter->tracepoint = NULL;
484} 542}
485EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 543EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
544
545#ifdef CONFIG_MODULES
546
547int tracepoint_module_notify(struct notifier_block *self,
548 unsigned long val, void *data)
549{
550 struct module *mod = data;
551
552 switch (val) {
553 case MODULE_STATE_COMING:
554 tracepoint_update_probe_range(mod->tracepoints,
555 mod->tracepoints + mod->num_tracepoints);
556 break;
557 case MODULE_STATE_GOING:
558 tracepoint_update_probe_range(mod->tracepoints,
559 mod->tracepoints + mod->num_tracepoints);
560 break;
561 }
562 return 0;
563}
564
565struct notifier_block tracepoint_module_nb = {
566 .notifier_call = tracepoint_module_notify,
567 .priority = 0,
568};
569
570static int init_tracepoints(void)
571{
572 return register_module_notifier(&tracepoint_module_nb);
573}
574__initcall(init_tracepoints);
575
576#endif /* CONFIG_MODULES */
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 8ebcd8532dfb..2dc06ab35716 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -27,6 +27,7 @@
27 */ 27 */
28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
29{ 29{
30 const struct cred *tcred;
30 struct timespec uptime, ts; 31 struct timespec uptime, ts;
31 u64 ac_etime; 32 u64 ac_etime;
32 33
@@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
53 stats->ac_flag |= AXSIG; 54 stats->ac_flag |= AXSIG;
54 stats->ac_nice = task_nice(tsk); 55 stats->ac_nice = task_nice(tsk);
55 stats->ac_sched = tsk->policy; 56 stats->ac_sched = tsk->policy;
56 stats->ac_uid = tsk->uid;
57 stats->ac_gid = tsk->gid;
58 stats->ac_pid = tsk->pid; 57 stats->ac_pid = tsk->pid;
59 rcu_read_lock(); 58 rcu_read_lock();
59 tcred = __task_cred(tsk);
60 stats->ac_uid = tcred->uid;
61 stats->ac_gid = tcred->gid;
60 stats->ac_ppid = pid_alive(tsk) ? 62 stats->ac_ppid = pid_alive(tsk) ?
61 rcu_dereference(tsk->real_parent)->tgid : 0; 63 rcu_dereference(tsk->real_parent)->tgid : 0;
62 rcu_read_unlock(); 64 rcu_read_unlock();
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 3e41c1673e2f..2460c3199b5a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
84 84
85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
86{ 86{
87 const struct cred *cred = current_cred();
87 int retval; 88 int retval;
88 89
89 if (!(retval = put_user(high2lowuid(current->uid), ruid)) && 90 if (!(retval = put_user(high2lowuid(cred->uid), ruid)) &&
90 !(retval = put_user(high2lowuid(current->euid), euid))) 91 !(retval = put_user(high2lowuid(cred->euid), euid)))
91 retval = put_user(high2lowuid(current->suid), suid); 92 retval = put_user(high2lowuid(cred->suid), suid);
92 93
93 return retval; 94 return retval;
94} 95}
@@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
104 105
105asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
106{ 107{
108 const struct cred *cred = current_cred();
107 int retval; 109 int retval;
108 110
109 if (!(retval = put_user(high2lowgid(current->gid), rgid)) && 111 if (!(retval = put_user(high2lowgid(cred->gid), rgid)) &&
110 !(retval = put_user(high2lowgid(current->egid), egid))) 112 !(retval = put_user(high2lowgid(cred->egid), egid)))
111 retval = put_user(high2lowgid(current->sgid), sgid); 113 retval = put_user(high2lowgid(cred->sgid), sgid);
112 114
113 return retval; 115 return retval;
114} 116}
@@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info,
161 163
162asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) 164asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
163{ 165{
164 int i = 0; 166 const struct cred *cred = current_cred();
167 int i;
165 168
166 if (gidsetsize < 0) 169 if (gidsetsize < 0)
167 return -EINVAL; 170 return -EINVAL;
168 171
169 get_group_info(current->group_info); 172 i = cred->group_info->ngroups;
170 i = current->group_info->ngroups;
171 if (gidsetsize) { 173 if (gidsetsize) {
172 if (i > gidsetsize) { 174 if (i > gidsetsize) {
173 i = -EINVAL; 175 i = -EINVAL;
174 goto out; 176 goto out;
175 } 177 }
176 if (groups16_to_user(grouplist, current->group_info)) { 178 if (groups16_to_user(grouplist, cred->group_info)) {
177 i = -EFAULT; 179 i = -EFAULT;
178 goto out; 180 goto out;
179 } 181 }
180 } 182 }
181out: 183out:
182 put_group_info(current->group_info);
183 return i; 184 return i;
184} 185}
185 186
@@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
210 211
211asmlinkage long sys_getuid16(void) 212asmlinkage long sys_getuid16(void)
212{ 213{
213 return high2lowuid(current->uid); 214 return high2lowuid(current_uid());
214} 215}
215 216
216asmlinkage long sys_geteuid16(void) 217asmlinkage long sys_geteuid16(void)
217{ 218{
218 return high2lowuid(current->euid); 219 return high2lowuid(current_euid());
219} 220}
220 221
221asmlinkage long sys_getgid16(void) 222asmlinkage long sys_getgid16(void)
222{ 223{
223 return high2lowgid(current->gid); 224 return high2lowgid(current_gid());
224} 225}
225 226
226asmlinkage long sys_getegid16(void) 227asmlinkage long sys_getegid16(void)
227{ 228{
228 return high2lowgid(current->egid); 229 return high2lowgid(current_egid());
229} 230}
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..477b6660f447 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,12 +16,13 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
19 20
20struct user_namespace init_user_ns = { 21struct user_namespace init_user_ns = {
21 .kref = { 22 .kref = {
22 .refcount = ATOMIC_INIT(2), 23 .refcount = ATOMIC_INIT(1),
23 }, 24 },
24 .root_user = &root_user, 25 .creator = &root_user,
25}; 26};
26EXPORT_SYMBOL_GPL(init_user_ns); 27EXPORT_SYMBOL_GPL(init_user_ns);
27 28
@@ -47,12 +48,14 @@ static struct kmem_cache *uid_cachep;
47 */ 48 */
48static DEFINE_SPINLOCK(uidhash_lock); 49static DEFINE_SPINLOCK(uidhash_lock);
49 50
51/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
50struct user_struct root_user = { 52struct user_struct root_user = {
51 .__count = ATOMIC_INIT(1), 53 .__count = ATOMIC_INIT(2),
52 .processes = ATOMIC_INIT(1), 54 .processes = ATOMIC_INIT(1),
53 .files = ATOMIC_INIT(0), 55 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns,
56#ifdef CONFIG_USER_SCHED 59#ifdef CONFIG_USER_SCHED
57 .tg = &init_task_group, 60 .tg = &init_task_group,
58#endif 61#endif
@@ -101,19 +104,15 @@ static int sched_create_user(struct user_struct *up)
101 if (IS_ERR(up->tg)) 104 if (IS_ERR(up->tg))
102 rc = -ENOMEM; 105 rc = -ENOMEM;
103 106
104 return rc; 107 set_tg_uid(up);
105}
106 108
107static void sched_switch_user(struct task_struct *p) 109 return rc;
108{
109 sched_move_task(p);
110} 110}
111 111
112#else /* CONFIG_USER_SCHED */ 112#else /* CONFIG_USER_SCHED */
113 113
114static void sched_destroy_user(struct user_struct *up) { } 114static void sched_destroy_user(struct user_struct *up) { }
115static int sched_create_user(struct user_struct *up) { return 0; } 115static int sched_create_user(struct user_struct *up) { return 0; }
116static void sched_switch_user(struct task_struct *p) { }
117 116
118#endif /* CONFIG_USER_SCHED */ 117#endif /* CONFIG_USER_SCHED */
119 118
@@ -242,13 +241,21 @@ static struct kobj_type uids_ktype = {
242 .release = uids_release, 241 .release = uids_release,
243}; 242};
244 243
245/* create /sys/kernel/uids/<uid>/cpu_share file for this user */ 244/*
245 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
246 * We do not create this file for users in a user namespace (until
247 * sysfs tagging is implemented).
248 *
249 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
250 */
246static int uids_user_create(struct user_struct *up) 251static int uids_user_create(struct user_struct *up)
247{ 252{
248 struct kobject *kobj = &up->kobj; 253 struct kobject *kobj = &up->kobj;
249 int error; 254 int error;
250 255
251 memset(kobj, 0, sizeof(struct kobject)); 256 memset(kobj, 0, sizeof(struct kobject));
257 if (up->user_ns != &init_user_ns)
258 return 0;
252 kobj->kset = uids_kset; 259 kobj->kset = uids_kset;
253 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); 260 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
254 if (error) { 261 if (error) {
@@ -284,6 +291,8 @@ static void remove_user_sysfs_dir(struct work_struct *w)
284 unsigned long flags; 291 unsigned long flags;
285 int remove_user = 0; 292 int remove_user = 0;
286 293
294 if (up->user_ns != &init_user_ns)
295 return;
287 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() 296 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
288 * atomic. 297 * atomic.
289 */ 298 */
@@ -319,12 +328,13 @@ done:
319 * IRQ state (as stored in flags) is restored and uidhash_lock released 328 * IRQ state (as stored in flags) is restored and uidhash_lock released
320 * upon function exit. 329 * upon function exit.
321 */ 330 */
322static inline void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
323{ 332{
324 /* restore back the count */ 333 /* restore back the count */
325 atomic_inc(&up->__count); 334 atomic_inc(&up->__count);
326 spin_unlock_irqrestore(&uidhash_lock, flags); 335 spin_unlock_irqrestore(&uidhash_lock, flags);
327 336
337 put_user_ns(up->user_ns);
328 INIT_WORK(&up->work, remove_user_sysfs_dir); 338 INIT_WORK(&up->work, remove_user_sysfs_dir);
329 schedule_work(&up->work); 339 schedule_work(&up->work);
330} 340}
@@ -340,13 +350,14 @@ static inline void uids_mutex_unlock(void) { }
340 * IRQ state (as stored in flags) is restored and uidhash_lock released 350 * IRQ state (as stored in flags) is restored and uidhash_lock released
341 * upon function exit. 351 * upon function exit.
342 */ 352 */
343static inline void free_user(struct user_struct *up, unsigned long flags) 353static void free_user(struct user_struct *up, unsigned long flags)
344{ 354{
345 uid_hash_remove(up); 355 uid_hash_remove(up);
346 spin_unlock_irqrestore(&uidhash_lock, flags); 356 spin_unlock_irqrestore(&uidhash_lock, flags);
347 sched_destroy_user(up); 357 sched_destroy_user(up);
348 key_put(up->uid_keyring); 358 key_put(up->uid_keyring);
349 key_put(up->session_keyring); 359 key_put(up->session_keyring);
360 put_user_ns(up->user_ns);
350 kmem_cache_free(uid_cachep, up); 361 kmem_cache_free(uid_cachep, up);
351} 362}
352 363
@@ -362,7 +373,7 @@ struct user_struct *find_user(uid_t uid)
362{ 373{
363 struct user_struct *ret; 374 struct user_struct *ret;
364 unsigned long flags; 375 unsigned long flags;
365 struct user_namespace *ns = current->nsproxy->user_ns; 376 struct user_namespace *ns = current_user_ns();
366 377
367 spin_lock_irqsave(&uidhash_lock, flags); 378 spin_lock_irqsave(&uidhash_lock, flags);
368 ret = uid_hash_find(uid, uidhashentry(ns, uid)); 379 ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -409,6 +420,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
409 if (sched_create_user(new) < 0) 420 if (sched_create_user(new) < 0)
410 goto out_free_user; 421 goto out_free_user;
411 422
423 new->user_ns = get_user_ns(ns);
424
412 if (uids_user_create(new)) 425 if (uids_user_create(new))
413 goto out_destoy_sched; 426 goto out_destoy_sched;
414 427
@@ -432,7 +445,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
432 up = new; 445 up = new;
433 } 446 }
434 spin_unlock_irq(&uidhash_lock); 447 spin_unlock_irq(&uidhash_lock);
435
436 } 448 }
437 449
438 uids_mutex_unlock(); 450 uids_mutex_unlock();
@@ -441,6 +453,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
441 453
442out_destoy_sched: 454out_destoy_sched:
443 sched_destroy_user(new); 455 sched_destroy_user(new);
456 put_user_ns(new->user_ns);
444out_free_user: 457out_free_user:
445 kmem_cache_free(uid_cachep, new); 458 kmem_cache_free(uid_cachep, new);
446out_unlock: 459out_unlock:
@@ -448,63 +461,6 @@ out_unlock:
448 return NULL; 461 return NULL;
449} 462}
450 463
451void switch_uid(struct user_struct *new_user)
452{
453 struct user_struct *old_user;
454
455 /* What if a process setreuid()'s and this brings the
456 * new uid over his NPROC rlimit? We can check this now
457 * cheaply with the new uid cache, so if it matters
458 * we should be checking for it. -DaveM
459 */
460 old_user = current->user;
461 atomic_inc(&new_user->processes);
462 atomic_dec(&old_user->processes);
463 switch_uid_keyring(new_user);
464 current->user = new_user;
465 sched_switch_user(current);
466
467 /*
468 * We need to synchronize with __sigqueue_alloc()
469 * doing a get_uid(p->user).. If that saw the old
470 * user value, we need to wait until it has exited
471 * its critical region before we can free the old
472 * structure.
473 */
474 smp_mb();
475 spin_unlock_wait(&current->sighand->siglock);
476
477 free_uid(old_user);
478 suid_keys(current);
479}
480
481#ifdef CONFIG_USER_NS
482void release_uids(struct user_namespace *ns)
483{
484 int i;
485 unsigned long flags;
486 struct hlist_head *head;
487 struct hlist_node *nd;
488
489 spin_lock_irqsave(&uidhash_lock, flags);
490 /*
491 * collapse the chains so that the user_struct-s will
492 * be still alive, but not in hashes. subsequent free_uid()
493 * will free them.
494 */
495 for (i = 0; i < UIDHASH_SZ; i++) {
496 head = ns->uidhash_table + i;
497 while (!hlist_empty(head)) {
498 nd = head->first;
499 hlist_del_init(nd);
500 }
501 }
502 spin_unlock_irqrestore(&uidhash_lock, flags);
503
504 free_uid(ns->root_user);
505}
506#endif
507
508static int __init uid_cache_init(void) 464static int __init uid_cache_init(void)
509{ 465{
510 int n; 466 int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 532858fa5b88..79084311ee57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,60 +9,55 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/cred.h>
12 13
13/* 14/*
14 * Clone a new ns copying an original user ns, setting refcount to 1 15 * Create a new user namespace, deriving the creator from the user in the
15 * @old_ns: namespace to clone 16 * passed credentials, and replacing that user with the new root user for the
16 * Return NULL on error (failure to kmalloc), new ns otherwise 17 * new namespace.
18 *
19 * This is called by copy_creds(), which will finish setting the target task's
20 * credentials.
17 */ 21 */
18static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) 22int create_user_ns(struct cred *new)
19{ 23{
20 struct user_namespace *ns; 24 struct user_namespace *ns;
21 struct user_struct *new_user; 25 struct user_struct *root_user;
22 int n; 26 int n;
23 27
24 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 28 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
25 if (!ns) 29 if (!ns)
26 return ERR_PTR(-ENOMEM); 30 return -ENOMEM;
27 31
28 kref_init(&ns->kref); 32 kref_init(&ns->kref);
29 33
30 for (n = 0; n < UIDHASH_SZ; ++n) 34 for (n = 0; n < UIDHASH_SZ; ++n)
31 INIT_HLIST_HEAD(ns->uidhash_table + n); 35 INIT_HLIST_HEAD(ns->uidhash_table + n);
32 36
33 /* Insert new root user. */ 37 /* Alloc new root user. */
34 ns->root_user = alloc_uid(ns, 0); 38 root_user = alloc_uid(ns, 0);
35 if (!ns->root_user) { 39 if (!root_user) {
36 kfree(ns); 40 kfree(ns);
37 return ERR_PTR(-ENOMEM); 41 return -ENOMEM;
38 } 42 }
39 43
40 /* Reset current->user with a new one */ 44 /* set the new root user in the credentials under preparation */
41 new_user = alloc_uid(ns, current->uid); 45 ns->creator = new->user;
42 if (!new_user) { 46 new->user = root_user;
43 free_uid(ns->root_user); 47 new->uid = new->euid = new->suid = new->fsuid = 0;
44 kfree(ns); 48 new->gid = new->egid = new->sgid = new->fsgid = 0;
45 return ERR_PTR(-ENOMEM); 49 put_group_info(new->group_info);
46 } 50 new->group_info = get_group_info(&init_groups);
47 51#ifdef CONFIG_KEYS
48 switch_uid(new_user); 52 key_put(new->request_key_auth);
49 return ns; 53 new->request_key_auth = NULL;
50} 54#endif
51 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
52struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
53{
54 struct user_namespace *new_ns;
55
56 BUG_ON(!old_ns);
57 get_user_ns(old_ns);
58
59 if (!(flags & CLONE_NEWUSER))
60 return old_ns;
61 56
62 new_ns = clone_user_ns(old_ns); 57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */
58 kref_set(&ns->kref, 1);
63 59
64 put_user_ns(old_ns); 60 return 0;
65 return new_ns;
66} 61}
67 62
68void free_user_ns(struct kref *kref) 63void free_user_ns(struct kref *kref)
@@ -70,7 +65,7 @@ void free_user_ns(struct kref *kref)
70 struct user_namespace *ns; 65 struct user_namespace *ns;
71 66
72 ns = container_of(kref, struct user_namespace, kref); 67 ns = container_of(kref, struct user_namespace, kref);
73 release_uids(ns); 68 free_uid(ns->creator);
74 kfree(ns); 69 kfree(ns);
75} 70}
76EXPORT_SYMBOL(free_user_ns); 71EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d4dc69ddebd7..4952322cba45 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -84,21 +84,21 @@ static cpumask_t cpu_singlethread_map __read_mostly;
84static cpumask_t cpu_populated_map __read_mostly; 84static cpumask_t cpu_populated_map __read_mostly;
85 85
86/* If it's single threaded, it isn't in the list of workqueues. */ 86/* If it's single threaded, it isn't in the list of workqueues. */
87static inline int is_single_threaded(struct workqueue_struct *wq) 87static inline int is_wq_single_threaded(struct workqueue_struct *wq)
88{ 88{
89 return wq->singlethread; 89 return wq->singlethread;
90} 90}
91 91
92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) 92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
93{ 93{
94 return is_single_threaded(wq) 94 return is_wq_single_threaded(wq)
95 ? &cpu_singlethread_map : &cpu_populated_map; 95 ? &cpu_singlethread_map : &cpu_populated_map;
96} 96}
97 97
98static 98static
99struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) 99struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
100{ 100{
101 if (unlikely(is_single_threaded(wq))) 101 if (unlikely(is_wq_single_threaded(wq)))
102 cpu = singlethread_cpu; 102 cpu = singlethread_cpu;
103 return per_cpu_ptr(wq->cpu_wq, cpu); 103 return per_cpu_ptr(wq->cpu_wq, cpu);
104} 104}
@@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
769{ 769{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
771 struct workqueue_struct *wq = cwq->wq; 771 struct workqueue_struct *wq = cwq->wq;
772 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; 772 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
773 struct task_struct *p; 773 struct task_struct *p;
774 774
775 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 775 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);