aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-12-31 02:31:57 -0500
committerIngo Molnar <mingo@elte.hu>2008-12-31 02:31:57 -0500
commita9de18eb761f7c1c860964b2e5addc1a35c7e861 (patch)
tree886e75fdfd09690cd262ca69cb7f5d1d42b48602 /kernel
parentb2aaf8f74cdc84a9182f6cabf198b7763bcb9d40 (diff)
parent6a94cb73064c952255336cc57731904174b2c58f (diff)
Merge branch 'linus' into stackprotector
Conflicts: arch/x86/include/asm/pda.h kernel/fork.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.freezer2
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/Makefile20
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/audit.c32
-rw-r--r--kernel/audit_tree.c139
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/auditsc.c279
-rw-r--r--kernel/capability.c288
-rw-r--r--kernel/cgroup.c318
-rw-r--r--kernel/cgroup_debug.c4
-rw-r--r--kernel/cgroup_freezer.c379
-rw-r--r--kernel/compat.c111
-rw-r--r--kernel/configs.c9
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/cpuset.c50
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c588
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exec_domain.c33
-rw-r--r--kernel/exit.c82
-rw-r--r--kernel/extable.c21
-rw-r--r--kernel/fork.c204
-rw-r--r--kernel/freezer.c154
-rw-r--r--kernel/futex.c382
-rw-r--r--kernel/futex_compat.c7
-rw-r--r--kernel/hrtimer.c538
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/autoprobe.c48
-rw-r--r--kernel/irq/chip.c114
-rw-r--r--kernel/irq/handle.c210
-rw-r--r--kernel/irq/internals.h14
-rw-r--r--kernel/irq/manage.c206
-rw-r--r--kernel/irq/migration.c21
-rw-r--r--kernel/irq/numa_migrate.c122
-rw-r--r--kernel/irq/proc.c53
-rw-r--r--kernel/irq/resend.c6
-rw-r--r--kernel/irq/spurious.c167
-rw-r--r--kernel/itimer.c33
-rw-r--r--kernel/kallsyms.c34
-rw-r--r--kernel/kexec.c3
-rw-r--r--kernel/kmod.c97
-rw-r--r--kernel/kprobes.c25
-rw-r--r--kernel/ksysfs.c35
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c82
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/marker.c218
-rw-r--r--kernel/module.c395
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c10
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c117
-rw-r--r--kernel/params.c276
-rw-r--r--kernel/posix-cpu-timers.c515
-rw-r--r--kernel/posix-timers.c181
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c26
-rw-r--r--kernel/power/main.c14
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c119
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/printk.c67
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/ptrace.c47
-rw-r--r--kernel/rcuclassic.c4
-rw-r--r--kernel/rcupdate.c19
-rw-r--r--kernel/rcupreempt.c12
-rw-r--r--kernel/rcupreempt_trace.c10
-rw-r--r--kernel/rcutorture.c68
-rw-r--r--kernel/rcutree.c1535
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/resource.c107
-rw-r--r--kernel/rtmutex.c3
-rw-r--r--kernel/sched.c540
-rw-r--r--kernel/sched_debug.c105
-rw-r--r--kernel/sched_fair.c314
-rw-r--r--kernel/sched_features.h3
-rw-r--r--kernel/sched_idletask.c5
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/sched_stats.h109
-rw-r--r--kernel/signal.c76
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/softirq.c176
-rw-r--r--kernel/softlockup.c6
-rw-r--r--kernel/stacktrace.c11
-rw-r--r--kernel/stop_machine.c123
-rw-r--r--kernel/sys.c705
-rw-r--r--kernel/sys_ni.c7
-rw-r--r--kernel/sysctl.c166
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/time.c18
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c100
-rw-r--r--kernel/time/tick-broadcast.c13
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c164
-rw-r--r--kernel/time/timekeeping.c144
-rw-r--r--kernel/time/timer_list.c28
-rw-r--r--kernel/timer.c149
-rw-r--r--kernel/trace/Kconfig194
-rw-r--r--kernel/trace/Makefile19
-rw-r--r--kernel/trace/ftrace.c1541
-rw-r--r--kernel/trace/ring_buffer.c2517
-rw-r--r--kernel/trace/trace.c2721
-rw-r--r--kernel/trace/trace.h470
-rw-r--r--kernel/trace/trace_boot.c186
-rw-r--r--kernel/trace/trace_branch.c342
-rw-r--r--kernel/trace/trace_functions.c32
-rw-r--r--kernel/trace/trace_functions_graph.c669
-rw-r--r--kernel/trace/trace_hw_branches.c195
-rw-r--r--kernel/trace/trace_irqsoff.c84
-rw-r--r--kernel/trace/trace_mmiotrace.c153
-rw-r--r--kernel/trace/trace_nop.c105
-rw-r--r--kernel/trace/trace_power.c179
-rw-r--r--kernel/trace/trace_sched_switch.c252
-rw-r--r--kernel/trace/trace_sched_wakeup.c220
-rw-r--r--kernel/trace/trace_selftest.c274
-rw-r--r--kernel/trace/trace_stack.c360
-rw-r--r--kernel/trace/trace_sysprof.c32
-rw-r--r--kernel/tracepoint.c576
-rw-r--r--kernel/tsacct.c6
-rw-r--r--kernel/uid16.c31
-rw-r--r--kernel/user.c98
-rw-r--r--kernel/user_namespace.c65
-rw-r--r--kernel/utsname_sysctl.c5
-rw-r--r--kernel/wait.c14
-rw-r--r--kernel/workqueue.c62
134 files changed, 17467 insertions, 6099 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 000000000000..a3bb4cb52539
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
1config FREEZER
2 def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,28 +52,3 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
69config RCU_TRACE
70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
72 select DEBUG_FS
73 default y
74 help
75 This option provides tracing in RCU which presents stats
76 in debugfs for debugging RCU implementation.
77
78 Say Y here if you want to enable RCU tracing
79 Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df7c3e2..e1c5bf3365c0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,11 +9,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
13 13
14CFLAGS_REMOVE_sched.o = -mno-spe 14ifdef CONFIG_FUNCTION_TRACER
15
16ifdef CONFIG_FTRACE
17# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
18CFLAGS_REMOVE_lockdep.o = -pg 16CFLAGS_REMOVE_lockdep.o = -pg
19CFLAGS_REMOVE_lockdep_proc.o = -pg 17CFLAGS_REMOVE_lockdep_proc.o = -pg
@@ -21,9 +19,9 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_sched.o = -mno-spe -pg
25endif 22endif
26 23
24obj-$(CONFIG_FREEZER) += freezer.o
27obj-$(CONFIG_PROFILING) += profile.o 25obj-$(CONFIG_PROFILING) += profile.o
28obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 26obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
29obj-$(CONFIG_STACKTRACE) += stacktrace.o 27obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -55,6 +53,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
55obj-$(CONFIG_COMPAT) += compat.o 53obj-$(CONFIG_COMPAT) += compat.o
56obj-$(CONFIG_CGROUPS) += cgroup.o 54obj-$(CONFIG_CGROUPS) += cgroup.o
57obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 55obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
56obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
58obj-$(CONFIG_CPUSETS) += cpuset.o 57obj-$(CONFIG_CPUSETS) += cpuset.o
59obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 58obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
60obj-$(CONFIG_UTS_NS) += utsname.o 59obj-$(CONFIG_UTS_NS) += utsname.o
@@ -74,22 +73,23 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
74obj-$(CONFIG_SECCOMP) += seccomp.o 73obj-$(CONFIG_SECCOMP) += seccomp.o
75obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
76obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o 75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
76obj-$(CONFIG_TREE_RCU) += rcutree.o
77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
78ifeq ($(CONFIG_PREEMPT_RCU),y) 78obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
79obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o 79obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
80endif
81obj-$(CONFIG_RELAY) += relay.o 80obj-$(CONFIG_RELAY) += relay.o
82obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
83obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
84obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 83obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
85obj-$(CONFIG_MARKERS) += marker.o 84obj-$(CONFIG_MARKERS) += marker.o
85obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
86obj-$(CONFIG_LATENCYTOP) += latencytop.o 86obj-$(CONFIG_LATENCYTOP) += latencytop.o
87obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o 87obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
88obj-$(CONFIG_FTRACE) += trace/ 88obj-$(CONFIG_FUNCTION_TRACER) += trace/
89obj-$(CONFIG_TRACING) += trace/ 89obj-$(CONFIG_TRACING) += trace/
90obj-$(CONFIG_SMP) += sched_cpupri.o 90obj-$(CONFIG_SMP) += sched_cpupri.o
91 91
92ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 92ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
94# needed for x86 only. Why this used to be enabled for all architectures is beyond 94# needed for x86 only. Why this used to be enabled for all architectures is beyond
95# me. I suspect most platforms don't need this, but until we know that for sure 95# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/acct.c b/kernel/acct.c
index f6006a60df5d..d57b7cbb98b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct,
530 do_div(elapsed, AHZ); 530 do_div(elapsed, AHZ);
531 ac.ac_btime = get_seconds() - elapsed; 531 ac.ac_btime = get_seconds() - elapsed;
532 /* we really need to bite the bullet and change layout */ 532 /* we really need to bite the bullet and change layout */
533 ac.ac_uid = current->uid; 533 current_uid_gid(&ac.ac_uid, &ac.ac_gid);
534 ac.ac_gid = current->gid;
535#if ACCT_VERSION==2 534#if ACCT_VERSION==2
536 ac.ac_ahz = AHZ; 535 ac.ac_ahz = AHZ;
537#endif 536#endif
538#if ACCT_VERSION==1 || ACCT_VERSION==2 537#if ACCT_VERSION==1 || ACCT_VERSION==2
539 /* backward-compatible 16 bit fields */ 538 /* backward-compatible 16 bit fields */
540 ac.ac_uid16 = current->uid; 539 ac.ac_uid16 = ac.ac_uid;
541 ac.ac_gid16 = current->gid; 540 ac.ac_gid16 = ac.ac_gid;
542#endif 541#endif
543#if ACCT_VERSION==3 542#if ACCT_VERSION==3
544 ac.ac_pid = task_tgid_nr_ns(current, ns); 543 ac.ac_pid = task_tgid_nr_ns(current, ns);
diff --git a/kernel/audit.c b/kernel/audit.c
index 4414e93d8750..ce6d8ea3131e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,8 +61,11 @@
61 61
62#include "audit.h" 62#include "audit.h"
63 63
64/* No auditing will take place until audit_initialized != 0. 64/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
65 * (Initialization happens after skb_init is called.) */ 65 * (Initialization happens after skb_init is called.) */
66#define AUDIT_DISABLED -1
67#define AUDIT_UNINITIALIZED 0
68#define AUDIT_INITIALIZED 1
66static int audit_initialized; 69static int audit_initialized;
67 70
68#define AUDIT_OFF 0 71#define AUDIT_OFF 0
@@ -965,6 +968,9 @@ static int __init audit_init(void)
965{ 968{
966 int i; 969 int i;
967 970
971 if (audit_initialized == AUDIT_DISABLED)
972 return 0;
973
968 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 974 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
969 audit_default ? "enabled" : "disabled"); 975 audit_default ? "enabled" : "disabled");
970 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, 976 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
@@ -976,7 +982,7 @@ static int __init audit_init(void)
976 982
977 skb_queue_head_init(&audit_skb_queue); 983 skb_queue_head_init(&audit_skb_queue);
978 skb_queue_head_init(&audit_skb_hold_queue); 984 skb_queue_head_init(&audit_skb_hold_queue);
979 audit_initialized = 1; 985 audit_initialized = AUDIT_INITIALIZED;
980 audit_enabled = audit_default; 986 audit_enabled = audit_default;
981 audit_ever_enabled |= !!audit_default; 987 audit_ever_enabled |= !!audit_default;
982 988
@@ -999,13 +1005,21 @@ __initcall(audit_init);
999static int __init audit_enable(char *str) 1005static int __init audit_enable(char *str)
1000{ 1006{
1001 audit_default = !!simple_strtol(str, NULL, 0); 1007 audit_default = !!simple_strtol(str, NULL, 0);
1002 printk(KERN_INFO "audit: %s%s\n", 1008 if (!audit_default)
1003 audit_default ? "enabled" : "disabled", 1009 audit_initialized = AUDIT_DISABLED;
1004 audit_initialized ? "" : " (after initialization)"); 1010
1005 if (audit_initialized) { 1011 printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
1012
1013 if (audit_initialized == AUDIT_INITIALIZED) {
1006 audit_enabled = audit_default; 1014 audit_enabled = audit_default;
1007 audit_ever_enabled |= !!audit_default; 1015 audit_ever_enabled |= !!audit_default;
1016 } else if (audit_initialized == AUDIT_UNINITIALIZED) {
1017 printk(" (after initialization)");
1018 } else {
1019 printk(" (until reboot)");
1008 } 1020 }
1021 printk("\n");
1022
1009 return 1; 1023 return 1;
1010} 1024}
1011 1025
@@ -1107,9 +1121,7 @@ unsigned int audit_serial(void)
1107static inline void audit_get_stamp(struct audit_context *ctx, 1121static inline void audit_get_stamp(struct audit_context *ctx,
1108 struct timespec *t, unsigned int *serial) 1122 struct timespec *t, unsigned int *serial)
1109{ 1123{
1110 if (ctx) 1124 if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
1111 auditsc_get_stamp(ctx, t, serial);
1112 else {
1113 *t = CURRENT_TIME; 1125 *t = CURRENT_TIME;
1114 *serial = audit_serial(); 1126 *serial = audit_serial();
1115 } 1127 }
@@ -1146,7 +1158,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1146 int reserve; 1158 int reserve;
1147 unsigned long timeout_start = jiffies; 1159 unsigned long timeout_start = jiffies;
1148 1160
1149 if (!audit_initialized) 1161 if (audit_initialized != AUDIT_INITIALIZED)
1150 return NULL; 1162 return NULL;
1151 1163
1152 if (unlikely(audit_filter_type(type))) 1164 if (unlikely(audit_filter_type(type)))
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f7921a2ecf16..8b509441f49a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
24 struct list_head trees; /* with root here */ 24 struct list_head trees; /* with root here */
25 int dead; 25 int dead;
26 int count; 26 int count;
27 atomic_long_t refs;
27 struct rcu_head head; 28 struct rcu_head head;
28 struct node { 29 struct node {
29 struct list_head list; 30 struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
56 * tree is refcounted; one reference for "some rules on rules_list refer to 57 * tree is refcounted; one reference for "some rules on rules_list refer to
57 * it", one for each chunk with pointer to it. 58 * it", one for each chunk with pointer to it.
58 * 59 *
59 * chunk is refcounted by embedded inotify_watch. 60 * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
61 * of watch contributes 1 to .refs).
60 * 62 *
61 * node.index allows to get from node.list to containing chunk. 63 * node.index allows to get from node.list to containing chunk.
62 * MSB of that sucker is stolen to mark taggings that we might have to 64 * MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
121 INIT_LIST_HEAD(&chunk->hash); 123 INIT_LIST_HEAD(&chunk->hash);
122 INIT_LIST_HEAD(&chunk->trees); 124 INIT_LIST_HEAD(&chunk->trees);
123 chunk->count = count; 125 chunk->count = count;
126 atomic_long_set(&chunk->refs, 1);
124 for (i = 0; i < count; i++) { 127 for (i = 0; i < count; i++) {
125 INIT_LIST_HEAD(&chunk->owners[i].list); 128 INIT_LIST_HEAD(&chunk->owners[i].list);
126 chunk->owners[i].index = i; 129 chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
129 return chunk; 132 return chunk;
130} 133}
131 134
132static void __free_chunk(struct rcu_head *rcu) 135static void free_chunk(struct audit_chunk *chunk)
133{ 136{
134 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
135 int i; 137 int i;
136 138
137 for (i = 0; i < chunk->count; i++) { 139 for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
141 kfree(chunk); 143 kfree(chunk);
142} 144}
143 145
144static inline void free_chunk(struct audit_chunk *chunk) 146void audit_put_chunk(struct audit_chunk *chunk)
145{ 147{
146 call_rcu(&chunk->head, __free_chunk); 148 if (atomic_long_dec_and_test(&chunk->refs))
149 free_chunk(chunk);
147} 150}
148 151
149void audit_put_chunk(struct audit_chunk *chunk) 152static void __put_chunk(struct rcu_head *rcu)
150{ 153{
151 put_inotify_watch(&chunk->watch); 154 struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
155 audit_put_chunk(chunk);
152} 156}
153 157
154enum {HASH_SIZE = 128}; 158enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
176 180
177 list_for_each_entry_rcu(p, list, hash) { 181 list_for_each_entry_rcu(p, list, hash) {
178 if (p->watch.inode == inode) { 182 if (p->watch.inode == inode) {
179 get_inotify_watch(&p->watch); 183 atomic_long_inc(&p->refs);
180 return p; 184 return p;
181 } 185 }
182 } 186 }
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
194 198
195/* tagging and untagging inodes with trees */ 199/* tagging and untagging inodes with trees */
196 200
197static void untag_chunk(struct audit_chunk *chunk, struct node *p) 201static struct audit_chunk *find_chunk(struct node *p)
202{
203 int index = p->index & ~(1U<<31);
204 p -= index;
205 return container_of(p, struct audit_chunk, owners[0]);
206}
207
208static void untag_chunk(struct node *p)
198{ 209{
210 struct audit_chunk *chunk = find_chunk(p);
199 struct audit_chunk *new; 211 struct audit_chunk *new;
200 struct audit_tree *owner; 212 struct audit_tree *owner;
201 int size = chunk->count - 1; 213 int size = chunk->count - 1;
202 int i, j; 214 int i, j;
203 215
216 if (!pin_inotify_watch(&chunk->watch)) {
217 /*
218 * Filesystem is shutting down; all watches are getting
219 * evicted, just take it off the node list for this
220 * tree and let the eviction logics take care of the
221 * rest.
222 */
223 owner = p->owner;
224 if (owner->root == chunk) {
225 list_del_init(&owner->same_root);
226 owner->root = NULL;
227 }
228 list_del_init(&p->list);
229 p->owner = NULL;
230 put_tree(owner);
231 return;
232 }
233
234 spin_unlock(&hash_lock);
235
236 /*
237 * pin_inotify_watch() succeeded, so the watch won't go away
238 * from under us.
239 */
204 mutex_lock(&chunk->watch.inode->inotify_mutex); 240 mutex_lock(&chunk->watch.inode->inotify_mutex);
205 if (chunk->dead) { 241 if (chunk->dead) {
206 mutex_unlock(&chunk->watch.inode->inotify_mutex); 242 mutex_unlock(&chunk->watch.inode->inotify_mutex);
207 return; 243 goto out;
208 } 244 }
209 245
210 owner = p->owner; 246 owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
221 inotify_evict_watch(&chunk->watch); 257 inotify_evict_watch(&chunk->watch);
222 mutex_unlock(&chunk->watch.inode->inotify_mutex); 258 mutex_unlock(&chunk->watch.inode->inotify_mutex);
223 put_inotify_watch(&chunk->watch); 259 put_inotify_watch(&chunk->watch);
224 return; 260 goto out;
225 } 261 }
226 262
227 new = alloc_chunk(size); 263 new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
263 inotify_evict_watch(&chunk->watch); 299 inotify_evict_watch(&chunk->watch);
264 mutex_unlock(&chunk->watch.inode->inotify_mutex); 300 mutex_unlock(&chunk->watch.inode->inotify_mutex);
265 put_inotify_watch(&chunk->watch); 301 put_inotify_watch(&chunk->watch);
266 return; 302 goto out;
267 303
268Fallback: 304Fallback:
269 // do the best we can 305 // do the best we can
@@ -277,6 +313,9 @@ Fallback:
277 put_tree(owner); 313 put_tree(owner);
278 spin_unlock(&hash_lock); 314 spin_unlock(&hash_lock);
279 mutex_unlock(&chunk->watch.inode->inotify_mutex); 315 mutex_unlock(&chunk->watch.inode->inotify_mutex);
316out:
317 unpin_inotify_watch(&chunk->watch);
318 spin_lock(&hash_lock);
280} 319}
281 320
282static int create_chunk(struct inode *inode, struct audit_tree *tree) 321static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
387 return 0; 426 return 0;
388} 427}
389 428
390static struct audit_chunk *find_chunk(struct node *p)
391{
392 int index = p->index & ~(1U<<31);
393 p -= index;
394 return container_of(p, struct audit_chunk, owners[0]);
395}
396
397static void kill_rules(struct audit_tree *tree) 429static void kill_rules(struct audit_tree *tree)
398{ 430{
399 struct audit_krule *rule, *next; 431 struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
431 spin_lock(&hash_lock); 463 spin_lock(&hash_lock);
432 while (!list_empty(&victim->chunks)) { 464 while (!list_empty(&victim->chunks)) {
433 struct node *p; 465 struct node *p;
434 struct audit_chunk *chunk;
435 466
436 p = list_entry(victim->chunks.next, struct node, list); 467 p = list_entry(victim->chunks.next, struct node, list);
437 chunk = find_chunk(p);
438 get_inotify_watch(&chunk->watch);
439 spin_unlock(&hash_lock);
440
441 untag_chunk(chunk, p);
442 468
443 put_inotify_watch(&chunk->watch); 469 untag_chunk(p);
444 spin_lock(&hash_lock);
445 } 470 }
446 spin_unlock(&hash_lock); 471 spin_unlock(&hash_lock);
447 put_tree(victim); 472 put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
469 494
470 while (!list_empty(&tree->chunks)) { 495 while (!list_empty(&tree->chunks)) {
471 struct node *node; 496 struct node *node;
472 struct audit_chunk *chunk;
473 497
474 node = list_entry(tree->chunks.next, struct node, list); 498 node = list_entry(tree->chunks.next, struct node, list);
475 499
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
477 if (!(node->index & (1U<<31))) 501 if (!(node->index & (1U<<31)))
478 break; 502 break;
479 503
480 chunk = find_chunk(node); 504 untag_chunk(node);
481 get_inotify_watch(&chunk->watch);
482 spin_unlock(&hash_lock);
483
484 untag_chunk(chunk, node);
485
486 put_inotify_watch(&chunk->watch);
487 spin_lock(&hash_lock);
488 } 505 }
489 if (!tree->root && !tree->goner) { 506 if (!tree->root && !tree->goner) {
490 tree->goner = 1; 507 tree->goner = 1;
@@ -532,7 +549,7 @@ void audit_trim_trees(void)
532 list_add(&cursor, &tree_list); 549 list_add(&cursor, &tree_list);
533 while (cursor.next != &tree_list) { 550 while (cursor.next != &tree_list) {
534 struct audit_tree *tree; 551 struct audit_tree *tree;
535 struct nameidata nd; 552 struct path path;
536 struct vfsmount *root_mnt; 553 struct vfsmount *root_mnt;
537 struct node *node; 554 struct node *node;
538 struct list_head list; 555 struct list_head list;
@@ -544,12 +561,12 @@ void audit_trim_trees(void)
544 list_add(&cursor, &tree->list); 561 list_add(&cursor, &tree->list);
545 mutex_unlock(&audit_filter_mutex); 562 mutex_unlock(&audit_filter_mutex);
546 563
547 err = path_lookup(tree->pathname, 0, &nd); 564 err = kern_path(tree->pathname, 0, &path);
548 if (err) 565 if (err)
549 goto skip_it; 566 goto skip_it;
550 567
551 root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); 568 root_mnt = collect_mounts(path.mnt, path.dentry);
552 path_put(&nd.path); 569 path_put(&path);
553 if (!root_mnt) 570 if (!root_mnt)
554 goto skip_it; 571 goto skip_it;
555 572
@@ -580,19 +597,19 @@ skip_it:
580} 597}
581 598
582static int is_under(struct vfsmount *mnt, struct dentry *dentry, 599static int is_under(struct vfsmount *mnt, struct dentry *dentry,
583 struct nameidata *nd) 600 struct path *path)
584{ 601{
585 if (mnt != nd->path.mnt) { 602 if (mnt != path->mnt) {
586 for (;;) { 603 for (;;) {
587 if (mnt->mnt_parent == mnt) 604 if (mnt->mnt_parent == mnt)
588 return 0; 605 return 0;
589 if (mnt->mnt_parent == nd->path.mnt) 606 if (mnt->mnt_parent == path->mnt)
590 break; 607 break;
591 mnt = mnt->mnt_parent; 608 mnt = mnt->mnt_parent;
592 } 609 }
593 dentry = mnt->mnt_mountpoint; 610 dentry = mnt->mnt_mountpoint;
594 } 611 }
595 return is_subdir(dentry, nd->path.dentry); 612 return is_subdir(dentry, path->dentry);
596} 613}
597 614
598int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 615int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -618,7 +635,7 @@ void audit_put_tree(struct audit_tree *tree)
618int audit_add_tree_rule(struct audit_krule *rule) 635int audit_add_tree_rule(struct audit_krule *rule)
619{ 636{
620 struct audit_tree *seed = rule->tree, *tree; 637 struct audit_tree *seed = rule->tree, *tree;
621 struct nameidata nd; 638 struct path path;
622 struct vfsmount *mnt, *p; 639 struct vfsmount *mnt, *p;
623 struct list_head list; 640 struct list_head list;
624 int err; 641 int err;
@@ -637,11 +654,11 @@ int audit_add_tree_rule(struct audit_krule *rule)
637 /* do not set rule->tree yet */ 654 /* do not set rule->tree yet */
638 mutex_unlock(&audit_filter_mutex); 655 mutex_unlock(&audit_filter_mutex);
639 656
640 err = path_lookup(tree->pathname, 0, &nd); 657 err = kern_path(tree->pathname, 0, &path);
641 if (err) 658 if (err)
642 goto Err; 659 goto Err;
643 mnt = collect_mounts(nd.path.mnt, nd.path.dentry); 660 mnt = collect_mounts(path.mnt, path.dentry);
644 path_put(&nd.path); 661 path_put(&path);
645 if (!mnt) { 662 if (!mnt) {
646 err = -ENOMEM; 663 err = -ENOMEM;
647 goto Err; 664 goto Err;
@@ -690,29 +707,29 @@ int audit_tag_tree(char *old, char *new)
690{ 707{
691 struct list_head cursor, barrier; 708 struct list_head cursor, barrier;
692 int failed = 0; 709 int failed = 0;
693 struct nameidata nd; 710 struct path path;
694 struct vfsmount *tagged; 711 struct vfsmount *tagged;
695 struct list_head list; 712 struct list_head list;
696 struct vfsmount *mnt; 713 struct vfsmount *mnt;
697 struct dentry *dentry; 714 struct dentry *dentry;
698 int err; 715 int err;
699 716
700 err = path_lookup(new, 0, &nd); 717 err = kern_path(new, 0, &path);
701 if (err) 718 if (err)
702 return err; 719 return err;
703 tagged = collect_mounts(nd.path.mnt, nd.path.dentry); 720 tagged = collect_mounts(path.mnt, path.dentry);
704 path_put(&nd.path); 721 path_put(&path);
705 if (!tagged) 722 if (!tagged)
706 return -ENOMEM; 723 return -ENOMEM;
707 724
708 err = path_lookup(old, 0, &nd); 725 err = kern_path(old, 0, &path);
709 if (err) { 726 if (err) {
710 drop_collected_mounts(tagged); 727 drop_collected_mounts(tagged);
711 return err; 728 return err;
712 } 729 }
713 mnt = mntget(nd.path.mnt); 730 mnt = mntget(path.mnt);
714 dentry = dget(nd.path.dentry); 731 dentry = dget(path.dentry);
715 path_put(&nd.path); 732 path_put(&path);
716 733
717 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) 734 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
718 follow_up(&mnt, &dentry); 735 follow_up(&mnt, &dentry);
@@ -733,7 +750,7 @@ int audit_tag_tree(char *old, char *new)
733 list_add(&cursor, &tree->list); 750 list_add(&cursor, &tree->list);
734 mutex_unlock(&audit_filter_mutex); 751 mutex_unlock(&audit_filter_mutex);
735 752
736 err = path_lookup(tree->pathname, 0, &nd); 753 err = kern_path(tree->pathname, 0, &path);
737 if (err) { 754 if (err) {
738 put_tree(tree); 755 put_tree(tree);
739 mutex_lock(&audit_filter_mutex); 756 mutex_lock(&audit_filter_mutex);
@@ -741,15 +758,15 @@ int audit_tag_tree(char *old, char *new)
741 } 758 }
742 759
743 spin_lock(&vfsmount_lock); 760 spin_lock(&vfsmount_lock);
744 if (!is_under(mnt, dentry, &nd)) { 761 if (!is_under(mnt, dentry, &path)) {
745 spin_unlock(&vfsmount_lock); 762 spin_unlock(&vfsmount_lock);
746 path_put(&nd.path); 763 path_put(&path);
747 put_tree(tree); 764 put_tree(tree);
748 mutex_lock(&audit_filter_mutex); 765 mutex_lock(&audit_filter_mutex);
749 continue; 766 continue;
750 } 767 }
751 spin_unlock(&vfsmount_lock); 768 spin_unlock(&vfsmount_lock);
752 path_put(&nd.path); 769 path_put(&path);
753 770
754 list_for_each_entry(p, &list, mnt_list) { 771 list_for_each_entry(p, &list, mnt_list) {
755 failed = tag_chunk(p->mnt_root->d_inode, tree); 772 failed = tag_chunk(p->mnt_root->d_inode, tree);
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
878static void destroy_watch(struct inotify_watch *watch) 895static void destroy_watch(struct inotify_watch *watch)
879{ 896{
880 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); 897 struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
881 free_chunk(chunk); 898 call_rcu(&chunk->head, __put_chunk);
882} 899}
883 900
884static const struct inotify_operations rtree_inotify_ops = { 901static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0ef..9fd85a4640a0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
1094 list_for_each_entry_safe(p, n, in_list, ilist) { 1094 list_for_each_entry_safe(p, n, in_list, ilist) {
1095 list_del(&p->ilist); 1095 list_del(&p->ilist);
1096 inotify_rm_watch(audit_ih, &p->wdata); 1096 inotify_rm_watch(audit_ih, &p->wdata);
1097 /* the put matching the get in audit_do_del_rule() */ 1097 /* the unpin matching the pin in audit_do_del_rule() */
1098 put_inotify_watch(&p->wdata); 1098 unpin_inotify_watch(&p->wdata);
1099 } 1099 }
1100} 1100}
1101 1101
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
1389 /* Put parent on the inotify un-registration 1389 /* Put parent on the inotify un-registration
1390 * list. Grab a reference before releasing 1390 * list. Grab a reference before releasing
1391 * audit_filter_mutex, to be released in 1391 * audit_filter_mutex, to be released in
1392 * audit_inotify_unregister(). */ 1392 * audit_inotify_unregister().
1393 list_add(&parent->ilist, &inotify_list); 1393 * If filesystem is going away, just leave
1394 get_inotify_watch(&parent->wdata); 1394 * the sucker alone, eviction will take
1395 * care of it.
1396 */
1397 if (pin_inotify_watch(&parent->wdata))
1398 list_add(&parent->ilist, &inotify_list);
1395 } 1399 }
1396 } 1400 }
1397 } 1401 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c3..4819f3711973 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,6 +65,7 @@
65#include <linux/highmem.h> 65#include <linux/highmem.h>
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h>
68 69
69#include "audit.h" 70#include "audit.h"
70 71
@@ -84,6 +85,15 @@ int audit_n_rules;
84/* determines whether we collect data for signals sent */ 85/* determines whether we collect data for signals sent */
85int audit_signals; 86int audit_signals;
86 87
88struct audit_cap_data {
89 kernel_cap_t permitted;
90 kernel_cap_t inheritable;
91 union {
92 unsigned int fE; /* effective bit of a file capability */
93 kernel_cap_t effective; /* effective set of a process */
94 };
95};
96
87/* When fs/namei.c:getname() is called, we store the pointer in name and 97/* When fs/namei.c:getname() is called, we store the pointer in name and
88 * we don't let putname() free it (instead we free all of the saved 98 * we don't let putname() free it (instead we free all of the saved
89 * pointers at syscall exit time). 99 * pointers at syscall exit time).
@@ -100,6 +110,8 @@ struct audit_names {
100 gid_t gid; 110 gid_t gid;
101 dev_t rdev; 111 dev_t rdev;
102 u32 osid; 112 u32 osid;
113 struct audit_cap_data fcap;
114 unsigned int fcap_ver;
103}; 115};
104 116
105struct audit_aux_data { 117struct audit_aux_data {
@@ -184,6 +196,20 @@ struct audit_aux_data_pids {
184 int pid_count; 196 int pid_count;
185}; 197};
186 198
199struct audit_aux_data_bprm_fcaps {
200 struct audit_aux_data d;
201 struct audit_cap_data fcap;
202 unsigned int fcap_ver;
203 struct audit_cap_data old_pcap;
204 struct audit_cap_data new_pcap;
205};
206
207struct audit_aux_data_capset {
208 struct audit_aux_data d;
209 pid_t pid;
210 struct audit_cap_data cap;
211};
212
187struct audit_tree_refs { 213struct audit_tree_refs {
188 struct audit_tree_refs *next; 214 struct audit_tree_refs *next;
189 struct audit_chunk *c[31]; 215 struct audit_chunk *c[31];
@@ -421,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
421 struct audit_names *name, 447 struct audit_names *name,
422 enum audit_state *state) 448 enum audit_state *state)
423{ 449{
450 const struct cred *cred = get_task_cred(tsk);
424 int i, j, need_sid = 1; 451 int i, j, need_sid = 1;
425 u32 sid; 452 u32 sid;
426 453
@@ -440,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk,
440 } 467 }
441 break; 468 break;
442 case AUDIT_UID: 469 case AUDIT_UID:
443 result = audit_comparator(tsk->uid, f->op, f->val); 470 result = audit_comparator(cred->uid, f->op, f->val);
444 break; 471 break;
445 case AUDIT_EUID: 472 case AUDIT_EUID:
446 result = audit_comparator(tsk->euid, f->op, f->val); 473 result = audit_comparator(cred->euid, f->op, f->val);
447 break; 474 break;
448 case AUDIT_SUID: 475 case AUDIT_SUID:
449 result = audit_comparator(tsk->suid, f->op, f->val); 476 result = audit_comparator(cred->suid, f->op, f->val);
450 break; 477 break;
451 case AUDIT_FSUID: 478 case AUDIT_FSUID:
452 result = audit_comparator(tsk->fsuid, f->op, f->val); 479 result = audit_comparator(cred->fsuid, f->op, f->val);
453 break; 480 break;
454 case AUDIT_GID: 481 case AUDIT_GID:
455 result = audit_comparator(tsk->gid, f->op, f->val); 482 result = audit_comparator(cred->gid, f->op, f->val);
456 break; 483 break;
457 case AUDIT_EGID: 484 case AUDIT_EGID:
458 result = audit_comparator(tsk->egid, f->op, f->val); 485 result = audit_comparator(cred->egid, f->op, f->val);
459 break; 486 break;
460 case AUDIT_SGID: 487 case AUDIT_SGID:
461 result = audit_comparator(tsk->sgid, f->op, f->val); 488 result = audit_comparator(cred->sgid, f->op, f->val);
462 break; 489 break;
463 case AUDIT_FSGID: 490 case AUDIT_FSGID:
464 result = audit_comparator(tsk->fsgid, f->op, f->val); 491 result = audit_comparator(cred->fsgid, f->op, f->val);
465 break; 492 break;
466 case AUDIT_PERS: 493 case AUDIT_PERS:
467 result = audit_comparator(tsk->personality, f->op, f->val); 494 result = audit_comparator(tsk->personality, f->op, f->val);
@@ -615,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk,
615 break; 642 break;
616 } 643 }
617 644
618 if (!result) 645 if (!result) {
646 put_cred(cred);
619 return 0; 647 return 0;
648 }
620 } 649 }
621 if (rule->filterkey && ctx) 650 if (rule->filterkey && ctx)
622 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 651 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -624,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk,
624 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 653 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
625 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 654 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
626 } 655 }
656 put_cred(cred);
627 return 1; 657 return 1;
628} 658}
629 659
@@ -1171,8 +1201,38 @@ static void audit_log_execve_info(struct audit_context *context,
1171 kfree(buf); 1201 kfree(buf);
1172} 1202}
1173 1203
1204static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1205{
1206 int i;
1207
1208 audit_log_format(ab, " %s=", prefix);
1209 CAP_FOR_EACH_U32(i) {
1210 audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
1211 }
1212}
1213
1214static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1215{
1216 kernel_cap_t *perm = &name->fcap.permitted;
1217 kernel_cap_t *inh = &name->fcap.inheritable;
1218 int log = 0;
1219
1220 if (!cap_isclear(*perm)) {
1221 audit_log_cap(ab, "cap_fp", perm);
1222 log = 1;
1223 }
1224 if (!cap_isclear(*inh)) {
1225 audit_log_cap(ab, "cap_fi", inh);
1226 log = 1;
1227 }
1228
1229 if (log)
1230 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1231}
1232
1174static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1233static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1175{ 1234{
1235 const struct cred *cred;
1176 int i, call_panic = 0; 1236 int i, call_panic = 0;
1177 struct audit_buffer *ab; 1237 struct audit_buffer *ab;
1178 struct audit_aux_data *aux; 1238 struct audit_aux_data *aux;
@@ -1182,14 +1242,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1182 context->pid = tsk->pid; 1242 context->pid = tsk->pid;
1183 if (!context->ppid) 1243 if (!context->ppid)
1184 context->ppid = sys_getppid(); 1244 context->ppid = sys_getppid();
1185 context->uid = tsk->uid; 1245 cred = current_cred();
1186 context->gid = tsk->gid; 1246 context->uid = cred->uid;
1187 context->euid = tsk->euid; 1247 context->gid = cred->gid;
1188 context->suid = tsk->suid; 1248 context->euid = cred->euid;
1189 context->fsuid = tsk->fsuid; 1249 context->suid = cred->suid;
1190 context->egid = tsk->egid; 1250 context->fsuid = cred->fsuid;
1191 context->sgid = tsk->sgid; 1251 context->egid = cred->egid;
1192 context->fsgid = tsk->fsgid; 1252 context->sgid = cred->sgid;
1253 context->fsgid = cred->fsgid;
1193 context->personality = tsk->personality; 1254 context->personality = tsk->personality;
1194 1255
1195 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 1256 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1334,6 +1395,28 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1334 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); 1395 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
1335 break; } 1396 break; }
1336 1397
1398 case AUDIT_BPRM_FCAPS: {
1399 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1400 audit_log_format(ab, "fver=%x", axs->fcap_ver);
1401 audit_log_cap(ab, "fp", &axs->fcap.permitted);
1402 audit_log_cap(ab, "fi", &axs->fcap.inheritable);
1403 audit_log_format(ab, " fe=%d", axs->fcap.fE);
1404 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
1405 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
1406 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
1407 audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
1408 audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
1409 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
1410 break; }
1411
1412 case AUDIT_CAPSET: {
1413 struct audit_aux_data_capset *axs = (void *)aux;
1414 audit_log_format(ab, "pid=%d", axs->pid);
1415 audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
1416 audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
1417 audit_log_cap(ab, "cap_pe", &axs->cap.effective);
1418 break; }
1419
1337 } 1420 }
1338 audit_log_end(ab); 1421 audit_log_end(ab);
1339 } 1422 }
@@ -1421,6 +1504,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1421 } 1504 }
1422 } 1505 }
1423 1506
1507 audit_log_fcaps(ab, n);
1508
1424 audit_log_end(ab); 1509 audit_log_end(ab);
1425 } 1510 }
1426 1511
@@ -1459,7 +1544,6 @@ void audit_free(struct task_struct *tsk)
1459 1544
1460/** 1545/**
1461 * audit_syscall_entry - fill in an audit record at syscall entry 1546 * audit_syscall_entry - fill in an audit record at syscall entry
1462 * @tsk: task being audited
1463 * @arch: architecture type 1547 * @arch: architecture type
1464 * @major: major syscall type (function) 1548 * @major: major syscall type (function)
1465 * @a1: additional syscall register 1 1549 * @a1: additional syscall register 1
@@ -1548,9 +1632,25 @@ void audit_syscall_entry(int arch, int major,
1548 context->ppid = 0; 1632 context->ppid = 0;
1549} 1633}
1550 1634
1635void audit_finish_fork(struct task_struct *child)
1636{
1637 struct audit_context *ctx = current->audit_context;
1638 struct audit_context *p = child->audit_context;
1639 if (!p || !ctx || !ctx->auditable)
1640 return;
1641 p->arch = ctx->arch;
1642 p->major = ctx->major;
1643 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1644 p->ctime = ctx->ctime;
1645 p->dummy = ctx->dummy;
1646 p->auditable = ctx->auditable;
1647 p->in_syscall = ctx->in_syscall;
1648 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1649 p->ppid = current->pid;
1650}
1651
1551/** 1652/**
1552 * audit_syscall_exit - deallocate audit context after a system call 1653 * audit_syscall_exit - deallocate audit context after a system call
1553 * @tsk: task being audited
1554 * @valid: success/failure flag 1654 * @valid: success/failure flag
1555 * @return_code: syscall return value 1655 * @return_code: syscall return value
1556 * 1656 *
@@ -1787,8 +1887,36 @@ static int audit_inc_name_count(struct audit_context *context,
1787 return 0; 1887 return 0;
1788} 1888}
1789 1889
1890
1891static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
1892{
1893 struct cpu_vfs_cap_data caps;
1894 int rc;
1895
1896 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1897 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1898 name->fcap.fE = 0;
1899 name->fcap_ver = 0;
1900
1901 if (!dentry)
1902 return 0;
1903
1904 rc = get_vfs_caps_from_disk(dentry, &caps);
1905 if (rc)
1906 return rc;
1907
1908 name->fcap.permitted = caps.permitted;
1909 name->fcap.inheritable = caps.inheritable;
1910 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
1911 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
1912
1913 return 0;
1914}
1915
1916
1790/* Copy inode data into an audit_names. */ 1917/* Copy inode data into an audit_names. */
1791static void audit_copy_inode(struct audit_names *name, const struct inode *inode) 1918static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
1919 const struct inode *inode)
1792{ 1920{
1793 name->ino = inode->i_ino; 1921 name->ino = inode->i_ino;
1794 name->dev = inode->i_sb->s_dev; 1922 name->dev = inode->i_sb->s_dev;
@@ -1797,6 +1925,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
1797 name->gid = inode->i_gid; 1925 name->gid = inode->i_gid;
1798 name->rdev = inode->i_rdev; 1926 name->rdev = inode->i_rdev;
1799 security_inode_getsecid(inode, &name->osid); 1927 security_inode_getsecid(inode, &name->osid);
1928 audit_copy_fcaps(name, dentry);
1800} 1929}
1801 1930
1802/** 1931/**
@@ -1831,7 +1960,7 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1831 context->names[idx].name = NULL; 1960 context->names[idx].name = NULL;
1832 } 1961 }
1833 handle_path(dentry); 1962 handle_path(dentry);
1834 audit_copy_inode(&context->names[idx], inode); 1963 audit_copy_inode(&context->names[idx], dentry, inode);
1835} 1964}
1836 1965
1837/** 1966/**
@@ -1892,7 +2021,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
1892 if (!strcmp(dname, n->name) || 2021 if (!strcmp(dname, n->name) ||
1893 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2022 !audit_compare_dname_path(dname, n->name, &dirlen)) {
1894 if (inode) 2023 if (inode)
1895 audit_copy_inode(n, inode); 2024 audit_copy_inode(n, NULL, inode);
1896 else 2025 else
1897 n->ino = (unsigned long)-1; 2026 n->ino = (unsigned long)-1;
1898 found_child = n->name; 2027 found_child = n->name;
@@ -1906,7 +2035,7 @@ add_names:
1906 return; 2035 return;
1907 idx = context->name_count - 1; 2036 idx = context->name_count - 1;
1908 context->names[idx].name = NULL; 2037 context->names[idx].name = NULL;
1909 audit_copy_inode(&context->names[idx], parent); 2038 audit_copy_inode(&context->names[idx], NULL, parent);
1910 } 2039 }
1911 2040
1912 if (!found_child) { 2041 if (!found_child) {
@@ -1927,7 +2056,7 @@ add_names:
1927 } 2056 }
1928 2057
1929 if (inode) 2058 if (inode)
1930 audit_copy_inode(&context->names[idx], inode); 2059 audit_copy_inode(&context->names[idx], NULL, inode);
1931 else 2060 else
1932 context->names[idx].ino = (unsigned long)-1; 2061 context->names[idx].ino = (unsigned long)-1;
1933 } 2062 }
@@ -1942,15 +2071,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
1942 * 2071 *
1943 * Also sets the context as auditable. 2072 * Also sets the context as auditable.
1944 */ 2073 */
1945void auditsc_get_stamp(struct audit_context *ctx, 2074int auditsc_get_stamp(struct audit_context *ctx,
1946 struct timespec *t, unsigned int *serial) 2075 struct timespec *t, unsigned int *serial)
1947{ 2076{
2077 if (!ctx->in_syscall)
2078 return 0;
1948 if (!ctx->serial) 2079 if (!ctx->serial)
1949 ctx->serial = audit_serial(); 2080 ctx->serial = audit_serial();
1950 t->tv_sec = ctx->ctime.tv_sec; 2081 t->tv_sec = ctx->ctime.tv_sec;
1951 t->tv_nsec = ctx->ctime.tv_nsec; 2082 t->tv_nsec = ctx->ctime.tv_nsec;
1952 *serial = ctx->serial; 2083 *serial = ctx->serial;
1953 ctx->auditable = 1; 2084 ctx->auditable = 1;
2085 return 1;
1954} 2086}
1955 2087
1956/* global counter which is incremented every time something logs in */ 2088/* global counter which is incremented every time something logs in */
@@ -1978,7 +2110,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1978 audit_log_format(ab, "login pid=%d uid=%u " 2110 audit_log_format(ab, "login pid=%d uid=%u "
1979 "old auid=%u new auid=%u" 2111 "old auid=%u new auid=%u"
1980 " old ses=%u new ses=%u", 2112 " old ses=%u new ses=%u",
1981 task->pid, task->uid, 2113 task->pid, task_uid(task),
1982 task->loginuid, loginuid, 2114 task->loginuid, loginuid,
1983 task->sessionid, sessionid); 2115 task->sessionid, sessionid);
1984 audit_log_end(ab); 2116 audit_log_end(ab);
@@ -2361,7 +2493,7 @@ void __audit_ptrace(struct task_struct *t)
2361 2493
2362 context->target_pid = t->pid; 2494 context->target_pid = t->pid;
2363 context->target_auid = audit_get_loginuid(t); 2495 context->target_auid = audit_get_loginuid(t);
2364 context->target_uid = t->uid; 2496 context->target_uid = task_uid(t);
2365 context->target_sessionid = audit_get_sessionid(t); 2497 context->target_sessionid = audit_get_sessionid(t);
2366 security_task_getsecid(t, &context->target_sid); 2498 security_task_getsecid(t, &context->target_sid);
2367 memcpy(context->target_comm, t->comm, TASK_COMM_LEN); 2499 memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2380,6 +2512,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2380 struct audit_aux_data_pids *axp; 2512 struct audit_aux_data_pids *axp;
2381 struct task_struct *tsk = current; 2513 struct task_struct *tsk = current;
2382 struct audit_context *ctx = tsk->audit_context; 2514 struct audit_context *ctx = tsk->audit_context;
2515 uid_t uid = current_uid(), t_uid = task_uid(t);
2383 2516
2384 if (audit_pid && t->tgid == audit_pid) { 2517 if (audit_pid && t->tgid == audit_pid) {
2385 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2518 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
@@ -2387,7 +2520,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2387 if (tsk->loginuid != -1) 2520 if (tsk->loginuid != -1)
2388 audit_sig_uid = tsk->loginuid; 2521 audit_sig_uid = tsk->loginuid;
2389 else 2522 else
2390 audit_sig_uid = tsk->uid; 2523 audit_sig_uid = uid;
2391 security_task_getsecid(tsk, &audit_sig_sid); 2524 security_task_getsecid(tsk, &audit_sig_sid);
2392 } 2525 }
2393 if (!audit_signals || audit_dummy_context()) 2526 if (!audit_signals || audit_dummy_context())
@@ -2399,7 +2532,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2399 if (!ctx->target_pid) { 2532 if (!ctx->target_pid) {
2400 ctx->target_pid = t->tgid; 2533 ctx->target_pid = t->tgid;
2401 ctx->target_auid = audit_get_loginuid(t); 2534 ctx->target_auid = audit_get_loginuid(t);
2402 ctx->target_uid = t->uid; 2535 ctx->target_uid = t_uid;
2403 ctx->target_sessionid = audit_get_sessionid(t); 2536 ctx->target_sessionid = audit_get_sessionid(t);
2404 security_task_getsecid(t, &ctx->target_sid); 2537 security_task_getsecid(t, &ctx->target_sid);
2405 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); 2538 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2420,7 +2553,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2420 2553
2421 axp->target_pid[axp->pid_count] = t->tgid; 2554 axp->target_pid[axp->pid_count] = t->tgid;
2422 axp->target_auid[axp->pid_count] = audit_get_loginuid(t); 2555 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2423 axp->target_uid[axp->pid_count] = t->uid; 2556 axp->target_uid[axp->pid_count] = t_uid;
2424 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); 2557 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
2425 security_task_getsecid(t, &axp->target_sid[axp->pid_count]); 2558 security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
2426 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); 2559 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
@@ -2430,6 +2563,86 @@ int __audit_signal_info(int sig, struct task_struct *t)
2430} 2563}
2431 2564
2432/** 2565/**
2566 * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
2567 * @bprm: pointer to the bprm being processed
2568 * @new: the proposed new credentials
2569 * @old: the old credentials
2570 *
2571 * Simply check if the proc already has the caps given by the file and if not
2572 * store the priv escalation info for later auditing at the end of the syscall
2573 *
2574 * -Eric
2575 */
2576int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2577 const struct cred *new, const struct cred *old)
2578{
2579 struct audit_aux_data_bprm_fcaps *ax;
2580 struct audit_context *context = current->audit_context;
2581 struct cpu_vfs_cap_data vcaps;
2582 struct dentry *dentry;
2583
2584 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2585 if (!ax)
2586 return -ENOMEM;
2587
2588 ax->d.type = AUDIT_BPRM_FCAPS;
2589 ax->d.next = context->aux;
2590 context->aux = (void *)ax;
2591
2592 dentry = dget(bprm->file->f_dentry);
2593 get_vfs_caps_from_disk(dentry, &vcaps);
2594 dput(dentry);
2595
2596 ax->fcap.permitted = vcaps.permitted;
2597 ax->fcap.inheritable = vcaps.inheritable;
2598 ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
2599 ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
2600
2601 ax->old_pcap.permitted = old->cap_permitted;
2602 ax->old_pcap.inheritable = old->cap_inheritable;
2603 ax->old_pcap.effective = old->cap_effective;
2604
2605 ax->new_pcap.permitted = new->cap_permitted;
2606 ax->new_pcap.inheritable = new->cap_inheritable;
2607 ax->new_pcap.effective = new->cap_effective;
2608 return 0;
2609}
2610
2611/**
2612 * __audit_log_capset - store information about the arguments to the capset syscall
2613 * @pid: target pid of the capset call
2614 * @new: the new credentials
2615 * @old: the old (current) credentials
2616 *
2617 * Record the aguments userspace sent to sys_capset for later printing by the
2618 * audit system if applicable
2619 */
2620int __audit_log_capset(pid_t pid,
2621 const struct cred *new, const struct cred *old)
2622{
2623 struct audit_aux_data_capset *ax;
2624 struct audit_context *context = current->audit_context;
2625
2626 if (likely(!audit_enabled || !context || context->dummy))
2627 return 0;
2628
2629 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2630 if (!ax)
2631 return -ENOMEM;
2632
2633 ax->d.type = AUDIT_CAPSET;
2634 ax->d.next = context->aux;
2635 context->aux = (void *)ax;
2636
2637 ax->pid = pid;
2638 ax->cap.effective = new->cap_effective;
2639 ax->cap.inheritable = new->cap_effective;
2640 ax->cap.permitted = new->cap_permitted;
2641
2642 return 0;
2643}
2644
2645/**
2433 * audit_core_dumps - record information about processes that end abnormally 2646 * audit_core_dumps - record information about processes that end abnormally
2434 * @signr: signal value 2647 * @signr: signal value
2435 * 2648 *
@@ -2440,7 +2653,8 @@ void audit_core_dumps(long signr)
2440{ 2653{
2441 struct audit_buffer *ab; 2654 struct audit_buffer *ab;
2442 u32 sid; 2655 u32 sid;
2443 uid_t auid = audit_get_loginuid(current); 2656 uid_t auid = audit_get_loginuid(current), uid;
2657 gid_t gid;
2444 unsigned int sessionid = audit_get_sessionid(current); 2658 unsigned int sessionid = audit_get_sessionid(current);
2445 2659
2446 if (!audit_enabled) 2660 if (!audit_enabled)
@@ -2450,8 +2664,9 @@ void audit_core_dumps(long signr)
2450 return; 2664 return;
2451 2665
2452 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2666 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2667 current_uid_gid(&uid, &gid);
2453 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2668 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2454 auid, current->uid, current->gid, sessionid); 2669 auid, uid, gid, sessionid);
2455 security_task_getsecid(current, &sid); 2670 security_task_getsecid(current, &sid);
2456 if (sid) { 2671 if (sid) {
2457 char *ctx = NULL; 2672 char *ctx = NULL;
diff --git a/kernel/capability.c b/kernel/capability.c
index 33e51e78c2d8..36b4b4daebec 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/audit.h>
10#include <linux/capability.h> 11#include <linux/capability.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
@@ -14,12 +15,7 @@
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
15#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18#include "cred-internals.h"
18/*
19 * This lock protects task->cap_* for all tasks including current.
20 * Locking rule: acquire this prior to tasklist_lock.
21 */
22static DEFINE_SPINLOCK(task_capability_lock);
23 19
24/* 20/*
25 * Leveraged for setting/resetting capabilities 21 * Leveraged for setting/resetting capabilities
@@ -33,6 +29,17 @@ EXPORT_SYMBOL(__cap_empty_set);
33EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
34EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
35 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1;
34
35static int __init file_caps_disable(char *str)
36{
37 file_caps_enabled = 0;
38 return 1;
39}
40__setup("no_file_caps", file_caps_disable);
41#endif
42
36/* 43/*
37 * More recent versions of libcap are available from: 44 * More recent versions of libcap are available from:
38 * 45 *
@@ -115,167 +122,12 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
115 return 0; 122 return 0;
116} 123}
117 124
118#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
119
120/*
121 * Without filesystem capability support, we nominally support one process
122 * setting the capabilities of another
123 */
124static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
125 kernel_cap_t *pIp, kernel_cap_t *pPp)
126{
127 struct task_struct *target;
128 int ret;
129
130 spin_lock(&task_capability_lock);
131 read_lock(&tasklist_lock);
132
133 if (pid && pid != task_pid_vnr(current)) {
134 target = find_task_by_vpid(pid);
135 if (!target) {
136 ret = -ESRCH;
137 goto out;
138 }
139 } else
140 target = current;
141
142 ret = security_capget(target, pEp, pIp, pPp);
143
144out:
145 read_unlock(&tasklist_lock);
146 spin_unlock(&task_capability_lock);
147
148 return ret;
149}
150
151/*
152 * cap_set_pg - set capabilities for all processes in a given process
153 * group. We call this holding task_capability_lock and tasklist_lock.
154 */
155static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
156 kernel_cap_t *inheritable,
157 kernel_cap_t *permitted)
158{
159 struct task_struct *g, *target;
160 int ret = -EPERM;
161 int found = 0;
162 struct pid *pgrp;
163
164 spin_lock(&task_capability_lock);
165 read_lock(&tasklist_lock);
166
167 pgrp = find_vpid(pgrp_nr);
168 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
169 target = g;
170 while_each_thread(g, target) {
171 if (!security_capset_check(target, effective,
172 inheritable, permitted)) {
173 security_capset_set(target, effective,
174 inheritable, permitted);
175 ret = 0;
176 }
177 found = 1;
178 }
179 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
180
181 read_unlock(&tasklist_lock);
182 spin_unlock(&task_capability_lock);
183
184 if (!found)
185 ret = 0;
186 return ret;
187}
188
189/*
190 * cap_set_all - set capabilities for all processes other than init
191 * and self. We call this holding task_capability_lock and tasklist_lock.
192 */
193static inline int cap_set_all(kernel_cap_t *effective,
194 kernel_cap_t *inheritable,
195 kernel_cap_t *permitted)
196{
197 struct task_struct *g, *target;
198 int ret = -EPERM;
199 int found = 0;
200
201 spin_lock(&task_capability_lock);
202 read_lock(&tasklist_lock);
203
204 do_each_thread(g, target) {
205 if (target == current
206 || is_container_init(target->group_leader))
207 continue;
208 found = 1;
209 if (security_capset_check(target, effective, inheritable,
210 permitted))
211 continue;
212 ret = 0;
213 security_capset_set(target, effective, inheritable, permitted);
214 } while_each_thread(g, target);
215
216 read_unlock(&tasklist_lock);
217 spin_unlock(&task_capability_lock);
218
219 if (!found)
220 ret = 0;
221
222 return ret;
223}
224
225/*
226 * Given the target pid does not refer to the current process we
227 * need more elaborate support... (This support is not present when
228 * filesystem capabilities are configured.)
229 */
230static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
231 kernel_cap_t *inheritable,
232 kernel_cap_t *permitted)
233{
234 struct task_struct *target;
235 int ret;
236
237 if (!capable(CAP_SETPCAP))
238 return -EPERM;
239
240 if (pid == -1) /* all procs other than current and init */
241 return cap_set_all(effective, inheritable, permitted);
242
243 else if (pid < 0) /* all procs in process group */
244 return cap_set_pg(-pid, effective, inheritable, permitted);
245
246 /* target != current */
247 spin_lock(&task_capability_lock);
248 read_lock(&tasklist_lock);
249
250 target = find_task_by_vpid(pid);
251 if (!target)
252 ret = -ESRCH;
253 else {
254 ret = security_capset_check(target, effective, inheritable,
255 permitted);
256
257 /* having verified that the proposed changes are legal,
258 we now put them into effect. */
259 if (!ret)
260 security_capset_set(target, effective, inheritable,
261 permitted);
262 }
263
264 read_unlock(&tasklist_lock);
265 spin_unlock(&task_capability_lock);
266
267 return ret;
268}
269
270#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
271
272/* 125/*
273 * If we have configured with filesystem capability support, then the 126 * The only thing that can change the capabilities of the current
274 * only thing that can change the capabilities of the current process 127 * process is the current process. As such, we can't be in this code
275 * is the current process. As such, we can't be in this code at the 128 * at the same time as we are in the process of setting capabilities
276 * same time as we are in the process of setting capabilities in this 129 * in this process. The net result is that we can limit our use of
277 * process. The net result is that we can limit our use of locks to 130 * locks to when we are reading the caps of another process.
278 * when we are reading the caps of another process.
279 */ 131 */
280static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, 132static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
281 kernel_cap_t *pIp, kernel_cap_t *pPp) 133 kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -285,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
285 if (pid && (pid != task_pid_vnr(current))) { 137 if (pid && (pid != task_pid_vnr(current))) {
286 struct task_struct *target; 138 struct task_struct *target;
287 139
288 spin_lock(&task_capability_lock);
289 read_lock(&tasklist_lock); 140 read_lock(&tasklist_lock);
290 141
291 target = find_task_by_vpid(pid); 142 target = find_task_by_vpid(pid);
@@ -295,50 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
295 ret = security_capget(target, pEp, pIp, pPp); 146 ret = security_capget(target, pEp, pIp, pPp);
296 147
297 read_unlock(&tasklist_lock); 148 read_unlock(&tasklist_lock);
298 spin_unlock(&task_capability_lock);
299 } else 149 } else
300 ret = security_capget(current, pEp, pIp, pPp); 150 ret = security_capget(current, pEp, pIp, pPp);
301 151
302 return ret; 152 return ret;
303} 153}
304 154
305/*
306 * With filesystem capability support configured, the kernel does not
307 * permit the changing of capabilities in one process by another
308 * process. (CAP_SETPCAP has much less broad semantics when configured
309 * this way.)
310 */
311static inline int do_sys_capset_other_tasks(pid_t pid,
312 kernel_cap_t *effective,
313 kernel_cap_t *inheritable,
314 kernel_cap_t *permitted)
315{
316 return -EPERM;
317}
318
319#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
320
321/*
322 * Atomically modify the effective capabilities returning the original
323 * value. No permission check is performed here - it is assumed that the
324 * caller is permitted to set the desired effective capabilities.
325 */
326kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
327{
328 kernel_cap_t pE_old;
329
330 spin_lock(&task_capability_lock);
331
332 pE_old = current->cap_effective;
333 current->cap_effective = pE_new;
334
335 spin_unlock(&task_capability_lock);
336
337 return pE_old;
338}
339
340EXPORT_SYMBOL(cap_set_effective);
341
342/** 155/**
343 * sys_capget - get the capabilities of a given process. 156 * sys_capget - get the capabilities of a given process.
344 * @header: pointer to struct that contains capability version and 157 * @header: pointer to struct that contains capability version and
@@ -366,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
366 return -EINVAL; 179 return -EINVAL;
367 180
368 ret = cap_get_target_pid(pid, &pE, &pI, &pP); 181 ret = cap_get_target_pid(pid, &pE, &pI, &pP);
369
370 if (!ret) { 182 if (!ret) {
371 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 183 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
372 unsigned i; 184 unsigned i;
@@ -412,16 +224,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
412 * @data: pointer to struct that contains the effective, permitted, 224 * @data: pointer to struct that contains the effective, permitted,
413 * and inheritable capabilities 225 * and inheritable capabilities
414 * 226 *
415 * Set capabilities for a given process, all processes, or all 227 * Set capabilities for the current process only. The ability to any other
416 * processes in a given process group. 228 * process(es) has been deprecated and removed.
417 * 229 *
418 * The restrictions on setting capabilities are specified as: 230 * The restrictions on setting capabilities are specified as:
419 * 231 *
420 * [pid is for the 'target' task. 'current' is the calling task.] 232 * I: any raised capabilities must be a subset of the old permitted
421 * 233 * P: any raised capabilities must be a subset of the old permitted
422 * I: any raised capabilities must be a subset of the (old current) permitted 234 * E: must be set to a subset of new permitted
423 * P: any raised capabilities must be a subset of the (old current) permitted
424 * E: must be set to a subset of (new target) permitted
425 * 235 *
426 * Returns 0 on success and < 0 on error. 236 * Returns 0 on success and < 0 on error.
427 */ 237 */
@@ -430,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
430 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
431 unsigned i, tocopy; 241 unsigned i, tocopy;
432 kernel_cap_t inheritable, permitted, effective; 242 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new;
433 int ret; 244 int ret;
434 pid_t pid; 245 pid_t pid;
435 246
@@ -440,10 +251,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
440 if (get_user(pid, &header->pid)) 251 if (get_user(pid, &header->pid))
441 return -EFAULT; 252 return -EFAULT;
442 253
443 if (copy_from_user(&kdata, data, tocopy 254 /* may only affect current now */
444 * sizeof(struct __user_cap_data_struct))) { 255 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM;
257
258 if (copy_from_user(&kdata, data,
259 tocopy * sizeof(struct __user_cap_data_struct)))
445 return -EFAULT; 260 return -EFAULT;
446 }
447 261
448 for (i = 0; i < tocopy; i++) { 262 for (i = 0; i < tocopy; i++) {
449 effective.cap[i] = kdata[i].effective; 263 effective.cap[i] = kdata[i].effective;
@@ -457,32 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
457 i++; 271 i++;
458 } 272 }
459 273
460 if (pid && (pid != task_pid_vnr(current))) 274 new = prepare_creds();
461 ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, 275 if (!new)
462 &permitted); 276 return -ENOMEM;
463 else {
464 /*
465 * This lock is required even when filesystem
466 * capability support is configured - it protects the
467 * sys_capget() call from returning incorrect data in
468 * the case that the targeted process is not the
469 * current one.
470 */
471 spin_lock(&task_capability_lock);
472 277
473 ret = security_capset_check(current, &effective, &inheritable, 278 ret = security_capset(new, current_cred(),
474 &permitted); 279 &effective, &inheritable, &permitted);
475 /* 280 if (ret < 0)
476 * Having verified that the proposed changes are 281 goto error;
477 * legal, we now put them into effect. 282
478 */ 283 ret = audit_log_capset(pid, new, current_cred());
479 if (!ret) 284 if (ret < 0)
480 security_capset_set(current, &effective, &inheritable, 285 return ret;
481 &permitted);
482 spin_unlock(&task_capability_lock);
483 }
484 286
287 return commit_creds(new);
485 288
289error:
290 abort_creds(new);
486 return ret; 291 return ret;
487} 292}
488 293
@@ -498,6 +303,11 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
498 */ 303 */
499int capable(int cap) 304int capable(int cap)
500{ 305{
306 if (unlikely(!cap_valid(cap))) {
307 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
308 BUG();
309 }
310
501 if (has_capability(current, cap)) { 311 if (has_capability(current, cap)) {
502 current->flags |= PF_SUPERPRIV; 312 current->flags |= PF_SUPERPRIV;
503 return 1; 313 return 1;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a0123d75ec9a..48348dde6d81 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)
241 struct cg_cgroup_link *link; 241 struct cg_cgroup_link *link;
242 struct cg_cgroup_link *saved_link; 242 struct cg_cgroup_link *saved_link;
243 243
244 write_lock(&css_set_lock);
245 hlist_del(&cg->hlist); 244 hlist_del(&cg->hlist);
246 css_set_count--; 245 css_set_count--;
247 246
@@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)
251 list_del(&link->cgrp_link_list); 250 list_del(&link->cgrp_link_list);
252 kfree(link); 251 kfree(link);
253 } 252 }
254
255 write_unlock(&css_set_lock);
256} 253}
257 254
258static void __release_css_set(struct kref *k, int taskexit) 255static void __put_css_set(struct css_set *cg, int taskexit)
259{ 256{
260 int i; 257 int i;
261 struct css_set *cg = container_of(k, struct css_set, ref); 258 /*
262 259 * Ensure that the refcount doesn't hit zero while any readers
260 * can see it. Similar to atomic_dec_and_lock(), but for an
261 * rwlock
262 */
263 if (atomic_add_unless(&cg->refcount, -1, 1))
264 return;
265 write_lock(&css_set_lock);
266 if (!atomic_dec_and_test(&cg->refcount)) {
267 write_unlock(&css_set_lock);
268 return;
269 }
263 unlink_css_set(cg); 270 unlink_css_set(cg);
271 write_unlock(&css_set_lock);
264 272
265 rcu_read_lock(); 273 rcu_read_lock();
266 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)
276 kfree(cg); 284 kfree(cg);
277} 285}
278 286
279static void release_css_set(struct kref *k)
280{
281 __release_css_set(k, 0);
282}
283
284static void release_css_set_taskexit(struct kref *k)
285{
286 __release_css_set(k, 1);
287}
288
289/* 287/*
290 * refcounted get/put for css_set objects 288 * refcounted get/put for css_set objects
291 */ 289 */
292static inline void get_css_set(struct css_set *cg) 290static inline void get_css_set(struct css_set *cg)
293{ 291{
294 kref_get(&cg->ref); 292 atomic_inc(&cg->refcount);
295} 293}
296 294
297static inline void put_css_set(struct css_set *cg) 295static inline void put_css_set(struct css_set *cg)
298{ 296{
299 kref_put(&cg->ref, release_css_set); 297 __put_css_set(cg, 0);
300} 298}
301 299
302static inline void put_css_set_taskexit(struct css_set *cg) 300static inline void put_css_set_taskexit(struct css_set *cg)
303{ 301{
304 kref_put(&cg->ref, release_css_set_taskexit); 302 __put_css_set(cg, 1);
305} 303}
306 304
307/* 305/*
@@ -427,7 +425,7 @@ static struct css_set *find_css_set(
427 return NULL; 425 return NULL;
428 } 426 }
429 427
430 kref_init(&res->ref); 428 atomic_set(&res->refcount, 1);
431 INIT_LIST_HEAD(&res->cg_links); 429 INIT_LIST_HEAD(&res->cg_links);
432 INIT_LIST_HEAD(&res->tasks); 430 INIT_LIST_HEAD(&res->tasks);
433 INIT_HLIST_NODE(&res->hlist); 431 INIT_HLIST_NODE(&res->hlist);
@@ -573,8 +571,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
573 571
574 if (inode) { 572 if (inode) {
575 inode->i_mode = mode; 573 inode->i_mode = mode;
576 inode->i_uid = current->fsuid; 574 inode->i_uid = current_fsuid();
577 inode->i_gid = current->fsgid; 575 inode->i_gid = current_fsgid();
578 inode->i_blocks = 0; 576 inode->i_blocks = 0;
579 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
580 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
@@ -704,7 +702,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
704 * any child cgroups exist. This is theoretically supportable 702 * any child cgroups exist. This is theoretically supportable
705 * but involves complex error handling, so it's being left until 703 * but involves complex error handling, so it's being left until
706 * later */ 704 * later */
707 if (!list_empty(&cgrp->children)) 705 if (root->number_of_cgroups > 1)
708 return -EBUSY; 706 return -EBUSY;
709 707
710 /* Process each subsystem */ 708 /* Process each subsystem */
@@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = {
870 .remount_fs = cgroup_remount, 868 .remount_fs = cgroup_remount,
871}; 869};
872 870
871static void init_cgroup_housekeeping(struct cgroup *cgrp)
872{
873 INIT_LIST_HEAD(&cgrp->sibling);
874 INIT_LIST_HEAD(&cgrp->children);
875 INIT_LIST_HEAD(&cgrp->css_sets);
876 INIT_LIST_HEAD(&cgrp->release_list);
877 init_rwsem(&cgrp->pids_mutex);
878}
873static void init_cgroup_root(struct cgroupfs_root *root) 879static void init_cgroup_root(struct cgroupfs_root *root)
874{ 880{
875 struct cgroup *cgrp = &root->top_cgroup; 881 struct cgroup *cgrp = &root->top_cgroup;
@@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
878 root->number_of_cgroups = 1; 884 root->number_of_cgroups = 1;
879 cgrp->root = root; 885 cgrp->root = root;
880 cgrp->top_cgroup = cgrp; 886 cgrp->top_cgroup = cgrp;
881 INIT_LIST_HEAD(&cgrp->sibling); 887 init_cgroup_housekeeping(cgrp);
882 INIT_LIST_HEAD(&cgrp->children);
883 INIT_LIST_HEAD(&cgrp->css_sets);
884 INIT_LIST_HEAD(&cgrp->release_list);
885} 888}
886 889
887static int cgroup_test_super(struct super_block *sb, void *data) 890static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1021,7 +1024,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1021 if (ret == -EBUSY) { 1024 if (ret == -EBUSY) {
1022 mutex_unlock(&cgroup_mutex); 1025 mutex_unlock(&cgroup_mutex);
1023 mutex_unlock(&inode->i_mutex); 1026 mutex_unlock(&inode->i_mutex);
1024 goto drop_new_super; 1027 goto free_cg_links;
1025 } 1028 }
1026 1029
1027 /* EBUSY should be the only error here */ 1030 /* EBUSY should be the only error here */
@@ -1070,10 +1073,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1070 1073
1071 return simple_set_mnt(mnt, sb); 1074 return simple_set_mnt(mnt, sb);
1072 1075
1076 free_cg_links:
1077 free_cg_links(&tmp_cg_links);
1073 drop_new_super: 1078 drop_new_super:
1074 up_write(&sb->s_umount); 1079 up_write(&sb->s_umount);
1075 deactivate_super(sb); 1080 deactivate_super(sb);
1076 free_cg_links(&tmp_cg_links);
1077 return ret; 1081 return ret;
1078} 1082}
1079 1083
@@ -1276,6 +1280,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1276static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 1280static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1277{ 1281{
1278 struct task_struct *tsk; 1282 struct task_struct *tsk;
1283 const struct cred *cred = current_cred(), *tcred;
1279 int ret; 1284 int ret;
1280 1285
1281 if (pid) { 1286 if (pid) {
@@ -1285,14 +1290,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1285 rcu_read_unlock(); 1290 rcu_read_unlock();
1286 return -ESRCH; 1291 return -ESRCH;
1287 } 1292 }
1288 get_task_struct(tsk);
1289 rcu_read_unlock();
1290 1293
1291 if ((current->euid) && (current->euid != tsk->uid) 1294 tcred = __task_cred(tsk);
1292 && (current->euid != tsk->suid)) { 1295 if (cred->euid &&
1293 put_task_struct(tsk); 1296 cred->euid != tcred->uid &&
1297 cred->euid != tcred->suid) {
1298 rcu_read_unlock();
1294 return -EACCES; 1299 return -EACCES;
1295 } 1300 }
1301 get_task_struct(tsk);
1302 rcu_read_unlock();
1296 } else { 1303 } else {
1297 tsk = current; 1304 tsk = current;
1298 get_task_struct(tsk); 1305 get_task_struct(tsk);
@@ -1728,7 +1735,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1728 1735
1729 read_lock(&css_set_lock); 1736 read_lock(&css_set_lock);
1730 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 1737 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1731 count += atomic_read(&link->cg->ref.refcount); 1738 count += atomic_read(&link->cg->refcount);
1732 } 1739 }
1733 read_unlock(&css_set_lock); 1740 read_unlock(&css_set_lock);
1734 return count; 1741 return count;
@@ -1997,16 +2004,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1997 * but we cannot guarantee that the information we produce is correct 2004 * but we cannot guarantee that the information we produce is correct
1998 * unless we produce it entirely atomically. 2005 * unless we produce it entirely atomically.
1999 * 2006 *
2000 * Upon tasks file open(), a struct ctr_struct is allocated, that
2001 * will have a pointer to an array (also allocated here). The struct
2002 * ctr_struct * is stored in file->private_data. Its resources will
2003 * be freed by release() when the file is closed. The array is used
2004 * to sprintf the PIDs and then used by read().
2005 */ 2007 */
2006struct ctr_struct {
2007 char *buf;
2008 int bufsz;
2009};
2010 2008
2011/* 2009/*
2012 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2010 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2045,10 +2043,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2045 struct cgroup *cgrp; 2043 struct cgroup *cgrp;
2046 struct cgroup_iter it; 2044 struct cgroup_iter it;
2047 struct task_struct *tsk; 2045 struct task_struct *tsk;
2046
2048 /* 2047 /*
2049 * Validate dentry by checking the superblock operations 2048 * Validate dentry by checking the superblock operations,
2049 * and make sure it's a directory.
2050 */ 2050 */
2051 if (dentry->d_sb->s_op != &cgroup_ops) 2051 if (dentry->d_sb->s_op != &cgroup_ops ||
2052 !S_ISDIR(dentry->d_inode->i_mode))
2052 goto err; 2053 goto err;
2053 2054
2054 ret = 0; 2055 ret = 0;
@@ -2088,42 +2089,132 @@ static int cmppid(const void *a, const void *b)
2088 return *(pid_t *)a - *(pid_t *)b; 2089 return *(pid_t *)a - *(pid_t *)b;
2089} 2090}
2090 2091
2092
2091/* 2093/*
2092 * Convert array 'a' of 'npids' pid_t's to a string of newline separated 2094 * seq_file methods for the "tasks" file. The seq_file position is the
2093 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return 2095 * next pid to display; the seq_file iterator is a pointer to the pid
2094 * count 'cnt' of how many chars would be written if buf were large enough. 2096 * in the cgroup->tasks_pids array.
2095 */ 2097 */
2096static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) 2098
2099static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2097{ 2100{
2098 int cnt = 0; 2101 /*
2099 int i; 2102 * Initially we receive a position value that corresponds to
2103 * one more than the last pid shown (or 0 on the first call or
2104 * after a seek to the start). Use a binary-search to find the
2105 * next pid to display, if any
2106 */
2107 struct cgroup *cgrp = s->private;
2108 int index = 0, pid = *pos;
2109 int *iter;
2110
2111 down_read(&cgrp->pids_mutex);
2112 if (pid) {
2113 int end = cgrp->pids_length;
2114
2115 while (index < end) {
2116 int mid = (index + end) / 2;
2117 if (cgrp->tasks_pids[mid] == pid) {
2118 index = mid;
2119 break;
2120 } else if (cgrp->tasks_pids[mid] <= pid)
2121 index = mid + 1;
2122 else
2123 end = mid;
2124 }
2125 }
2126 /* If we're off the end of the array, we're done */
2127 if (index >= cgrp->pids_length)
2128 return NULL;
2129 /* Update the abstract position to be the actual pid that we found */
2130 iter = cgrp->tasks_pids + index;
2131 *pos = *iter;
2132 return iter;
2133}
2134
2135static void cgroup_tasks_stop(struct seq_file *s, void *v)
2136{
2137 struct cgroup *cgrp = s->private;
2138 up_read(&cgrp->pids_mutex);
2139}
2140
2141static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2142{
2143 struct cgroup *cgrp = s->private;
2144 int *p = v;
2145 int *end = cgrp->tasks_pids + cgrp->pids_length;
2146
2147 /*
2148 * Advance to the next pid in the array. If this goes off the
2149 * end, we're done
2150 */
2151 p++;
2152 if (p >= end) {
2153 return NULL;
2154 } else {
2155 *pos = *p;
2156 return p;
2157 }
2158}
2100 2159
2101 for (i = 0; i < npids; i++) 2160static int cgroup_tasks_show(struct seq_file *s, void *v)
2102 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); 2161{
2103 return cnt; 2162 return seq_printf(s, "%d\n", *(int *)v);
2104} 2163}
2105 2164
2165static struct seq_operations cgroup_tasks_seq_operations = {
2166 .start = cgroup_tasks_start,
2167 .stop = cgroup_tasks_stop,
2168 .next = cgroup_tasks_next,
2169 .show = cgroup_tasks_show,
2170};
2171
2172static void release_cgroup_pid_array(struct cgroup *cgrp)
2173{
2174 down_write(&cgrp->pids_mutex);
2175 BUG_ON(!cgrp->pids_use_count);
2176 if (!--cgrp->pids_use_count) {
2177 kfree(cgrp->tasks_pids);
2178 cgrp->tasks_pids = NULL;
2179 cgrp->pids_length = 0;
2180 }
2181 up_write(&cgrp->pids_mutex);
2182}
2183
2184static int cgroup_tasks_release(struct inode *inode, struct file *file)
2185{
2186 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2187
2188 if (!(file->f_mode & FMODE_READ))
2189 return 0;
2190
2191 release_cgroup_pid_array(cgrp);
2192 return seq_release(inode, file);
2193}
2194
2195static struct file_operations cgroup_tasks_operations = {
2196 .read = seq_read,
2197 .llseek = seq_lseek,
2198 .write = cgroup_file_write,
2199 .release = cgroup_tasks_release,
2200};
2201
2106/* 2202/*
2107 * Handle an open on 'tasks' file. Prepare a buffer listing the 2203 * Handle an open on 'tasks' file. Prepare an array containing the
2108 * process id's of tasks currently attached to the cgroup being opened. 2204 * process id's of tasks currently attached to the cgroup being opened.
2109 *
2110 * Does not require any specific cgroup mutexes, and does not take any.
2111 */ 2205 */
2206
2112static int cgroup_tasks_open(struct inode *unused, struct file *file) 2207static int cgroup_tasks_open(struct inode *unused, struct file *file)
2113{ 2208{
2114 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2209 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2115 struct ctr_struct *ctr;
2116 pid_t *pidarray; 2210 pid_t *pidarray;
2117 int npids; 2211 int npids;
2118 char c; 2212 int retval;
2119 2213
2214 /* Nothing to do for write-only files */
2120 if (!(file->f_mode & FMODE_READ)) 2215 if (!(file->f_mode & FMODE_READ))
2121 return 0; 2216 return 0;
2122 2217
2123 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
2124 if (!ctr)
2125 goto err0;
2126
2127 /* 2218 /*
2128 * If cgroup gets more users after we read count, we won't have 2219 * If cgroup gets more users after we read count, we won't have
2129 * enough space - tough. This race is indistinguishable to the 2220 * enough space - tough. This race is indistinguishable to the
@@ -2131,57 +2222,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2131 * show up until sometime later on. 2222 * show up until sometime later on.
2132 */ 2223 */
2133 npids = cgroup_task_count(cgrp); 2224 npids = cgroup_task_count(cgrp);
2134 if (npids) { 2225 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2135 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); 2226 if (!pidarray)
2136 if (!pidarray) 2227 return -ENOMEM;
2137 goto err1; 2228 npids = pid_array_load(pidarray, npids, cgrp);
2138 2229 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2139 npids = pid_array_load(pidarray, npids, cgrp);
2140 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2141
2142 /* Call pid_array_to_buf() twice, first just to get bufsz */
2143 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
2144 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
2145 if (!ctr->buf)
2146 goto err2;
2147 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
2148
2149 kfree(pidarray);
2150 } else {
2151 ctr->buf = NULL;
2152 ctr->bufsz = 0;
2153 }
2154 file->private_data = ctr;
2155 return 0;
2156
2157err2:
2158 kfree(pidarray);
2159err1:
2160 kfree(ctr);
2161err0:
2162 return -ENOMEM;
2163}
2164
2165static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
2166 struct cftype *cft,
2167 struct file *file, char __user *buf,
2168 size_t nbytes, loff_t *ppos)
2169{
2170 struct ctr_struct *ctr = file->private_data;
2171 2230
2172 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); 2231 /*
2173} 2232 * Store the array in the cgroup, freeing the old
2233 * array if necessary
2234 */
2235 down_write(&cgrp->pids_mutex);
2236 kfree(cgrp->tasks_pids);
2237 cgrp->tasks_pids = pidarray;
2238 cgrp->pids_length = npids;
2239 cgrp->pids_use_count++;
2240 up_write(&cgrp->pids_mutex);
2174 2241
2175static int cgroup_tasks_release(struct inode *unused_inode, 2242 file->f_op = &cgroup_tasks_operations;
2176 struct file *file)
2177{
2178 struct ctr_struct *ctr;
2179 2243
2180 if (file->f_mode & FMODE_READ) { 2244 retval = seq_open(file, &cgroup_tasks_seq_operations);
2181 ctr = file->private_data; 2245 if (retval) {
2182 kfree(ctr->buf); 2246 release_cgroup_pid_array(cgrp);
2183 kfree(ctr); 2247 return retval;
2184 } 2248 }
2249 ((struct seq_file *)file->private_data)->private = cgrp;
2185 return 0; 2250 return 0;
2186} 2251}
2187 2252
@@ -2210,7 +2275,6 @@ static struct cftype files[] = {
2210 { 2275 {
2211 .name = "tasks", 2276 .name = "tasks",
2212 .open = cgroup_tasks_open, 2277 .open = cgroup_tasks_open,
2213 .read = cgroup_tasks_read,
2214 .write_u64 = cgroup_tasks_write, 2278 .write_u64 = cgroup_tasks_write,
2215 .release = cgroup_tasks_release, 2279 .release = cgroup_tasks_release,
2216 .private = FILE_TASKLIST, 2280 .private = FILE_TASKLIST,
@@ -2300,10 +2364,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2300 2364
2301 mutex_lock(&cgroup_mutex); 2365 mutex_lock(&cgroup_mutex);
2302 2366
2303 INIT_LIST_HEAD(&cgrp->sibling); 2367 init_cgroup_housekeeping(cgrp);
2304 INIT_LIST_HEAD(&cgrp->children);
2305 INIT_LIST_HEAD(&cgrp->css_sets);
2306 INIT_LIST_HEAD(&cgrp->release_list);
2307 2368
2308 cgrp->parent = parent; 2369 cgrp->parent = parent;
2309 cgrp->root = parent->root; 2370 cgrp->root = parent->root;
@@ -2418,10 +2479,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2418 mutex_unlock(&cgroup_mutex); 2479 mutex_unlock(&cgroup_mutex);
2419 return -EBUSY; 2480 return -EBUSY;
2420 } 2481 }
2421 2482 mutex_unlock(&cgroup_mutex);
2422 parent = cgrp->parent;
2423 root = cgrp->root;
2424 sb = root->sb;
2425 2483
2426 /* 2484 /*
2427 * Call pre_destroy handlers of subsys. Notify subsystems 2485 * Call pre_destroy handlers of subsys. Notify subsystems
@@ -2429,7 +2487,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2429 */ 2487 */
2430 cgroup_call_pre_destroy(cgrp); 2488 cgroup_call_pre_destroy(cgrp);
2431 2489
2432 if (cgroup_has_css_refs(cgrp)) { 2490 mutex_lock(&cgroup_mutex);
2491 parent = cgrp->parent;
2492 root = cgrp->root;
2493 sb = root->sb;
2494
2495 if (atomic_read(&cgrp->count)
2496 || !list_empty(&cgrp->children)
2497 || cgroup_has_css_refs(cgrp)) {
2433 mutex_unlock(&cgroup_mutex); 2498 mutex_unlock(&cgroup_mutex);
2434 return -EBUSY; 2499 return -EBUSY;
2435 } 2500 }
@@ -2443,7 +2508,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2443 list_del(&cgrp->sibling); 2508 list_del(&cgrp->sibling);
2444 spin_lock(&cgrp->dentry->d_lock); 2509 spin_lock(&cgrp->dentry->d_lock);
2445 d = dget(cgrp->dentry); 2510 d = dget(cgrp->dentry);
2446 cgrp->dentry = NULL;
2447 spin_unlock(&d->d_lock); 2511 spin_unlock(&d->d_lock);
2448 2512
2449 cgroup_d_remove_dir(d); 2513 cgroup_d_remove_dir(d);
@@ -2495,8 +2559,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2495int __init cgroup_init_early(void) 2559int __init cgroup_init_early(void)
2496{ 2560{
2497 int i; 2561 int i;
2498 kref_init(&init_css_set.ref); 2562 atomic_set(&init_css_set.refcount, 1);
2499 kref_get(&init_css_set.ref);
2500 INIT_LIST_HEAD(&init_css_set.cg_links); 2563 INIT_LIST_HEAD(&init_css_set.cg_links);
2501 INIT_LIST_HEAD(&init_css_set.tasks); 2564 INIT_LIST_HEAD(&init_css_set.tasks);
2502 INIT_HLIST_NODE(&init_css_set.hlist); 2565 INIT_HLIST_NODE(&init_css_set.hlist);
@@ -2735,6 +2798,8 @@ void cgroup_fork_callbacks(struct task_struct *child)
2735 * Called on every change to mm->owner. mm_init_owner() does not 2798 * Called on every change to mm->owner. mm_init_owner() does not
2736 * invoke this routine, since it assigns the mm->owner the first time 2799 * invoke this routine, since it assigns the mm->owner the first time
2737 * and does not change it. 2800 * and does not change it.
2801 *
2802 * The callbacks are invoked with mmap_sem held in read mode.
2738 */ 2803 */
2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2804void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2740{ 2805{
@@ -2750,7 +2815,7 @@ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2750 if (oldcgrp == newcgrp) 2815 if (oldcgrp == newcgrp)
2751 continue; 2816 continue;
2752 if (ss->mm_owner_changed) 2817 if (ss->mm_owner_changed)
2753 ss->mm_owner_changed(ss, oldcgrp, newcgrp); 2818 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2754 } 2819 }
2755 } 2820 }
2756} 2821}
@@ -2873,9 +2938,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2873 again: 2938 again:
2874 root = subsys->root; 2939 root = subsys->root;
2875 if (root == &rootnode) { 2940 if (root == &rootnode) {
2876 printk(KERN_INFO
2877 "Not cloning cgroup for unused subsystem %s\n",
2878 subsys->name);
2879 mutex_unlock(&cgroup_mutex); 2941 mutex_unlock(&cgroup_mutex);
2880 return 0; 2942 return 0;
2881 } 2943 }
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index c3dc3aba4c02..daca6209202d 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
57 u64 count; 57 u64 count;
58 58
59 rcu_read_lock(); 59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount); 60 count = atomic_read(&current->cgroups->refcount);
61 rcu_read_unlock(); 61 rcu_read_unlock();
62 return count; 62 return count;
63} 63}
@@ -90,7 +90,7 @@ static struct cftype files[] = {
90 { 90 {
91 .name = "releasable", 91 .name = "releasable",
92 .read_u64 = releasable_read, 92 .read_u64 = releasable_read,
93 } 93 },
94}; 94};
95 95
96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 000000000000..fb249e2bcada
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,379 @@
1/*
2 * cgroup_freezer.c - control group freezer subsystem
3 *
4 * Copyright IBM Corporation, 2007
5 *
6 * Author : Cedric Le Goater <clg@fr.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of version 2.1 of the GNU Lesser General Public License
10 * as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it would be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15 */
16
17#include <linux/module.h>
18#include <linux/cgroup.h>
19#include <linux/fs.h>
20#include <linux/uaccess.h>
21#include <linux/freezer.h>
22#include <linux/seq_file.h>
23
24enum freezer_state {
25 CGROUP_THAWED = 0,
26 CGROUP_FREEZING,
27 CGROUP_FROZEN,
28};
29
30struct freezer {
31 struct cgroup_subsys_state css;
32 enum freezer_state state;
33 spinlock_t lock; /* protects _writes_ to state */
34};
35
36static inline struct freezer *cgroup_freezer(
37 struct cgroup *cgroup)
38{
39 return container_of(
40 cgroup_subsys_state(cgroup, freezer_subsys_id),
41 struct freezer, css);
42}
43
44static inline struct freezer *task_freezer(struct task_struct *task)
45{
46 return container_of(task_subsys_state(task, freezer_subsys_id),
47 struct freezer, css);
48}
49
50int cgroup_frozen(struct task_struct *task)
51{
52 struct freezer *freezer;
53 enum freezer_state state;
54
55 task_lock(task);
56 freezer = task_freezer(task);
57 state = freezer->state;
58 task_unlock(task);
59
60 return state == CGROUP_FROZEN;
61}
62
63/*
64 * cgroups_write_string() limits the size of freezer state strings to
65 * CGROUP_LOCAL_BUFFER_SIZE
66 */
67static const char *freezer_state_strs[] = {
68 "THAWED",
69 "FREEZING",
70 "FROZEN",
71};
72
73/*
74 * State diagram
75 * Transitions are caused by userspace writes to the freezer.state file.
76 * The values in parenthesis are state labels. The rest are edge labels.
77 *
78 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
79 * ^ ^ | |
80 * | \_______THAWED_______/ |
81 * \__________________________THAWED____________/
82 */
83
84struct cgroup_subsys freezer_subsys;
85
86/* Locks taken and their ordering
87 * ------------------------------
88 * css_set_lock
89 * cgroup_mutex (AKA cgroup_lock)
90 * task->alloc_lock (AKA task_lock)
91 * freezer->lock
92 * task->sighand->siglock
93 *
94 * cgroup code forces css_set_lock to be taken before task->alloc_lock
95 *
96 * freezer_create(), freezer_destroy():
97 * cgroup_mutex [ by cgroup core ]
98 *
99 * can_attach():
100 * cgroup_mutex
101 *
102 * cgroup_frozen():
103 * task->alloc_lock (to get task's cgroup)
104 *
105 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
106 * task->alloc_lock (to get task's cgroup)
107 * freezer->lock
108 * sighand->siglock (if the cgroup is freezing)
109 *
110 * freezer_read():
111 * cgroup_mutex
112 * freezer->lock
113 * read_lock css_set_lock (cgroup iterator start)
114 *
115 * freezer_write() (freeze):
116 * cgroup_mutex
117 * freezer->lock
118 * read_lock css_set_lock (cgroup iterator start)
119 * sighand->siglock
120 *
121 * freezer_write() (unfreeze):
122 * cgroup_mutex
123 * freezer->lock
124 * read_lock css_set_lock (cgroup iterator start)
125 * task->alloc_lock (to prevent races with freeze_task())
126 * sighand->siglock
127 */
128static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
129 struct cgroup *cgroup)
130{
131 struct freezer *freezer;
132
133 freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
134 if (!freezer)
135 return ERR_PTR(-ENOMEM);
136
137 spin_lock_init(&freezer->lock);
138 freezer->state = CGROUP_THAWED;
139 return &freezer->css;
140}
141
142static void freezer_destroy(struct cgroup_subsys *ss,
143 struct cgroup *cgroup)
144{
145 kfree(cgroup_freezer(cgroup));
146}
147
148/* Task is frozen or will freeze immediately when next it gets woken */
149static bool is_task_frozen_enough(struct task_struct *task)
150{
151 return frozen(task) ||
152 (task_is_stopped_or_traced(task) && freezing(task));
153}
154
155/*
156 * The call to cgroup_lock() in the freezer.state write method prevents
157 * a write to that file racing against an attach, and hence the
158 * can_attach() result will remain valid until the attach completes.
159 */
160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup,
162 struct task_struct *task)
163{
164 struct freezer *freezer;
165
166 /*
167 * Anything frozen can't move or be moved to/from.
168 *
169 * Since orig_freezer->state == FROZEN means that @task has been
170 * frozen, so it's sufficient to check the latter condition.
171 */
172
173 if (is_task_frozen_enough(task))
174 return -EBUSY;
175
176 freezer = cgroup_freezer(new_cgroup);
177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY;
179
180 return 0;
181}
182
183static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
184{
185 struct freezer *freezer;
186
187 /*
188 * No lock is needed, since the task isn't on tasklist yet,
189 * so it can't be moved to another cgroup, which means the
190 * freezer won't be removed and will be valid during this
191 * function call.
192 */
193 freezer = task_freezer(task);
194
195 /*
196 * The root cgroup is non-freezable, so we can skip the
197 * following check.
198 */
199 if (!freezer->css.cgroup->parent)
200 return;
201
202 spin_lock_irq(&freezer->lock);
203 BUG_ON(freezer->state == CGROUP_FROZEN);
204
205 /* Locking avoids race with FREEZING -> THAWED transitions. */
206 if (freezer->state == CGROUP_FREEZING)
207 freeze_task(task, true);
208 spin_unlock_irq(&freezer->lock);
209}
210
211/*
212 * caller must hold freezer->lock
213 */
214static void update_freezer_state(struct cgroup *cgroup,
215 struct freezer *freezer)
216{
217 struct cgroup_iter it;
218 struct task_struct *task;
219 unsigned int nfrozen = 0, ntotal = 0;
220
221 cgroup_iter_start(cgroup, &it);
222 while ((task = cgroup_iter_next(cgroup, &it))) {
223 ntotal++;
224 if (is_task_frozen_enough(task))
225 nfrozen++;
226 }
227
228 /*
229 * Transition to FROZEN when no new tasks can be added ensures
230 * that we never exist in the FROZEN state while there are unfrozen
231 * tasks.
232 */
233 if (nfrozen == ntotal)
234 freezer->state = CGROUP_FROZEN;
235 else if (nfrozen > 0)
236 freezer->state = CGROUP_FREEZING;
237 else
238 freezer->state = CGROUP_THAWED;
239 cgroup_iter_end(cgroup, &it);
240}
241
242static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
243 struct seq_file *m)
244{
245 struct freezer *freezer;
246 enum freezer_state state;
247
248 if (!cgroup_lock_live_group(cgroup))
249 return -ENODEV;
250
251 freezer = cgroup_freezer(cgroup);
252 spin_lock_irq(&freezer->lock);
253 state = freezer->state;
254 if (state == CGROUP_FREEZING) {
255 /* We change from FREEZING to FROZEN lazily if the cgroup was
256 * only partially frozen when we exitted write. */
257 update_freezer_state(cgroup, freezer);
258 state = freezer->state;
259 }
260 spin_unlock_irq(&freezer->lock);
261 cgroup_unlock();
262
263 seq_puts(m, freezer_state_strs[state]);
264 seq_putc(m, '\n');
265 return 0;
266}
267
268static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
269{
270 struct cgroup_iter it;
271 struct task_struct *task;
272 unsigned int num_cant_freeze_now = 0;
273
274 freezer->state = CGROUP_FREEZING;
275 cgroup_iter_start(cgroup, &it);
276 while ((task = cgroup_iter_next(cgroup, &it))) {
277 if (!freeze_task(task, true))
278 continue;
279 if (is_task_frozen_enough(task))
280 continue;
281 if (!freezing(task) && !freezer_should_skip(task))
282 num_cant_freeze_now++;
283 }
284 cgroup_iter_end(cgroup, &it);
285
286 return num_cant_freeze_now ? -EBUSY : 0;
287}
288
289static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
290{
291 struct cgroup_iter it;
292 struct task_struct *task;
293
294 cgroup_iter_start(cgroup, &it);
295 while ((task = cgroup_iter_next(cgroup, &it))) {
296 thaw_process(task);
297 }
298 cgroup_iter_end(cgroup, &it);
299
300 freezer->state = CGROUP_THAWED;
301}
302
303static int freezer_change_state(struct cgroup *cgroup,
304 enum freezer_state goal_state)
305{
306 struct freezer *freezer;
307 int retval = 0;
308
309 freezer = cgroup_freezer(cgroup);
310
311 spin_lock_irq(&freezer->lock);
312
313 update_freezer_state(cgroup, freezer);
314 if (goal_state == freezer->state)
315 goto out;
316
317 switch (goal_state) {
318 case CGROUP_THAWED:
319 unfreeze_cgroup(cgroup, freezer);
320 break;
321 case CGROUP_FROZEN:
322 retval = try_to_freeze_cgroup(cgroup, freezer);
323 break;
324 default:
325 BUG();
326 }
327out:
328 spin_unlock_irq(&freezer->lock);
329
330 return retval;
331}
332
333static int freezer_write(struct cgroup *cgroup,
334 struct cftype *cft,
335 const char *buffer)
336{
337 int retval;
338 enum freezer_state goal_state;
339
340 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
341 goal_state = CGROUP_THAWED;
342 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
343 goal_state = CGROUP_FROZEN;
344 else
345 return -EINVAL;
346
347 if (!cgroup_lock_live_group(cgroup))
348 return -ENODEV;
349 retval = freezer_change_state(cgroup, goal_state);
350 cgroup_unlock();
351 return retval;
352}
353
354static struct cftype files[] = {
355 {
356 .name = "state",
357 .read_seq_string = freezer_read,
358 .write_string = freezer_write,
359 },
360};
361
362static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
363{
364 if (!cgroup->parent)
365 return 0;
366 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
367}
368
369struct cgroup_subsys freezer_subsys = {
370 .name = "freezer",
371 .create = freezer_create,
372 .destroy = freezer_destroy,
373 .populate = freezer_populate,
374 .subsys_id = freezer_subsys_id,
375 .can_attach = freezer_can_attach,
376 .attach = NULL,
377 .fork = freezer_fork,
378 .exit = NULL,
379};
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..8eafe3eb50d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,9 +23,68 @@
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h>
26 27
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
30/*
31 * Note that the native side is already converted to a timespec, because
32 * that's what we want anyway.
33 */
34static int compat_get_timeval(struct timespec *o,
35 struct compat_timeval __user *i)
36{
37 long usec;
38
39 if (get_user(o->tv_sec, &i->tv_sec) ||
40 get_user(usec, &i->tv_usec))
41 return -EFAULT;
42 o->tv_nsec = usec * 1000;
43 return 0;
44}
45
46static int compat_put_timeval(struct compat_timeval __user *o,
47 struct timeval *i)
48{
49 return (put_user(i->tv_sec, &o->tv_sec) ||
50 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
51}
52
53asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
54 struct timezone __user *tz)
55{
56 if (tv) {
57 struct timeval ktv;
58 do_gettimeofday(&ktv);
59 if (compat_put_timeval(tv, &ktv))
60 return -EFAULT;
61 }
62 if (tz) {
63 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
64 return -EFAULT;
65 }
66
67 return 0;
68}
69
70asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
71 struct timezone __user *tz)
72{
73 struct timespec kts;
74 struct timezone ktz;
75
76 if (tv) {
77 if (compat_get_timeval(&kts, tv))
78 return -EFAULT;
79 }
80 if (tz) {
81 if (copy_from_user(&ktz, tz, sizeof(ktz)))
82 return -EFAULT;
83 }
84
85 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
86}
87
29int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 88int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
30{ 89{
31 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 90 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
@@ -150,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which,
150 return 0; 209 return 0;
151} 210}
152 211
212static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
213{
214 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
215}
216
153asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) 217asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
154{ 218{
155 /*
156 * In the SMP world we might just be unlucky and have one of
157 * the times increment as we use it. Since the value is an
158 * atomically safe type this is just fine. Conceptually its
159 * as if the syscall took an instant longer to occur.
160 */
161 if (tbuf) { 219 if (tbuf) {
220 struct tms tms;
162 struct compat_tms tmp; 221 struct compat_tms tmp;
163 struct task_struct *tsk = current; 222
164 struct task_struct *t; 223 do_sys_times(&tms);
165 cputime_t utime, stime, cutime, cstime; 224 /* Convert our struct tms to the compat version. */
166 225 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
167 read_lock(&tasklist_lock); 226 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
168 utime = tsk->signal->utime; 227 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
169 stime = tsk->signal->stime; 228 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
170 t = tsk;
171 do {
172 utime = cputime_add(utime, t->utime);
173 stime = cputime_add(stime, t->stime);
174 t = next_thread(t);
175 } while (t != tsk);
176
177 /*
178 * While we have tasklist_lock read-locked, no dying thread
179 * can be updating current->signal->[us]time. Instead,
180 * we got their counts included in the live thread loop.
181 * However, another thread can come in right now and
182 * do a wait call that updates current->signal->c[us]time.
183 * To make sure we always see that pair updated atomically,
184 * we take the siglock around fetching them.
185 */
186 spin_lock_irq(&tsk->sighand->siglock);
187 cutime = tsk->signal->cutime;
188 cstime = tsk->signal->cstime;
189 spin_unlock_irq(&tsk->sighand->siglock);
190 read_unlock(&tasklist_lock);
191
192 tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
193 tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
194 tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
195 tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
196 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 229 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
197 return -EFAULT; 230 return -EFAULT;
198 } 231 }
diff --git a/kernel/configs.c b/kernel/configs.c
index 4c345210ed8c..abaee684ecbf 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -54,9 +54,6 @@
54 54
55#ifdef CONFIG_IKCONFIG_PROC 55#ifdef CONFIG_IKCONFIG_PROC
56 56
57/**************************************************/
58/* globals and useful constants */
59
60static ssize_t 57static ssize_t
61ikconfig_read_current(struct file *file, char __user *buf, 58ikconfig_read_current(struct file *file, char __user *buf,
62 size_t len, loff_t * offset) 59 size_t len, loff_t * offset)
@@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = {
71 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
72}; 69};
73 70
74/***************************************************/
75/* ikconfig_init: start up everything we need to */
76
77static int __init ikconfig_init(void) 71static int __init ikconfig_init(void)
78{ 72{
79 struct proc_dir_entry *entry; 73 struct proc_dir_entry *entry;
@@ -89,9 +83,6 @@ static int __init ikconfig_init(void)
89 return 0; 83 return 0;
90} 84}
91 85
92/***************************************************/
93/* ikconfig_cleanup: clean up our mess */
94
95static void __exit ikconfig_cleanup(void) 86static void __exit ikconfig_cleanup(void)
96{ 87{
97 remove_proc_entry("config.gz", NULL); 88 remove_proc_entry("config.gz", NULL);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045daed..8ea32e8d68b0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,7 +462,7 @@ out:
462 * It must be called by the arch code on the new cpu, before the new cpu 462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */ 464 */
465void notify_cpu_starting(unsigned int cpu) 465void __cpuinit notify_cpu_starting(unsigned int cpu)
466{ 466{
467 unsigned long val = CPU_STARTING; 467 unsigned long val = CPU_STARTING;
468 468
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
499#endif 499#endif
500}; 500};
501EXPORT_SYMBOL_GPL(cpu_bit_bitmap); 501EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
502
503const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
504EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index eab7bd6628e0..96c0ba13b8cd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/mempolicy.h> 37#include <linux/mempolicy.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/memory.h>
39#include <linux/module.h> 40#include <linux/module.h>
40#include <linux/mount.h> 41#include <linux/mount.h>
41#include <linux/namei.h> 42#include <linux/namei.h>
@@ -584,10 +585,9 @@ static int generate_sched_domains(cpumask_t **domains,
584 int i, j, k; /* indices for partition finding loops */ 585 int i, j, k; /* indices for partition finding loops */
585 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 586 cpumask_t *doms; /* resulting partition; i.e. sched domains */
586 struct sched_domain_attr *dattr; /* attributes for custom domains */ 587 struct sched_domain_attr *dattr; /* attributes for custom domains */
587 int ndoms; /* number of sched domains in result */ 588 int ndoms = 0; /* number of sched domains in result */
588 int nslot; /* next empty doms[] cpumask_t slot */ 589 int nslot; /* next empty doms[] cpumask_t slot */
589 590
590 ndoms = 0;
591 doms = NULL; 591 doms = NULL;
592 dattr = NULL; 592 dattr = NULL;
593 csa = NULL; 593 csa = NULL;
@@ -674,10 +674,8 @@ restart:
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 675 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
677 if (!doms) { 677 if (!doms)
678 ndoms = 0;
679 goto done; 678 goto done;
680 }
681 679
682 /* 680 /*
683 * The rest of the code, including the scheduler, can deal with 681 * The rest of the code, including the scheduler, can deal with
@@ -732,6 +730,13 @@ restart:
732done: 730done:
733 kfree(csa); 731 kfree(csa);
734 732
733 /*
734 * Fallback to the default domain if kmalloc() failed.
735 * See comments in partition_sched_domains().
736 */
737 if (doms == NULL)
738 ndoms = 1;
739
735 *domains = doms; 740 *domains = doms;
736 *attributes = dattr; 741 *attributes = dattr;
737 return ndoms; 742 return ndoms;
@@ -1172,7 +1177,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1172{ 1177{
1173 struct cpuset trialcs; 1178 struct cpuset trialcs;
1174 int err; 1179 int err;
1175 int cpus_nonempty, balance_flag_changed; 1180 int balance_flag_changed;
1176 1181
1177 trialcs = *cs; 1182 trialcs = *cs;
1178 if (turning_on) 1183 if (turning_on)
@@ -1184,7 +1189,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1184 if (err < 0) 1189 if (err < 0)
1185 return err; 1190 return err;
1186 1191
1187 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1188 balance_flag_changed = (is_sched_load_balance(cs) != 1192 balance_flag_changed = (is_sched_load_balance(cs) !=
1189 is_sched_load_balance(&trialcs)); 1193 is_sched_load_balance(&trialcs));
1190 1194
@@ -1192,7 +1196,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1192 cs->flags = trialcs.flags; 1196 cs->flags = trialcs.flags;
1193 mutex_unlock(&callback_mutex); 1197 mutex_unlock(&callback_mutex);
1194 1198
1195 if (cpus_nonempty && balance_flag_changed) 1199 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
1196 async_rebuild_sched_domains(); 1200 async_rebuild_sched_domains();
1197 1201
1198 return 0; 1202 return 0;
@@ -2012,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2012 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2016 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2013 * See also the previous routine cpuset_track_online_cpus(). 2017 * See also the previous routine cpuset_track_online_cpus().
2014 */ 2018 */
2015void cpuset_track_online_nodes(void) 2019static int cpuset_track_online_nodes(struct notifier_block *self,
2020 unsigned long action, void *arg)
2016{ 2021{
2017 cgroup_lock(); 2022 cgroup_lock();
2018 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2023 switch (action) {
2019 scan_for_empty_cpusets(&top_cpuset); 2024 case MEM_ONLINE:
2025 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2026 break;
2027 case MEM_OFFLINE:
2028 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2029 scan_for_empty_cpusets(&top_cpuset);
2030 break;
2031 default:
2032 break;
2033 }
2020 cgroup_unlock(); 2034 cgroup_unlock();
2035 return NOTIFY_OK;
2021} 2036}
2022#endif 2037#endif
2023 2038
@@ -2033,6 +2048,7 @@ void __init cpuset_init_smp(void)
2033 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2034 2049
2035 hotcpu_notifier(cpuset_track_online_cpus, 0); 2050 hotcpu_notifier(cpuset_track_online_cpus, 0);
2051 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2036} 2052}
2037 2053
2038/** 2054/**
@@ -2437,19 +2453,15 @@ const struct file_operations proc_cpuset_operations = {
2437void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2453void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2438{ 2454{
2439 seq_printf(m, "Cpus_allowed:\t"); 2455 seq_printf(m, "Cpus_allowed:\t");
2440 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2456 seq_cpumask(m, &task->cpus_allowed);
2441 task->cpus_allowed);
2442 seq_printf(m, "\n"); 2457 seq_printf(m, "\n");
2443 seq_printf(m, "Cpus_allowed_list:\t"); 2458 seq_printf(m, "Cpus_allowed_list:\t");
2444 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, 2459 seq_cpumask_list(m, &task->cpus_allowed);
2445 task->cpus_allowed);
2446 seq_printf(m, "\n"); 2460 seq_printf(m, "\n");
2447 seq_printf(m, "Mems_allowed:\t"); 2461 seq_printf(m, "Mems_allowed:\t");
2448 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2462 seq_nodemask(m, &task->mems_allowed);
2449 task->mems_allowed);
2450 seq_printf(m, "\n"); 2463 seq_printf(m, "\n");
2451 seq_printf(m, "Mems_allowed_list:\t"); 2464 seq_printf(m, "Mems_allowed_list:\t");
2452 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, 2465 seq_nodemask_list(m, &task->mems_allowed);
2453 task->mems_allowed);
2454 seq_printf(m, "\n"); 2466 seq_printf(m, "\n");
2455} 2467}
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 000000000000..2dc4fc2d0bf1
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 000000000000..ff7bc071991c
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,588 @@
1/* Task credentials management - see Documentation/credentials.txt
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11#include <linux/module.h>
12#include <linux/cred.h>
13#include <linux/sched.h>
14#include <linux/key.h>
15#include <linux/keyctl.h>
16#include <linux/init_task.h>
17#include <linux/security.h>
18#include <linux/cn_proc.h>
19#include "cred-internals.h"
20
21static struct kmem_cache *cred_jar;
22
23/*
24 * The common credentials for the initial task's thread group
25 */
26#ifdef CONFIG_KEYS
27static struct thread_group_cred init_tgcred = {
28 .usage = ATOMIC_INIT(2),
29 .tgid = 0,
30 .lock = SPIN_LOCK_UNLOCKED,
31};
32#endif
33
34/*
35 * The initial credentials for the initial task
36 */
37struct cred init_cred = {
38 .usage = ATOMIC_INIT(4),
39 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET,
42 .cap_effective = CAP_INIT_EFF_SET,
43 .cap_bset = CAP_INIT_BSET,
44 .user = INIT_USER,
45 .group_info = &init_groups,
46#ifdef CONFIG_KEYS
47 .tgcred = &init_tgcred,
48#endif
49};
50
51/*
52 * Dispose of the shared task group credentials
53 */
54#ifdef CONFIG_KEYS
55static void release_tgcred_rcu(struct rcu_head *rcu)
56{
57 struct thread_group_cred *tgcred =
58 container_of(rcu, struct thread_group_cred, rcu);
59
60 BUG_ON(atomic_read(&tgcred->usage) != 0);
61
62 key_put(tgcred->session_keyring);
63 key_put(tgcred->process_keyring);
64 kfree(tgcred);
65}
66#endif
67
68/*
69 * Release a set of thread group credentials.
70 */
71static void release_tgcred(struct cred *cred)
72{
73#ifdef CONFIG_KEYS
74 struct thread_group_cred *tgcred = cred->tgcred;
75
76 if (atomic_dec_and_test(&tgcred->usage))
77 call_rcu(&tgcred->rcu, release_tgcred_rcu);
78#endif
79}
80
81/*
82 * The RCU callback to actually dispose of a set of credentials
83 */
84static void put_cred_rcu(struct rcu_head *rcu)
85{
86 struct cred *cred = container_of(rcu, struct cred, rcu);
87
88 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage));
91
92 security_cred_free(cred);
93 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth);
95 release_tgcred(cred);
96 put_group_info(cred->group_info);
97 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred);
99}
100
101/**
102 * __put_cred - Destroy a set of credentials
103 * @cred: The record to release
104 *
105 * Destroy a set of credentials on which no references remain.
106 */
107void __put_cred(struct cred *cred)
108{
109 BUG_ON(atomic_read(&cred->usage) != 0);
110
111 call_rcu(&cred->rcu, put_cred_rcu);
112}
113EXPORT_SYMBOL(__put_cred);
114
115/**
116 * prepare_creds - Prepare a new set of credentials for modification
117 *
118 * Prepare a new set of task credentials for modification. A task's creds
119 * shouldn't generally be modified directly, therefore this function is used to
120 * prepare a new copy, which the caller then modifies and then commits by
121 * calling commit_creds().
122 *
123 * Preparation involves making a copy of the objective creds for modification.
124 *
125 * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
126 *
127 * Call commit_creds() or abort_creds() to clean up.
128 */
129struct cred *prepare_creds(void)
130{
131 struct task_struct *task = current;
132 const struct cred *old;
133 struct cred *new;
134
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1);
136
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new)
139 return NULL;
140
141 old = task->cred;
142 memcpy(new, old, sizeof(struct cred));
143
144 atomic_set(&new->usage, 1);
145 get_group_info(new->group_info);
146 get_uid(new->user);
147
148#ifdef CONFIG_KEYS
149 key_get(new->thread_keyring);
150 key_get(new->request_key_auth);
151 atomic_inc(&new->tgcred->usage);
152#endif
153
154#ifdef CONFIG_SECURITY
155 new->security = NULL;
156#endif
157
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error;
160 return new;
161
162error:
163 abort_creds(new);
164 return NULL;
165}
166EXPORT_SYMBOL(prepare_creds);
167
168/*
169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex
171 */
172struct cred *prepare_exec_creds(void)
173{
174 struct thread_group_cred *tgcred = NULL;
175 struct cred *new;
176
177#ifdef CONFIG_KEYS
178 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
179 if (!tgcred)
180 return NULL;
181#endif
182
183 new = prepare_creds();
184 if (!new) {
185 kfree(tgcred);
186 return new;
187 }
188
189#ifdef CONFIG_KEYS
190 /* newly exec'd tasks don't get a thread keyring */
191 key_put(new->thread_keyring);
192 new->thread_keyring = NULL;
193
194 /* create a new per-thread-group creds for all this set of threads to
195 * share */
196 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
197
198 atomic_set(&tgcred->usage, 1);
199 spin_lock_init(&tgcred->lock);
200
201 /* inherit the session keyring; new process keyring */
202 key_get(tgcred->session_keyring);
203 tgcred->process_keyring = NULL;
204
205 release_tgcred(new);
206 new->tgcred = tgcred;
207#endif
208
209 return new;
210}
211
212/*
213 * prepare new credentials for the usermode helper dispatcher
214 */
215struct cred *prepare_usermodehelper_creds(void)
216{
217#ifdef CONFIG_KEYS
218 struct thread_group_cred *tgcred = NULL;
219#endif
220 struct cred *new;
221
222#ifdef CONFIG_KEYS
223 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
224 if (!tgcred)
225 return NULL;
226#endif
227
228 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
229 if (!new)
230 return NULL;
231
232 memcpy(new, &init_cred, sizeof(struct cred));
233
234 atomic_set(&new->usage, 1);
235 get_group_info(new->group_info);
236 get_uid(new->user);
237
238#ifdef CONFIG_KEYS
239 new->thread_keyring = NULL;
240 new->request_key_auth = NULL;
241 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
242
243 atomic_set(&tgcred->usage, 1);
244 spin_lock_init(&tgcred->lock);
245 new->tgcred = tgcred;
246#endif
247
248#ifdef CONFIG_SECURITY
249 new->security = NULL;
250#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error;
253
254 BUG_ON(atomic_read(&new->usage) != 1);
255 return new;
256
257error:
258 put_cred(new);
259 return NULL;
260}
261
262/*
263 * Copy credentials for the new process created by fork()
264 *
265 * We share if we can, but under some circumstances we have to generate a new
266 * set.
267 *
268 * The new process gets the current process's subjective credentials as its
269 * objective and subjective credentials
270 */
271int copy_creds(struct task_struct *p, unsigned long clone_flags)
272{
273#ifdef CONFIG_KEYS
274 struct thread_group_cred *tgcred;
275#endif
276 struct cred *new;
277 int ret;
278
279 mutex_init(&p->cred_exec_mutex);
280
281 if (
282#ifdef CONFIG_KEYS
283 !p->cred->thread_keyring &&
284#endif
285 clone_flags & CLONE_THREAD
286 ) {
287 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred);
289 atomic_inc(&p->cred->user->processes);
290 return 0;
291 }
292
293 new = prepare_creds();
294 if (!new)
295 return -ENOMEM;
296
297 if (clone_flags & CLONE_NEWUSER) {
298 ret = create_user_ns(new);
299 if (ret < 0)
300 goto error_put;
301 }
302
303#ifdef CONFIG_KEYS
304 /* new threads get their own thread keyrings if their parent already
305 * had one */
306 if (new->thread_keyring) {
307 key_put(new->thread_keyring);
308 new->thread_keyring = NULL;
309 if (clone_flags & CLONE_THREAD)
310 install_thread_keyring_to_cred(new);
311 }
312
313 /* we share the process and session keyrings between all the threads in
314 * a process - this is slightly icky as we violate COW credentials a
315 * bit */
316 if (!(clone_flags & CLONE_THREAD)) {
317 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
318 if (!tgcred) {
319 ret = -ENOMEM;
320 goto error_put;
321 }
322 atomic_set(&tgcred->usage, 1);
323 spin_lock_init(&tgcred->lock);
324 tgcred->process_keyring = NULL;
325 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
326
327 release_tgcred(new);
328 new->tgcred = tgcred;
329 }
330#endif
331
332 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new);
334 return 0;
335
336error_put:
337 put_cred(new);
338 return ret;
339}
340
341/**
342 * commit_creds - Install new credentials upon the current task
343 * @new: The credentials to be assigned
344 *
345 * Install a new set of credentials to the current task, using RCU to replace
346 * the old set. Both the objective and the subjective credentials pointers are
347 * updated. This function may not be called if the subjective credentials are
348 * in an overridden state.
349 *
350 * This function eats the caller's reference to the new credentials.
351 *
352 * Always returns 0 thus allowing this function to be tail-called at the end
353 * of, say, sys_setgid().
354 */
355int commit_creds(struct cred *new)
356{
357 struct task_struct *task = current;
358 const struct cred *old;
359
360 BUG_ON(task->cred != task->real_cred);
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2);
362 BUG_ON(atomic_read(&new->usage) < 1);
363
364 old = task->real_cred;
365 security_commit_creds(new, old);
366
367 get_cred(new); /* we will require a ref for the subj creds too */
368
369 /* dumpability changes */
370 if (old->euid != new->euid ||
371 old->egid != new->egid ||
372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0;
377 smp_wmb();
378 }
379
380 /* alter the thread keyring */
381 if (new->fsuid != old->fsuid)
382 key_fsuid_changed(task);
383 if (new->fsgid != old->fsgid)
384 key_fsgid_changed(task);
385
386 /* do it
387 * - What if a process setreuid()'s and this brings the
388 * new uid over his NPROC rlimit? We can check this now
389 * cheaply with the new uid cache, so if it matters
390 * we should be checking for it. -DaveM
391 */
392 if (new->user != old->user)
393 atomic_inc(&new->user->processes);
394 rcu_assign_pointer(task->real_cred, new);
395 rcu_assign_pointer(task->cred, new);
396 if (new->user != old->user)
397 atomic_dec(&old->user->processes);
398
399 sched_switch_user(task);
400
401 /* send notifications */
402 if (new->uid != old->uid ||
403 new->euid != old->euid ||
404 new->suid != old->suid ||
405 new->fsuid != old->fsuid)
406 proc_id_connector(task, PROC_EVENT_UID);
407
408 if (new->gid != old->gid ||
409 new->egid != old->egid ||
410 new->sgid != old->sgid ||
411 new->fsgid != old->fsgid)
412 proc_id_connector(task, PROC_EVENT_GID);
413
414 /* release the old obj and subj refs both */
415 put_cred(old);
416 put_cred(old);
417 return 0;
418}
419EXPORT_SYMBOL(commit_creds);
420
421/**
422 * abort_creds - Discard a set of credentials and unlock the current task
423 * @new: The credentials that were going to be applied
424 *
425 * Discard a set of credentials that were under construction and unlock the
426 * current task.
427 */
428void abort_creds(struct cred *new)
429{
430 BUG_ON(atomic_read(&new->usage) < 1);
431 put_cred(new);
432}
433EXPORT_SYMBOL(abort_creds);
434
435/**
436 * override_creds - Override the current process's subjective credentials
437 * @new: The credentials to be assigned
438 *
439 * Install a set of temporary override subjective credentials on the current
440 * process, returning the old set for later reversion.
441 */
442const struct cred *override_creds(const struct cred *new)
443{
444 const struct cred *old = current->cred;
445
446 rcu_assign_pointer(current->cred, get_cred(new));
447 return old;
448}
449EXPORT_SYMBOL(override_creds);
450
451/**
452 * revert_creds - Revert a temporary subjective credentials override
453 * @old: The credentials to be restored
454 *
455 * Revert a temporary set of override subjective credentials to an old set,
456 * discarding the override set.
457 */
458void revert_creds(const struct cred *old)
459{
460 const struct cred *override = current->cred;
461
462 rcu_assign_pointer(current->cred, old);
463 put_cred(override);
464}
465EXPORT_SYMBOL(revert_creds);
466
467/*
468 * initialise the credentials stuff
469 */
470void __init cred_init(void)
471{
472 /* allocate a slab in which we can store credentials */
473 cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
474 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
475}
476
477/**
478 * prepare_kernel_cred - Prepare a set of credentials for a kernel service
479 * @daemon: A userspace daemon to be used as a reference
480 *
481 * Prepare a set of credentials for a kernel service. This can then be used to
482 * override a task's own credentials so that work can be done on behalf of that
483 * task that requires a different subjective context.
484 *
485 * @daemon is used to provide a base for the security record, but can be NULL.
486 * If @daemon is supplied, then the security data will be derived from that;
487 * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
488 *
489 * The caller may change these controls afterwards if desired.
490 *
491 * Returns the new credentials or NULL if out of memory.
492 *
493 * Does not take, and does not return holding current->cred_replace_mutex.
494 */
495struct cred *prepare_kernel_cred(struct task_struct *daemon)
496{
497 const struct cred *old;
498 struct cred *new;
499
500 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
501 if (!new)
502 return NULL;
503
504 if (daemon)
505 old = get_task_cred(daemon);
506 else
507 old = get_cred(&init_cred);
508
509 get_uid(new->user);
510 get_group_info(new->group_info);
511
512#ifdef CONFIG_KEYS
513 atomic_inc(&init_tgcred.usage);
514 new->tgcred = &init_tgcred;
515 new->request_key_auth = NULL;
516 new->thread_keyring = NULL;
517 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
518#endif
519
520#ifdef CONFIG_SECURITY
521 new->security = NULL;
522#endif
523 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
524 goto error;
525
526 atomic_set(&new->usage, 1);
527 put_cred(old);
528 return new;
529
530error:
531 put_cred(new);
532 return NULL;
533}
534EXPORT_SYMBOL(prepare_kernel_cred);
535
536/**
537 * set_security_override - Set the security ID in a set of credentials
538 * @new: The credentials to alter
539 * @secid: The LSM security ID to set
540 *
541 * Set the LSM security ID in a set of credentials so that the subjective
542 * security is overridden when an alternative set of credentials is used.
543 */
544int set_security_override(struct cred *new, u32 secid)
545{
546 return security_kernel_act_as(new, secid);
547}
548EXPORT_SYMBOL(set_security_override);
549
550/**
551 * set_security_override_from_ctx - Set the security ID in a set of credentials
552 * @new: The credentials to alter
553 * @secctx: The LSM security context to generate the security ID from.
554 *
555 * Set the LSM security ID in a set of credentials so that the subjective
556 * security is overridden when an alternative set of credentials is used. The
557 * security ID is specified in string form as a security context to be
558 * interpreted by the LSM.
559 */
560int set_security_override_from_ctx(struct cred *new, const char *secctx)
561{
562 u32 secid;
563 int ret;
564
565 ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
566 if (ret < 0)
567 return ret;
568
569 return set_security_override(new, secid);
570}
571EXPORT_SYMBOL(set_security_override_from_ctx);
572
573/**
574 * set_create_files_as - Set the LSM file create context in a set of credentials
575 * @new: The credentials to alter
576 * @inode: The inode to take the context from
577 *
578 * Change the LSM file creation context in a set of credentials to be the same
579 * as the object context of the specified inode, so that the new inodes have
580 * the same MAC context as that inode.
581 */
582int set_create_files_as(struct cred *new, struct inode *inode)
583{
584 new->fsuid = inode->i_uid;
585 new->fsgid = inode->i_gid;
586 return security_kernel_create_files_as(new, inode);
587}
588EXPORT_SYMBOL(set_create_files_as);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be..abb6e17505e2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
127 */ 127 */
128 t1 = tsk->sched_info.pcount; 128 t1 = tsk->sched_info.pcount;
129 t2 = tsk->sched_info.run_delay; 129 t2 = tsk->sched_info.run_delay;
130 t3 = tsk->sched_info.cpu_time; 130 t3 = tsk->se.sum_exec_runtime;
131 131
132 d->cpu_count += t1; 132 d->cpu_count += t1;
133 133
diff --git a/kernel/dma.c b/kernel/dma.c
index d2c60a822790..f903189c5304 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,4 +1,4 @@
1/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ 1/*
2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. 2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
3 * 3 *
4 * Written by Hennus Bergman, 1992. 4 * Written by Hennus Bergman, 1992.
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0d407e886735..0511716e9424 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -12,7 +12,9 @@
12#include <linux/kmod.h> 12#include <linux/kmod.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/proc_fs.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/seq_file.h>
16#include <linux/syscalls.h> 18#include <linux/syscalls.h>
17#include <linux/sysctl.h> 19#include <linux/sysctl.h>
18#include <linux/types.h> 20#include <linux/types.h>
@@ -173,20 +175,39 @@ __set_personality(u_long personality)
173 return 0; 175 return 0;
174} 176}
175 177
176int 178#ifdef CONFIG_PROC_FS
177get_exec_domain_list(char *page) 179static int execdomains_proc_show(struct seq_file *m, void *v)
178{ 180{
179 struct exec_domain *ep; 181 struct exec_domain *ep;
180 int len = 0;
181 182
182 read_lock(&exec_domains_lock); 183 read_lock(&exec_domains_lock);
183 for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) 184 for (ep = exec_domains; ep; ep = ep->next)
184 len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", 185 seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
185 ep->pers_low, ep->pers_high, ep->name, 186 ep->pers_low, ep->pers_high, ep->name,
186 module_name(ep->module)); 187 module_name(ep->module));
187 read_unlock(&exec_domains_lock); 188 read_unlock(&exec_domains_lock);
188 return (len); 189 return 0;
190}
191
192static int execdomains_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, execdomains_proc_show, NULL);
195}
196
197static const struct file_operations execdomains_proc_fops = {
198 .open = execdomains_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
203
204static int __init proc_execdomains_init(void)
205{
206 proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
207 return 0;
189} 208}
209module_init(proc_execdomains_init);
210#endif
190 211
191asmlinkage long 212asmlinkage long
192sys_personality(u_long personality) 213sys_personality(u_long personality)
diff --git a/kernel/exit.c b/kernel/exit.c
index c8d0485578be..e69edc74aeeb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -40,18 +40,24 @@
40#include <linux/cn_proc.h> 40#include <linux/cn_proc.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
44#include <linux/pipe_fs_i.h> 43#include <linux/pipe_fs_i.h>
45#include <linux/audit.h> /* for audit_free() */ 44#include <linux/audit.h> /* for audit_free() */
46#include <linux/resource.h> 45#include <linux/resource.h>
47#include <linux/blkdev.h> 46#include <linux/blkdev.h>
48#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
49#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/init_task.h>
50#include <trace/sched.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52#include <asm/unistd.h> 53#include <asm/unistd.h>
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
56#include "cred-internals.h"
57
58DEFINE_TRACE(sched_process_free);
59DEFINE_TRACE(sched_process_exit);
60DEFINE_TRACE(sched_process_wait);
55 61
56static void exit_mm(struct task_struct * tsk); 62static void exit_mm(struct task_struct * tsk);
57 63
@@ -112,8 +118,6 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 118 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 119 * will have been the last reference on the signal_struct.
114 */ 120 */
115 sig->utime = cputime_add(sig->utime, task_utime(tsk));
116 sig->stime = cputime_add(sig->stime, task_stime(tsk));
117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 121 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
118 sig->min_flt += tsk->min_flt; 122 sig->min_flt += tsk->min_flt;
119 sig->maj_flt += tsk->maj_flt; 123 sig->maj_flt += tsk->maj_flt;
@@ -122,7 +126,6 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->inblock += task_io_get_inblock(tsk); 126 sig->inblock += task_io_get_inblock(tsk);
123 sig->oublock += task_io_get_oublock(tsk); 127 sig->oublock += task_io_get_oublock(tsk);
124 task_io_accounting_add(&sig->ioac, &tsk->ioac); 128 task_io_accounting_add(&sig->ioac, &tsk->ioac);
125 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
126 sig = NULL; /* Marker for below. */ 129 sig = NULL; /* Marker for below. */
127 } 130 }
128 131
@@ -143,13 +146,21 @@ static void __exit_signal(struct task_struct *tsk)
143 if (sig) { 146 if (sig) {
144 flush_sigqueue(&sig->shared_pending); 147 flush_sigqueue(&sig->shared_pending);
145 taskstats_tgid_free(sig); 148 taskstats_tgid_free(sig);
149 /*
150 * Make sure ->signal can't go away under rq->lock,
151 * see account_group_exec_runtime().
152 */
153 task_rq_unlock_wait(tsk);
146 __cleanup_signal(sig); 154 __cleanup_signal(sig);
147 } 155 }
148} 156}
149 157
150static void delayed_put_task_struct(struct rcu_head *rhp) 158static void delayed_put_task_struct(struct rcu_head *rhp)
151{ 159{
152 put_task_struct(container_of(rhp, struct task_struct, rcu)); 160 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
161
162 trace_sched_process_free(tsk);
163 put_task_struct(tsk);
153} 164}
154 165
155 166
@@ -159,7 +170,10 @@ void release_task(struct task_struct * p)
159 int zap_leader; 170 int zap_leader;
160repeat: 171repeat:
161 tracehook_prepare_release_task(p); 172 tracehook_prepare_release_task(p);
162 atomic_dec(&p->user->processes); 173 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials */
175 atomic_dec(&__task_cred(p)->user->processes);
176
163 proc_flush_task(p); 177 proc_flush_task(p);
164 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
165 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
@@ -334,12 +348,12 @@ static void reparent_to_kthreadd(void)
334 /* cpus_allowed? */ 348 /* cpus_allowed? */
335 /* rt_priority? */ 349 /* rt_priority? */
336 /* signals? */ 350 /* signals? */
337 security_task_reparent_to_init(current);
338 memcpy(current->signal->rlim, init_task.signal->rlim, 351 memcpy(current->signal->rlim, init_task.signal->rlim,
339 sizeof(current->signal->rlim)); 352 sizeof(current->signal->rlim));
340 atomic_inc(&(INIT_USER->__count)); 353
354 atomic_inc(&init_cred.usage);
355 commit_creds(&init_cred);
341 write_unlock_irq(&tasklist_lock); 356 write_unlock_irq(&tasklist_lock);
342 switch_uid(INIT_USER);
343} 357}
344 358
345void __set_special_pids(struct pid *pid) 359void __set_special_pids(struct pid *pid)
@@ -640,24 +654,23 @@ retry:
640assign_new_owner: 654assign_new_owner:
641 BUG_ON(c == p); 655 BUG_ON(c == p);
642 get_task_struct(c); 656 get_task_struct(c);
657 read_unlock(&tasklist_lock);
658 down_write(&mm->mmap_sem);
643 /* 659 /*
644 * The task_lock protects c->mm from changing. 660 * The task_lock protects c->mm from changing.
645 * We always want mm->owner->mm == mm 661 * We always want mm->owner->mm == mm
646 */ 662 */
647 task_lock(c); 663 task_lock(c);
648 /*
649 * Delay read_unlock() till we have the task_lock()
650 * to ensure that c does not slip away underneath us
651 */
652 read_unlock(&tasklist_lock);
653 if (c->mm != mm) { 664 if (c->mm != mm) {
654 task_unlock(c); 665 task_unlock(c);
666 up_write(&mm->mmap_sem);
655 put_task_struct(c); 667 put_task_struct(c);
656 goto retry; 668 goto retry;
657 } 669 }
658 cgroup_mm_owner_callbacks(mm->owner, c); 670 cgroup_mm_owner_callbacks(mm->owner, c);
659 mm->owner = c; 671 mm->owner = c;
660 task_unlock(c); 672 task_unlock(c);
673 up_write(&mm->mmap_sem);
661 put_task_struct(c); 674 put_task_struct(c);
662} 675}
663#endif /* CONFIG_MM_OWNER */ 676#endif /* CONFIG_MM_OWNER */
@@ -1021,8 +1034,6 @@ NORET_TYPE void do_exit(long code)
1021 * task into the wait for ever nirwana as well. 1034 * task into the wait for ever nirwana as well.
1022 */ 1035 */
1023 tsk->flags |= PF_EXITPIDONE; 1036 tsk->flags |= PF_EXITPIDONE;
1024 if (tsk->io_context)
1025 exit_io_context();
1026 set_current_state(TASK_UNINTERRUPTIBLE); 1037 set_current_state(TASK_UNINTERRUPTIBLE);
1027 schedule(); 1038 schedule();
1028 } 1039 }
@@ -1051,14 +1062,6 @@ NORET_TYPE void do_exit(long code)
1051 exit_itimers(tsk->signal); 1062 exit_itimers(tsk->signal);
1052 } 1063 }
1053 acct_collect(code, group_dead); 1064 acct_collect(code, group_dead);
1054#ifdef CONFIG_FUTEX
1055 if (unlikely(tsk->robust_list))
1056 exit_robust_list(tsk);
1057#ifdef CONFIG_COMPAT
1058 if (unlikely(tsk->compat_robust_list))
1059 compat_exit_robust_list(tsk);
1060#endif
1061#endif
1062 if (group_dead) 1065 if (group_dead)
1063 tty_audit_exit(); 1066 tty_audit_exit();
1064 if (unlikely(tsk->audit_context)) 1067 if (unlikely(tsk->audit_context))
@@ -1071,13 +1074,14 @@ NORET_TYPE void do_exit(long code)
1071 1074
1072 if (group_dead) 1075 if (group_dead)
1073 acct_process(); 1076 acct_process();
1077 trace_sched_process_exit(tsk);
1078
1074 exit_sem(tsk); 1079 exit_sem(tsk);
1075 exit_files(tsk); 1080 exit_files(tsk);
1076 exit_fs(tsk); 1081 exit_fs(tsk);
1077 check_stack_usage(); 1082 check_stack_usage();
1078 exit_thread(); 1083 exit_thread();
1079 cgroup_exit(tsk, 1); 1084 cgroup_exit(tsk, 1);
1080 exit_keys(tsk);
1081 1085
1082 if (group_dead && tsk->signal->leader) 1086 if (group_dead && tsk->signal->leader)
1083 disassociate_ctty(1); 1087 disassociate_ctty(1);
@@ -1122,7 +1126,6 @@ NORET_TYPE void do_exit(long code)
1122 preempt_disable(); 1126 preempt_disable();
1123 /* causes final put_task_struct in finish_task_switch(). */ 1127 /* causes final put_task_struct in finish_task_switch(). */
1124 tsk->state = TASK_DEAD; 1128 tsk->state = TASK_DEAD;
1125
1126 schedule(); 1129 schedule();
1127 BUG(); 1130 BUG();
1128 /* Avoid "noreturn function does return". */ 1131 /* Avoid "noreturn function does return". */
@@ -1262,12 +1265,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1262 unsigned long state; 1265 unsigned long state;
1263 int retval, status, traced; 1266 int retval, status, traced;
1264 pid_t pid = task_pid_vnr(p); 1267 pid_t pid = task_pid_vnr(p);
1268 uid_t uid = __task_cred(p)->uid;
1265 1269
1266 if (!likely(options & WEXITED)) 1270 if (!likely(options & WEXITED))
1267 return 0; 1271 return 0;
1268 1272
1269 if (unlikely(options & WNOWAIT)) { 1273 if (unlikely(options & WNOWAIT)) {
1270 uid_t uid = p->uid;
1271 int exit_code = p->exit_code; 1274 int exit_code = p->exit_code;
1272 int why, status; 1275 int why, status;
1273 1276
@@ -1299,6 +1302,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1299 if (likely(!traced)) { 1302 if (likely(!traced)) {
1300 struct signal_struct *psig; 1303 struct signal_struct *psig;
1301 struct signal_struct *sig; 1304 struct signal_struct *sig;
1305 struct task_cputime cputime;
1302 1306
1303 /* 1307 /*
1304 * The resource counters for the group leader are in its 1308 * The resource counters for the group leader are in its
@@ -1314,20 +1318,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1314 * need to protect the access to p->parent->signal fields, 1318 * need to protect the access to p->parent->signal fields,
1315 * as other threads in the parent group can be right 1319 * as other threads in the parent group can be right
1316 * here reaping other children at the same time. 1320 * here reaping other children at the same time.
1321 *
1322 * We use thread_group_cputime() to get times for the thread
1323 * group, which consolidates times for all threads in the
1324 * group including the group leader.
1317 */ 1325 */
1326 thread_group_cputime(p, &cputime);
1318 spin_lock_irq(&p->parent->sighand->siglock); 1327 spin_lock_irq(&p->parent->sighand->siglock);
1319 psig = p->parent->signal; 1328 psig = p->parent->signal;
1320 sig = p->signal; 1329 sig = p->signal;
1321 psig->cutime = 1330 psig->cutime =
1322 cputime_add(psig->cutime, 1331 cputime_add(psig->cutime,
1323 cputime_add(p->utime, 1332 cputime_add(cputime.utime,
1324 cputime_add(sig->utime, 1333 sig->cutime));
1325 sig->cutime)));
1326 psig->cstime = 1334 psig->cstime =
1327 cputime_add(psig->cstime, 1335 cputime_add(psig->cstime,
1328 cputime_add(p->stime, 1336 cputime_add(cputime.stime,
1329 cputime_add(sig->stime, 1337 sig->cstime));
1330 sig->cstime)));
1331 psig->cgtime = 1338 psig->cgtime =
1332 cputime_add(psig->cgtime, 1339 cputime_add(psig->cgtime,
1333 cputime_add(p->gtime, 1340 cputime_add(p->gtime,
@@ -1384,7 +1391,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1384 if (!retval && infop) 1391 if (!retval && infop)
1385 retval = put_user(pid, &infop->si_pid); 1392 retval = put_user(pid, &infop->si_pid);
1386 if (!retval && infop) 1393 if (!retval && infop)
1387 retval = put_user(p->uid, &infop->si_uid); 1394 retval = put_user(uid, &infop->si_uid);
1388 if (!retval) 1395 if (!retval)
1389 retval = pid; 1396 retval = pid;
1390 1397
@@ -1449,7 +1456,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1449 if (!unlikely(options & WNOWAIT)) 1456 if (!unlikely(options & WNOWAIT))
1450 p->exit_code = 0; 1457 p->exit_code = 0;
1451 1458
1452 uid = p->uid; 1459 /* don't need the RCU readlock here as we're holding a spinlock */
1460 uid = __task_cred(p)->uid;
1453unlock_sig: 1461unlock_sig:
1454 spin_unlock_irq(&p->sighand->siglock); 1462 spin_unlock_irq(&p->sighand->siglock);
1455 if (!exit_code) 1463 if (!exit_code)
@@ -1523,10 +1531,10 @@ static int wait_task_continued(struct task_struct *p, int options,
1523 } 1531 }
1524 if (!unlikely(options & WNOWAIT)) 1532 if (!unlikely(options & WNOWAIT))
1525 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1533 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1534 uid = __task_cred(p)->uid;
1526 spin_unlock_irq(&p->sighand->siglock); 1535 spin_unlock_irq(&p->sighand->siglock);
1527 1536
1528 pid = task_pid_vnr(p); 1537 pid = task_pid_vnr(p);
1529 uid = p->uid;
1530 get_task_struct(p); 1538 get_task_struct(p);
1531 read_unlock(&tasklist_lock); 1539 read_unlock(&tasklist_lock);
1532 1540
@@ -1672,6 +1680,8 @@ static long do_wait(enum pid_type type, struct pid *pid, int options,
1672 struct task_struct *tsk; 1680 struct task_struct *tsk;
1673 int retval; 1681 int retval;
1674 1682
1683 trace_sched_process_wait(pid);
1684
1675 add_wait_queue(&current->signal->wait_chldexit,&wait); 1685 add_wait_queue(&current->signal->wait_chldexit,&wait);
1676repeat: 1686repeat:
1677 /* 1687 /*
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
17*/ 17*/
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/ftrace.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22 23
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40 return e; 41 return e;
41} 42}
42 43
43int core_kernel_text(unsigned long addr) 44__notrace_funcgraph int core_kernel_text(unsigned long addr)
44{ 45{
45 if (addr >= (unsigned long)_stext && 46 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext) 47 addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
53 return 0; 54 return 0;
54} 55}
55 56
56int __kernel_text_address(unsigned long addr) 57__notrace_funcgraph int __kernel_text_address(unsigned long addr)
57{ 58{
58 if (core_kernel_text(addr)) 59 if (core_kernel_text(addr))
59 return 1; 60 return 1;
@@ -66,3 +67,19 @@ int kernel_text_address(unsigned long addr)
66 return 1; 67 return 1;
67 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
68} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/fork.c b/kernel/fork.c
index 99c5c655b098..913284e3db14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,12 +40,14 @@
40#include <linux/jiffies.h> 40#include <linux/jiffies.h>
41#include <linux/tracehook.h> 41#include <linux/tracehook.h>
42#include <linux/futex.h> 42#include <linux/futex.h>
43#include <linux/compat.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
46#include <linux/mount.h> 47#include <linux/mount.h>
47#include <linux/audit.h> 48#include <linux/audit.h>
48#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
50#include <linux/ftrace.h>
49#include <linux/profile.h> 51#include <linux/profile.h>
50#include <linux/rmap.h> 52#include <linux/rmap.h>
51#include <linux/acct.h> 53#include <linux/acct.h>
@@ -58,6 +60,7 @@
58#include <linux/tty.h> 60#include <linux/tty.h>
59#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
60#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <trace/sched.h>
61#include <linux/magic.h> 64#include <linux/magic.h>
62 65
63#include <asm/pgtable.h> 66#include <asm/pgtable.h>
@@ -79,6 +82,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
79 82
80__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 83__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
81 84
85DEFINE_TRACE(sched_process_fork);
86
82int nr_processes(void) 87int nr_processes(void)
83{ 88{
84 int cpu; 89 int cpu;
@@ -136,6 +141,7 @@ void free_task(struct task_struct *tsk)
136 prop_local_destroy_single(&tsk->dirties); 141 prop_local_destroy_single(&tsk->dirties);
137 free_thread_info(tsk->stack); 142 free_thread_info(tsk->stack);
138 rt_mutex_debug_task_free(tsk); 143 rt_mutex_debug_task_free(tsk);
144 ftrace_graph_exit_task(tsk);
139 free_task_struct(tsk); 145 free_task_struct(tsk);
140} 146}
141EXPORT_SYMBOL(free_task); 147EXPORT_SYMBOL(free_task);
@@ -146,9 +152,8 @@ void __put_task_struct(struct task_struct *tsk)
146 WARN_ON(atomic_read(&tsk->usage)); 152 WARN_ON(atomic_read(&tsk->usage));
147 WARN_ON(tsk == current); 153 WARN_ON(tsk == current);
148 154
149 security_task_free(tsk); 155 put_cred(tsk->real_cred);
150 free_uid(tsk->user); 156 put_cred(tsk->cred);
151 put_group_info(tsk->group_info);
152 delayacct_tsk_free(tsk); 157 delayacct_tsk_free(tsk);
153 158
154 if (!profile_handoff_task(tsk)) 159 if (!profile_handoff_task(tsk))
@@ -318,17 +323,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
318 file = tmp->vm_file; 323 file = tmp->vm_file;
319 if (file) { 324 if (file) {
320 struct inode *inode = file->f_path.dentry->d_inode; 325 struct inode *inode = file->f_path.dentry->d_inode;
326 struct address_space *mapping = file->f_mapping;
327
321 get_file(file); 328 get_file(file);
322 if (tmp->vm_flags & VM_DENYWRITE) 329 if (tmp->vm_flags & VM_DENYWRITE)
323 atomic_dec(&inode->i_writecount); 330 atomic_dec(&inode->i_writecount);
324 331 spin_lock(&mapping->i_mmap_lock);
325 /* insert tmp into the share list, just after mpnt */ 332 if (tmp->vm_flags & VM_SHARED)
326 spin_lock(&file->f_mapping->i_mmap_lock); 333 mapping->i_mmap_writable++;
327 tmp->vm_truncate_count = mpnt->vm_truncate_count; 334 tmp->vm_truncate_count = mpnt->vm_truncate_count;
328 flush_dcache_mmap_lock(file->f_mapping); 335 flush_dcache_mmap_lock(mapping);
336 /* insert tmp into the share list, just after mpnt */
329 vma_prio_tree_add(tmp, mpnt); 337 vma_prio_tree_add(tmp, mpnt);
330 flush_dcache_mmap_unlock(file->f_mapping); 338 flush_dcache_mmap_unlock(mapping);
331 spin_unlock(&file->f_mapping->i_mmap_lock); 339 spin_unlock(&mapping->i_mmap_lock);
332 } 340 }
333 341
334 /* 342 /*
@@ -412,8 +420,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
412 set_mm_counter(mm, file_rss, 0); 420 set_mm_counter(mm, file_rss, 0);
413 set_mm_counter(mm, anon_rss, 0); 421 set_mm_counter(mm, anon_rss, 0);
414 spin_lock_init(&mm->page_table_lock); 422 spin_lock_init(&mm->page_table_lock);
415 rwlock_init(&mm->ioctx_list_lock); 423 spin_lock_init(&mm->ioctx_lock);
416 mm->ioctx_list = NULL; 424 INIT_HLIST_HEAD(&mm->ioctx_list);
417 mm->free_area_cache = TASK_UNMAPPED_BASE; 425 mm->free_area_cache = TASK_UNMAPPED_BASE;
418 mm->cached_hole_size = ~0UL; 426 mm->cached_hole_size = ~0UL;
419 mm_init_owner(mm, p); 427 mm_init_owner(mm, p);
@@ -523,6 +531,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
523{ 531{
524 struct completion *vfork_done = tsk->vfork_done; 532 struct completion *vfork_done = tsk->vfork_done;
525 533
534 /* Get rid of any futexes when releasing the mm */
535#ifdef CONFIG_FUTEX
536 if (unlikely(tsk->robust_list))
537 exit_robust_list(tsk);
538#ifdef CONFIG_COMPAT
539 if (unlikely(tsk->compat_robust_list))
540 compat_exit_robust_list(tsk);
541#endif
542#endif
543
526 /* Get rid of any cached register state */ 544 /* Get rid of any cached register state */
527 deactivate_mm(tsk, mm); 545 deactivate_mm(tsk, mm);
528 546
@@ -764,27 +782,50 @@ void __cleanup_sighand(struct sighand_struct *sighand)
764 kmem_cache_free(sighand_cachep, sighand); 782 kmem_cache_free(sighand_cachep, sighand);
765} 783}
766 784
785
786/*
787 * Initialize POSIX timer handling for a thread group.
788 */
789static void posix_cpu_timers_init_group(struct signal_struct *sig)
790{
791 /* Thread group counters. */
792 thread_group_cputime_init(sig);
793
794 /* Expiration times and increments. */
795 sig->it_virt_expires = cputime_zero;
796 sig->it_virt_incr = cputime_zero;
797 sig->it_prof_expires = cputime_zero;
798 sig->it_prof_incr = cputime_zero;
799
800 /* Cached expiration times. */
801 sig->cputime_expires.prof_exp = cputime_zero;
802 sig->cputime_expires.virt_exp = cputime_zero;
803 sig->cputime_expires.sched_exp = 0;
804
805 /* The timer lists. */
806 INIT_LIST_HEAD(&sig->cpu_timers[0]);
807 INIT_LIST_HEAD(&sig->cpu_timers[1]);
808 INIT_LIST_HEAD(&sig->cpu_timers[2]);
809}
810
767static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 811static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
768{ 812{
769 struct signal_struct *sig; 813 struct signal_struct *sig;
770 int ret; 814 int ret;
771 815
772 if (clone_flags & CLONE_THREAD) { 816 if (clone_flags & CLONE_THREAD) {
773 atomic_inc(&current->signal->count); 817 ret = thread_group_cputime_clone_thread(current);
774 atomic_inc(&current->signal->live); 818 if (likely(!ret)) {
775 return 0; 819 atomic_inc(&current->signal->count);
820 atomic_inc(&current->signal->live);
821 }
822 return ret;
776 } 823 }
777 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 824 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
778 tsk->signal = sig; 825 tsk->signal = sig;
779 if (!sig) 826 if (!sig)
780 return -ENOMEM; 827 return -ENOMEM;
781 828
782 ret = copy_thread_group_keys(tsk);
783 if (ret < 0) {
784 kmem_cache_free(signal_cachep, sig);
785 return ret;
786 }
787
788 atomic_set(&sig->count, 1); 829 atomic_set(&sig->count, 1);
789 atomic_set(&sig->live, 1); 830 atomic_set(&sig->live, 1);
790 init_waitqueue_head(&sig->wait_chldexit); 831 init_waitqueue_head(&sig->wait_chldexit);
@@ -800,40 +841,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
800 sig->it_real_incr.tv64 = 0; 841 sig->it_real_incr.tv64 = 0;
801 sig->real_timer.function = it_real_fn; 842 sig->real_timer.function = it_real_fn;
802 843
803 sig->it_virt_expires = cputime_zero;
804 sig->it_virt_incr = cputime_zero;
805 sig->it_prof_expires = cputime_zero;
806 sig->it_prof_incr = cputime_zero;
807
808 sig->leader = 0; /* session leadership doesn't inherit */ 844 sig->leader = 0; /* session leadership doesn't inherit */
809 sig->tty_old_pgrp = NULL; 845 sig->tty_old_pgrp = NULL;
810 sig->tty = NULL; 846 sig->tty = NULL;
811 847
812 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 848 sig->cutime = sig->cstime = cputime_zero;
813 sig->gtime = cputime_zero; 849 sig->gtime = cputime_zero;
814 sig->cgtime = cputime_zero; 850 sig->cgtime = cputime_zero;
815 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 851 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
816 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 852 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
817 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 853 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
818 task_io_accounting_init(&sig->ioac); 854 task_io_accounting_init(&sig->ioac);
819 sig->sum_sched_runtime = 0;
820 INIT_LIST_HEAD(&sig->cpu_timers[0]);
821 INIT_LIST_HEAD(&sig->cpu_timers[1]);
822 INIT_LIST_HEAD(&sig->cpu_timers[2]);
823 taskstats_tgid_init(sig); 855 taskstats_tgid_init(sig);
824 856
825 task_lock(current->group_leader); 857 task_lock(current->group_leader);
826 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 858 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
827 task_unlock(current->group_leader); 859 task_unlock(current->group_leader);
828 860
829 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 861 posix_cpu_timers_init_group(sig);
830 /* 862
831 * New sole thread in the process gets an expiry time
832 * of the whole CPU time limit.
833 */
834 tsk->it_prof_expires =
835 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
836 }
837 acct_init_pacct(&sig->pacct); 863 acct_init_pacct(&sig->pacct);
838 864
839 tty_audit_fork(sig); 865 tty_audit_fork(sig);
@@ -843,7 +869,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
843 869
844void __cleanup_signal(struct signal_struct *sig) 870void __cleanup_signal(struct signal_struct *sig)
845{ 871{
846 exit_thread_group_keys(sig); 872 thread_group_cputime_free(sig);
847 tty_kref_put(sig->tty); 873 tty_kref_put(sig->tty);
848 kmem_cache_free(signal_cachep, sig); 874 kmem_cache_free(signal_cachep, sig);
849} 875}
@@ -893,6 +919,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
893#endif /* CONFIG_MM_OWNER */ 919#endif /* CONFIG_MM_OWNER */
894 920
895/* 921/*
922 * Initialize POSIX timer handling for a single task.
923 */
924static void posix_cpu_timers_init(struct task_struct *tsk)
925{
926 tsk->cputime_expires.prof_exp = cputime_zero;
927 tsk->cputime_expires.virt_exp = cputime_zero;
928 tsk->cputime_expires.sched_exp = 0;
929 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
930 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
931 INIT_LIST_HEAD(&tsk->cpu_timers[2]);
932}
933
934/*
896 * This creates a new process as a copy of the old one, 935 * This creates a new process as a copy of the old one,
897 * but does not actually start it yet. 936 * but does not actually start it yet.
898 * 937 *
@@ -946,16 +985,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
946 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 985 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
947#endif 986#endif
948 retval = -EAGAIN; 987 retval = -EAGAIN;
949 if (atomic_read(&p->user->processes) >= 988 if (atomic_read(&p->real_cred->user->processes) >=
950 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 989 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
951 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 990 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
952 p->user != current->nsproxy->user_ns->root_user) 991 p->real_cred->user != INIT_USER)
953 goto bad_fork_free; 992 goto bad_fork_free;
954 } 993 }
955 994
956 atomic_inc(&p->user->__count); 995 retval = copy_creds(p, clone_flags);
957 atomic_inc(&p->user->processes); 996 if (retval < 0)
958 get_group_info(p->group_info); 997 goto bad_fork_free;
959 998
960 /* 999 /*
961 * If multiple threads are within copy_process(), then this check 1000 * If multiple threads are within copy_process(), then this check
@@ -994,6 +1033,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
994 p->prev_utime = cputime_zero; 1033 p->prev_utime = cputime_zero;
995 p->prev_stime = cputime_zero; 1034 p->prev_stime = cputime_zero;
996 1035
1036 p->default_timer_slack_ns = current->timer_slack_ns;
1037
997#ifdef CONFIG_DETECT_SOFTLOCKUP 1038#ifdef CONFIG_DETECT_SOFTLOCKUP
998 p->last_switch_count = 0; 1039 p->last_switch_count = 0;
999 p->last_switch_timestamp = 0; 1040 p->last_switch_timestamp = 0;
@@ -1002,21 +1043,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1002 task_io_accounting_init(&p->ioac); 1043 task_io_accounting_init(&p->ioac);
1003 acct_clear_integrals(p); 1044 acct_clear_integrals(p);
1004 1045
1005 p->it_virt_expires = cputime_zero; 1046 posix_cpu_timers_init(p);
1006 p->it_prof_expires = cputime_zero;
1007 p->it_sched_expires = 0;
1008 INIT_LIST_HEAD(&p->cpu_timers[0]);
1009 INIT_LIST_HEAD(&p->cpu_timers[1]);
1010 INIT_LIST_HEAD(&p->cpu_timers[2]);
1011 1047
1012 p->lock_depth = -1; /* -1 = no lock */ 1048 p->lock_depth = -1; /* -1 = no lock */
1013 do_posix_clock_monotonic_gettime(&p->start_time); 1049 do_posix_clock_monotonic_gettime(&p->start_time);
1014 p->real_start_time = p->start_time; 1050 p->real_start_time = p->start_time;
1015 monotonic_to_bootbased(&p->real_start_time); 1051 monotonic_to_bootbased(&p->real_start_time);
1016#ifdef CONFIG_SECURITY
1017 p->security = NULL;
1018#endif
1019 p->cap_bset = current->cap_bset;
1020 p->io_context = NULL; 1052 p->io_context = NULL;
1021 p->audit_context = NULL; 1053 p->audit_context = NULL;
1022 cgroup_fork(p); 1054 cgroup_fork(p);
@@ -1057,14 +1089,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1057#ifdef CONFIG_DEBUG_MUTEXES 1089#ifdef CONFIG_DEBUG_MUTEXES
1058 p->blocked_on = NULL; /* not blocked yet */ 1090 p->blocked_on = NULL; /* not blocked yet */
1059#endif 1091#endif
1092 if (unlikely(ptrace_reparented(current)))
1093 ptrace_fork(p, clone_flags);
1060 1094
1061 /* Perform scheduler related setup. Assign this task to a CPU. */ 1095 /* Perform scheduler related setup. Assign this task to a CPU. */
1062 sched_fork(p, clone_flags); 1096 sched_fork(p, clone_flags);
1063 1097
1064 if ((retval = security_task_alloc(p)))
1065 goto bad_fork_cleanup_policy;
1066 if ((retval = audit_alloc(p))) 1098 if ((retval = audit_alloc(p)))
1067 goto bad_fork_cleanup_security; 1099 goto bad_fork_cleanup_policy;
1068 /* copy all the process information */ 1100 /* copy all the process information */
1069 if ((retval = copy_semundo(clone_flags, p))) 1101 if ((retval = copy_semundo(clone_flags, p)))
1070 goto bad_fork_cleanup_audit; 1102 goto bad_fork_cleanup_audit;
@@ -1078,10 +1110,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1078 goto bad_fork_cleanup_sighand; 1110 goto bad_fork_cleanup_sighand;
1079 if ((retval = copy_mm(clone_flags, p))) 1111 if ((retval = copy_mm(clone_flags, p)))
1080 goto bad_fork_cleanup_signal; 1112 goto bad_fork_cleanup_signal;
1081 if ((retval = copy_keys(clone_flags, p)))
1082 goto bad_fork_cleanup_mm;
1083 if ((retval = copy_namespaces(clone_flags, p))) 1113 if ((retval = copy_namespaces(clone_flags, p)))
1084 goto bad_fork_cleanup_keys; 1114 goto bad_fork_cleanup_mm;
1085 if ((retval = copy_io(clone_flags, p))) 1115 if ((retval = copy_io(clone_flags, p)))
1086 goto bad_fork_cleanup_namespaces; 1116 goto bad_fork_cleanup_namespaces;
1087 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1117 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1101,6 +1131,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1101 } 1131 }
1102 } 1132 }
1103 1133
1134 ftrace_graph_init_task(p);
1135
1104 p->pid = pid_nr(pid); 1136 p->pid = pid_nr(pid);
1105 p->tgid = p->pid; 1137 p->tgid = p->pid;
1106 if (clone_flags & CLONE_THREAD) 1138 if (clone_flags & CLONE_THREAD)
@@ -1109,7 +1141,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1109 if (current->nsproxy != p->nsproxy) { 1141 if (current->nsproxy != p->nsproxy) {
1110 retval = ns_cgroup_clone(p, pid); 1142 retval = ns_cgroup_clone(p, pid);
1111 if (retval) 1143 if (retval)
1112 goto bad_fork_free_pid; 1144 goto bad_fork_free_graph;
1113 } 1145 }
1114 1146
1115 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1147 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1202,27 +1234,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1202 spin_unlock(&current->sighand->siglock); 1234 spin_unlock(&current->sighand->siglock);
1203 write_unlock_irq(&tasklist_lock); 1235 write_unlock_irq(&tasklist_lock);
1204 retval = -ERESTARTNOINTR; 1236 retval = -ERESTARTNOINTR;
1205 goto bad_fork_free_pid; 1237 goto bad_fork_free_graph;
1206 } 1238 }
1207 1239
1208 if (clone_flags & CLONE_THREAD) { 1240 if (clone_flags & CLONE_THREAD) {
1209 p->group_leader = current->group_leader; 1241 p->group_leader = current->group_leader;
1210 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1242 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1211
1212 if (!cputime_eq(current->signal->it_virt_expires,
1213 cputime_zero) ||
1214 !cputime_eq(current->signal->it_prof_expires,
1215 cputime_zero) ||
1216 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1217 !list_empty(&current->signal->cpu_timers[0]) ||
1218 !list_empty(&current->signal->cpu_timers[1]) ||
1219 !list_empty(&current->signal->cpu_timers[2])) {
1220 /*
1221 * Have child wake up on its first tick to check
1222 * for process CPU timers.
1223 */
1224 p->it_prof_expires = jiffies_to_cputime(1);
1225 }
1226 } 1243 }
1227 1244
1228 if (likely(p->pid)) { 1245 if (likely(p->pid)) {
@@ -1254,6 +1271,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1254 cgroup_post_fork(p); 1271 cgroup_post_fork(p);
1255 return p; 1272 return p;
1256 1273
1274bad_fork_free_graph:
1275 ftrace_graph_exit_task(p);
1257bad_fork_free_pid: 1276bad_fork_free_pid:
1258 if (pid != &init_struct_pid) 1277 if (pid != &init_struct_pid)
1259 free_pid(pid); 1278 free_pid(pid);
@@ -1261,8 +1280,6 @@ bad_fork_cleanup_io:
1261 put_io_context(p->io_context); 1280 put_io_context(p->io_context);
1262bad_fork_cleanup_namespaces: 1281bad_fork_cleanup_namespaces:
1263 exit_task_namespaces(p); 1282 exit_task_namespaces(p);
1264bad_fork_cleanup_keys:
1265 exit_keys(p);
1266bad_fork_cleanup_mm: 1283bad_fork_cleanup_mm:
1267 if (p->mm) 1284 if (p->mm)
1268 mmput(p->mm); 1285 mmput(p->mm);
@@ -1278,8 +1295,6 @@ bad_fork_cleanup_semundo:
1278 exit_sem(p); 1295 exit_sem(p);
1279bad_fork_cleanup_audit: 1296bad_fork_cleanup_audit:
1280 audit_free(p); 1297 audit_free(p);
1281bad_fork_cleanup_security:
1282 security_task_free(p);
1283bad_fork_cleanup_policy: 1298bad_fork_cleanup_policy:
1284#ifdef CONFIG_NUMA 1299#ifdef CONFIG_NUMA
1285 mpol_put(p->mempolicy); 1300 mpol_put(p->mempolicy);
@@ -1292,9 +1307,9 @@ bad_fork_cleanup_cgroup:
1292bad_fork_cleanup_put_domain: 1307bad_fork_cleanup_put_domain:
1293 module_put(task_thread_info(p)->exec_domain->module); 1308 module_put(task_thread_info(p)->exec_domain->module);
1294bad_fork_cleanup_count: 1309bad_fork_cleanup_count:
1295 put_group_info(p->group_info); 1310 atomic_dec(&p->cred->user->processes);
1296 atomic_dec(&p->user->processes); 1311 put_cred(p->real_cred);
1297 free_uid(p->user); 1312 put_cred(p->cred);
1298bad_fork_free: 1313bad_fork_free:
1299 free_task(p); 1314 free_task(p);
1300fork_out: 1315fork_out:
@@ -1338,6 +1353,21 @@ long do_fork(unsigned long clone_flags,
1338 long nr; 1353 long nr;
1339 1354
1340 /* 1355 /*
1356 * Do some preliminary argument and permissions checking before we
1357 * actually start allocating stuff
1358 */
1359 if (clone_flags & CLONE_NEWUSER) {
1360 if (clone_flags & CLONE_THREAD)
1361 return -EINVAL;
1362 /* hopefully this check will go away when userns support is
1363 * complete
1364 */
1365 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1366 !capable(CAP_SETGID))
1367 return -EPERM;
1368 }
1369
1370 /*
1341 * We hope to recycle these flags after 2.6.26 1371 * We hope to recycle these flags after 2.6.26
1342 */ 1372 */
1343 if (unlikely(clone_flags & CLONE_STOPPED)) { 1373 if (unlikely(clone_flags & CLONE_STOPPED)) {
@@ -1369,6 +1399,8 @@ long do_fork(unsigned long clone_flags,
1369 if (!IS_ERR(p)) { 1399 if (!IS_ERR(p)) {
1370 struct completion vfork; 1400 struct completion vfork;
1371 1401
1402 trace_sched_process_fork(current, p);
1403
1372 nr = task_pid_vnr(p); 1404 nr = task_pid_vnr(p);
1373 1405
1374 if (clone_flags & CLONE_PARENT_SETTID) 1406 if (clone_flags & CLONE_PARENT_SETTID)
@@ -1379,6 +1411,7 @@ long do_fork(unsigned long clone_flags,
1379 init_completion(&vfork); 1411 init_completion(&vfork);
1380 } 1412 }
1381 1413
1414 audit_finish_fork(p);
1382 tracehook_report_clone(trace, regs, clone_flags, nr, p); 1415 tracehook_report_clone(trace, regs, clone_flags, nr, p);
1383 1416
1384 /* 1417 /*
@@ -1582,8 +1615,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1582 err = -EINVAL; 1615 err = -EINVAL;
1583 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1616 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1584 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1617 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1585 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| 1618 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1586 CLONE_NEWNET))
1587 goto bad_unshare_out; 1619 goto bad_unshare_out;
1588 1620
1589 /* 1621 /*
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 000000000000..2f4936cf7083
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,154 @@
1/*
2 * kernel/freezer.c - Function to freeze a process
3 *
4 * Originally from kernel/power/process.c
5 */
6
7#include <linux/interrupt.h>
8#include <linux/suspend.h>
9#include <linux/module.h>
10#include <linux/syscalls.h>
11#include <linux/freezer.h>
12
13/*
14 * freezing is complete, mark current process as frozen
15 */
16static inline void frozen_process(void)
17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN;
20 wmb();
21 }
22 clear_freeze_flag(current);
23}
24
25/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void)
27{
28 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */
30 long save;
31
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm);
42
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current))
50 break;
51 schedule();
52 }
53 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save);
55}
56EXPORT_SYMBOL(refrigerator);
57
58static void fake_signal_wake_up(struct task_struct *p)
59{
60 unsigned long flags;
61
62 spin_lock_irqsave(&p->sighand->siglock, flags);
63 signal_wake_up(p, 0);
64 spin_unlock_irqrestore(&p->sighand->siglock, flags);
65}
66
67/**
68 * freeze_task - send a freeze request to given task
69 * @p: task to send the request to
70 * @sig_only: if set, the request will only be sent if the task has the
71 * PF_FREEZER_NOSIG flag unset
72 * Return value: 'false', if @sig_only is set and the task has
73 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
74 *
75 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
76 * either sending a fake signal to it or waking it up, depending on whether
77 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
78 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
79 * TIF_FREEZE flag will not be set.
80 */
81bool freeze_task(struct task_struct *p, bool sig_only)
82{
83 /*
84 * We first check if the task is freezing and next if it has already
85 * been frozen to avoid the race with frozen_process() which first marks
86 * the task as frozen and next clears its TIF_FREEZE.
87 */
88 if (!freezing(p)) {
89 rmb();
90 if (frozen(p))
91 return false;
92
93 if (!sig_only || should_send_signal(p))
94 set_freeze_flag(p);
95 else
96 return false;
97 }
98
99 if (should_send_signal(p)) {
100 if (!signal_pending(p))
101 fake_signal_wake_up(p);
102 } else if (sig_only) {
103 return false;
104 } else {
105 wake_up_state(p, TASK_INTERRUPTIBLE);
106 }
107
108 return true;
109}
110
111void cancel_freezing(struct task_struct *p)
112{
113 unsigned long flags;
114
115 if (freezing(p)) {
116 pr_debug(" clean up: %s\n", p->comm);
117 clear_freeze_flag(p);
118 spin_lock_irqsave(&p->sighand->siglock, flags);
119 recalc_sigpending_and_wake(p);
120 spin_unlock_irqrestore(&p->sighand->siglock, flags);
121 }
122}
123
124static int __thaw_process(struct task_struct *p)
125{
126 if (frozen(p)) {
127 p->flags &= ~PF_FROZEN;
128 return 1;
129 }
130 clear_freeze_flag(p);
131 return 0;
132}
133
134/*
135 * Wake up a frozen process
136 *
137 * task_lock() is needed to prevent the race with refrigerator() which may
138 * occur if the freezing of tasks fails. Namely, without the lock, if the
139 * freezing of tasks failed, thaw_tasks() might have run before a task in
140 * refrigerator() could call frozen_process(), in which case the task would be
141 * frozen and no one would thaw it.
142 */
143int thaw_process(struct task_struct *p)
144{
145 task_lock(p);
146 if (__thaw_process(p) == 1) {
147 task_unlock(p);
148 wake_up_process(p);
149 return 1;
150 }
151 task_unlock(p);
152 return 0;
153}
154EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c14..7c6cbabe52b3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
94 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
95 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiter, then make the second condition true.
96 */ 96 */
97struct futex_q { 97struct futex_q {
98 struct plist_node list; 98 struct plist_node list;
99 wait_queue_head_t waiters; 99 /* There can only be a single waiter */
100 wait_queue_head_t waiter;
100 101
101 /* Which hash list lock to use: */ 102 /* Which hash list lock to use: */
102 spinlock_t *lock_ptr; 103 spinlock_t *lock_ptr;
@@ -123,24 +124,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 125
125/* 126/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 127 * We hash on the keys returned from get_futex_key (see below).
145 */ 128 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 129static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +144,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 144 && key1->both.offset == key2->both.offset);
162} 145}
163 146
147/*
148 * Take a reference to the resource addressed by a key.
149 * Can be called while holding spinlocks.
150 *
151 */
152static void get_futex_key_refs(union futex_key *key)
153{
154 if (!key->both.ptr)
155 return;
156
157 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
158 case FUT_OFF_INODE:
159 atomic_inc(&key->shared.inode->i_count);
160 break;
161 case FUT_OFF_MMSHARED:
162 atomic_inc(&key->private.mm->mm_count);
163 break;
164 }
165}
166
167/*
168 * Drop a reference to the resource addressed by a key.
169 * The hash bucket spinlock must not be held.
170 */
171static void drop_futex_key_refs(union futex_key *key)
172{
173 if (!key->both.ptr)
174 return;
175
176 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
177 case FUT_OFF_INODE:
178 iput(key->shared.inode);
179 break;
180 case FUT_OFF_MMSHARED:
181 mmdrop(key->private.mm);
182 break;
183 }
184}
185
164/** 186/**
165 * get_futex_key - Get parameters which are the keys for a futex. 187 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 188 * @uaddr: virtual address of the futex
@@ -179,12 +201,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 201 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 202 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 203 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 204static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 205{
185 unsigned long address = (unsigned long)uaddr; 206 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 207 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 208 struct page *page;
189 int err; 209 int err;
190 210
@@ -208,100 +228,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 228 return -EFAULT;
209 key->private.mm = mm; 229 key->private.mm = mm;
210 key->private.address = address; 230 key->private.address = address;
231 get_futex_key_refs(key);
211 return 0; 232 return 0;
212 } 233 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 234
221 /* 235again:
222 * Permissions. 236 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 237 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 238 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 239
240 lock_page(page);
241 if (!page->mapping) {
242 unlock_page(page);
243 put_page(page);
244 goto again;
245 }
226 246
227 /* 247 /*
228 * Private mappings are handled in a simple way. 248 * Private mappings are handled in a simple way.
229 * 249 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 250 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 251 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 252 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 253 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 254 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 255 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 256 key->private.mm = mm;
239 key->private.address = address; 257 key->private.address = address;
240 return 0; 258 } else {
259 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
260 key->shared.inode = page->mapping->host;
261 key->shared.pgoff = page->index;
241 } 262 }
242 263
243 /* 264 get_futex_key_refs(key);
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 }
253 265
254 /* 266 unlock_page(page);
255 * We could walk the page table to read the non-linear 267 put_page(page);
256 * pte, and get the page index without fetching the page 268 return 0;
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269
270/*
271 * Take a reference to the resource addressed by a key.
272 * Can be called while holding spinlocks.
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 269}
288 270
289/* 271static inline
290 * Drop a reference to the resource addressed by a key. 272void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 273{
295 if (!key->both.ptr) 274 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 275}
306 276
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 277static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +298,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 298
329/* 299/*
330 * Fault handling. 300 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 301 */
333static int futex_handle_fault(unsigned long address, 302static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 303{
336 struct vm_area_struct * vma; 304 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 305 struct mm_struct *mm = current->mm;
@@ -340,8 +308,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 308 if (attempt > 2)
341 return ret; 309 return ret;
342 310
343 if (!fshared) 311 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 312 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 313 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 314 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +328,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 328 current->min_flt++;
362 } 329 }
363 } 330 }
364 if (!fshared) 331 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 332 return ret;
367} 333}
368 334
@@ -385,6 +351,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 351 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 352 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 353 atomic_set(&pi_state->refcount, 1);
354 pi_state->key = FUTEX_KEY_INIT;
388 355
389 current->pi_state_cache = pi_state; 356 current->pi_state_cache = pi_state;
390 357
@@ -439,13 +406,20 @@ static void free_pi_state(struct futex_pi_state *pi_state)
439static struct task_struct * futex_find_get_task(pid_t pid) 406static struct task_struct * futex_find_get_task(pid_t pid)
440{ 407{
441 struct task_struct *p; 408 struct task_struct *p;
409 const struct cred *cred = current_cred(), *pcred;
442 410
443 rcu_read_lock(); 411 rcu_read_lock();
444 p = find_task_by_vpid(pid); 412 p = find_task_by_vpid(pid);
445 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 413 if (!p) {
446 p = ERR_PTR(-ESRCH); 414 p = ERR_PTR(-ESRCH);
447 else 415 } else {
448 get_task_struct(p); 416 pcred = __task_cred(p);
417 if (cred->euid != pcred->euid &&
418 cred->euid != pcred->uid)
419 p = ERR_PTR(-ESRCH);
420 else
421 get_task_struct(p);
422 }
449 423
450 rcu_read_unlock(); 424 rcu_read_unlock();
451 425
@@ -462,7 +436,7 @@ void exit_pi_state_list(struct task_struct *curr)
462 struct list_head *next, *head = &curr->pi_state_list; 436 struct list_head *next, *head = &curr->pi_state_list;
463 struct futex_pi_state *pi_state; 437 struct futex_pi_state *pi_state;
464 struct futex_hash_bucket *hb; 438 struct futex_hash_bucket *hb;
465 union futex_key key; 439 union futex_key key = FUTEX_KEY_INIT;
466 440
467 if (!futex_cmpxchg_enabled) 441 if (!futex_cmpxchg_enabled)
468 return; 442 return;
@@ -607,7 +581,7 @@ static void wake_futex(struct futex_q *q)
607 * The lock in wake_up_all() is a crucial memory barrier after the 581 * The lock in wake_up_all() is a crucial memory barrier after the
608 * plist_del() and also before assigning to q->lock_ptr. 582 * plist_del() and also before assigning to q->lock_ptr.
609 */ 583 */
610 wake_up_all(&q->waiters); 584 wake_up(&q->waiter);
611 /* 585 /*
612 * The waiting task can free the futex_q as soon as this is written, 586 * The waiting task can free the futex_q as soon as this is written,
613 * without taking any locks. This must come last. 587 * without taking any locks. This must come last.
@@ -719,20 +693,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
719 * Wake up all waiters hashed on the physical page that is mapped 693 * Wake up all waiters hashed on the physical page that is mapped
720 * to this virtual address: 694 * to this virtual address:
721 */ 695 */
722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 696static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
723 int nr_wake, u32 bitset)
724{ 697{
725 struct futex_hash_bucket *hb; 698 struct futex_hash_bucket *hb;
726 struct futex_q *this, *next; 699 struct futex_q *this, *next;
727 struct plist_head *head; 700 struct plist_head *head;
728 union futex_key key; 701 union futex_key key = FUTEX_KEY_INIT;
729 int ret; 702 int ret;
730 703
731 if (!bitset) 704 if (!bitset)
732 return -EINVAL; 705 return -EINVAL;
733 706
734 futex_lock_mm(fshared);
735
736 ret = get_futex_key(uaddr, fshared, &key); 707 ret = get_futex_key(uaddr, fshared, &key);
737 if (unlikely(ret != 0)) 708 if (unlikely(ret != 0))
738 goto out; 709 goto out;
@@ -760,7 +731,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
760 731
761 spin_unlock(&hb->lock); 732 spin_unlock(&hb->lock);
762out: 733out:
763 futex_unlock_mm(fshared); 734 put_futex_key(fshared, &key);
764 return ret; 735 return ret;
765} 736}
766 737
@@ -769,19 +740,16 @@ out:
769 * to this virtual address: 740 * to this virtual address:
770 */ 741 */
771static int 742static int
772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 743futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
773 u32 __user *uaddr2,
774 int nr_wake, int nr_wake2, int op) 744 int nr_wake, int nr_wake2, int op)
775{ 745{
776 union futex_key key1, key2; 746 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
777 struct futex_hash_bucket *hb1, *hb2; 747 struct futex_hash_bucket *hb1, *hb2;
778 struct plist_head *head; 748 struct plist_head *head;
779 struct futex_q *this, *next; 749 struct futex_q *this, *next;
780 int ret, op_ret, attempt = 0; 750 int ret, op_ret, attempt = 0;
781 751
782retryfull: 752retryfull:
783 futex_lock_mm(fshared);
784
785 ret = get_futex_key(uaddr1, fshared, &key1); 753 ret = get_futex_key(uaddr1, fshared, &key1);
786 if (unlikely(ret != 0)) 754 if (unlikely(ret != 0))
787 goto out; 755 goto out;
@@ -826,18 +794,12 @@ retry:
826 */ 794 */
827 if (attempt++) { 795 if (attempt++) {
828 ret = futex_handle_fault((unsigned long)uaddr2, 796 ret = futex_handle_fault((unsigned long)uaddr2,
829 fshared, attempt); 797 attempt);
830 if (ret) 798 if (ret)
831 goto out; 799 goto out;
832 goto retry; 800 goto retry;
833 } 801 }
834 802
835 /*
836 * If we would have faulted, release mmap_sem,
837 * fault it in and start all over again.
838 */
839 futex_unlock_mm(fshared);
840
841 ret = get_user(dummy, uaddr2); 803 ret = get_user(dummy, uaddr2);
842 if (ret) 804 if (ret)
843 return ret; 805 return ret;
@@ -873,7 +835,8 @@ retry:
873 if (hb1 != hb2) 835 if (hb1 != hb2)
874 spin_unlock(&hb2->lock); 836 spin_unlock(&hb2->lock);
875out: 837out:
876 futex_unlock_mm(fshared); 838 put_futex_key(fshared, &key2);
839 put_futex_key(fshared, &key1);
877 840
878 return ret; 841 return ret;
879} 842}
@@ -882,19 +845,16 @@ out:
882 * Requeue all waiters hashed on one physical page to another 845 * Requeue all waiters hashed on one physical page to another
883 * physical page. 846 * physical page.
884 */ 847 */
885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 848static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
886 u32 __user *uaddr2,
887 int nr_wake, int nr_requeue, u32 *cmpval) 849 int nr_wake, int nr_requeue, u32 *cmpval)
888{ 850{
889 union futex_key key1, key2; 851 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
890 struct futex_hash_bucket *hb1, *hb2; 852 struct futex_hash_bucket *hb1, *hb2;
891 struct plist_head *head1; 853 struct plist_head *head1;
892 struct futex_q *this, *next; 854 struct futex_q *this, *next;
893 int ret, drop_count = 0; 855 int ret, drop_count = 0;
894 856
895 retry: 857 retry:
896 futex_lock_mm(fshared);
897
898 ret = get_futex_key(uaddr1, fshared, &key1); 858 ret = get_futex_key(uaddr1, fshared, &key1);
899 if (unlikely(ret != 0)) 859 if (unlikely(ret != 0))
900 goto out; 860 goto out;
@@ -917,12 +877,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
917 if (hb1 != hb2) 877 if (hb1 != hb2)
918 spin_unlock(&hb2->lock); 878 spin_unlock(&hb2->lock);
919 879
920 /*
921 * If we would have faulted, release mmap_sem, fault
922 * it in and start all over again.
923 */
924 futex_unlock_mm(fshared);
925
926 ret = get_user(curval, uaddr1); 880 ret = get_user(curval, uaddr1);
927 881
928 if (!ret) 882 if (!ret)
@@ -974,7 +928,8 @@ out_unlock:
974 drop_futex_key_refs(&key1); 928 drop_futex_key_refs(&key1);
975 929
976out: 930out:
977 futex_unlock_mm(fshared); 931 put_futex_key(fshared, &key2);
932 put_futex_key(fshared, &key1);
978 return ret; 933 return ret;
979} 934}
980 935
@@ -983,7 +938,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
983{ 938{
984 struct futex_hash_bucket *hb; 939 struct futex_hash_bucket *hb;
985 940
986 init_waitqueue_head(&q->waiters); 941 init_waitqueue_head(&q->waiter);
987 942
988 get_futex_key_refs(&q->key); 943 get_futex_key_refs(&q->key);
989 hb = hash_futex(&q->key); 944 hb = hash_futex(&q->key);
@@ -1096,8 +1051,7 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1051 * private futexes.
1097 */ 1052 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1053static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner, 1054 struct task_struct *newowner, int fshared)
1100 struct rw_semaphore *fshared)
1101{ 1055{
1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1056 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103 struct futex_pi_state *pi_state = q->pi_state; 1057 struct futex_pi_state *pi_state = q->pi_state;
@@ -1176,7 +1130,7 @@ retry:
1176handle_fault: 1130handle_fault:
1177 spin_unlock(q->lock_ptr); 1131 spin_unlock(q->lock_ptr);
1178 1132
1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1133 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1180 1134
1181 spin_lock(q->lock_ptr); 1135 spin_lock(q->lock_ptr);
1182 1136
@@ -1196,12 +1150,13 @@ handle_fault:
1196 * In case we must use restart_block to restart a futex_wait, 1150 * In case we must use restart_block to restart a futex_wait,
1197 * we encode in the 'flags' shared capability 1151 * we encode in the 'flags' shared capability
1198 */ 1152 */
1199#define FLAGS_SHARED 1 1153#define FLAGS_SHARED 0x01
1154#define FLAGS_CLOCKRT 0x02
1200 1155
1201static long futex_wait_restart(struct restart_block *restart); 1156static long futex_wait_restart(struct restart_block *restart);
1202 1157
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1158static int futex_wait(u32 __user *uaddr, int fshared,
1204 u32 val, ktime_t *abs_time, u32 bitset) 1159 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1205{ 1160{
1206 struct task_struct *curr = current; 1161 struct task_struct *curr = current;
1207 DECLARE_WAITQUEUE(wait, curr); 1162 DECLARE_WAITQUEUE(wait, curr);
@@ -1218,8 +1173,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1218 q.pi_state = NULL; 1173 q.pi_state = NULL;
1219 q.bitset = bitset; 1174 q.bitset = bitset;
1220 retry: 1175 retry:
1221 futex_lock_mm(fshared); 1176 q.key = FUTEX_KEY_INIT;
1222
1223 ret = get_futex_key(uaddr, fshared, &q.key); 1177 ret = get_futex_key(uaddr, fshared, &q.key);
1224 if (unlikely(ret != 0)) 1178 if (unlikely(ret != 0))
1225 goto out_release_sem; 1179 goto out_release_sem;
@@ -1251,12 +1205,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1251 if (unlikely(ret)) { 1205 if (unlikely(ret)) {
1252 queue_unlock(&q, hb); 1206 queue_unlock(&q, hb);
1253 1207
1254 /*
1255 * If we would have faulted, release mmap_sem, fault it in and
1256 * start all over again.
1257 */
1258 futex_unlock_mm(fshared);
1259
1260 ret = get_user(uval, uaddr); 1208 ret = get_user(uval, uaddr);
1261 1209
1262 if (!ret) 1210 if (!ret)
@@ -1271,12 +1219,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1271 queue_me(&q, hb); 1219 queue_me(&q, hb);
1272 1220
1273 /* 1221 /*
1274 * Now the futex is queued and we have checked the data, we
1275 * don't want to hold mmap_sem while we sleep.
1276 */
1277 futex_unlock_mm(fshared);
1278
1279 /*
1280 * There might have been scheduling since the queue_me(), as we 1222 * There might have been scheduling since the queue_me(), as we
1281 * cannot hold a spinlock across the get_user() in case it 1223 * cannot hold a spinlock across the get_user() in case it
1282 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1224 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1287,7 +1229,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1287 1229
1288 /* add_wait_queue is the barrier after __set_current_state. */ 1230 /* add_wait_queue is the barrier after __set_current_state. */
1289 __set_current_state(TASK_INTERRUPTIBLE); 1231 __set_current_state(TASK_INTERRUPTIBLE);
1290 add_wait_queue(&q.waiters, &wait); 1232 add_wait_queue(&q.waiter, &wait);
1291 /* 1233 /*
1292 * !plist_node_empty() is safe here without any lock. 1234 * !plist_node_empty() is safe here without any lock.
1293 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1235 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
@@ -1296,13 +1238,18 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1296 if (!abs_time) 1238 if (!abs_time)
1297 schedule(); 1239 schedule();
1298 else { 1240 else {
1299 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1241 unsigned long slack;
1300 HRTIMER_MODE_ABS); 1242 slack = current->timer_slack_ns;
1243 if (rt_task(current))
1244 slack = 0;
1245 hrtimer_init_on_stack(&t.timer,
1246 clockrt ? CLOCK_REALTIME :
1247 CLOCK_MONOTONIC,
1248 HRTIMER_MODE_ABS);
1301 hrtimer_init_sleeper(&t, current); 1249 hrtimer_init_sleeper(&t, current);
1302 t.timer.expires = *abs_time; 1250 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1303 1251
1304 hrtimer_start(&t.timer, t.timer.expires, 1252 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1305 HRTIMER_MODE_ABS);
1306 if (!hrtimer_active(&t.timer)) 1253 if (!hrtimer_active(&t.timer))
1307 t.task = NULL; 1254 t.task = NULL;
1308 1255
@@ -1353,6 +1300,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1353 1300
1354 if (fshared) 1301 if (fshared)
1355 restart->futex.flags |= FLAGS_SHARED; 1302 restart->futex.flags |= FLAGS_SHARED;
1303 if (clockrt)
1304 restart->futex.flags |= FLAGS_CLOCKRT;
1356 return -ERESTART_RESTARTBLOCK; 1305 return -ERESTART_RESTARTBLOCK;
1357 } 1306 }
1358 1307
@@ -1360,7 +1309,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1360 queue_unlock(&q, hb); 1309 queue_unlock(&q, hb);
1361 1310
1362 out_release_sem: 1311 out_release_sem:
1363 futex_unlock_mm(fshared); 1312 put_futex_key(fshared, &q.key);
1364 return ret; 1313 return ret;
1365} 1314}
1366 1315
@@ -1368,15 +1317,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1368static long futex_wait_restart(struct restart_block *restart) 1317static long futex_wait_restart(struct restart_block *restart)
1369{ 1318{
1370 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1319 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1371 struct rw_semaphore *fshared = NULL; 1320 int fshared = 0;
1372 ktime_t t; 1321 ktime_t t;
1373 1322
1374 t.tv64 = restart->futex.time; 1323 t.tv64 = restart->futex.time;
1375 restart->fn = do_no_restart_syscall; 1324 restart->fn = do_no_restart_syscall;
1376 if (restart->futex.flags & FLAGS_SHARED) 1325 if (restart->futex.flags & FLAGS_SHARED)
1377 fshared = &current->mm->mmap_sem; 1326 fshared = 1;
1378 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1327 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1379 restart->futex.bitset); 1328 restart->futex.bitset,
1329 restart->futex.flags & FLAGS_CLOCKRT);
1380} 1330}
1381 1331
1382 1332
@@ -1386,7 +1336,7 @@ static long futex_wait_restart(struct restart_block *restart)
1386 * if there are waiters then it will block, it does PI, etc. (Due to 1336 * if there are waiters then it will block, it does PI, etc. (Due to
1387 * races the kernel might see a 0 value of the futex too.) 1337 * races the kernel might see a 0 value of the futex too.)
1388 */ 1338 */
1389static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1339static int futex_lock_pi(u32 __user *uaddr, int fshared,
1390 int detect, ktime_t *time, int trylock) 1340 int detect, ktime_t *time, int trylock)
1391{ 1341{
1392 struct hrtimer_sleeper timeout, *to = NULL; 1342 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1404,13 +1354,12 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1404 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 1354 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1405 HRTIMER_MODE_ABS); 1355 HRTIMER_MODE_ABS);
1406 hrtimer_init_sleeper(to, current); 1356 hrtimer_init_sleeper(to, current);
1407 to->timer.expires = *time; 1357 hrtimer_set_expires(&to->timer, *time);
1408 } 1358 }
1409 1359
1410 q.pi_state = NULL; 1360 q.pi_state = NULL;
1411 retry: 1361 retry:
1412 futex_lock_mm(fshared); 1362 q.key = FUTEX_KEY_INIT;
1413
1414 ret = get_futex_key(uaddr, fshared, &q.key); 1363 ret = get_futex_key(uaddr, fshared, &q.key);
1415 if (unlikely(ret != 0)) 1364 if (unlikely(ret != 0))
1416 goto out_release_sem; 1365 goto out_release_sem;
@@ -1499,7 +1448,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1499 * exit to complete. 1448 * exit to complete.
1500 */ 1449 */
1501 queue_unlock(&q, hb); 1450 queue_unlock(&q, hb);
1502 futex_unlock_mm(fshared);
1503 cond_resched(); 1451 cond_resched();
1504 goto retry; 1452 goto retry;
1505 1453
@@ -1531,12 +1479,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1531 */ 1479 */
1532 queue_me(&q, hb); 1480 queue_me(&q, hb);
1533 1481
1534 /*
1535 * Now the futex is queued and we have checked the data, we
1536 * don't want to hold mmap_sem while we sleep.
1537 */
1538 futex_unlock_mm(fshared);
1539
1540 WARN_ON(!q.pi_state); 1482 WARN_ON(!q.pi_state);
1541 /* 1483 /*
1542 * Block on the PI mutex: 1484 * Block on the PI mutex:
@@ -1549,7 +1491,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1549 ret = ret ? 0 : -EWOULDBLOCK; 1491 ret = ret ? 0 : -EWOULDBLOCK;
1550 } 1492 }
1551 1493
1552 futex_lock_mm(fshared);
1553 spin_lock(q.lock_ptr); 1494 spin_lock(q.lock_ptr);
1554 1495
1555 if (!ret) { 1496 if (!ret) {
@@ -1615,7 +1556,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1615 1556
1616 /* Unqueue and drop the lock */ 1557 /* Unqueue and drop the lock */
1617 unqueue_me_pi(&q); 1558 unqueue_me_pi(&q);
1618 futex_unlock_mm(fshared);
1619 1559
1620 if (to) 1560 if (to)
1621 destroy_hrtimer_on_stack(&to->timer); 1561 destroy_hrtimer_on_stack(&to->timer);
@@ -1625,34 +1565,30 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1625 queue_unlock(&q, hb); 1565 queue_unlock(&q, hb);
1626 1566
1627 out_release_sem: 1567 out_release_sem:
1628 futex_unlock_mm(fshared); 1568 put_futex_key(fshared, &q.key);
1629 if (to) 1569 if (to)
1630 destroy_hrtimer_on_stack(&to->timer); 1570 destroy_hrtimer_on_stack(&to->timer);
1631 return ret; 1571 return ret;
1632 1572
1633 uaddr_faulted: 1573 uaddr_faulted:
1634 /* 1574 /*
1635 * We have to r/w *(int __user *)uaddr, but we can't modify it 1575 * We have to r/w *(int __user *)uaddr, and we have to modify it
1636 * non-atomically. Therefore, if get_user below is not 1576 * atomically. Therefore, if we continue to fault after get_user()
1637 * enough, we need to handle the fault ourselves, while 1577 * below, we need to handle the fault ourselves, while still holding
1638 * still holding the mmap_sem. 1578 * the mmap_sem. This can occur if the uaddr is under contention as
1639 * 1579 * we have to drop the mmap_sem in order to call get_user().
1640 * ... and hb->lock. :-) --ANK
1641 */ 1580 */
1642 queue_unlock(&q, hb); 1581 queue_unlock(&q, hb);
1643 1582
1644 if (attempt++) { 1583 if (attempt++) {
1645 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1584 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1646 attempt);
1647 if (ret) 1585 if (ret)
1648 goto out_release_sem; 1586 goto out_release_sem;
1649 goto retry_unlocked; 1587 goto retry_unlocked;
1650 } 1588 }
1651 1589
1652 futex_unlock_mm(fshared);
1653
1654 ret = get_user(uval, uaddr); 1590 ret = get_user(uval, uaddr);
1655 if (!ret && (uval != -EFAULT)) 1591 if (!ret)
1656 goto retry; 1592 goto retry;
1657 1593
1658 if (to) 1594 if (to)
@@ -1665,13 +1601,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1665 * This is the in-kernel slowpath: we look up the PI state (if any), 1601 * This is the in-kernel slowpath: we look up the PI state (if any),
1666 * and do the rt-mutex unlock. 1602 * and do the rt-mutex unlock.
1667 */ 1603 */
1668static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1604static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1669{ 1605{
1670 struct futex_hash_bucket *hb; 1606 struct futex_hash_bucket *hb;
1671 struct futex_q *this, *next; 1607 struct futex_q *this, *next;
1672 u32 uval; 1608 u32 uval;
1673 struct plist_head *head; 1609 struct plist_head *head;
1674 union futex_key key; 1610 union futex_key key = FUTEX_KEY_INIT;
1675 int ret, attempt = 0; 1611 int ret, attempt = 0;
1676 1612
1677retry: 1613retry:
@@ -1682,10 +1618,6 @@ retry:
1682 */ 1618 */
1683 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1619 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1684 return -EPERM; 1620 return -EPERM;
1685 /*
1686 * First take all the futex related locks:
1687 */
1688 futex_lock_mm(fshared);
1689 1621
1690 ret = get_futex_key(uaddr, fshared, &key); 1622 ret = get_futex_key(uaddr, fshared, &key);
1691 if (unlikely(ret != 0)) 1623 if (unlikely(ret != 0))
@@ -1744,34 +1676,30 @@ retry_unlocked:
1744out_unlock: 1676out_unlock:
1745 spin_unlock(&hb->lock); 1677 spin_unlock(&hb->lock);
1746out: 1678out:
1747 futex_unlock_mm(fshared); 1679 put_futex_key(fshared, &key);
1748 1680
1749 return ret; 1681 return ret;
1750 1682
1751pi_faulted: 1683pi_faulted:
1752 /* 1684 /*
1753 * We have to r/w *(int __user *)uaddr, but we can't modify it 1685 * We have to r/w *(int __user *)uaddr, and we have to modify it
1754 * non-atomically. Therefore, if get_user below is not 1686 * atomically. Therefore, if we continue to fault after get_user()
1755 * enough, we need to handle the fault ourselves, while 1687 * below, we need to handle the fault ourselves, while still holding
1756 * still holding the mmap_sem. 1688 * the mmap_sem. This can occur if the uaddr is under contention as
1757 * 1689 * we have to drop the mmap_sem in order to call get_user().
1758 * ... and hb->lock. --ANK
1759 */ 1690 */
1760 spin_unlock(&hb->lock); 1691 spin_unlock(&hb->lock);
1761 1692
1762 if (attempt++) { 1693 if (attempt++) {
1763 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1694 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1764 attempt);
1765 if (ret) 1695 if (ret)
1766 goto out; 1696 goto out;
1767 uval = 0; 1697 uval = 0;
1768 goto retry_unlocked; 1698 goto retry_unlocked;
1769 } 1699 }
1770 1700
1771 futex_unlock_mm(fshared);
1772
1773 ret = get_user(uval, uaddr); 1701 ret = get_user(uval, uaddr);
1774 if (!ret && (uval != -EFAULT)) 1702 if (!ret)
1775 goto retry; 1703 goto retry;
1776 1704
1777 return ret; 1705 return ret;
@@ -1826,6 +1754,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1826{ 1754{
1827 struct robust_list_head __user *head; 1755 struct robust_list_head __user *head;
1828 unsigned long ret; 1756 unsigned long ret;
1757 const struct cred *cred = current_cred(), *pcred;
1829 1758
1830 if (!futex_cmpxchg_enabled) 1759 if (!futex_cmpxchg_enabled)
1831 return -ENOSYS; 1760 return -ENOSYS;
@@ -1841,8 +1770,10 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1841 if (!p) 1770 if (!p)
1842 goto err_unlock; 1771 goto err_unlock;
1843 ret = -EPERM; 1772 ret = -EPERM;
1844 if ((current->euid != p->euid) && (current->euid != p->uid) && 1773 pcred = __task_cred(p);
1845 !capable(CAP_SYS_PTRACE)) 1774 if (cred->euid != pcred->euid &&
1775 cred->euid != pcred->uid &&
1776 !capable(CAP_SYS_PTRACE))
1846 goto err_unlock; 1777 goto err_unlock;
1847 head = p->robust_list; 1778 head = p->robust_list;
1848 rcu_read_unlock(); 1779 rcu_read_unlock();
@@ -1895,8 +1826,7 @@ retry:
1895 * PI futexes happens in exit_pi_state(): 1826 * PI futexes happens in exit_pi_state():
1896 */ 1827 */
1897 if (!pi && (uval & FUTEX_WAITERS)) 1828 if (!pi && (uval & FUTEX_WAITERS))
1898 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1829 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1899 FUTEX_BITSET_MATCH_ANY);
1900 } 1830 }
1901 return 0; 1831 return 0;
1902} 1832}
@@ -1990,18 +1920,22 @@ void exit_robust_list(struct task_struct *curr)
1990long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1920long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1991 u32 __user *uaddr2, u32 val2, u32 val3) 1921 u32 __user *uaddr2, u32 val2, u32 val3)
1992{ 1922{
1993 int ret = -ENOSYS; 1923 int clockrt, ret = -ENOSYS;
1994 int cmd = op & FUTEX_CMD_MASK; 1924 int cmd = op & FUTEX_CMD_MASK;
1995 struct rw_semaphore *fshared = NULL; 1925 int fshared = 0;
1996 1926
1997 if (!(op & FUTEX_PRIVATE_FLAG)) 1927 if (!(op & FUTEX_PRIVATE_FLAG))
1998 fshared = &current->mm->mmap_sem; 1928 fshared = 1;
1929
1930 clockrt = op & FUTEX_CLOCK_REALTIME;
1931 if (clockrt && cmd != FUTEX_WAIT_BITSET)
1932 return -ENOSYS;
1999 1933
2000 switch (cmd) { 1934 switch (cmd) {
2001 case FUTEX_WAIT: 1935 case FUTEX_WAIT:
2002 val3 = FUTEX_BITSET_MATCH_ANY; 1936 val3 = FUTEX_BITSET_MATCH_ANY;
2003 case FUTEX_WAIT_BITSET: 1937 case FUTEX_WAIT_BITSET:
2004 ret = futex_wait(uaddr, fshared, val, timeout, val3); 1938 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
2005 break; 1939 break;
2006 case FUTEX_WAKE: 1940 case FUTEX_WAKE:
2007 val3 = FUTEX_BITSET_MATCH_ANY; 1941 val3 = FUTEX_BITSET_MATCH_ANY;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 04ac3a9e42cf..d607a5b9ee29 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
135{ 135{
136 struct compat_robust_list_head __user *head; 136 struct compat_robust_list_head __user *head;
137 unsigned long ret; 137 unsigned long ret;
138 const struct cred *cred = current_cred(), *pcred;
138 139
139 if (!futex_cmpxchg_enabled) 140 if (!futex_cmpxchg_enabled)
140 return -ENOSYS; 141 return -ENOSYS;
@@ -150,8 +151,10 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
150 if (!p) 151 if (!p)
151 goto err_unlock; 152 goto err_unlock;
152 ret = -EPERM; 153 ret = -EPERM;
153 if ((current->euid != p->euid) && (current->euid != p->uid) && 154 pcred = __task_cred(p);
154 !capable(CAP_SYS_PTRACE)) 155 if (cred->euid != pcred->euid &&
156 cred->euid != pcred->uid &&
157 !capable(CAP_SYS_PTRACE))
155 goto err_unlock; 158 goto err_unlock;
156 head = p->compat_robust_list; 159 head = p->compat_robust_list;
157 read_unlock(&tasklist_lock); 160 read_unlock(&tasklist_lock);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cdec83e722fa..bda9cb924276 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
443#endif 443#endif
444 444
445/*
446 * Check, whether the timer is on the callback pending list
447 */
448static inline int hrtimer_cb_pending(const struct hrtimer *timer)
449{
450 return timer->state & HRTIMER_STATE_PENDING;
451}
452
453/*
454 * Remove a timer from the callback pending list
455 */
456static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
457{
458 list_del_init(&timer->cb_entry);
459}
460
461/* High resolution timer related functions */ 445/* High resolution timer related functions */
462#ifdef CONFIG_HIGH_RES_TIMERS 446#ifdef CONFIG_HIGH_RES_TIMERS
463 447
@@ -517,7 +501,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
517 if (!base->first) 501 if (!base->first)
518 continue; 502 continue;
519 timer = rb_entry(base->first, struct hrtimer, node); 503 timer = rb_entry(base->first, struct hrtimer, node);
520 expires = ktime_sub(timer->expires, base->offset); 504 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
521 if (expires.tv64 < cpu_base->expires_next.tv64) 505 if (expires.tv64 < cpu_base->expires_next.tv64)
522 cpu_base->expires_next = expires; 506 cpu_base->expires_next = expires;
523 } 507 }
@@ -539,10 +523,10 @@ static int hrtimer_reprogram(struct hrtimer *timer,
539 struct hrtimer_clock_base *base) 523 struct hrtimer_clock_base *base)
540{ 524{
541 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 525 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
542 ktime_t expires = ktime_sub(timer->expires, base->offset); 526 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
543 int res; 527 int res;
544 528
545 WARN_ON_ONCE(timer->expires.tv64 < 0); 529 WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
546 530
547 /* 531 /*
548 * When the callback is running, we do not reprogram the clock event 532 * When the callback is running, we do not reprogram the clock event
@@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651{ 635{
652} 636}
653 637
638static void __run_hrtimer(struct hrtimer *timer);
639
654/* 640/*
655 * When High resolution timers are active, try to reprogram. Note, that in case 641 * When High resolution timers are active, try to reprogram. Note, that in case
656 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 642 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,40 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
661 struct hrtimer_clock_base *base) 647 struct hrtimer_clock_base *base)
662{ 648{
663 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 649 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
664 650 /*
665 /* Timer is expired, act upon the callback mode */ 651 * XXX: recursion check?
666 switch(timer->cb_mode) { 652 * hrtimer_forward() should round up with timer granularity
667 case HRTIMER_CB_IRQSAFE_NO_RESTART: 653 * so that we never get into inf recursion here,
668 debug_hrtimer_deactivate(timer); 654 * it doesn't do that though
669 /* 655 */
670 * We can call the callback from here. No restart 656 __run_hrtimer(timer);
671 * happens, so no danger of recursion 657 return 1;
672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1;
675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
677 /*
678 * This is solely for the sched tick emulation with
679 * dynamic tick support to ensure that we do not
680 * restart the tick right on the edge and end up with
681 * the tick timer in the softirq ! The calling site
682 * takes care of this. Also used for hrtimer sleeper !
683 */
684 debug_hrtimer_deactivate(timer);
685 return 1;
686 case HRTIMER_CB_IRQSAFE:
687 case HRTIMER_CB_SOFTIRQ:
688 /*
689 * Move everything else into the softirq pending list !
690 */
691 list_add_tail(&timer->cb_entry,
692 &base->cpu_base->cb_pending);
693 timer->state = HRTIMER_STATE_PENDING;
694 return 1;
695 default:
696 BUG();
697 }
698 } 658 }
699 return 0; 659 return 0;
700} 660}
@@ -733,11 +693,6 @@ static int hrtimer_switch_to_hres(void)
733 return 1; 693 return 1;
734} 694}
735 695
736static inline void hrtimer_raise_softirq(void)
737{
738 raise_softirq(HRTIMER_SOFTIRQ);
739}
740
741#else 696#else
742 697
743static inline int hrtimer_hres_active(void) { return 0; } 698static inline int hrtimer_hres_active(void) { return 0; }
@@ -756,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
756{ 711{
757 return 0; 712 return 0;
758} 713}
759static inline void hrtimer_raise_softirq(void) { }
760 714
761#endif /* CONFIG_HIGH_RES_TIMERS */ 715#endif /* CONFIG_HIGH_RES_TIMERS */
762 716
@@ -795,7 +749,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
795 u64 orun = 1; 749 u64 orun = 1;
796 ktime_t delta; 750 ktime_t delta;
797 751
798 delta = ktime_sub(now, timer->expires); 752 delta = ktime_sub(now, hrtimer_get_expires(timer));
799 753
800 if (delta.tv64 < 0) 754 if (delta.tv64 < 0)
801 return 0; 755 return 0;
@@ -807,8 +761,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
807 s64 incr = ktime_to_ns(interval); 761 s64 incr = ktime_to_ns(interval);
808 762
809 orun = ktime_divns(delta, incr); 763 orun = ktime_divns(delta, incr);
810 timer->expires = ktime_add_ns(timer->expires, incr * orun); 764 hrtimer_add_expires_ns(timer, incr * orun);
811 if (timer->expires.tv64 > now.tv64) 765 if (hrtimer_get_expires_tv64(timer) > now.tv64)
812 return orun; 766 return orun;
813 /* 767 /*
814 * This (and the ktime_add() below) is the 768 * This (and the ktime_add() below) is the
@@ -816,7 +770,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
816 */ 770 */
817 orun++; 771 orun++;
818 } 772 }
819 timer->expires = ktime_add_safe(timer->expires, interval); 773 hrtimer_add_expires(timer, interval);
820 774
821 return orun; 775 return orun;
822} 776}
@@ -848,7 +802,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
848 * We dont care about collisions. Nodes with 802 * We dont care about collisions. Nodes with
849 * the same expiry time stay together. 803 * the same expiry time stay together.
850 */ 804 */
851 if (timer->expires.tv64 < entry->expires.tv64) { 805 if (hrtimer_get_expires_tv64(timer) <
806 hrtimer_get_expires_tv64(entry)) {
852 link = &(*link)->rb_left; 807 link = &(*link)->rb_left;
853 } else { 808 } else {
854 link = &(*link)->rb_right; 809 link = &(*link)->rb_right;
@@ -898,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
898 struct hrtimer_clock_base *base, 853 struct hrtimer_clock_base *base,
899 unsigned long newstate, int reprogram) 854 unsigned long newstate, int reprogram)
900{ 855{
901 /* High res. callback list. NOP for !HIGHRES */ 856 if (timer->state & HRTIMER_STATE_ENQUEUED) {
902 if (hrtimer_cb_pending(timer))
903 hrtimer_remove_cb_pending(timer);
904 else {
905 /* 857 /*
906 * Remove the timer from the rbtree and replace the 858 * Remove the timer from the rbtree and replace the
907 * first entry pointer if necessary. 859 * first entry pointer if necessary.
@@ -945,9 +897,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
945} 897}
946 898
947/** 899/**
948 * hrtimer_start - (re)start an relative timer on the current CPU 900 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
949 * @timer: the timer to be added 901 * @timer: the timer to be added
950 * @tim: expiry time 902 * @tim: expiry time
903 * @delta_ns: "slack" range for the timer
951 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 904 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
952 * 905 *
953 * Returns: 906 * Returns:
@@ -955,11 +908,12 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
955 * 1 when the timer was active 908 * 1 when the timer was active
956 */ 909 */
957int 910int
958hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 911hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
912 const enum hrtimer_mode mode)
959{ 913{
960 struct hrtimer_clock_base *base, *new_base; 914 struct hrtimer_clock_base *base, *new_base;
961 unsigned long flags; 915 unsigned long flags;
962 int ret, raise; 916 int ret;
963 917
964 base = lock_hrtimer_base(timer, &flags); 918 base = lock_hrtimer_base(timer, &flags);
965 919
@@ -983,7 +937,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
983#endif 937#endif
984 } 938 }
985 939
986 timer->expires = tim; 940 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
987 941
988 timer_stats_hrtimer_set_start_info(timer); 942 timer_stats_hrtimer_set_start_info(timer);
989 943
@@ -994,30 +948,30 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
994 enqueue_hrtimer(timer, new_base, 948 enqueue_hrtimer(timer, new_base,
995 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 949 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
996 950
997 /*
998 * The timer may be expired and moved to the cb_pending
999 * list. We can not raise the softirq with base lock held due
1000 * to a possible deadlock with runqueue lock.
1001 */
1002 raise = timer->state == HRTIMER_STATE_PENDING;
1003
1004 /*
1005 * We use preempt_disable to prevent this task from migrating after
1006 * setting up the softirq and raising it. Otherwise, if me migrate
1007 * we will raise the softirq on the wrong CPU.
1008 */
1009 preempt_disable();
1010
1011 unlock_hrtimer_base(timer, &flags); 951 unlock_hrtimer_base(timer, &flags);
1012 952
1013 if (raise)
1014 hrtimer_raise_softirq();
1015 preempt_enable();
1016
1017 return ret; 953 return ret;
1018} 954}
955EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
956
957/**
958 * hrtimer_start - (re)start an hrtimer on the current CPU
959 * @timer: the timer to be added
960 * @tim: expiry time
961 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
962 *
963 * Returns:
964 * 0 on success
965 * 1 when the timer was active
966 */
967int
968hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
969{
970 return hrtimer_start_range_ns(timer, tim, 0, mode);
971}
1019EXPORT_SYMBOL_GPL(hrtimer_start); 972EXPORT_SYMBOL_GPL(hrtimer_start);
1020 973
974
1021/** 975/**
1022 * hrtimer_try_to_cancel - try to deactivate a timer 976 * hrtimer_try_to_cancel - try to deactivate a timer
1023 * @timer: hrtimer to stop 977 * @timer: hrtimer to stop
@@ -1077,7 +1031,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1077 ktime_t rem; 1031 ktime_t rem;
1078 1032
1079 base = lock_hrtimer_base(timer, &flags); 1033 base = lock_hrtimer_base(timer, &flags);
1080 rem = ktime_sub(timer->expires, base->get_time()); 1034 rem = hrtimer_expires_remaining(timer);
1081 unlock_hrtimer_base(timer, &flags); 1035 unlock_hrtimer_base(timer, &flags);
1082 1036
1083 return rem; 1037 return rem;
@@ -1109,7 +1063,7 @@ ktime_t hrtimer_get_next_event(void)
1109 continue; 1063 continue;
1110 1064
1111 timer = rb_entry(base->first, struct hrtimer, node); 1065 timer = rb_entry(base->first, struct hrtimer, node);
1112 delta.tv64 = timer->expires.tv64; 1066 delta.tv64 = hrtimer_get_expires_tv64(timer);
1113 delta = ktime_sub(delta, base->get_time()); 1067 delta = ktime_sub(delta, base->get_time());
1114 if (delta.tv64 < mindelta.tv64) 1068 if (delta.tv64 < mindelta.tv64)
1115 mindelta.tv64 = delta.tv64; 1069 mindelta.tv64 = delta.tv64;
@@ -1180,60 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1180} 1134}
1181EXPORT_SYMBOL_GPL(hrtimer_get_res); 1135EXPORT_SYMBOL_GPL(hrtimer_get_res);
1182 1136
1183static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1184{
1185 spin_lock_irq(&cpu_base->lock);
1186
1187 while (!list_empty(&cpu_base->cb_pending)) {
1188 enum hrtimer_restart (*fn)(struct hrtimer *);
1189 struct hrtimer *timer;
1190 int restart;
1191
1192 timer = list_entry(cpu_base->cb_pending.next,
1193 struct hrtimer, cb_entry);
1194
1195 debug_hrtimer_deactivate(timer);
1196 timer_stats_account_hrtimer(timer);
1197
1198 fn = timer->function;
1199 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1200 spin_unlock_irq(&cpu_base->lock);
1201
1202 restart = fn(timer);
1203
1204 spin_lock_irq(&cpu_base->lock);
1205
1206 timer->state &= ~HRTIMER_STATE_CALLBACK;
1207 if (restart == HRTIMER_RESTART) {
1208 BUG_ON(hrtimer_active(timer));
1209 /*
1210 * Enqueue the timer, allow reprogramming of the event
1211 * device
1212 */
1213 enqueue_hrtimer(timer, timer->base, 1);
1214 } else if (hrtimer_active(timer)) {
1215 /*
1216 * If the timer was rearmed on another CPU, reprogram
1217 * the event device.
1218 */
1219 struct hrtimer_clock_base *base = timer->base;
1220
1221 if (base->first == &timer->node &&
1222 hrtimer_reprogram(timer, base)) {
1223 /*
1224 * Timer is expired. Thus move it from tree to
1225 * pending list again.
1226 */
1227 __remove_hrtimer(timer, base,
1228 HRTIMER_STATE_PENDING, 0);
1229 list_add_tail(&timer->cb_entry,
1230 &base->cpu_base->cb_pending);
1231 }
1232 }
1233 }
1234 spin_unlock_irq(&cpu_base->lock);
1235}
1236
1237static void __run_hrtimer(struct hrtimer *timer) 1137static void __run_hrtimer(struct hrtimer *timer)
1238{ 1138{
1239 struct hrtimer_clock_base *base = timer->base; 1139 struct hrtimer_clock_base *base = timer->base;
@@ -1241,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer)
1241 enum hrtimer_restart (*fn)(struct hrtimer *); 1141 enum hrtimer_restart (*fn)(struct hrtimer *);
1242 int restart; 1142 int restart;
1243 1143
1144 WARN_ON(!irqs_disabled());
1145
1244 debug_hrtimer_deactivate(timer); 1146 debug_hrtimer_deactivate(timer);
1245 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1147 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1246 timer_stats_account_hrtimer(timer); 1148 timer_stats_account_hrtimer(timer);
1247
1248 fn = timer->function; 1149 fn = timer->function;
1249 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || 1150
1250 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { 1151 /*
1251 /* 1152 * Because we run timers from hardirq context, there is no chance
1252 * Used for scheduler timers, avoid lock inversion with 1153 * they get migrated to another cpu, therefore its safe to unlock
1253 * rq->lock and tasklist_lock. 1154 * the timer base.
1254 * 1155 */
1255 * These timers are required to deal with enqueue expiry 1156 spin_unlock(&cpu_base->lock);
1256 * themselves and are not allowed to migrate. 1157 restart = fn(timer);
1257 */ 1158 spin_lock(&cpu_base->lock);
1258 spin_unlock(&cpu_base->lock);
1259 restart = fn(timer);
1260 spin_lock(&cpu_base->lock);
1261 } else
1262 restart = fn(timer);
1263 1159
1264 /* 1160 /*
1265 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1161 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
@@ -1284,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1284 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1180 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1285 struct hrtimer_clock_base *base; 1181 struct hrtimer_clock_base *base;
1286 ktime_t expires_next, now; 1182 ktime_t expires_next, now;
1287 int i, raise = 0; 1183 int i;
1288 1184
1289 BUG_ON(!cpu_base->hres_active); 1185 BUG_ON(!cpu_base->hres_active);
1290 cpu_base->nr_events++; 1186 cpu_base->nr_events++;
@@ -1310,26 +1206,29 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1310 1206
1311 timer = rb_entry(node, struct hrtimer, node); 1207 timer = rb_entry(node, struct hrtimer, node);
1312 1208
1313 if (basenow.tv64 < timer->expires.tv64) { 1209 /*
1210 * The immediate goal for using the softexpires is
1211 * minimizing wakeups, not running timers at the
1212 * earliest interrupt after their soft expiration.
1213 * This allows us to avoid using a Priority Search
1214 * Tree, which can answer a stabbing querry for
1215 * overlapping intervals and instead use the simple
1216 * BST we already have.
1217 * We don't add extra wakeups by delaying timers that
1218 * are right-of a not yet expired timer, because that
1219 * timer will have to trigger a wakeup anyway.
1220 */
1221
1222 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1314 ktime_t expires; 1223 ktime_t expires;
1315 1224
1316 expires = ktime_sub(timer->expires, 1225 expires = ktime_sub(hrtimer_get_expires(timer),
1317 base->offset); 1226 base->offset);
1318 if (expires.tv64 < expires_next.tv64) 1227 if (expires.tv64 < expires_next.tv64)
1319 expires_next = expires; 1228 expires_next = expires;
1320 break; 1229 break;
1321 } 1230 }
1322 1231
1323 /* Move softirq callbacks to the pending list */
1324 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1325 __remove_hrtimer(timer, base,
1326 HRTIMER_STATE_PENDING, 0);
1327 list_add_tail(&timer->cb_entry,
1328 &base->cpu_base->cb_pending);
1329 raise = 1;
1330 continue;
1331 }
1332
1333 __run_hrtimer(timer); 1232 __run_hrtimer(timer);
1334 } 1233 }
1335 spin_unlock(&cpu_base->lock); 1234 spin_unlock(&cpu_base->lock);
@@ -1343,15 +1242,30 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1343 if (tick_program_event(expires_next, 0)) 1242 if (tick_program_event(expires_next, 0))
1344 goto retry; 1243 goto retry;
1345 } 1244 }
1346
1347 /* Raise softirq ? */
1348 if (raise)
1349 raise_softirq(HRTIMER_SOFTIRQ);
1350} 1245}
1351 1246
1352static void run_hrtimer_softirq(struct softirq_action *h) 1247/**
1248 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1249 *
1250 * hrtimer_peek_ahead_timers will peek at the timer queue of
1251 * the current cpu and check if there are any timers for which
1252 * the soft expires time has passed. If any such timers exist,
1253 * they are run immediately and then removed from the timer queue.
1254 *
1255 */
1256void hrtimer_peek_ahead_timers(void)
1353{ 1257{
1354 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); 1258 struct tick_device *td;
1259 unsigned long flags;
1260
1261 if (!hrtimer_hres_active())
1262 return;
1263
1264 local_irq_save(flags);
1265 td = &__get_cpu_var(tick_cpu_device);
1266 if (td && td->evtdev)
1267 hrtimer_interrupt(td->evtdev);
1268 local_irq_restore(flags);
1355} 1269}
1356 1270
1357#endif /* CONFIG_HIGH_RES_TIMERS */ 1271#endif /* CONFIG_HIGH_RES_TIMERS */
@@ -1365,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
1365 */ 1279 */
1366void hrtimer_run_pending(void) 1280void hrtimer_run_pending(void)
1367{ 1281{
1368 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1369
1370 if (hrtimer_hres_active()) 1282 if (hrtimer_hres_active())
1371 return; 1283 return;
1372 1284
@@ -1380,8 +1292,6 @@ void hrtimer_run_pending(void)
1380 */ 1292 */
1381 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) 1293 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1382 hrtimer_switch_to_hres(); 1294 hrtimer_switch_to_hres();
1383
1384 run_hrtimer_pending(cpu_base);
1385} 1295}
1386 1296
1387/* 1297/*
@@ -1403,9 +1313,7 @@ void hrtimer_run_queues(void)
1403 if (!base->first) 1313 if (!base->first)
1404 continue; 1314 continue;
1405 1315
1406 if (base->get_softirq_time) 1316 if (gettime) {
1407 base->softirq_time = base->get_softirq_time();
1408 else if (gettime) {
1409 hrtimer_get_softirq_time(cpu_base); 1317 hrtimer_get_softirq_time(cpu_base);
1410 gettime = 0; 1318 gettime = 0;
1411 } 1319 }
@@ -1416,17 +1324,10 @@ void hrtimer_run_queues(void)
1416 struct hrtimer *timer; 1324 struct hrtimer *timer;
1417 1325
1418 timer = rb_entry(node, struct hrtimer, node); 1326 timer = rb_entry(node, struct hrtimer, node);
1419 if (base->softirq_time.tv64 <= timer->expires.tv64) 1327 if (base->softirq_time.tv64 <=
1328 hrtimer_get_expires_tv64(timer))
1420 break; 1329 break;
1421 1330
1422 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1423 __remove_hrtimer(timer, base,
1424 HRTIMER_STATE_PENDING, 0);
1425 list_add_tail(&timer->cb_entry,
1426 &base->cpu_base->cb_pending);
1427 continue;
1428 }
1429
1430 __run_hrtimer(timer); 1331 __run_hrtimer(timer);
1431 } 1332 }
1432 spin_unlock(&cpu_base->lock); 1333 spin_unlock(&cpu_base->lock);
@@ -1453,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1453{ 1354{
1454 sl->timer.function = hrtimer_wakeup; 1355 sl->timer.function = hrtimer_wakeup;
1455 sl->task = task; 1356 sl->task = task;
1456#ifdef CONFIG_HIGH_RES_TIMERS
1457 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1458#endif
1459} 1357}
1460 1358
1461static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1359static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1464,7 +1362,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1464 1362
1465 do { 1363 do {
1466 set_current_state(TASK_INTERRUPTIBLE); 1364 set_current_state(TASK_INTERRUPTIBLE);
1467 hrtimer_start(&t->timer, t->timer.expires, mode); 1365 hrtimer_start_expires(&t->timer, mode);
1468 if (!hrtimer_active(&t->timer)) 1366 if (!hrtimer_active(&t->timer))
1469 t->task = NULL; 1367 t->task = NULL;
1470 1368
@@ -1486,7 +1384,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
1486 struct timespec rmt; 1384 struct timespec rmt;
1487 ktime_t rem; 1385 ktime_t rem;
1488 1386
1489 rem = ktime_sub(timer->expires, timer->base->get_time()); 1387 rem = hrtimer_expires_remaining(timer);
1490 if (rem.tv64 <= 0) 1388 if (rem.tv64 <= 0)
1491 return 0; 1389 return 0;
1492 rmt = ktime_to_timespec(rem); 1390 rmt = ktime_to_timespec(rem);
@@ -1505,7 +1403,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1505 1403
1506 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, 1404 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
1507 HRTIMER_MODE_ABS); 1405 HRTIMER_MODE_ABS);
1508 t.timer.expires.tv64 = restart->nanosleep.expires; 1406 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1509 1407
1510 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1408 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1511 goto out; 1409 goto out;
@@ -1530,9 +1428,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1530 struct restart_block *restart; 1428 struct restart_block *restart;
1531 struct hrtimer_sleeper t; 1429 struct hrtimer_sleeper t;
1532 int ret = 0; 1430 int ret = 0;
1431 unsigned long slack;
1432
1433 slack = current->timer_slack_ns;
1434 if (rt_task(current))
1435 slack = 0;
1533 1436
1534 hrtimer_init_on_stack(&t.timer, clockid, mode); 1437 hrtimer_init_on_stack(&t.timer, clockid, mode);
1535 t.timer.expires = timespec_to_ktime(*rqtp); 1438 hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
1536 if (do_nanosleep(&t, mode)) 1439 if (do_nanosleep(&t, mode))
1537 goto out; 1440 goto out;
1538 1441
@@ -1552,7 +1455,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1552 restart->fn = hrtimer_nanosleep_restart; 1455 restart->fn = hrtimer_nanosleep_restart;
1553 restart->nanosleep.index = t.timer.base->index; 1456 restart->nanosleep.index = t.timer.base->index;
1554 restart->nanosleep.rmtp = rmtp; 1457 restart->nanosleep.rmtp = rmtp;
1555 restart->nanosleep.expires = t.timer.expires.tv64; 1458 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1556 1459
1557 ret = -ERESTART_RESTARTBLOCK; 1460 ret = -ERESTART_RESTARTBLOCK;
1558out: 1461out:
@@ -1587,18 +1490,16 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1587 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1490 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1588 cpu_base->clock_base[i].cpu_base = cpu_base; 1491 cpu_base->clock_base[i].cpu_base = cpu_base;
1589 1492
1590 INIT_LIST_HEAD(&cpu_base->cb_pending);
1591 hrtimer_init_hres(cpu_base); 1493 hrtimer_init_hres(cpu_base);
1592} 1494}
1593 1495
1594#ifdef CONFIG_HOTPLUG_CPU 1496#ifdef CONFIG_HOTPLUG_CPU
1595 1497
1596static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1498static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1597 struct hrtimer_clock_base *new_base, int dcpu) 1499 struct hrtimer_clock_base *new_base)
1598{ 1500{
1599 struct hrtimer *timer; 1501 struct hrtimer *timer;
1600 struct rb_node *node; 1502 struct rb_node *node;
1601 int raise = 0;
1602 1503
1603 while ((node = rb_first(&old_base->active))) { 1504 while ((node = rb_first(&old_base->active))) {
1604 timer = rb_entry(node, struct hrtimer, node); 1505 timer = rb_entry(node, struct hrtimer, node);
@@ -1606,18 +1507,6 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1606 debug_hrtimer_deactivate(timer); 1507 debug_hrtimer_deactivate(timer);
1607 1508
1608 /* 1509 /*
1609 * Should not happen. Per CPU timers should be
1610 * canceled _before_ the migration code is called
1611 */
1612 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1613 __remove_hrtimer(timer, old_base,
1614 HRTIMER_STATE_INACTIVE, 0);
1615 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1616 timer, timer->function, dcpu);
1617 continue;
1618 }
1619
1620 /*
1621 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1510 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1622 * timer could be seen as !active and just vanish away 1511 * timer could be seen as !active and just vanish away
1623 * under us on another CPU 1512 * under us on another CPU
@@ -1625,111 +1514,83 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1625 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1514 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1626 timer->base = new_base; 1515 timer->base = new_base;
1627 /* 1516 /*
1628 * Enqueue the timer. Allow reprogramming of the event device 1517 * Enqueue the timers on the new cpu, but do not reprogram
1518 * the timer as that would enable a deadlock between
1519 * hrtimer_enqueue_reprogramm() running the timer and us still
1520 * holding a nested base lock.
1521 *
1522 * Instead we tickle the hrtimer interrupt after the migration
1523 * is done, which will run all expired timers and re-programm
1524 * the timer device.
1629 */ 1525 */
1630 enqueue_hrtimer(timer, new_base, 1); 1526 enqueue_hrtimer(timer, new_base, 0);
1631 1527
1632#ifdef CONFIG_HIGH_RES_TIMERS
1633 /*
1634 * Happens with high res enabled when the timer was
1635 * already expired and the callback mode is
1636 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1637 * enqueue code does not move them to the soft irq
1638 * pending list for performance/latency reasons, but
1639 * in the migration state, we need to do that
1640 * otherwise we end up with a stale timer.
1641 */
1642 if (timer->state == HRTIMER_STATE_MIGRATE) {
1643 timer->state = HRTIMER_STATE_PENDING;
1644 list_add_tail(&timer->cb_entry,
1645 &new_base->cpu_base->cb_pending);
1646 raise = 1;
1647 }
1648#endif
1649 /* Clear the migration state bit */ 1528 /* Clear the migration state bit */
1650 timer->state &= ~HRTIMER_STATE_MIGRATE; 1529 timer->state &= ~HRTIMER_STATE_MIGRATE;
1651 } 1530 }
1652 return raise;
1653} 1531}
1654 1532
1655#ifdef CONFIG_HIGH_RES_TIMERS 1533static int migrate_hrtimers(int scpu)
1656static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1657 struct hrtimer_cpu_base *new_base)
1658{
1659 struct hrtimer *timer;
1660 int raise = 0;
1661
1662 while (!list_empty(&old_base->cb_pending)) {
1663 timer = list_entry(old_base->cb_pending.next,
1664 struct hrtimer, cb_entry);
1665
1666 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1667 timer->base = &new_base->clock_base[timer->base->index];
1668 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1669 raise = 1;
1670 }
1671 return raise;
1672}
1673#else
1674static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1675 struct hrtimer_cpu_base *new_base)
1676{
1677 return 0;
1678}
1679#endif
1680
1681static void migrate_hrtimers(int cpu)
1682{ 1534{
1683 struct hrtimer_cpu_base *old_base, *new_base; 1535 struct hrtimer_cpu_base *old_base, *new_base;
1684 int i, raise = 0; 1536 int dcpu, i;
1685 1537
1686 BUG_ON(cpu_online(cpu)); 1538 BUG_ON(cpu_online(scpu));
1687 old_base = &per_cpu(hrtimer_bases, cpu); 1539 old_base = &per_cpu(hrtimer_bases, scpu);
1688 new_base = &get_cpu_var(hrtimer_bases); 1540 new_base = &get_cpu_var(hrtimer_bases);
1689 1541
1690 tick_cancel_sched_timer(cpu); 1542 dcpu = smp_processor_id();
1691 1543
1692 local_irq_disable(); 1544 tick_cancel_sched_timer(scpu);
1693 spin_lock(&new_base->lock); 1545 /*
1546 * The caller is globally serialized and nobody else
1547 * takes two locks at once, deadlock is not possible.
1548 */
1549 spin_lock_irq(&new_base->lock);
1694 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1550 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1695 1551
1696 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1552 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1697 if (migrate_hrtimer_list(&old_base->clock_base[i], 1553 migrate_hrtimer_list(&old_base->clock_base[i],
1698 &new_base->clock_base[i], cpu)) 1554 &new_base->clock_base[i]);
1699 raise = 1;
1700 } 1555 }
1701 1556
1702 if (migrate_hrtimer_pending(old_base, new_base))
1703 raise = 1;
1704
1705 spin_unlock(&old_base->lock); 1557 spin_unlock(&old_base->lock);
1706 spin_unlock(&new_base->lock); 1558 spin_unlock_irq(&new_base->lock);
1707 local_irq_enable();
1708 put_cpu_var(hrtimer_bases); 1559 put_cpu_var(hrtimer_bases);
1709 1560
1710 if (raise) 1561 return dcpu;
1711 hrtimer_raise_softirq();
1712} 1562}
1563
1564static void tickle_timers(void *arg)
1565{
1566 hrtimer_peek_ahead_timers();
1567}
1568
1713#endif /* CONFIG_HOTPLUG_CPU */ 1569#endif /* CONFIG_HOTPLUG_CPU */
1714 1570
1715static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1571static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1716 unsigned long action, void *hcpu) 1572 unsigned long action, void *hcpu)
1717{ 1573{
1718 unsigned int cpu = (long)hcpu; 1574 int scpu = (long)hcpu;
1719 1575
1720 switch (action) { 1576 switch (action) {
1721 1577
1722 case CPU_UP_PREPARE: 1578 case CPU_UP_PREPARE:
1723 case CPU_UP_PREPARE_FROZEN: 1579 case CPU_UP_PREPARE_FROZEN:
1724 init_hrtimers_cpu(cpu); 1580 init_hrtimers_cpu(scpu);
1725 break; 1581 break;
1726 1582
1727#ifdef CONFIG_HOTPLUG_CPU 1583#ifdef CONFIG_HOTPLUG_CPU
1728 case CPU_DEAD: 1584 case CPU_DEAD:
1729 case CPU_DEAD_FROZEN: 1585 case CPU_DEAD_FROZEN:
1730 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1586 {
1731 migrate_hrtimers(cpu); 1587 int dcpu;
1588
1589 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1590 dcpu = migrate_hrtimers(scpu);
1591 smp_call_function_single(dcpu, tickle_timers, NULL, 0);
1732 break; 1592 break;
1593 }
1733#endif 1594#endif
1734 1595
1735 default: 1596 default:
@@ -1748,8 +1609,105 @@ void __init hrtimers_init(void)
1748 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1609 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1749 (void *)(long)smp_processor_id()); 1610 (void *)(long)smp_processor_id());
1750 register_cpu_notifier(&hrtimers_nb); 1611 register_cpu_notifier(&hrtimers_nb);
1751#ifdef CONFIG_HIGH_RES_TIMERS
1752 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1753#endif
1754} 1612}
1755 1613
1614/**
1615 * schedule_hrtimeout_range - sleep until timeout
1616 * @expires: timeout value (ktime_t)
1617 * @delta: slack in expires timeout (ktime_t)
1618 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1619 *
1620 * Make the current task sleep until the given expiry time has
1621 * elapsed. The routine will return immediately unless
1622 * the current task state has been set (see set_current_state()).
1623 *
1624 * The @delta argument gives the kernel the freedom to schedule the
1625 * actual wakeup to a time that is both power and performance friendly.
1626 * The kernel give the normal best effort behavior for "@expires+@delta",
1627 * but may decide to fire the timer earlier, but no earlier than @expires.
1628 *
1629 * You can set the task state as follows -
1630 *
1631 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1632 * pass before the routine returns.
1633 *
1634 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1635 * delivered to the current task.
1636 *
1637 * The current task state is guaranteed to be TASK_RUNNING when this
1638 * routine returns.
1639 *
1640 * Returns 0 when the timer has expired otherwise -EINTR
1641 */
1642int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1643 const enum hrtimer_mode mode)
1644{
1645 struct hrtimer_sleeper t;
1646
1647 /*
1648 * Optimize when a zero timeout value is given. It does not
1649 * matter whether this is an absolute or a relative time.
1650 */
1651 if (expires && !expires->tv64) {
1652 __set_current_state(TASK_RUNNING);
1653 return 0;
1654 }
1655
1656 /*
1657 * A NULL parameter means "inifinte"
1658 */
1659 if (!expires) {
1660 schedule();
1661 __set_current_state(TASK_RUNNING);
1662 return -EINTR;
1663 }
1664
1665 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
1666 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1667
1668 hrtimer_init_sleeper(&t, current);
1669
1670 hrtimer_start_expires(&t.timer, mode);
1671 if (!hrtimer_active(&t.timer))
1672 t.task = NULL;
1673
1674 if (likely(t.task))
1675 schedule();
1676
1677 hrtimer_cancel(&t.timer);
1678 destroy_hrtimer_on_stack(&t.timer);
1679
1680 __set_current_state(TASK_RUNNING);
1681
1682 return !t.task ? 0 : -EINTR;
1683}
1684EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1685
1686/**
1687 * schedule_hrtimeout - sleep until timeout
1688 * @expires: timeout value (ktime_t)
1689 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1690 *
1691 * Make the current task sleep until the given expiry time has
1692 * elapsed. The routine will return immediately unless
1693 * the current task state has been set (see set_current_state()).
1694 *
1695 * You can set the task state as follows -
1696 *
1697 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1698 * pass before the routine returns.
1699 *
1700 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1701 * delivered to the current task.
1702 *
1703 * The current task state is guaranteed to be TASK_RUNNING when this
1704 * routine returns.
1705 *
1706 * Returns 0 when the timer has expired otherwise -EINTR
1707 */
1708int __sched schedule_hrtimeout(ktime_t *expires,
1709 const enum hrtimer_mode mode)
1710{
1711 return schedule_hrtimeout_range(expires, 0, mode);
1712}
1713EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 681c52dbfe22..4dd5b1edac98 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 533068cfb607..650ce4102a63 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -30,16 +30,18 @@ static DEFINE_MUTEX(probing_active);
30unsigned long probe_irq_on(void) 30unsigned long probe_irq_on(void)
31{ 31{
32 struct irq_desc *desc; 32 struct irq_desc *desc;
33 unsigned long mask; 33 unsigned long mask = 0;
34 unsigned int i; 34 unsigned int status;
35 int i;
35 36
36 mutex_lock(&probing_active); 37 mutex_lock(&probing_active);
37 /* 38 /*
38 * something may have generated an irq long ago and we want to 39 * something may have generated an irq long ago and we want to
39 * flush such a longstanding irq before considering it as spurious. 40 * flush such a longstanding irq before considering it as spurious.
40 */ 41 */
41 for (i = NR_IRQS-1; i > 0; i--) { 42 for_each_irq_desc_reverse(i, desc) {
42 desc = irq_desc + i; 43 if (!desc)
44 continue;
43 45
44 spin_lock_irq(&desc->lock); 46 spin_lock_irq(&desc->lock);
45 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 47 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
@@ -68,8 +70,9 @@ unsigned long probe_irq_on(void)
68 * (we must startup again here because if a longstanding irq 70 * (we must startup again here because if a longstanding irq
69 * happened in the previous stage, it may have masked itself) 71 * happened in the previous stage, it may have masked itself)
70 */ 72 */
71 for (i = NR_IRQS-1; i > 0; i--) { 73 for_each_irq_desc_reverse(i, desc) {
72 desc = irq_desc + i; 74 if (!desc)
75 continue;
73 76
74 spin_lock_irq(&desc->lock); 77 spin_lock_irq(&desc->lock);
75 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
@@ -88,11 +91,10 @@ unsigned long probe_irq_on(void)
88 /* 91 /*
89 * Now filter out any obviously spurious interrupts 92 * Now filter out any obviously spurious interrupts
90 */ 93 */
91 mask = 0; 94 for_each_irq_desc(i, desc) {
92 for (i = 0; i < NR_IRQS; i++) { 95 if (!desc)
93 unsigned int status; 96 continue;
94 97
95 desc = irq_desc + i;
96 spin_lock_irq(&desc->lock); 98 spin_lock_irq(&desc->lock);
97 status = desc->status; 99 status = desc->status;
98 100
@@ -126,13 +128,13 @@ EXPORT_SYMBOL(probe_irq_on);
126 */ 128 */
127unsigned int probe_irq_mask(unsigned long val) 129unsigned int probe_irq_mask(unsigned long val)
128{ 130{
129 unsigned int mask; 131 unsigned int status, mask = 0;
132 struct irq_desc *desc;
130 int i; 133 int i;
131 134
132 mask = 0; 135 for_each_irq_desc(i, desc) {
133 for (i = 0; i < NR_IRQS; i++) { 136 if (!desc)
134 struct irq_desc *desc = irq_desc + i; 137 continue;
135 unsigned int status;
136 138
137 spin_lock_irq(&desc->lock); 139 spin_lock_irq(&desc->lock);
138 status = desc->status; 140 status = desc->status;
@@ -171,20 +173,22 @@ EXPORT_SYMBOL(probe_irq_mask);
171 */ 173 */
172int probe_irq_off(unsigned long val) 174int probe_irq_off(unsigned long val)
173{ 175{
174 int i, irq_found = 0, nr_irqs = 0; 176 int i, irq_found = 0, nr_of_irqs = 0;
177 struct irq_desc *desc;
178 unsigned int status;
175 179
176 for (i = 0; i < NR_IRQS; i++) { 180 for_each_irq_desc(i, desc) {
177 struct irq_desc *desc = irq_desc + i; 181 if (!desc)
178 unsigned int status; 182 continue;
179 183
180 spin_lock_irq(&desc->lock); 184 spin_lock_irq(&desc->lock);
181 status = desc->status; 185 status = desc->status;
182 186
183 if (status & IRQ_AUTODETECT) { 187 if (status & IRQ_AUTODETECT) {
184 if (!(status & IRQ_WAITING)) { 188 if (!(status & IRQ_WAITING)) {
185 if (!nr_irqs) 189 if (!nr_of_irqs)
186 irq_found = i; 190 irq_found = i;
187 nr_irqs++; 191 nr_of_irqs++;
188 } 192 }
189 desc->status = status & ~IRQ_AUTODETECT; 193 desc->status = status & ~IRQ_AUTODETECT;
190 desc->chip->shutdown(i); 194 desc->chip->shutdown(i);
@@ -193,7 +197,7 @@ int probe_irq_off(unsigned long val)
193 } 197 }
194 mutex_unlock(&probing_active); 198 mutex_unlock(&probing_active);
195 199
196 if (nr_irqs > 1) 200 if (nr_of_irqs > 1)
197 irq_found = -irq_found; 201 irq_found = -irq_found;
198 202
199 return irq_found; 203 return irq_found;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3cd441ebf5d2..6eb3c7952b64 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -27,13 +27,13 @@ void dynamic_irq_init(unsigned int irq)
27 struct irq_desc *desc; 27 struct irq_desc *desc;
28 unsigned long flags; 28 unsigned long flags;
29 29
30 if (irq >= NR_IRQS) { 30 desc = irq_to_desc(irq);
31 if (!desc) {
31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 32 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 return; 33 return;
33 } 34 }
34 35
35 /* Ensure we don't have left over values from a previous use of this irq */ 36 /* Ensure we don't have left over values from a previous use of this irq */
36 desc = irq_desc + irq;
37 spin_lock_irqsave(&desc->lock, flags); 37 spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 38 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 39 desc->chip = &no_irq_chip;
@@ -57,15 +57,14 @@ void dynamic_irq_init(unsigned int irq)
57 */ 57 */
58void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_cleanup(unsigned int irq)
59{ 59{
60 struct irq_desc *desc; 60 struct irq_desc *desc = irq_to_desc(irq);
61 unsigned long flags; 61 unsigned long flags;
62 62
63 if (irq >= NR_IRQS) { 63 if (!desc) {
64 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); 64 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
65 return; 65 return;
66 } 66 }
67 67
68 desc = irq_desc + irq;
69 spin_lock_irqsave(&desc->lock, flags); 68 spin_lock_irqsave(&desc->lock, flags);
70 if (desc->action) { 69 if (desc->action) {
71 spin_unlock_irqrestore(&desc->lock, flags); 70 spin_unlock_irqrestore(&desc->lock, flags);
@@ -78,6 +77,7 @@ void dynamic_irq_cleanup(unsigned int irq)
78 desc->chip_data = NULL; 77 desc->chip_data = NULL;
79 desc->handle_irq = handle_bad_irq; 78 desc->handle_irq = handle_bad_irq;
80 desc->chip = &no_irq_chip; 79 desc->chip = &no_irq_chip;
80 desc->name = NULL;
81 spin_unlock_irqrestore(&desc->lock, flags); 81 spin_unlock_irqrestore(&desc->lock, flags);
82} 82}
83 83
@@ -89,10 +89,10 @@ void dynamic_irq_cleanup(unsigned int irq)
89 */ 89 */
90int set_irq_chip(unsigned int irq, struct irq_chip *chip) 90int set_irq_chip(unsigned int irq, struct irq_chip *chip)
91{ 91{
92 struct irq_desc *desc; 92 struct irq_desc *desc = irq_to_desc(irq);
93 unsigned long flags; 93 unsigned long flags;
94 94
95 if (irq >= NR_IRQS) { 95 if (!desc) {
96 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); 96 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
97 return -EINVAL; 97 return -EINVAL;
98 } 98 }
@@ -100,7 +100,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
100 if (!chip) 100 if (!chip)
101 chip = &no_irq_chip; 101 chip = &no_irq_chip;
102 102
103 desc = irq_desc + irq;
104 spin_lock_irqsave(&desc->lock, flags); 103 spin_lock_irqsave(&desc->lock, flags);
105 irq_chip_set_defaults(chip); 104 irq_chip_set_defaults(chip);
106 desc->chip = chip; 105 desc->chip = chip;
@@ -111,27 +110,28 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
111EXPORT_SYMBOL(set_irq_chip); 110EXPORT_SYMBOL(set_irq_chip);
112 111
113/** 112/**
114 * set_irq_type - set the irq type for an irq 113 * set_irq_type - set the irq trigger type for an irq
115 * @irq: irq number 114 * @irq: irq number
116 * @type: interrupt type - see include/linux/interrupt.h 115 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
117 */ 116 */
118int set_irq_type(unsigned int irq, unsigned int type) 117int set_irq_type(unsigned int irq, unsigned int type)
119{ 118{
120 struct irq_desc *desc; 119 struct irq_desc *desc = irq_to_desc(irq);
121 unsigned long flags; 120 unsigned long flags;
122 int ret = -ENXIO; 121 int ret = -ENXIO;
123 122
124 if (irq >= NR_IRQS) { 123 if (!desc) {
125 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 124 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
126 return -ENODEV; 125 return -ENODEV;
127 } 126 }
128 127
129 desc = irq_desc + irq; 128 type &= IRQ_TYPE_SENSE_MASK;
130 if (desc->chip->set_type) { 129 if (type == IRQ_TYPE_NONE)
131 spin_lock_irqsave(&desc->lock, flags); 130 return 0;
132 ret = desc->chip->set_type(irq, type); 131
133 spin_unlock_irqrestore(&desc->lock, flags); 132 spin_lock_irqsave(&desc->lock, flags);
134 } 133 ret = __irq_set_trigger(desc, irq, type);
134 spin_unlock_irqrestore(&desc->lock, flags);
135 return ret; 135 return ret;
136} 136}
137EXPORT_SYMBOL(set_irq_type); 137EXPORT_SYMBOL(set_irq_type);
@@ -145,16 +145,15 @@ EXPORT_SYMBOL(set_irq_type);
145 */ 145 */
146int set_irq_data(unsigned int irq, void *data) 146int set_irq_data(unsigned int irq, void *data)
147{ 147{
148 struct irq_desc *desc; 148 struct irq_desc *desc = irq_to_desc(irq);
149 unsigned long flags; 149 unsigned long flags;
150 150
151 if (irq >= NR_IRQS) { 151 if (!desc) {
152 printk(KERN_ERR 152 printk(KERN_ERR
153 "Trying to install controller data for IRQ%d\n", irq); 153 "Trying to install controller data for IRQ%d\n", irq);
154 return -EINVAL; 154 return -EINVAL;
155 } 155 }
156 156
157 desc = irq_desc + irq;
158 spin_lock_irqsave(&desc->lock, flags); 157 spin_lock_irqsave(&desc->lock, flags);
159 desc->handler_data = data; 158 desc->handler_data = data;
160 spin_unlock_irqrestore(&desc->lock, flags); 159 spin_unlock_irqrestore(&desc->lock, flags);
@@ -171,15 +170,15 @@ EXPORT_SYMBOL(set_irq_data);
171 */ 170 */
172int set_irq_msi(unsigned int irq, struct msi_desc *entry) 171int set_irq_msi(unsigned int irq, struct msi_desc *entry)
173{ 172{
174 struct irq_desc *desc; 173 struct irq_desc *desc = irq_to_desc(irq);
175 unsigned long flags; 174 unsigned long flags;
176 175
177 if (irq >= NR_IRQS) { 176 if (!desc) {
178 printk(KERN_ERR 177 printk(KERN_ERR
179 "Trying to install msi data for IRQ%d\n", irq); 178 "Trying to install msi data for IRQ%d\n", irq);
180 return -EINVAL; 179 return -EINVAL;
181 } 180 }
182 desc = irq_desc + irq; 181
183 spin_lock_irqsave(&desc->lock, flags); 182 spin_lock_irqsave(&desc->lock, flags);
184 desc->msi_desc = entry; 183 desc->msi_desc = entry;
185 if (entry) 184 if (entry)
@@ -197,10 +196,16 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
197 */ 196 */
198int set_irq_chip_data(unsigned int irq, void *data) 197int set_irq_chip_data(unsigned int irq, void *data)
199{ 198{
200 struct irq_desc *desc = irq_desc + irq; 199 struct irq_desc *desc = irq_to_desc(irq);
201 unsigned long flags; 200 unsigned long flags;
202 201
203 if (irq >= NR_IRQS || !desc->chip) { 202 if (!desc) {
203 printk(KERN_ERR
204 "Trying to install chip data for IRQ%d\n", irq);
205 return -EINVAL;
206 }
207
208 if (!desc->chip) {
204 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); 209 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
205 return -EINVAL; 210 return -EINVAL;
206 } 211 }
@@ -218,7 +223,7 @@ EXPORT_SYMBOL(set_irq_chip_data);
218 */ 223 */
219static void default_enable(unsigned int irq) 224static void default_enable(unsigned int irq)
220{ 225{
221 struct irq_desc *desc = irq_desc + irq; 226 struct irq_desc *desc = irq_to_desc(irq);
222 227
223 desc->chip->unmask(irq); 228 desc->chip->unmask(irq);
224 desc->status &= ~IRQ_MASKED; 229 desc->status &= ~IRQ_MASKED;
@@ -236,8 +241,9 @@ static void default_disable(unsigned int irq)
236 */ 241 */
237static unsigned int default_startup(unsigned int irq) 242static unsigned int default_startup(unsigned int irq)
238{ 243{
239 irq_desc[irq].chip->enable(irq); 244 struct irq_desc *desc = irq_to_desc(irq);
240 245
246 desc->chip->enable(irq);
241 return 0; 247 return 0;
242} 248}
243 249
@@ -246,7 +252,7 @@ static unsigned int default_startup(unsigned int irq)
246 */ 252 */
247static void default_shutdown(unsigned int irq) 253static void default_shutdown(unsigned int irq)
248{ 254{
249 struct irq_desc *desc = irq_desc + irq; 255 struct irq_desc *desc = irq_to_desc(irq);
250 256
251 desc->chip->mask(irq); 257 desc->chip->mask(irq);
252 desc->status |= IRQ_MASKED; 258 desc->status |= IRQ_MASKED;
@@ -305,14 +311,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
305{ 311{
306 struct irqaction *action; 312 struct irqaction *action;
307 irqreturn_t action_ret; 313 irqreturn_t action_ret;
308 const unsigned int cpu = smp_processor_id();
309 314
310 spin_lock(&desc->lock); 315 spin_lock(&desc->lock);
311 316
312 if (unlikely(desc->status & IRQ_INPROGRESS)) 317 if (unlikely(desc->status & IRQ_INPROGRESS))
313 goto out_unlock; 318 goto out_unlock;
314 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 319 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
315 kstat_cpu(cpu).irqs[irq]++; 320 kstat_incr_irqs_this_cpu(irq, desc);
316 321
317 action = desc->action; 322 action = desc->action;
318 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 323 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
@@ -344,17 +349,17 @@ out_unlock:
344void 349void
345handle_level_irq(unsigned int irq, struct irq_desc *desc) 350handle_level_irq(unsigned int irq, struct irq_desc *desc)
346{ 351{
347 unsigned int cpu = smp_processor_id();
348 struct irqaction *action; 352 struct irqaction *action;
349 irqreturn_t action_ret; 353 irqreturn_t action_ret;
350 354
351 spin_lock(&desc->lock); 355 spin_lock(&desc->lock);
352 mask_ack_irq(desc, irq); 356 mask_ack_irq(desc, irq);
357 desc = irq_remap_to_desc(irq, desc);
353 358
354 if (unlikely(desc->status & IRQ_INPROGRESS)) 359 if (unlikely(desc->status & IRQ_INPROGRESS))
355 goto out_unlock; 360 goto out_unlock;
356 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 361 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
357 kstat_cpu(cpu).irqs[irq]++; 362 kstat_incr_irqs_this_cpu(irq, desc);
358 363
359 /* 364 /*
360 * If its disabled or no action available 365 * If its disabled or no action available
@@ -392,7 +397,6 @@ out_unlock:
392void 397void
393handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 398handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
394{ 399{
395 unsigned int cpu = smp_processor_id();
396 struct irqaction *action; 400 struct irqaction *action;
397 irqreturn_t action_ret; 401 irqreturn_t action_ret;
398 402
@@ -402,7 +406,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
402 goto out; 406 goto out;
403 407
404 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 408 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
405 kstat_cpu(cpu).irqs[irq]++; 409 kstat_incr_irqs_this_cpu(irq, desc);
406 410
407 /* 411 /*
408 * If its disabled or no action available 412 * If its disabled or no action available
@@ -428,6 +432,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
428 desc->status &= ~IRQ_INPROGRESS; 432 desc->status &= ~IRQ_INPROGRESS;
429out: 433out:
430 desc->chip->eoi(irq); 434 desc->chip->eoi(irq);
435 desc = irq_remap_to_desc(irq, desc);
431 436
432 spin_unlock(&desc->lock); 437 spin_unlock(&desc->lock);
433} 438}
@@ -451,8 +456,6 @@ out:
451void 456void
452handle_edge_irq(unsigned int irq, struct irq_desc *desc) 457handle_edge_irq(unsigned int irq, struct irq_desc *desc)
453{ 458{
454 const unsigned int cpu = smp_processor_id();
455
456 spin_lock(&desc->lock); 459 spin_lock(&desc->lock);
457 460
458 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 461 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
@@ -466,13 +469,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
466 !desc->action)) { 469 !desc->action)) {
467 desc->status |= (IRQ_PENDING | IRQ_MASKED); 470 desc->status |= (IRQ_PENDING | IRQ_MASKED);
468 mask_ack_irq(desc, irq); 471 mask_ack_irq(desc, irq);
472 desc = irq_remap_to_desc(irq, desc);
469 goto out_unlock; 473 goto out_unlock;
470 } 474 }
471 475 kstat_incr_irqs_this_cpu(irq, desc);
472 kstat_cpu(cpu).irqs[irq]++;
473 476
474 /* Start handling the irq */ 477 /* Start handling the irq */
475 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 desc = irq_remap_to_desc(irq, desc);
476 480
477 /* Mark the IRQ currently in progress.*/ 481 /* Mark the IRQ currently in progress.*/
478 desc->status |= IRQ_INPROGRESS; 482 desc->status |= IRQ_INPROGRESS;
@@ -524,7 +528,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
524{ 528{
525 irqreturn_t action_ret; 529 irqreturn_t action_ret;
526 530
527 kstat_this_cpu.irqs[irq]++; 531 kstat_incr_irqs_this_cpu(irq, desc);
528 532
529 if (desc->chip->ack) 533 if (desc->chip->ack)
530 desc->chip->ack(irq); 534 desc->chip->ack(irq);
@@ -533,25 +537,25 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
533 if (!noirqdebug) 537 if (!noirqdebug)
534 note_interrupt(irq, desc, action_ret); 538 note_interrupt(irq, desc, action_ret);
535 539
536 if (desc->chip->eoi) 540 if (desc->chip->eoi) {
537 desc->chip->eoi(irq); 541 desc->chip->eoi(irq);
542 desc = irq_remap_to_desc(irq, desc);
543 }
538} 544}
539 545
540void 546void
541__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 547__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
542 const char *name) 548 const char *name)
543{ 549{
544 struct irq_desc *desc; 550 struct irq_desc *desc = irq_to_desc(irq);
545 unsigned long flags; 551 unsigned long flags;
546 552
547 if (irq >= NR_IRQS) { 553 if (!desc) {
548 printk(KERN_ERR 554 printk(KERN_ERR
549 "Trying to install type control for IRQ%d\n", irq); 555 "Trying to install type control for IRQ%d\n", irq);
550 return; 556 return;
551 } 557 }
552 558
553 desc = irq_desc + irq;
554
555 if (!handle) 559 if (!handle)
556 handle = handle_bad_irq; 560 handle = handle_bad_irq;
557 else if (desc->chip == &no_irq_chip) { 561 else if (desc->chip == &no_irq_chip) {
@@ -571,8 +575,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
571 575
572 /* Uninstall? */ 576 /* Uninstall? */
573 if (handle == handle_bad_irq) { 577 if (handle == handle_bad_irq) {
574 if (desc->chip != &no_irq_chip) 578 if (desc->chip != &no_irq_chip) {
575 mask_ack_irq(desc, irq); 579 mask_ack_irq(desc, irq);
580 desc = irq_remap_to_desc(irq, desc);
581 }
576 desc->status |= IRQ_DISABLED; 582 desc->status |= IRQ_DISABLED;
577 desc->depth = 1; 583 desc->depth = 1;
578 } 584 }
@@ -583,7 +589,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
583 desc->status &= ~IRQ_DISABLED; 589 desc->status &= ~IRQ_DISABLED;
584 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 590 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
585 desc->depth = 0; 591 desc->depth = 0;
586 desc->chip->unmask(irq); 592 desc->chip->startup(irq);
587 } 593 }
588 spin_unlock_irqrestore(&desc->lock, flags); 594 spin_unlock_irqrestore(&desc->lock, flags);
589} 595}
@@ -606,17 +612,14 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
606 612
607void __init set_irq_noprobe(unsigned int irq) 613void __init set_irq_noprobe(unsigned int irq)
608{ 614{
609 struct irq_desc *desc; 615 struct irq_desc *desc = irq_to_desc(irq);
610 unsigned long flags; 616 unsigned long flags;
611 617
612 if (irq >= NR_IRQS) { 618 if (!desc) {
613 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); 619 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
614
615 return; 620 return;
616 } 621 }
617 622
618 desc = irq_desc + irq;
619
620 spin_lock_irqsave(&desc->lock, flags); 623 spin_lock_irqsave(&desc->lock, flags);
621 desc->status |= IRQ_NOPROBE; 624 desc->status |= IRQ_NOPROBE;
622 spin_unlock_irqrestore(&desc->lock, flags); 625 spin_unlock_irqrestore(&desc->lock, flags);
@@ -624,17 +627,14 @@ void __init set_irq_noprobe(unsigned int irq)
624 627
625void __init set_irq_probe(unsigned int irq) 628void __init set_irq_probe(unsigned int irq)
626{ 629{
627 struct irq_desc *desc; 630 struct irq_desc *desc = irq_to_desc(irq);
628 unsigned long flags; 631 unsigned long flags;
629 632
630 if (irq >= NR_IRQS) { 633 if (!desc) {
631 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 634 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
632
633 return; 635 return;
634 } 636 }
635 637
636 desc = irq_desc + irq;
637
638 spin_lock_irqsave(&desc->lock, flags); 638 spin_lock_irqsave(&desc->lock, flags);
639 desc->status &= ~IRQ_NOPROBE; 639 desc->status &= ~IRQ_NOPROBE;
640 spin_unlock_irqrestore(&desc->lock, flags); 640 spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 5fa6198e9139..6492400cb50d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -15,9 +15,16 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h>
19#include <linux/hash.h>
18 20
19#include "internals.h" 21#include "internals.h"
20 22
23/*
24 * lockdep: we want to handle all irq_desc locks as a single lock-class:
25 */
26struct lock_class_key irq_desc_lock_class;
27
21/** 28/**
22 * handle_bad_irq - handle spurious and unhandled irqs 29 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 30 * @irq: the interrupt number
@@ -25,11 +32,10 @@
25 * 32 *
26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 33 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
27 */ 34 */
28void 35void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
29handle_bad_irq(unsigned int irq, struct irq_desc *desc)
30{ 36{
31 print_irq_desc(irq, desc); 37 print_irq_desc(irq, desc);
32 kstat_this_cpu.irqs[irq]++; 38 kstat_incr_irqs_this_cpu(irq, desc);
33 ack_bad_irq(irq); 39 ack_bad_irq(irq);
34} 40}
35 41
@@ -47,6 +53,158 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
47 * 53 *
48 * Controller mappings for all interrupt sources: 54 * Controller mappings for all interrupt sources:
49 */ 55 */
56int nr_irqs = NR_IRQS;
57EXPORT_SYMBOL_GPL(nr_irqs);
58
59void __init __attribute__((weak)) arch_early_irq_init(void)
60{
61}
62
63#ifdef CONFIG_SPARSE_IRQ
64static struct irq_desc irq_desc_init = {
65 .irq = -1,
66 .status = IRQ_DISABLED,
67 .chip = &no_irq_chip,
68 .handle_irq = handle_bad_irq,
69 .depth = 1,
70 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
71#ifdef CONFIG_SMP
72 .affinity = CPU_MASK_ALL
73#endif
74};
75
76void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
77{
78 unsigned long bytes;
79 char *ptr;
80 int node;
81
82 /* Compute how many bytes we need per irq and allocate them */
83 bytes = nr * sizeof(unsigned int);
84
85 node = cpu_to_node(cpu);
86 ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
87 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
88
89 if (ptr)
90 desc->kstat_irqs = (unsigned int *)ptr;
91}
92
93void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
94{
95}
96
97static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
98{
99 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
100 desc->irq = irq;
101#ifdef CONFIG_SMP
102 desc->cpu = cpu;
103#endif
104 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
105 init_kstat_irqs(desc, cpu, nr_cpu_ids);
106 if (!desc->kstat_irqs) {
107 printk(KERN_ERR "can not alloc kstat_irqs\n");
108 BUG_ON(1);
109 }
110 arch_init_chip_data(desc, cpu);
111}
112
113/*
114 * Protect the sparse_irqs:
115 */
116DEFINE_SPINLOCK(sparse_irq_lock);
117
118struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
119
120static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
121 [0 ... NR_IRQS_LEGACY-1] = {
122 .irq = -1,
123 .status = IRQ_DISABLED,
124 .chip = &no_irq_chip,
125 .handle_irq = handle_bad_irq,
126 .depth = 1,
127 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
128#ifdef CONFIG_SMP
129 .affinity = CPU_MASK_ALL
130#endif
131 }
132};
133
134/* FIXME: use bootmem alloc ...*/
135static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
136
137void __init early_irq_init(void)
138{
139 struct irq_desc *desc;
140 int legacy_count;
141 int i;
142
143 desc = irq_desc_legacy;
144 legacy_count = ARRAY_SIZE(irq_desc_legacy);
145
146 for (i = 0; i < legacy_count; i++) {
147 desc[i].irq = i;
148 desc[i].kstat_irqs = kstat_irqs_legacy[i];
149
150 irq_desc_ptrs[i] = desc + i;
151 }
152
153 for (i = legacy_count; i < NR_IRQS; i++)
154 irq_desc_ptrs[i] = NULL;
155
156 arch_early_irq_init();
157}
158
159struct irq_desc *irq_to_desc(unsigned int irq)
160{
161 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
162}
163
164struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
165{
166 struct irq_desc *desc;
167 unsigned long flags;
168 int node;
169
170 if (irq >= NR_IRQS) {
171 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
172 irq, NR_IRQS);
173 WARN_ON(1);
174 return NULL;
175 }
176
177 desc = irq_desc_ptrs[irq];
178 if (desc)
179 return desc;
180
181 spin_lock_irqsave(&sparse_irq_lock, flags);
182
183 /* We have to check it to avoid races with another CPU */
184 desc = irq_desc_ptrs[irq];
185 if (desc)
186 goto out_unlock;
187
188 node = cpu_to_node(cpu);
189 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
190 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
191 irq, cpu, node);
192 if (!desc) {
193 printk(KERN_ERR "can not alloc irq_desc\n");
194 BUG_ON(1);
195 }
196 init_one_irq_desc(irq, desc, cpu);
197
198 irq_desc_ptrs[irq] = desc;
199
200out_unlock:
201 spin_unlock_irqrestore(&sparse_irq_lock, flags);
202
203 return desc;
204}
205
206#else
207
50struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 208struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
51 [0 ... NR_IRQS-1] = { 209 [0 ... NR_IRQS-1] = {
52 .status = IRQ_DISABLED, 210 .status = IRQ_DISABLED,
@@ -60,13 +218,17 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
60 } 218 }
61}; 219};
62 220
221#endif
222
63/* 223/*
64 * What should we do if we get a hw irq event on an illegal vector? 224 * What should we do if we get a hw irq event on an illegal vector?
65 * Each architecture has to answer this themself. 225 * Each architecture has to answer this themself.
66 */ 226 */
67static void ack_bad(unsigned int irq) 227static void ack_bad(unsigned int irq)
68{ 228{
69 print_irq_desc(irq, irq_desc + irq); 229 struct irq_desc *desc = irq_to_desc(irq);
230
231 print_irq_desc(irq, desc);
70 ack_bad_irq(irq); 232 ack_bad_irq(irq);
71} 233}
72 234
@@ -131,8 +293,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
131 irqreturn_t ret, retval = IRQ_NONE; 293 irqreturn_t ret, retval = IRQ_NONE;
132 unsigned int status = 0; 294 unsigned int status = 0;
133 295
134 handle_dynamic_tick(action);
135
136 if (!(action->flags & IRQF_DISABLED)) 296 if (!(action->flags & IRQF_DISABLED))
137 local_irq_enable_in_hardirq(); 297 local_irq_enable_in_hardirq();
138 298
@@ -165,19 +325,23 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
165 */ 325 */
166unsigned int __do_IRQ(unsigned int irq) 326unsigned int __do_IRQ(unsigned int irq)
167{ 327{
168 struct irq_desc *desc = irq_desc + irq; 328 struct irq_desc *desc = irq_to_desc(irq);
169 struct irqaction *action; 329 struct irqaction *action;
170 unsigned int status; 330 unsigned int status;
171 331
172 kstat_this_cpu.irqs[irq]++; 332 kstat_incr_irqs_this_cpu(irq, desc);
333
173 if (CHECK_IRQ_PER_CPU(desc->status)) { 334 if (CHECK_IRQ_PER_CPU(desc->status)) {
174 irqreturn_t action_ret; 335 irqreturn_t action_ret;
175 336
176 /* 337 /*
177 * No locking required for CPU-local interrupts: 338 * No locking required for CPU-local interrupts:
178 */ 339 */
179 if (desc->chip->ack) 340 if (desc->chip->ack) {
180 desc->chip->ack(irq); 341 desc->chip->ack(irq);
342 /* get new one */
343 desc = irq_remap_to_desc(irq, desc);
344 }
181 if (likely(!(desc->status & IRQ_DISABLED))) { 345 if (likely(!(desc->status & IRQ_DISABLED))) {
182 action_ret = handle_IRQ_event(irq, desc->action); 346 action_ret = handle_IRQ_event(irq, desc->action);
183 if (!noirqdebug) 347 if (!noirqdebug)
@@ -188,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq)
188 } 352 }
189 353
190 spin_lock(&desc->lock); 354 spin_lock(&desc->lock);
191 if (desc->chip->ack) 355 if (desc->chip->ack) {
192 desc->chip->ack(irq); 356 desc->chip->ack(irq);
357 desc = irq_remap_to_desc(irq, desc);
358 }
193 /* 359 /*
194 * REPLAY is when Linux resends an IRQ that was dropped earlier 360 * REPLAY is when Linux resends an IRQ that was dropped earlier
195 * WAITING is used by probe to mark irqs that are being tested 361 * WAITING is used by probe to mark irqs that are being tested
@@ -256,19 +422,25 @@ out:
256} 422}
257#endif 423#endif
258 424
259#ifdef CONFIG_TRACE_IRQFLAGS
260
261/*
262 * lockdep: we want to handle all irq_desc locks as a single lock-class:
263 */
264static struct lock_class_key irq_desc_lock_class;
265
266void early_init_irq_lock_class(void) 425void early_init_irq_lock_class(void)
267{ 426{
427 struct irq_desc *desc;
268 int i; 428 int i;
269 429
270 for (i = 0; i < NR_IRQS; i++) 430 for_each_irq_desc(i, desc) {
271 lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); 431 if (!desc)
432 continue;
433
434 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
435 }
272} 436}
273 437
438#ifdef CONFIG_SPARSE_IRQ
439unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
440{
441 struct irq_desc *desc = irq_to_desc(irq);
442 return desc->kstat_irqs[cpu];
443}
274#endif 444#endif
445EXPORT_SYMBOL(kstat_irqs_cpu);
446
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 08a849a22447..e6d0a43cc125 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -10,18 +10,28 @@ extern void irq_chip_set_defaults(struct irq_chip *chip);
10/* Set default handler: */ 10/* Set default handler: */
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); 11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12 12
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags);
15
16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock;
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
20
13#ifdef CONFIG_PROC_FS 21#ifdef CONFIG_PROC_FS
14extern void register_irq_proc(unsigned int irq); 22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
15extern void register_handler_proc(unsigned int irq, struct irqaction *action); 23extern void register_handler_proc(unsigned int irq, struct irqaction *action);
16extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 24extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
17#else 25#else
18static inline void register_irq_proc(unsigned int irq) { } 26static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
19static inline void register_handler_proc(unsigned int irq, 27static inline void register_handler_proc(unsigned int irq,
20 struct irqaction *action) { } 28 struct irqaction *action) { }
21static inline void unregister_handler_proc(unsigned int irq, 29static inline void unregister_handler_proc(unsigned int irq,
22 struct irqaction *action) { } 30 struct irqaction *action) { }
23#endif 31#endif
24 32
33extern int irq_select_affinity_usr(unsigned int irq);
34
25/* 35/*
26 * Debugging printout: 36 * Debugging printout:
27 */ 37 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 60c49e324390..540f6c49f3fa 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL;
31 */ 31 */
32void synchronize_irq(unsigned int irq) 32void synchronize_irq(unsigned int irq)
33{ 33{
34 struct irq_desc *desc = irq_desc + irq; 34 struct irq_desc *desc = irq_to_desc(irq);
35 unsigned int status; 35 unsigned int status;
36 36
37 if (irq >= NR_IRQS) 37 if (!desc)
38 return; 38 return;
39 39
40 do { 40 do {
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq);
64 */ 64 */
65int irq_can_set_affinity(unsigned int irq) 65int irq_can_set_affinity(unsigned int irq)
66{ 66{
67 struct irq_desc *desc = irq_desc + irq; 67 struct irq_desc *desc = irq_to_desc(irq);
68 68
69 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 69 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
70 !desc->chip->set_affinity) 70 !desc->chip->set_affinity)
@@ -81,26 +81,28 @@ int irq_can_set_affinity(unsigned int irq)
81 */ 81 */
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 82int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
83{ 83{
84 struct irq_desc *desc = irq_desc + irq; 84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags;
85 86
86 if (!desc->chip->set_affinity) 87 if (!desc->chip->set_affinity)
87 return -EINVAL; 88 return -EINVAL;
88 89
89 set_balance_irq_affinity(irq, cpumask); 90 spin_lock_irqsave(&desc->lock, flags);
90 91
91#ifdef CONFIG_GENERIC_PENDING_IRQ 92#ifdef CONFIG_GENERIC_PENDING_IRQ
92 if (desc->status & IRQ_MOVE_PCNTXT) { 93 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
93 unsigned long flags; 94 desc->affinity = cpumask;
94
95 spin_lock_irqsave(&desc->lock, flags);
96 desc->chip->set_affinity(irq, cpumask); 95 desc->chip->set_affinity(irq, cpumask);
97 spin_unlock_irqrestore(&desc->lock, flags); 96 } else {
98 } else 97 desc->status |= IRQ_MOVE_PENDING;
99 set_pending_irq(irq, cpumask); 98 desc->pending_mask = cpumask;
99 }
100#else 100#else
101 desc->affinity = cpumask; 101 desc->affinity = cpumask;
102 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
103#endif 103#endif
104 desc->status |= IRQ_AFFINITY_SET;
105 spin_unlock_irqrestore(&desc->lock, flags);
104 return 0; 106 return 0;
105} 107}
106 108
@@ -108,7 +110,7 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
108/* 110/*
109 * Generic version of the affinity autoselector. 111 * Generic version of the affinity autoselector.
110 */ 112 */
111int irq_select_affinity(unsigned int irq) 113int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
112{ 114{
113 cpumask_t mask; 115 cpumask_t mask;
114 116
@@ -117,14 +119,50 @@ int irq_select_affinity(unsigned int irq)
117 119
118 cpus_and(mask, cpu_online_map, irq_default_affinity); 120 cpus_and(mask, cpu_online_map, irq_default_affinity);
119 121
120 irq_desc[irq].affinity = mask; 122 /*
121 irq_desc[irq].chip->set_affinity(irq, mask); 123 * Preserve an userspace affinity setup, but make sure that
124 * one of the targets is online.
125 */
126 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
127 if (cpus_intersects(desc->affinity, cpu_online_map))
128 mask = desc->affinity;
129 else
130 desc->status &= ~IRQ_AFFINITY_SET;
131 }
132
133 desc->affinity = mask;
134 desc->chip->set_affinity(irq, mask);
122 135
123 set_balance_irq_affinity(irq, mask);
124 return 0; 136 return 0;
125} 137}
138#else
139static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
140{
141 return irq_select_affinity(irq);
142}
126#endif 143#endif
127 144
145/*
146 * Called when affinity is set via /proc/irq
147 */
148int irq_select_affinity_usr(unsigned int irq)
149{
150 struct irq_desc *desc = irq_to_desc(irq);
151 unsigned long flags;
152 int ret;
153
154 spin_lock_irqsave(&desc->lock, flags);
155 ret = do_irq_select_affinity(irq, desc);
156 spin_unlock_irqrestore(&desc->lock, flags);
157
158 return ret;
159}
160
161#else
162static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
163{
164 return 0;
165}
128#endif 166#endif
129 167
130/** 168/**
@@ -140,10 +178,10 @@ int irq_select_affinity(unsigned int irq)
140 */ 178 */
141void disable_irq_nosync(unsigned int irq) 179void disable_irq_nosync(unsigned int irq)
142{ 180{
143 struct irq_desc *desc = irq_desc + irq; 181 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags; 182 unsigned long flags;
145 183
146 if (irq >= NR_IRQS) 184 if (!desc)
147 return; 185 return;
148 186
149 spin_lock_irqsave(&desc->lock, flags); 187 spin_lock_irqsave(&desc->lock, flags);
@@ -169,9 +207,9 @@ EXPORT_SYMBOL(disable_irq_nosync);
169 */ 207 */
170void disable_irq(unsigned int irq) 208void disable_irq(unsigned int irq)
171{ 209{
172 struct irq_desc *desc = irq_desc + irq; 210 struct irq_desc *desc = irq_to_desc(irq);
173 211
174 if (irq >= NR_IRQS) 212 if (!desc)
175 return; 213 return;
176 214
177 disable_irq_nosync(irq); 215 disable_irq_nosync(irq);
@@ -211,10 +249,10 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
211 */ 249 */
212void enable_irq(unsigned int irq) 250void enable_irq(unsigned int irq)
213{ 251{
214 struct irq_desc *desc = irq_desc + irq; 252 struct irq_desc *desc = irq_to_desc(irq);
215 unsigned long flags; 253 unsigned long flags;
216 254
217 if (irq >= NR_IRQS) 255 if (!desc)
218 return; 256 return;
219 257
220 spin_lock_irqsave(&desc->lock, flags); 258 spin_lock_irqsave(&desc->lock, flags);
@@ -223,9 +261,9 @@ void enable_irq(unsigned int irq)
223} 261}
224EXPORT_SYMBOL(enable_irq); 262EXPORT_SYMBOL(enable_irq);
225 263
226int set_irq_wake_real(unsigned int irq, unsigned int on) 264static int set_irq_wake_real(unsigned int irq, unsigned int on)
227{ 265{
228 struct irq_desc *desc = irq_desc + irq; 266 struct irq_desc *desc = irq_to_desc(irq);
229 int ret = -ENXIO; 267 int ret = -ENXIO;
230 268
231 if (desc->chip->set_wake) 269 if (desc->chip->set_wake)
@@ -248,7 +286,7 @@ int set_irq_wake_real(unsigned int irq, unsigned int on)
248 */ 286 */
249int set_irq_wake(unsigned int irq, unsigned int on) 287int set_irq_wake(unsigned int irq, unsigned int on)
250{ 288{
251 struct irq_desc *desc = irq_desc + irq; 289 struct irq_desc *desc = irq_to_desc(irq);
252 unsigned long flags; 290 unsigned long flags;
253 int ret = 0; 291 int ret = 0;
254 292
@@ -288,12 +326,16 @@ EXPORT_SYMBOL(set_irq_wake);
288 */ 326 */
289int can_request_irq(unsigned int irq, unsigned long irqflags) 327int can_request_irq(unsigned int irq, unsigned long irqflags)
290{ 328{
329 struct irq_desc *desc = irq_to_desc(irq);
291 struct irqaction *action; 330 struct irqaction *action;
292 331
293 if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) 332 if (!desc)
333 return 0;
334
335 if (desc->status & IRQ_NOREQUEST)
294 return 0; 336 return 0;
295 337
296 action = irq_desc[irq].action; 338 action = desc->action;
297 if (action) 339 if (action)
298 if (irqflags & action->flags & IRQF_SHARED) 340 if (irqflags & action->flags & IRQF_SHARED)
299 action = NULL; 341 action = NULL;
@@ -312,27 +354,35 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
312 desc->handle_irq = NULL; 354 desc->handle_irq = NULL;
313} 355}
314 356
315static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, 357int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
316 unsigned long flags) 358 unsigned long flags)
317{ 359{
318 int ret; 360 int ret;
361 struct irq_chip *chip = desc->chip;
319 362
320 if (!chip || !chip->set_type) { 363 if (!chip || !chip->set_type) {
321 /* 364 /*
322 * IRQF_TRIGGER_* but the PIC does not support multiple 365 * IRQF_TRIGGER_* but the PIC does not support multiple
323 * flow-types? 366 * flow-types?
324 */ 367 */
325 pr_warning("No set_type function for IRQ %d (%s)\n", irq, 368 pr_debug("No set_type function for IRQ %d (%s)\n", irq,
326 chip ? (chip->name ? : "unknown") : "unknown"); 369 chip ? (chip->name ? : "unknown") : "unknown");
327 return 0; 370 return 0;
328 } 371 }
329 372
330 ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); 373 /* caller masked out all except trigger mode flags */
374 ret = chip->set_type(irq, flags);
331 375
332 if (ret) 376 if (ret)
333 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 377 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
334 (int)(flags & IRQF_TRIGGER_MASK), 378 (int)flags, irq, chip->set_type);
335 irq, chip->set_type); 379 else {
380 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
381 flags |= IRQ_LEVEL;
382 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
383 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
384 desc->status |= flags;
385 }
336 386
337 return ret; 387 return ret;
338} 388}
@@ -341,16 +391,16 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
341 * Internal function to register an irqaction - typically used to 391 * Internal function to register an irqaction - typically used to
342 * allocate special interrupts that are part of the architecture. 392 * allocate special interrupts that are part of the architecture.
343 */ 393 */
344int setup_irq(unsigned int irq, struct irqaction *new) 394static int
395__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
345{ 396{
346 struct irq_desc *desc = irq_desc + irq;
347 struct irqaction *old, **p; 397 struct irqaction *old, **p;
348 const char *old_name = NULL; 398 const char *old_name = NULL;
349 unsigned long flags; 399 unsigned long flags;
350 int shared = 0; 400 int shared = 0;
351 int ret; 401 int ret;
352 402
353 if (irq >= NR_IRQS) 403 if (!desc)
354 return -EINVAL; 404 return -EINVAL;
355 405
356 if (desc->chip == &no_irq_chip) 406 if (desc->chip == &no_irq_chip)
@@ -411,7 +461,8 @@ int setup_irq(unsigned int irq, struct irqaction *new)
411 461
412 /* Setup the type (level, edge polarity) if configured: */ 462 /* Setup the type (level, edge polarity) if configured: */
413 if (new->flags & IRQF_TRIGGER_MASK) { 463 if (new->flags & IRQF_TRIGGER_MASK) {
414 ret = __irq_set_trigger(desc->chip, irq, new->flags); 464 ret = __irq_set_trigger(desc, irq,
465 new->flags & IRQF_TRIGGER_MASK);
415 466
416 if (ret) { 467 if (ret) {
417 spin_unlock_irqrestore(&desc->lock, flags); 468 spin_unlock_irqrestore(&desc->lock, flags);
@@ -430,24 +481,29 @@ int setup_irq(unsigned int irq, struct irqaction *new)
430 if (!(desc->status & IRQ_NOAUTOEN)) { 481 if (!(desc->status & IRQ_NOAUTOEN)) {
431 desc->depth = 0; 482 desc->depth = 0;
432 desc->status &= ~IRQ_DISABLED; 483 desc->status &= ~IRQ_DISABLED;
433 if (desc->chip->startup) 484 desc->chip->startup(irq);
434 desc->chip->startup(irq);
435 else
436 desc->chip->enable(irq);
437 } else 485 } else
438 /* Undo nested disables: */ 486 /* Undo nested disables: */
439 desc->depth = 1; 487 desc->depth = 1;
440 488
489 /* Exclude IRQ from balancing if requested */
490 if (new->flags & IRQF_NOBALANCING)
491 desc->status |= IRQ_NO_BALANCING;
492
441 /* Set default affinity mask once everything is setup */ 493 /* Set default affinity mask once everything is setup */
442 irq_select_affinity(irq); 494 do_irq_select_affinity(irq, desc);
495
496 } else if ((new->flags & IRQF_TRIGGER_MASK)
497 && (new->flags & IRQF_TRIGGER_MASK)
498 != (desc->status & IRQ_TYPE_SENSE_MASK)) {
499 /* hope the handler works with the actual trigger mode... */
500 pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
501 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
502 (int)(new->flags & IRQF_TRIGGER_MASK));
443 } 503 }
444 504
445 *p = new; 505 *p = new;
446 506
447 /* Exclude IRQ from balancing */
448 if (new->flags & IRQF_NOBALANCING)
449 desc->status |= IRQ_NO_BALANCING;
450
451 /* Reset broken irq detection when installing new handler */ 507 /* Reset broken irq detection when installing new handler */
452 desc->irq_count = 0; 508 desc->irq_count = 0;
453 desc->irqs_unhandled = 0; 509 desc->irqs_unhandled = 0;
@@ -464,7 +520,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
464 spin_unlock_irqrestore(&desc->lock, flags); 520 spin_unlock_irqrestore(&desc->lock, flags);
465 521
466 new->irq = irq; 522 new->irq = irq;
467 register_irq_proc(irq); 523 register_irq_proc(irq, desc);
468 new->dir = NULL; 524 new->dir = NULL;
469 register_handler_proc(irq, new); 525 register_handler_proc(irq, new);
470 526
@@ -484,6 +540,20 @@ mismatch:
484} 540}
485 541
486/** 542/**
543 * setup_irq - setup an interrupt
544 * @irq: Interrupt line to setup
545 * @act: irqaction for the interrupt
546 *
547 * Used to statically setup interrupts in the early boot process.
548 */
549int setup_irq(unsigned int irq, struct irqaction *act)
550{
551 struct irq_desc *desc = irq_to_desc(irq);
552
553 return __setup_irq(irq, desc, act);
554}
555
556/**
487 * free_irq - free an interrupt 557 * free_irq - free an interrupt
488 * @irq: Interrupt line to free 558 * @irq: Interrupt line to free
489 * @dev_id: Device identity to free 559 * @dev_id: Device identity to free
@@ -499,15 +569,15 @@ mismatch:
499 */ 569 */
500void free_irq(unsigned int irq, void *dev_id) 570void free_irq(unsigned int irq, void *dev_id)
501{ 571{
502 struct irq_desc *desc; 572 struct irq_desc *desc = irq_to_desc(irq);
503 struct irqaction **p; 573 struct irqaction **p;
504 unsigned long flags; 574 unsigned long flags;
505 575
506 WARN_ON(in_interrupt()); 576 WARN_ON(in_interrupt());
507 if (irq >= NR_IRQS) 577
578 if (!desc)
508 return; 579 return;
509 580
510 desc = irq_desc + irq;
511 spin_lock_irqsave(&desc->lock, flags); 581 spin_lock_irqsave(&desc->lock, flags);
512 p = &desc->action; 582 p = &desc->action;
513 for (;;) { 583 for (;;) {
@@ -596,14 +666,28 @@ EXPORT_SYMBOL(free_irq);
596 * IRQF_SHARED Interrupt is shared 666 * IRQF_SHARED Interrupt is shared
597 * IRQF_DISABLED Disable local interrupts while processing 667 * IRQF_DISABLED Disable local interrupts while processing
598 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 668 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
669 * IRQF_TRIGGER_* Specify active edge(s) or level
599 * 670 *
600 */ 671 */
601int request_irq(unsigned int irq, irq_handler_t handler, 672int request_irq(unsigned int irq, irq_handler_t handler,
602 unsigned long irqflags, const char *devname, void *dev_id) 673 unsigned long irqflags, const char *devname, void *dev_id)
603{ 674{
604 struct irqaction *action; 675 struct irqaction *action;
676 struct irq_desc *desc;
605 int retval; 677 int retval;
606 678
679 /*
680 * handle_IRQ_event() always ignores IRQF_DISABLED except for
681 * the _first_ irqaction (sigh). That can cause oopsing, but
682 * the behavior is classified as "will not fix" so we need to
683 * start nudging drivers away from using that idiom.
684 */
685 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
686 == (IRQF_SHARED|IRQF_DISABLED))
687 pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
688 "guaranteed on shared IRQs\n",
689 irq, devname);
690
607#ifdef CONFIG_LOCKDEP 691#ifdef CONFIG_LOCKDEP
608 /* 692 /*
609 * Lockdep wants atomic interrupt handlers: 693 * Lockdep wants atomic interrupt handlers:
@@ -618,9 +702,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
618 */ 702 */
619 if ((irqflags & IRQF_SHARED) && !dev_id) 703 if ((irqflags & IRQF_SHARED) && !dev_id)
620 return -EINVAL; 704 return -EINVAL;
621 if (irq >= NR_IRQS) 705
706 desc = irq_to_desc(irq);
707 if (!desc)
622 return -EINVAL; 708 return -EINVAL;
623 if (irq_desc[irq].status & IRQ_NOREQUEST) 709
710 if (desc->status & IRQ_NOREQUEST)
624 return -EINVAL; 711 return -EINVAL;
625 if (!handler) 712 if (!handler)
626 return -EINVAL; 713 return -EINVAL;
@@ -636,26 +723,29 @@ int request_irq(unsigned int irq, irq_handler_t handler,
636 action->next = NULL; 723 action->next = NULL;
637 action->dev_id = dev_id; 724 action->dev_id = dev_id;
638 725
726 retval = __setup_irq(irq, desc, action);
727 if (retval)
728 kfree(action);
729
639#ifdef CONFIG_DEBUG_SHIRQ 730#ifdef CONFIG_DEBUG_SHIRQ
640 if (irqflags & IRQF_SHARED) { 731 if (irqflags & IRQF_SHARED) {
641 /* 732 /*
642 * It's a shared IRQ -- the driver ought to be prepared for it 733 * It's a shared IRQ -- the driver ought to be prepared for it
643 * to happen immediately, so let's make sure.... 734 * to happen immediately, so let's make sure....
644 * We do this before actually registering it, to make sure that 735 * We disable the irq to make sure that a 'real' IRQ doesn't
645 * a 'real' IRQ doesn't run in parallel with our fake 736 * run in parallel with our fake.
646 */ 737 */
647 unsigned long flags; 738 unsigned long flags;
648 739
740 disable_irq(irq);
649 local_irq_save(flags); 741 local_irq_save(flags);
742
650 handler(irq, dev_id); 743 handler(irq, dev_id);
744
651 local_irq_restore(flags); 745 local_irq_restore(flags);
746 enable_irq(irq);
652 } 747 }
653#endif 748#endif
654
655 retval = setup_irq(irq, action);
656 if (retval)
657 kfree(action);
658
659 return retval; 749 return retval;
660} 750}
661EXPORT_SYMBOL(request_irq); 751EXPORT_SYMBOL(request_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 77b7acc875c5..9db681d95814 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,20 +1,9 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{
6 struct irq_desc *desc = irq_desc + irq;
7 unsigned long flags;
8
9 spin_lock_irqsave(&desc->lock, flags);
10 desc->status |= IRQ_MOVE_PENDING;
11 irq_desc[irq].pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags);
13}
14
15void move_masked_irq(int irq) 4void move_masked_irq(int irq)
16{ 5{
17 struct irq_desc *desc = irq_desc + irq; 6 struct irq_desc *desc = irq_to_desc(irq);
18 cpumask_t tmp; 7 cpumask_t tmp;
19 8
20 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 9 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -30,7 +19,7 @@ void move_masked_irq(int irq)
30 19
31 desc->status &= ~IRQ_MOVE_PENDING; 20 desc->status &= ~IRQ_MOVE_PENDING;
32 21
33 if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) 22 if (unlikely(cpus_empty(desc->pending_mask)))
34 return; 23 return;
35 24
36 if (!desc->chip->set_affinity) 25 if (!desc->chip->set_affinity)
@@ -38,7 +27,7 @@ void move_masked_irq(int irq)
38 27
39 assert_spin_locked(&desc->lock); 28 assert_spin_locked(&desc->lock);
40 29
41 cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); 30 cpus_and(tmp, desc->pending_mask, cpu_online_map);
42 31
43 /* 32 /*
44 * If there was a valid mask to work with, please 33 * If there was a valid mask to work with, please
@@ -55,12 +44,12 @@ void move_masked_irq(int irq)
55 if (likely(!cpus_empty(tmp))) { 44 if (likely(!cpus_empty(tmp))) {
56 desc->chip->set_affinity(irq,tmp); 45 desc->chip->set_affinity(irq,tmp);
57 } 46 }
58 cpus_clear(irq_desc[irq].pending_mask); 47 cpus_clear(desc->pending_mask);
59} 48}
60 49
61void move_native_irq(int irq) 50void move_native_irq(int irq)
62{ 51{
63 struct irq_desc *desc = irq_desc + irq; 52 struct irq_desc *desc = irq_to_desc(irq);
64 53
65 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 54 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
66 return; 55 return;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644
index 000000000000..089c3746358a
--- /dev/null
+++ b/kernel/irq/numa_migrate.c
@@ -0,0 +1,122 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/module.h>
10#include <linux/random.h>
11#include <linux/interrupt.h>
12#include <linux/kernel_stat.h>
13
14#include "internals.h"
15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc,
18 int cpu, int nr)
19{
20 unsigned long bytes;
21
22 init_kstat_irqs(desc, cpu, nr);
23
24 if (desc->kstat_irqs != old_desc->kstat_irqs) {
25 /* Compute how many bytes we need per irq and allocate them */
26 bytes = nr * sizeof(unsigned int);
27
28 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
29 }
30}
31
32static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
33{
34 if (old_desc->kstat_irqs == desc->kstat_irqs)
35 return;
36
37 kfree(old_desc->kstat_irqs);
38 old_desc->kstat_irqs = NULL;
39}
40
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu)
43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 desc->cpu = cpu;
46 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
47 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
48 arch_init_copy_chip_data(old_desc, desc, cpu);
49}
50
51static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
52{
53 free_kstat_irqs(old_desc, desc);
54 arch_free_chip_data(old_desc, desc);
55}
56
57static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
58 int cpu)
59{
60 struct irq_desc *desc;
61 unsigned int irq;
62 unsigned long flags;
63 int node;
64
65 irq = old_desc->irq;
66
67 spin_lock_irqsave(&sparse_irq_lock, flags);
68
69 /* We have to check it to avoid races with another CPU */
70 desc = irq_desc_ptrs[irq];
71
72 if (desc && old_desc != desc)
73 goto out_unlock;
74
75 node = cpu_to_node(cpu);
76 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
77 printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n",
78 irq, cpu, node);
79 if (!desc) {
80 printk(KERN_ERR "can not get new irq_desc for moving\n");
81 /* still use old one */
82 desc = old_desc;
83 goto out_unlock;
84 }
85 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
86
87 irq_desc_ptrs[irq] = desc;
88
89 /* free the old one */
90 free_one_irq_desc(old_desc, desc);
91 kfree(old_desc);
92
93out_unlock:
94 spin_unlock_irqrestore(&sparse_irq_lock, flags);
95
96 return desc;
97}
98
99struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
100{
101 int old_cpu;
102 int node, old_node;
103
104 /* those all static, do move them */
105 if (desc->irq < NR_IRQS_LEGACY)
106 return desc;
107
108 old_cpu = desc->cpu;
109 printk(KERN_DEBUG
110 "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
111 if (old_cpu != cpu) {
112 node = cpu_to_node(cpu);
113 old_node = cpu_to_node(old_cpu);
114 if (old_node != node)
115 desc = __real_move_irq_desc(desc, cpu);
116 else
117 desc->cpu = cpu;
118 }
119
120 return desc;
121}
122
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a09dd29c2fd7..f6b3440f05bc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
19 19
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_desc + (long)m->private; 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 cpumask_t *mask = &desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
43 cpumask_t new_value; 43 cpumask_t new_value;
44 int err; 44 int err;
45 45
46 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
47 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
48 return -EIO; 48 return -EIO;
49 49
@@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
62 if (!cpus_intersects(new_value, cpu_online_map)) 62 if (!cpus_intersects(new_value, cpu_online_map))
63 /* Special case for empty set - allow the architecture 63 /* Special case for empty set - allow the architecture
64 code to set default SMP affinity. */ 64 code to set default SMP affinity. */
65 return irq_select_affinity(irq) ? -EINVAL : count; 65 return irq_select_affinity_usr(irq) ? -EINVAL : count;
66 66
67 irq_set_affinity(irq, new_value); 67 irq_set_affinity(irq, new_value);
68 68
@@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = {
132static int irq_spurious_read(char *page, char **start, off_t off, 132static int irq_spurious_read(char *page, char **start, off_t off,
133 int count, int *eof, void *data) 133 int count, int *eof, void *data)
134{ 134{
135 struct irq_desc *d = &irq_desc[(long) data]; 135 struct irq_desc *desc = irq_to_desc((long) data);
136 return sprintf(page, "count %u\n" 136 return sprintf(page, "count %u\n"
137 "unhandled %u\n" 137 "unhandled %u\n"
138 "last_unhandled %u ms\n", 138 "last_unhandled %u ms\n",
139 d->irq_count, 139 desc->irq_count,
140 d->irqs_unhandled, 140 desc->irqs_unhandled,
141 jiffies_to_msecs(d->last_unhandled)); 141 jiffies_to_msecs(desc->last_unhandled));
142} 142}
143 143
144#define MAX_NAMELEN 128 144#define MAX_NAMELEN 128
145 145
146static int name_unique(unsigned int irq, struct irqaction *new_action) 146static int name_unique(unsigned int irq, struct irqaction *new_action)
147{ 147{
148 struct irq_desc *desc = irq_desc + irq; 148 struct irq_desc *desc = irq_to_desc(irq);
149 struct irqaction *action; 149 struct irqaction *action;
150 unsigned long flags; 150 unsigned long flags;
151 int ret = 1; 151 int ret = 1;
@@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
165void register_handler_proc(unsigned int irq, struct irqaction *action) 165void register_handler_proc(unsigned int irq, struct irqaction *action)
166{ 166{
167 char name [MAX_NAMELEN]; 167 char name [MAX_NAMELEN];
168 struct irq_desc *desc = irq_to_desc(irq);
168 169
169 if (!irq_desc[irq].dir || action->dir || !action->name || 170 if (!desc->dir || action->dir || !action->name ||
170 !name_unique(irq, action)) 171 !name_unique(irq, action))
171 return; 172 return;
172 173
@@ -174,36 +175,34 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
174 snprintf(name, MAX_NAMELEN, "%s", action->name); 175 snprintf(name, MAX_NAMELEN, "%s", action->name);
175 176
176 /* create /proc/irq/1234/handler/ */ 177 /* create /proc/irq/1234/handler/ */
177 action->dir = proc_mkdir(name, irq_desc[irq].dir); 178 action->dir = proc_mkdir(name, desc->dir);
178} 179}
179 180
180#undef MAX_NAMELEN 181#undef MAX_NAMELEN
181 182
182#define MAX_NAMELEN 10 183#define MAX_NAMELEN 10
183 184
184void register_irq_proc(unsigned int irq) 185void register_irq_proc(unsigned int irq, struct irq_desc *desc)
185{ 186{
186 char name [MAX_NAMELEN]; 187 char name [MAX_NAMELEN];
187 struct proc_dir_entry *entry; 188 struct proc_dir_entry *entry;
188 189
189 if (!root_irq_dir || 190 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
190 (irq_desc[irq].chip == &no_irq_chip) ||
191 irq_desc[irq].dir)
192 return; 191 return;
193 192
194 memset(name, 0, MAX_NAMELEN); 193 memset(name, 0, MAX_NAMELEN);
195 sprintf(name, "%d", irq); 194 sprintf(name, "%d", irq);
196 195
197 /* create /proc/irq/1234 */ 196 /* create /proc/irq/1234 */
198 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); 197 desc->dir = proc_mkdir(name, root_irq_dir);
199 198
200#ifdef CONFIG_SMP 199#ifdef CONFIG_SMP
201 /* create /proc/irq/<irq>/smp_affinity */ 200 /* create /proc/irq/<irq>/smp_affinity */
202 proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, 201 proc_create_data("smp_affinity", 0600, desc->dir,
203 &irq_affinity_proc_fops, (void *)(long)irq); 202 &irq_affinity_proc_fops, (void *)(long)irq);
204#endif 203#endif
205 204
206 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); 205 entry = create_proc_entry("spurious", 0444, desc->dir);
207 if (entry) { 206 if (entry) {
208 entry->data = (void *)(long)irq; 207 entry->data = (void *)(long)irq;
209 entry->read_proc = irq_spurious_read; 208 entry->read_proc = irq_spurious_read;
@@ -214,11 +213,14 @@ void register_irq_proc(unsigned int irq)
214 213
215void unregister_handler_proc(unsigned int irq, struct irqaction *action) 214void unregister_handler_proc(unsigned int irq, struct irqaction *action)
216{ 215{
217 if (action->dir) 216 if (action->dir) {
218 remove_proc_entry(action->dir->name, irq_desc[irq].dir); 217 struct irq_desc *desc = irq_to_desc(irq);
218
219 remove_proc_entry(action->dir->name, desc->dir);
220 }
219} 221}
220 222
221void register_default_affinity_proc(void) 223static void register_default_affinity_proc(void)
222{ 224{
223#ifdef CONFIG_SMP 225#ifdef CONFIG_SMP
224 proc_create("irq/default_smp_affinity", 0600, NULL, 226 proc_create("irq/default_smp_affinity", 0600, NULL,
@@ -228,7 +230,8 @@ void register_default_affinity_proc(void)
228 230
229void init_irq_proc(void) 231void init_irq_proc(void)
230{ 232{
231 int i; 233 unsigned int irq;
234 struct irq_desc *desc;
232 235
233 /* create /proc/irq */ 236 /* create /proc/irq */
234 root_irq_dir = proc_mkdir("irq", NULL); 237 root_irq_dir = proc_mkdir("irq", NULL);
@@ -240,7 +243,11 @@ void init_irq_proc(void)
240 /* 243 /*
241 * Create entries for all existing IRQs. 244 * Create entries for all existing IRQs.
242 */ 245 */
243 for (i = 0; i < NR_IRQS; i++) 246 for_each_irq_desc(irq, desc) {
244 register_irq_proc(i); 247 if (!desc)
248 continue;
249
250 register_irq_proc(irq, desc);
251 }
245} 252}
246 253
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index a8046791ba2d..89c7117acf2b 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -33,10 +33,10 @@ static void resend_irqs(unsigned long arg)
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 int irq; 34 int irq;
35 35
36 while (!bitmap_empty(irqs_resend, NR_IRQS)) { 36 while (!bitmap_empty(irqs_resend, nr_irqs)) {
37 irq = find_first_bit(irqs_resend, NR_IRQS); 37 irq = find_first_bit(irqs_resend, nr_irqs);
38 clear_bit(irq, irqs_resend); 38 clear_bit(irq, irqs_resend);
39 desc = irq_desc + irq; 39 desc = irq_to_desc(irq);
40 local_irq_disable(); 40 local_irq_disable();
41 desc->handle_irq(irq, desc); 41 desc->handle_irq(irq, desc);
42 local_irq_enable(); 42 local_irq_enable();
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c66d3f10e853..3738107531fd 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -12,83 +12,127 @@
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h>
15 16
16static int irqfixup __read_mostly; 17static int irqfixup __read_mostly;
17 18
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
20static void poll_spurious_irqs(unsigned long dummy);
21static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
22
18/* 23/*
19 * Recovery handler for misrouted interrupts. 24 * Recovery handler for misrouted interrupts.
20 */ 25 */
21static int misrouted_irq(int irq) 26static int try_one_irq(int irq, struct irq_desc *desc)
22{ 27{
23 int i; 28 struct irqaction *action;
24 int ok = 0; 29 int ok = 0, work = 0;
25 int work = 0; /* Did we do work for a real IRQ */
26
27 for (i = 1; i < NR_IRQS; i++) {
28 struct irq_desc *desc = irq_desc + i;
29 struct irqaction *action;
30
31 if (i == irq) /* Already tried */
32 continue;
33 30
34 spin_lock(&desc->lock); 31 spin_lock(&desc->lock);
35 /* Already running on another processor */ 32 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
37 /* 34 /*
38 * Already running: If it is shared get the other 35 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too 36 * CPU to go looking for our mystery interrupt too
40 */ 37 */
41 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
42 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock);
44 continue;
45 }
46 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS;
48 action = desc->action;
49 spin_unlock(&desc->lock); 40 spin_unlock(&desc->lock);
41 return ok;
42 }
43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action;
46 spin_unlock(&desc->lock);
50 47
51 while (action) { 48 while (action) {
52 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
53 if (action->flags & IRQF_SHARED) { 50 if (action->flags & IRQF_SHARED) {
54 if (action->handler(i, action->dev_id) == 51 if (action->handler(irq, action->dev_id) ==
55 IRQ_HANDLED) 52 IRQ_HANDLED)
56 ok = 1; 53 ok = 1;
57 }
58 action = action->next;
59 } 54 }
60 local_irq_disable(); 55 action = action->next;
61 /* Now clean up the flags */ 56 }
62 spin_lock(&desc->lock); 57 local_irq_disable();
63 action = desc->action; 58 /* Now clean up the flags */
59 spin_lock(&desc->lock);
60 action = desc->action;
64 61
62 /*
63 * While we were looking for a fixup someone queued a real
64 * IRQ clashing with our walk:
65 */
66 while ((desc->status & IRQ_PENDING) && action) {
65 /* 67 /*
66 * While we were looking for a fixup someone queued a real 68 * Perform real IRQ processing for the IRQ we deferred
67 * IRQ clashing with our walk:
68 */
69 while ((desc->status & IRQ_PENDING) && action) {
70 /*
71 * Perform real IRQ processing for the IRQ we deferred
72 */
73 work = 1;
74 spin_unlock(&desc->lock);
75 handle_IRQ_event(i, action);
76 spin_lock(&desc->lock);
77 desc->status &= ~IRQ_PENDING;
78 }
79 desc->status &= ~IRQ_INPROGRESS;
80 /*
81 * If we did actual work for the real IRQ line we must let the
82 * IRQ controller clean up too
83 */ 69 */
84 if (work && desc->chip && desc->chip->end) 70 work = 1;
85 desc->chip->end(i);
86 spin_unlock(&desc->lock); 71 spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING;
75 }
76 desc->status &= ~IRQ_INPROGRESS;
77 /*
78 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too
80 */
81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq);
83 spin_unlock(&desc->lock);
84
85 return ok;
86}
87
88static int misrouted_irq(int irq)
89{
90 struct irq_desc *desc;
91 int i, ok = 0;
92
93 for_each_irq_desc(i, desc) {
94 if (!desc)
95 continue;
96
97 if (!i)
98 continue;
99
100 if (i == irq) /* Already tried */
101 continue;
102
103 if (try_one_irq(i, desc))
104 ok = 1;
87 } 105 }
88 /* So the caller can adjust the irq error counts */ 106 /* So the caller can adjust the irq error counts */
89 return ok; 107 return ok;
90} 108}
91 109
110static void poll_spurious_irqs(unsigned long dummy)
111{
112 struct irq_desc *desc;
113 int i;
114
115 for_each_irq_desc(i, desc) {
116 unsigned int status;
117
118 if (!desc)
119 continue;
120 if (!i)
121 continue;
122
123 /* Racy but it doesn't matter */
124 status = desc->status;
125 barrier();
126 if (!(status & IRQ_SPURIOUS_DISABLED))
127 continue;
128
129 try_one_irq(i, desc);
130 }
131
132 mod_timer(&poll_spurious_irq_timer,
133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
134}
135
92/* 136/*
93 * If 99,900 of the previous 100,000 interrupts have not been handled 137 * If 99,900 of the previous 100,000 interrupts have not been handled
94 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 138 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -137,7 +181,9 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
137 } 181 }
138} 182}
139 183
140static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) 184static inline int
185try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
186 irqreturn_t action_ret)
141{ 187{
142 struct irqaction *action; 188 struct irqaction *action;
143 189
@@ -212,6 +258,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
212 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 258 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
213 desc->depth++; 259 desc->depth++;
214 desc->chip->disable(irq); 260 desc->chip->disable(irq);
261
262 mod_timer(&poll_spurious_irq_timer,
263 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
215 } 264 }
216 desc->irqs_unhandled = 0; 265 desc->irqs_unhandled = 0;
217} 266}
@@ -241,7 +290,7 @@ static int __init irqfixup_setup(char *str)
241 290
242__setup("irqfixup", irqfixup_setup); 291__setup("irqfixup", irqfixup_setup);
243module_param(irqfixup, int, 0644); 292module_param(irqfixup, int, 0644);
244MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); 293MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
245 294
246static int __init irqpoll_setup(char *str) 295static int __init irqpoll_setup(char *str)
247{ 296{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab982747d9bd..db7c358b9a02 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 55 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 56 break;
57 case ITIMER_VIRTUAL: 57 case ITIMER_VIRTUAL:
58 read_lock(&tasklist_lock);
59 spin_lock_irq(&tsk->sighand->siglock); 58 spin_lock_irq(&tsk->sighand->siglock);
60 cval = tsk->signal->it_virt_expires; 59 cval = tsk->signal->it_virt_expires;
61 cinterval = tsk->signal->it_virt_incr; 60 cinterval = tsk->signal->it_virt_incr;
62 if (!cputime_eq(cval, cputime_zero)) { 61 if (!cputime_eq(cval, cputime_zero)) {
63 struct task_struct *t = tsk; 62 struct task_cputime cputime;
64 cputime_t utime = tsk->signal->utime; 63 cputime_t utime;
65 do { 64
66 utime = cputime_add(utime, t->utime); 65 thread_group_cputime(tsk, &cputime);
67 t = next_thread(t); 66 utime = cputime.utime;
68 } while (t != tsk);
69 if (cputime_le(cval, utime)) { /* about to fire */ 67 if (cputime_le(cval, utime)) { /* about to fire */
70 cval = jiffies_to_cputime(1); 68 cval = jiffies_to_cputime(1);
71 } else { 69 } else {
@@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)
73 } 71 }
74 } 72 }
75 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
76 read_unlock(&tasklist_lock);
77 cputime_to_timeval(cval, &value->it_value); 74 cputime_to_timeval(cval, &value->it_value);
78 cputime_to_timeval(cinterval, &value->it_interval); 75 cputime_to_timeval(cinterval, &value->it_interval);
79 break; 76 break;
80 case ITIMER_PROF: 77 case ITIMER_PROF:
81 read_lock(&tasklist_lock);
82 spin_lock_irq(&tsk->sighand->siglock); 78 spin_lock_irq(&tsk->sighand->siglock);
83 cval = tsk->signal->it_prof_expires; 79 cval = tsk->signal->it_prof_expires;
84 cinterval = tsk->signal->it_prof_incr; 80 cinterval = tsk->signal->it_prof_incr;
85 if (!cputime_eq(cval, cputime_zero)) { 81 if (!cputime_eq(cval, cputime_zero)) {
86 struct task_struct *t = tsk; 82 struct task_cputime times;
87 cputime_t ptime = cputime_add(tsk->signal->utime, 83 cputime_t ptime;
88 tsk->signal->stime); 84
89 do { 85 thread_group_cputime(tsk, &times);
90 ptime = cputime_add(ptime, 86 ptime = cputime_add(times.utime, times.stime);
91 cputime_add(t->utime,
92 t->stime));
93 t = next_thread(t);
94 } while (t != tsk);
95 if (cputime_le(cval, ptime)) { /* about to fire */ 87 if (cputime_le(cval, ptime)) { /* about to fire */
96 cval = jiffies_to_cputime(1); 88 cval = jiffies_to_cputime(1);
97 } else { 89 } else {
@@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)
99 } 91 }
100 } 92 }
101 spin_unlock_irq(&tsk->sighand->siglock); 93 spin_unlock_irq(&tsk->sighand->siglock);
102 read_unlock(&tasklist_lock);
103 cputime_to_timeval(cval, &value->it_value); 94 cputime_to_timeval(cval, &value->it_value);
104 cputime_to_timeval(cinterval, &value->it_interval); 95 cputime_to_timeval(cinterval, &value->it_interval);
105 break; 96 break;
@@ -185,7 +176,6 @@ again:
185 case ITIMER_VIRTUAL: 176 case ITIMER_VIRTUAL:
186 nval = timeval_to_cputime(&value->it_value); 177 nval = timeval_to_cputime(&value->it_value);
187 ninterval = timeval_to_cputime(&value->it_interval); 178 ninterval = timeval_to_cputime(&value->it_interval);
188 read_lock(&tasklist_lock);
189 spin_lock_irq(&tsk->sighand->siglock); 179 spin_lock_irq(&tsk->sighand->siglock);
190 cval = tsk->signal->it_virt_expires; 180 cval = tsk->signal->it_virt_expires;
191 cinterval = tsk->signal->it_virt_incr; 181 cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@ again:
200 tsk->signal->it_virt_expires = nval; 190 tsk->signal->it_virt_expires = nval;
201 tsk->signal->it_virt_incr = ninterval; 191 tsk->signal->it_virt_incr = ninterval;
202 spin_unlock_irq(&tsk->sighand->siglock); 192 spin_unlock_irq(&tsk->sighand->siglock);
203 read_unlock(&tasklist_lock);
204 if (ovalue) { 193 if (ovalue) {
205 cputime_to_timeval(cval, &ovalue->it_value); 194 cputime_to_timeval(cval, &ovalue->it_value);
206 cputime_to_timeval(cinterval, &ovalue->it_interval); 195 cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@ again:
209 case ITIMER_PROF: 198 case ITIMER_PROF:
210 nval = timeval_to_cputime(&value->it_value); 199 nval = timeval_to_cputime(&value->it_value);
211 ninterval = timeval_to_cputime(&value->it_interval); 200 ninterval = timeval_to_cputime(&value->it_interval);
212 read_lock(&tasklist_lock);
213 spin_lock_irq(&tsk->sighand->siglock); 201 spin_lock_irq(&tsk->sighand->siglock);
214 cval = tsk->signal->it_prof_expires; 202 cval = tsk->signal->it_prof_expires;
215 cinterval = tsk->signal->it_prof_incr; 203 cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@ again:
224 tsk->signal->it_prof_expires = nval; 212 tsk->signal->it_prof_expires = nval;
225 tsk->signal->it_prof_incr = ninterval; 213 tsk->signal->it_prof_incr = ninterval;
226 spin_unlock_irq(&tsk->sighand->siglock); 214 spin_unlock_irq(&tsk->sighand->siglock);
227 read_unlock(&tasklist_lock);
228 if (ovalue) { 215 if (ovalue) {
229 cputime_to_timeval(cval, &ovalue->it_value); 216 cputime_to_timeval(cval, &ovalue->it_value);
230 cputime_to_timeval(cinterval, &ovalue->it_interval); 217 cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 38fc10ac7541..e694afa0eb8c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,20 +30,19 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33extern const unsigned long kallsyms_addresses[];
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const u8 kallsyms_names[];
35extern const u8 kallsyms_names[] __attribute__((weak));
36 35
37/* tell the compiler that the count isn't in the small data section if the arch 36/* tell the compiler that the count isn't in the small data section if the arch
38 * has one (eg: FRV) 37 * has one (eg: FRV)
39 */ 38 */
40extern const unsigned long kallsyms_num_syms 39extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 40 __attribute__((__section__(".rodata")));
42 41
43extern const u8 kallsyms_token_table[] __attribute__((weak)); 42extern const u8 kallsyms_token_table[];
44extern const u16 kallsyms_token_index[] __attribute__((weak)); 43extern const u16 kallsyms_token_index[];
45 44
46extern const unsigned long kallsyms_markers[] __attribute__((weak)); 45extern const unsigned long kallsyms_markers[];
47 46
48static inline int is_kernel_inittext(unsigned long addr) 47static inline int is_kernel_inittext(unsigned long addr)
49{ 48{
@@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr,
168 unsigned long symbol_start = 0, symbol_end = 0; 167 unsigned long symbol_start = 0, symbol_end = 0;
169 unsigned long i, low, high, mid; 168 unsigned long i, low, high, mid;
170 169
171 /* This kernel should never had been booted. */
172 BUG_ON(!kallsyms_addresses);
173
174 /* do a binary search on the sorted kallsyms_addresses array */ 170 /* do a binary search on the sorted kallsyms_addresses array */
175 low = 0; 171 low = 0;
176 high = kallsyms_num_syms; 172 high = kallsyms_num_syms;
@@ -260,7 +256,6 @@ const char *kallsyms_lookup(unsigned long addr,
260 /* see if it's in a module */ 256 /* see if it's in a module */
261 return module_address_lookup(addr, symbolsize, offset, modname, 257 return module_address_lookup(addr, symbolsize, offset, modname,
262 namebuf); 258 namebuf);
263 return NULL;
264} 259}
265 260
266int lookup_symbol_name(unsigned long addr, char *symname) 261int lookup_symbol_name(unsigned long addr, char *symname)
@@ -305,17 +300,24 @@ int sprint_symbol(char *buffer, unsigned long address)
305 char *modname; 300 char *modname;
306 const char *name; 301 const char *name;
307 unsigned long offset, size; 302 unsigned long offset, size;
308 char namebuf[KSYM_NAME_LEN]; 303 int len;
309 304
310 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 305 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
311 if (!name) 306 if (!name)
312 return sprintf(buffer, "0x%lx", address); 307 return sprintf(buffer, "0x%lx", address);
313 308
309 if (name != buffer)
310 strcpy(buffer, name);
311 len = strlen(buffer);
312 buffer += len;
313
314 if (modname) 314 if (modname)
315 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 315 len += sprintf(buffer, "+%#lx/%#lx [%s]",
316 size, modname); 316 offset, size, modname);
317 else 317 else
318 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 318 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
319
320 return len;
319} 321}
320 322
321/* Look up a kernel symbol and print it to the kernel messages. */ 323/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index aef265325cd3..ac0fde7b54d0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -30,6 +30,7 @@
30#include <linux/pm.h> 30#include <linux/pm.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h>
33 34
34#include <asm/page.h> 35#include <asm/page.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
@@ -1371,6 +1372,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1371 VMCOREINFO_SYMBOL(node_online_map); 1372 VMCOREINFO_SYMBOL(node_online_map);
1372 VMCOREINFO_SYMBOL(swapper_pg_dir); 1373 VMCOREINFO_SYMBOL(swapper_pg_dir);
1373 VMCOREINFO_SYMBOL(_stext); 1374 VMCOREINFO_SYMBOL(_stext);
1375 VMCOREINFO_SYMBOL(vmlist);
1374 1376
1375#ifndef CONFIG_NEED_MULTIPLE_NODES 1377#ifndef CONFIG_NEED_MULTIPLE_NODES
1376 VMCOREINFO_SYMBOL(mem_map); 1378 VMCOREINFO_SYMBOL(mem_map);
@@ -1406,6 +1408,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1406 VMCOREINFO_OFFSET(free_area, free_list); 1408 VMCOREINFO_OFFSET(free_area, free_list);
1407 VMCOREINFO_OFFSET(list_head, next); 1409 VMCOREINFO_OFFSET(list_head, next);
1408 VMCOREINFO_OFFSET(list_head, prev); 1410 VMCOREINFO_OFFSET(list_head, prev);
1411 VMCOREINFO_OFFSET(vm_struct, addr);
1409 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1412 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1410 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1411 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2456d1a0befb..b46dbb908669 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -113,15 +113,15 @@ int request_module(const char *fmt, ...)
113 return ret; 113 return ret;
114} 114}
115EXPORT_SYMBOL(request_module); 115EXPORT_SYMBOL(request_module);
116#endif /* CONFIG_KMOD */ 116#endif /* CONFIG_MODULES */
117 117
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
120 struct completion *complete; 120 struct completion *complete;
121 struct cred *cred;
121 char *path; 122 char *path;
122 char **argv; 123 char **argv;
123 char **envp; 124 char **envp;
124 struct key *ring;
125 enum umh_wait wait; 125 enum umh_wait wait;
126 int retval; 126 int retval;
127 struct file *stdin; 127 struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
134static int ____call_usermodehelper(void *data) 134static int ____call_usermodehelper(void *data)
135{ 135{
136 struct subprocess_info *sub_info = data; 136 struct subprocess_info *sub_info = data;
137 struct key *new_session, *old_session;
138 int retval; 137 int retval;
139 138
140 /* Unblock all signals and set the session keyring. */ 139 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
141 new_session = key_get(sub_info->ring); 140
141 /* Unblock all signals */
142 spin_lock_irq(&current->sighand->siglock); 142 spin_lock_irq(&current->sighand->siglock);
143 old_session = __install_session_keyring(current, new_session);
144 flush_signal_handlers(current, 1); 143 flush_signal_handlers(current, 1);
145 sigemptyset(&current->blocked); 144 sigemptyset(&current->blocked);
146 recalc_sigpending(); 145 recalc_sigpending();
147 spin_unlock_irq(&current->sighand->siglock); 146 spin_unlock_irq(&current->sighand->siglock);
148 147
149 key_put(old_session); 148 /* Install the credentials */
149 commit_creds(sub_info->cred);
150 sub_info->cred = NULL;
150 151
151 /* Install input pipe when needed */ 152 /* Install input pipe when needed */
152 if (sub_info->stdin) { 153 if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
185{ 186{
186 if (info->cleanup) 187 if (info->cleanup)
187 (*info->cleanup)(info->argv, info->envp); 188 (*info->cleanup)(info->argv, info->envp);
189 if (info->cred)
190 put_cred(info->cred);
188 kfree(info); 191 kfree(info);
189} 192}
190EXPORT_SYMBOL(call_usermodehelper_freeinfo); 193EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
240 pid_t pid; 243 pid_t pid;
241 enum umh_wait wait = sub_info->wait; 244 enum umh_wait wait = sub_info->wait;
242 245
246 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
247
243 /* CLONE_VFORK: wait until the usermode helper has execve'd 248 /* CLONE_VFORK: wait until the usermode helper has execve'd
244 * successfully We need the data structures to stay around 249 * successfully We need the data structures to stay around
245 * until that is done. */ 250 * until that is done. */
@@ -265,7 +270,7 @@ static void __call_usermodehelper(struct work_struct *work)
265 } 270 }
266} 271}
267 272
268#ifdef CONFIG_PM 273#ifdef CONFIG_PM_SLEEP
269/* 274/*
270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 275 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
271 * (used for preventing user land processes from being created after the user 276 * (used for preventing user land processes from being created after the user
@@ -288,39 +293,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
288 */ 293 */
289#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 294#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
290 295
291static int usermodehelper_pm_callback(struct notifier_block *nfb, 296/**
292 unsigned long action, 297 * usermodehelper_disable - prevent new helpers from being started
293 void *ignored) 298 */
299int usermodehelper_disable(void)
294{ 300{
295 long retval; 301 long retval;
296 302
297 switch (action) { 303 usermodehelper_disabled = 1;
298 case PM_HIBERNATION_PREPARE: 304 smp_mb();
299 case PM_SUSPEND_PREPARE: 305 /*
300 usermodehelper_disabled = 1; 306 * From now on call_usermodehelper_exec() won't start any new
301 smp_mb(); 307 * helpers, so it is sufficient if running_helpers turns out to
302 /* 308 * be zero at one point (it may be increased later, but that
303 * From now on call_usermodehelper_exec() won't start any new 309 * doesn't matter).
304 * helpers, so it is sufficient if running_helpers turns out to 310 */
305 * be zero at one point (it may be increased later, but that 311 retval = wait_event_timeout(running_helpers_waitq,
306 * doesn't matter).
307 */
308 retval = wait_event_timeout(running_helpers_waitq,
309 atomic_read(&running_helpers) == 0, 312 atomic_read(&running_helpers) == 0,
310 RUNNING_HELPERS_TIMEOUT); 313 RUNNING_HELPERS_TIMEOUT);
311 if (retval) { 314 if (retval)
312 return NOTIFY_OK; 315 return 0;
313 } else {
314 usermodehelper_disabled = 0;
315 return NOTIFY_BAD;
316 }
317 case PM_POST_HIBERNATION:
318 case PM_POST_SUSPEND:
319 usermodehelper_disabled = 0;
320 return NOTIFY_OK;
321 }
322 316
323 return NOTIFY_DONE; 317 usermodehelper_disabled = 0;
318 return -EAGAIN;
319}
320
321/**
322 * usermodehelper_enable - allow new helpers to be started again
323 */
324void usermodehelper_enable(void)
325{
326 usermodehelper_disabled = 0;
324} 327}
325 328
326static void helper_lock(void) 329static void helper_lock(void)
@@ -334,18 +337,12 @@ static void helper_unlock(void)
334 if (atomic_dec_and_test(&running_helpers)) 337 if (atomic_dec_and_test(&running_helpers))
335 wake_up(&running_helpers_waitq); 338 wake_up(&running_helpers_waitq);
336} 339}
337 340#else /* CONFIG_PM_SLEEP */
338static void register_pm_notifier_callback(void)
339{
340 pm_notifier(usermodehelper_pm_callback, 0);
341}
342#else /* CONFIG_PM */
343#define usermodehelper_disabled 0 341#define usermodehelper_disabled 0
344 342
345static inline void helper_lock(void) {} 343static inline void helper_lock(void) {}
346static inline void helper_unlock(void) {} 344static inline void helper_unlock(void) {}
347static inline void register_pm_notifier_callback(void) {} 345#endif /* CONFIG_PM_SLEEP */
348#endif /* CONFIG_PM */
349 346
350/** 347/**
351 * call_usermodehelper_setup - prepare to call a usermode helper 348 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -370,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
370 sub_info->path = path; 367 sub_info->path = path;
371 sub_info->argv = argv; 368 sub_info->argv = argv;
372 sub_info->envp = envp; 369 sub_info->envp = envp;
370 sub_info->cred = prepare_usermodehelper_creds();
371 if (!sub_info->cred)
372 return NULL;
373 373
374 out: 374 out:
375 return sub_info; 375 return sub_info;
@@ -384,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
384void call_usermodehelper_setkeys(struct subprocess_info *info, 384void call_usermodehelper_setkeys(struct subprocess_info *info,
385 struct key *session_keyring) 385 struct key *session_keyring)
386{ 386{
387 info->ring = session_keyring; 387#ifdef CONFIG_KEYS
388 struct thread_group_cred *tgcred = info->cred->tgcred;
389 key_put(tgcred->session_keyring);
390 tgcred->session_keyring = key_get(session_keyring);
391#else
392 BUG();
393#endif
388} 394}
389EXPORT_SYMBOL(call_usermodehelper_setkeys); 395EXPORT_SYMBOL(call_usermodehelper_setkeys);
390 396
@@ -452,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
452 DECLARE_COMPLETION_ONSTACK(done); 458 DECLARE_COMPLETION_ONSTACK(done);
453 int retval = 0; 459 int retval = 0;
454 460
461 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
462
455 helper_lock(); 463 helper_lock();
456 if (sub_info->path[0] == '\0') 464 if (sub_info->path[0] == '\0')
457 goto out; 465 goto out;
@@ -515,5 +523,4 @@ void __init usermodehelper_init(void)
515{ 523{
516 khelper_wq = create_singlethread_workqueue("khelper"); 524 khelper_wq = create_singlethread_workqueue("khelper");
517 BUG_ON(!khelper_wq); 525 BUG_ON(!khelper_wq);
518 register_pm_notifier_callback();
519} 526}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 75bc2cd9ebc6..9f8a3f25259a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned; 75 spinlock_t lock ____cacheline_aligned_in_smp;
76} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 76} kretprobe_table_locks[KPROBE_TABLE_SIZE];
77 77
78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 404 spin_lock_irqsave(hlist_lock, *flags);
405} 405}
406 406
407void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
408{ 408{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 410 spin_lock_irqsave(hlist_lock, *flags);
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
613 return -EINVAL; 613 return -EINVAL;
614 p->addr = addr; 614 p->addr = addr;
615 615
616 if (!kernel_text_address((unsigned long) p->addr) || 616 preempt_disable();
617 in_kprobes_functions((unsigned long) p->addr)) 617 if (!__kernel_text_address((unsigned long) p->addr) ||
618 in_kprobes_functions((unsigned long) p->addr)) {
619 preempt_enable();
618 return -EINVAL; 620 return -EINVAL;
621 }
619 622
620 p->mod_refcounted = 0; 623 p->mod_refcounted = 0;
621 624
622 /* 625 /*
623 * Check if are we probing a module. 626 * Check if are we probing a module.
624 */ 627 */
625 probed_mod = module_text_address((unsigned long) p->addr); 628 probed_mod = __module_text_address((unsigned long) p->addr);
626 if (probed_mod) { 629 if (probed_mod) {
627 struct module *calling_mod = module_text_address(called_from); 630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
628 /* 632 /*
629 * We must allow modules to probe themself and in this case 633 * We must allow modules to probe themself and in this case
630 * avoid incrementing the module refcount, so as to allow 634 * avoid incrementing the module refcount, so as to allow
631 * unloading of self probing modules. 635 * unloading of self probing modules.
632 */ 636 */
633 if (calling_mod && calling_mod != probed_mod) { 637 if (calling_mod && calling_mod != probed_mod) {
634 if (unlikely(!try_module_get(probed_mod))) 638 if (unlikely(!try_module_get(probed_mod))) {
639 preempt_enable();
635 return -EINVAL; 640 return -EINVAL;
641 }
636 p->mod_refcounted = 1; 642 p->mod_refcounted = 1;
637 } else 643 } else
638 probed_mod = NULL; 644 probed_mod = NULL;
639 } 645 }
646 preempt_enable();
640 647
641 p->nmissed = 0; 648 p->nmissed = 0;
642 INIT_LIST_HEAD(&p->list); 649 INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
718 struct kprobe *old_p; 725 struct kprobe *old_p;
719 726
720 if (p->mod_refcounted) { 727 if (p->mod_refcounted) {
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
721 mod = module_text_address((unsigned long)p->addr); 732 mod = module_text_address((unsigned long)p->addr);
722 if (mod) 733 if (mod)
723 module_put(mod); 734 module_put(mod);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e53bc30e9ba5..08dd8ed86c77 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18 19
19#define KERNEL_ATTR_RO(_name) \ 20#define KERNEL_ATTR_RO(_name) \
@@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
53KERNEL_ATTR_RW(uevent_helper); 54KERNEL_ATTR_RW(uevent_helper);
54#endif 55#endif
55 56
57#ifdef CONFIG_PROFILING
58static ssize_t profiling_show(struct kobject *kobj,
59 struct kobj_attribute *attr, char *buf)
60{
61 return sprintf(buf, "%d\n", prof_on);
62}
63static ssize_t profiling_store(struct kobject *kobj,
64 struct kobj_attribute *attr,
65 const char *buf, size_t count)
66{
67 int ret;
68
69 if (prof_on)
70 return -EEXIST;
71 /*
72 * This eventually calls into get_option() which
73 * has a ton of callers and is not const. It is
74 * easiest to cast it away here.
75 */
76 profile_setup((char *)buf);
77 ret = profile_init();
78 if (ret)
79 return ret;
80 ret = create_proc_profile();
81 if (ret)
82 return ret;
83 return count;
84}
85KERNEL_ATTR_RW(profiling);
86#endif
87
56#ifdef CONFIG_KEXEC 88#ifdef CONFIG_KEXEC
57static ssize_t kexec_loaded_show(struct kobject *kobj, 89static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf) 90 struct kobj_attribute *attr, char *buf)
@@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = {
109 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
110 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
111#endif 143#endif
144#ifdef CONFIG_PROFILING
145 &profiling_attr.attr,
146#endif
112#ifdef CONFIG_KEXEC 147#ifdef CONFIG_KEXEC
113 &kexec_loaded_attr.attr, 148 &kexec_loaded_attr.attr,
114 &kexec_crash_loaded_attr.attr, 149 &kexec_crash_loaded_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 96cff2f8710b..4fbc456f393d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <trace/sched.h>
16 17
17#define KTHREAD_NICE_LEVEL (-5) 18#define KTHREAD_NICE_LEVEL (-5)
18 19
@@ -20,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
20static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
21struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
22 23
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
23struct kthread_create_info 27struct kthread_create_info
24{ 28{
25 /* Information passed to kthread() from kthreadd. */ 29 /* Information passed to kthread() from kthreadd. */
@@ -171,12 +175,11 @@ EXPORT_SYMBOL(kthread_create);
171 */ 175 */
172void kthread_bind(struct task_struct *k, unsigned int cpu) 176void kthread_bind(struct task_struct *k, unsigned int cpu)
173{ 177{
174 if (k->state != TASK_UNINTERRUPTIBLE) { 178 /* Must have done schedule() in kthread() before we set_task_cpu */
179 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
175 WARN_ON(1); 180 WARN_ON(1);
176 return; 181 return;
177 } 182 }
178 /* Must have done schedule() in kthread() before we set_task_cpu */
179 wait_task_inactive(k, 0);
180 set_task_cpu(k, cpu); 183 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 184 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 185 k->rt.nr_cpus_allowed = 1;
@@ -206,6 +209,8 @@ int kthread_stop(struct task_struct *k)
206 /* It could exit after stop_info.k set, but before wake_up_process. */ 209 /* It could exit after stop_info.k set, but before wake_up_process. */
207 get_task_struct(k); 210 get_task_struct(k);
208 211
212 trace_sched_kthread_stop(k);
213
209 /* Must init completion *before* thread sees kthread_stop_info.k */ 214 /* Must init completion *before* thread sees kthread_stop_info.k */
210 init_completion(&kthread_stop_info.done); 215 init_completion(&kthread_stop_info.done);
211 smp_wmb(); 216 smp_wmb();
@@ -221,6 +226,8 @@ int kthread_stop(struct task_struct *k)
221 ret = kthread_stop_info.err; 226 ret = kthread_stop_info.err;
222 mutex_unlock(&kthread_stop_lock); 227 mutex_unlock(&kthread_stop_lock);
223 228
229 trace_sched_kthread_stop_ret(ret);
230
224 return ret; 231 return ret;
225} 232}
226EXPORT_SYMBOL(kthread_stop); 233EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c56923..449db466bdbc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
191 latency_record[i].time, 191 latency_record[i].time,
192 latency_record[i].max); 192 latency_record[i].max);
193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 193 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
194 char sym[KSYM_NAME_LEN]; 194 char sym[KSYM_SYMBOL_LEN];
195 char *c; 195 char *c;
196 if (!latency_record[i].backtrace[q]) 196 if (!latency_record[i].backtrace[q])
197 break; 197 break;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dbda475b13bd..06b0c3568f0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -25,6 +25,7 @@
25 * Thanks to Arjan van de Ven for coming up with the initial idea of 25 * Thanks to Arjan van de Ven for coming up with the initial idea of
26 * mapping lock dependencies runtime. 26 * mapping lock dependencies runtime.
27 */ 27 */
28#define DISABLE_BRANCH_PROFILING
28#include <linux/mutex.h> 29#include <linux/mutex.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/delay.h> 31#include <linux/delay.h>
@@ -136,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
136#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
138 139
139static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
140{ 141{
141 int i; 142 int i;
142 143
143 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
144 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
145 class->contention_point[i] = ip; 146 points[i] = ip;
146 break; 147 break;
147 } 148 }
148 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
149 break; 150 break;
150 } 151 }
151 152
@@ -185,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
185 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
186 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
187 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
188 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
189 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
190 194
@@ -209,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
209 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
210 } 214 }
211 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
212} 217}
213 218
214static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -287,14 +292,12 @@ void lockdep_off(void)
287{ 292{
288 current->lockdep_recursion++; 293 current->lockdep_recursion++;
289} 294}
290
291EXPORT_SYMBOL(lockdep_off); 295EXPORT_SYMBOL(lockdep_off);
292 296
293void lockdep_on(void) 297void lockdep_on(void)
294{ 298{
295 current->lockdep_recursion--; 299 current->lockdep_recursion--;
296} 300}
297
298EXPORT_SYMBOL(lockdep_on); 301EXPORT_SYMBOL(lockdep_on);
299 302
300/* 303/*
@@ -576,7 +579,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
576/* 579/*
577 * printk all lock dependencies starting at <entry>: 580 * printk all lock dependencies starting at <entry>:
578 */ 581 */
579static void print_lock_dependencies(struct lock_class *class, int depth) 582static void __used
583print_lock_dependencies(struct lock_class *class, int depth)
580{ 584{
581 struct lock_list *entry; 585 struct lock_list *entry;
582 586
@@ -2169,12 +2173,11 @@ void early_boot_irqs_on(void)
2169/* 2173/*
2170 * Hardirqs will be enabled: 2174 * Hardirqs will be enabled:
2171 */ 2175 */
2172void trace_hardirqs_on_caller(unsigned long a0) 2176void trace_hardirqs_on_caller(unsigned long ip)
2173{ 2177{
2174 struct task_struct *curr = current; 2178 struct task_struct *curr = current;
2175 unsigned long ip;
2176 2179
2177 time_hardirqs_on(CALLER_ADDR0, a0); 2180 time_hardirqs_on(CALLER_ADDR0, ip);
2178 2181
2179 if (unlikely(!debug_locks || current->lockdep_recursion)) 2182 if (unlikely(!debug_locks || current->lockdep_recursion))
2180 return; 2183 return;
@@ -2188,7 +2191,6 @@ void trace_hardirqs_on_caller(unsigned long a0)
2188 } 2191 }
2189 /* we'll do an OFF -> ON transition: */ 2192 /* we'll do an OFF -> ON transition: */
2190 curr->hardirqs_enabled = 1; 2193 curr->hardirqs_enabled = 1;
2191 ip = (unsigned long) __builtin_return_address(0);
2192 2194
2193 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2195 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2194 return; 2196 return;
@@ -2224,11 +2226,11 @@ EXPORT_SYMBOL(trace_hardirqs_on);
2224/* 2226/*
2225 * Hardirqs were disabled: 2227 * Hardirqs were disabled:
2226 */ 2228 */
2227void trace_hardirqs_off_caller(unsigned long a0) 2229void trace_hardirqs_off_caller(unsigned long ip)
2228{ 2230{
2229 struct task_struct *curr = current; 2231 struct task_struct *curr = current;
2230 2232
2231 time_hardirqs_off(CALLER_ADDR0, a0); 2233 time_hardirqs_off(CALLER_ADDR0, ip);
2232 2234
2233 if (unlikely(!debug_locks || current->lockdep_recursion)) 2235 if (unlikely(!debug_locks || current->lockdep_recursion))
2234 return; 2236 return;
@@ -2241,7 +2243,7 @@ void trace_hardirqs_off_caller(unsigned long a0)
2241 * We have done an ON -> OFF transition: 2243 * We have done an ON -> OFF transition:
2242 */ 2244 */
2243 curr->hardirqs_enabled = 0; 2245 curr->hardirqs_enabled = 0;
2244 curr->hardirq_disable_ip = _RET_IP_; 2246 curr->hardirq_disable_ip = ip;
2245 curr->hardirq_disable_event = ++curr->irq_events; 2247 curr->hardirq_disable_event = ++curr->irq_events;
2246 debug_atomic_inc(&hardirqs_off_events); 2248 debug_atomic_inc(&hardirqs_off_events);
2247 } else 2249 } else
@@ -2510,7 +2512,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2510 if (subclass) 2512 if (subclass)
2511 register_lock_class(lock, subclass, 1); 2513 register_lock_class(lock, subclass, 1);
2512} 2514}
2513
2514EXPORT_SYMBOL_GPL(lockdep_init_map); 2515EXPORT_SYMBOL_GPL(lockdep_init_map);
2515 2516
2516/* 2517/*
@@ -2691,8 +2692,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2691} 2692}
2692 2693
2693static int 2694static int
2694__lock_set_subclass(struct lockdep_map *lock, 2695__lock_set_class(struct lockdep_map *lock, const char *name,
2695 unsigned int subclass, unsigned long ip) 2696 struct lock_class_key *key, unsigned int subclass,
2697 unsigned long ip)
2696{ 2698{
2697 struct task_struct *curr = current; 2699 struct task_struct *curr = current;
2698 struct held_lock *hlock, *prev_hlock; 2700 struct held_lock *hlock, *prev_hlock;
@@ -2719,6 +2721,7 @@ __lock_set_subclass(struct lockdep_map *lock,
2719 return print_unlock_inbalance_bug(curr, lock, ip); 2721 return print_unlock_inbalance_bug(curr, lock, ip);
2720 2722
2721found_it: 2723found_it:
2724 lockdep_init_map(lock, name, key, 0);
2722 class = register_lock_class(lock, subclass, 0); 2725 class = register_lock_class(lock, subclass, 0);
2723 hlock->class_idx = class - lock_classes + 1; 2726 hlock->class_idx = class - lock_classes + 1;
2724 2727
@@ -2903,9 +2906,9 @@ static void check_flags(unsigned long flags)
2903#endif 2906#endif
2904} 2907}
2905 2908
2906void 2909void lock_set_class(struct lockdep_map *lock, const char *name,
2907lock_set_subclass(struct lockdep_map *lock, 2910 struct lock_class_key *key, unsigned int subclass,
2908 unsigned int subclass, unsigned long ip) 2911 unsigned long ip)
2909{ 2912{
2910 unsigned long flags; 2913 unsigned long flags;
2911 2914
@@ -2915,13 +2918,12 @@ lock_set_subclass(struct lockdep_map *lock,
2915 raw_local_irq_save(flags); 2918 raw_local_irq_save(flags);
2916 current->lockdep_recursion = 1; 2919 current->lockdep_recursion = 1;
2917 check_flags(flags); 2920 check_flags(flags);
2918 if (__lock_set_subclass(lock, subclass, ip)) 2921 if (__lock_set_class(lock, name, key, subclass, ip))
2919 check_chain_key(current); 2922 check_chain_key(current);
2920 current->lockdep_recursion = 0; 2923 current->lockdep_recursion = 0;
2921 raw_local_irq_restore(flags); 2924 raw_local_irq_restore(flags);
2922} 2925}
2923 2926EXPORT_SYMBOL_GPL(lock_set_class);
2924EXPORT_SYMBOL_GPL(lock_set_subclass);
2925 2927
2926/* 2928/*
2927 * We are not always called with irqs disabled - do that here, 2929 * We are not always called with irqs disabled - do that here,
@@ -2945,7 +2947,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2945 current->lockdep_recursion = 0; 2947 current->lockdep_recursion = 0;
2946 raw_local_irq_restore(flags); 2948 raw_local_irq_restore(flags);
2947} 2949}
2948
2949EXPORT_SYMBOL_GPL(lock_acquire); 2950EXPORT_SYMBOL_GPL(lock_acquire);
2950 2951
2951void lock_release(struct lockdep_map *lock, int nested, 2952void lock_release(struct lockdep_map *lock, int nested,
@@ -2963,7 +2964,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2963 current->lockdep_recursion = 0; 2964 current->lockdep_recursion = 0;
2964 raw_local_irq_restore(flags); 2965 raw_local_irq_restore(flags);
2965} 2966}
2966
2967EXPORT_SYMBOL_GPL(lock_release); 2967EXPORT_SYMBOL_GPL(lock_release);
2968 2968
2969#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
@@ -3001,7 +3001,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3001 struct held_lock *hlock, *prev_hlock; 3001 struct held_lock *hlock, *prev_hlock;
3002 struct lock_class_stats *stats; 3002 struct lock_class_stats *stats;
3003 unsigned int depth; 3003 unsigned int depth;
3004 int i, point; 3004 int i, contention_point, contending_point;
3005 3005
3006 depth = curr->lockdep_depth; 3006 depth = curr->lockdep_depth;
3007 if (DEBUG_LOCKS_WARN_ON(!depth)) 3007 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3025,18 +3025,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3025found_it: 3025found_it:
3026 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
3027 3027
3028 point = lock_contention_point(hlock_class(hlock), ip); 3028 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3029 contending_point = lock_point(hlock_class(hlock)->contending_point,
3030 lock->ip);
3029 3031
3030 stats = get_lock_stats(hlock_class(hlock)); 3032 stats = get_lock_stats(hlock_class(hlock));
3031 if (point < ARRAY_SIZE(stats->contention_point)) 3033 if (contention_point < LOCKSTAT_POINTS)
3032 stats->contention_point[point]++; 3034 stats->contention_point[contention_point]++;
3035 if (contending_point < LOCKSTAT_POINTS)
3036 stats->contending_point[contending_point]++;
3033 if (lock->cpu != smp_processor_id()) 3037 if (lock->cpu != smp_processor_id())
3034 stats->bounces[bounce_contended + !!hlock->read]++; 3038 stats->bounces[bounce_contended + !!hlock->read]++;
3035 put_lock_stats(stats); 3039 put_lock_stats(stats);
3036} 3040}
3037 3041
3038static void 3042static void
3039__lock_acquired(struct lockdep_map *lock) 3043__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3040{ 3044{
3041 struct task_struct *curr = current; 3045 struct task_struct *curr = current;
3042 struct held_lock *hlock, *prev_hlock; 3046 struct held_lock *hlock, *prev_hlock;
@@ -3085,6 +3089,7 @@ found_it:
3085 put_lock_stats(stats); 3089 put_lock_stats(stats);
3086 3090
3087 lock->cpu = cpu; 3091 lock->cpu = cpu;
3092 lock->ip = ip;
3088} 3093}
3089 3094
3090void lock_contended(struct lockdep_map *lock, unsigned long ip) 3095void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3106,7 +3111,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3106} 3111}
3107EXPORT_SYMBOL_GPL(lock_contended); 3112EXPORT_SYMBOL_GPL(lock_contended);
3108 3113
3109void lock_acquired(struct lockdep_map *lock) 3114void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3110{ 3115{
3111 unsigned long flags; 3116 unsigned long flags;
3112 3117
@@ -3119,7 +3124,7 @@ void lock_acquired(struct lockdep_map *lock)
3119 raw_local_irq_save(flags); 3124 raw_local_irq_save(flags);
3120 check_flags(flags); 3125 check_flags(flags);
3121 current->lockdep_recursion = 1; 3126 current->lockdep_recursion = 1;
3122 __lock_acquired(lock); 3127 __lock_acquired(lock, ip);
3123 current->lockdep_recursion = 0; 3128 current->lockdep_recursion = 0;
3124 raw_local_irq_restore(flags); 3129 raw_local_irq_restore(flags);
3125} 3130}
@@ -3278,10 +3283,10 @@ void __init lockdep_info(void)
3278{ 3283{
3279 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); 3284 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
3280 3285
3281 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); 3286 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
3282 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); 3287 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
3283 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); 3288 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
3284 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); 3289 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
3285 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); 3290 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
3286 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); 3291 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
3287 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); 3292 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
@@ -3417,9 +3422,10 @@ retry:
3417 } 3422 }
3418 printk(" ignoring it.\n"); 3423 printk(" ignoring it.\n");
3419 unlock = 0; 3424 unlock = 0;
3425 } else {
3426 if (count != 10)
3427 printk(KERN_CONT " locked it.\n");
3420 } 3428 }
3421 if (count != 10)
3422 printk(" locked it.\n");
3423 3429
3424 do_each_thread(g, p) { 3430 do_each_thread(g, p) {
3425 /* 3431 /*
@@ -3442,7 +3448,6 @@ retry:
3442 if (unlock) 3448 if (unlock)
3443 read_unlock(&tasklist_lock); 3449 read_unlock(&tasklist_lock);
3444} 3450}
3445
3446EXPORT_SYMBOL_GPL(debug_show_all_locks); 3451EXPORT_SYMBOL_GPL(debug_show_all_locks);
3447 3452
3448/* 3453/*
@@ -3463,7 +3468,6 @@ void debug_show_held_locks(struct task_struct *task)
3463{ 3468{
3464 __debug_show_held_locks(task); 3469 __debug_show_held_locks(task);
3465} 3470}
3466
3467EXPORT_SYMBOL_GPL(debug_show_held_locks); 3471EXPORT_SYMBOL_GPL(debug_show_held_locks);
3468 3472
3469void lockdep_sys_exit(void) 3473void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/marker.c b/kernel/marker.c
index 7d1faecd7a51..ea54f2647868 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
43 */ 43 */
44#define MARKER_HASH_BITS 6 44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -62,13 +63,12 @@ struct marker_entry {
62 int refcount; /* Number of times armed. 0 if disarmed. */ 63 int refcount; /* Number of times armed. 0 if disarmed. */
63 struct rcu_head rcu; 64 struct rcu_head rcu;
64 void *oldptr; 65 void *oldptr;
65 unsigned char rcu_pending:1; 66 int rcu_pending;
66 unsigned char ptype:1; 67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 69 char name[0]; /* Contains name'\0'format'\0' */
68}; 70};
69 71
70static struct hlist_head marker_table[MARKER_TABLE_SIZE];
71
72/** 72/**
73 * __mark_empty_function - Empty probe callback 73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data 74 * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
81 * though the function pointer change and the marker enabling are two distinct 81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code. 82 * operations that modifies the execution flow of preemptible code.
83 */ 83 */
84void __mark_empty_function(void *probe_private, void *call_private, 84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args) 85 const char *fmt, va_list *args)
86{ 86{
87} 87}
@@ -97,17 +97,18 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
97 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
99 */ 99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private, ...) 100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
101{ 102{
102 va_list args; 103 va_list args;
103 char ptype; 104 char ptype;
104 105
105 /* 106 /*
106 * preempt_disable does two things : disabling preemption to make sure 107 * rcu_read_lock_sched does two things : disabling preemption to make
107 * the teardown of the callbacks can be done correctly when they are in 108 * sure the teardown of the callbacks can be done correctly when they
108 * modules and they insure RCU read coherency. 109 * are in modules and they insure RCU read coherency.
109 */ 110 */
110 preempt_disable(); 111 rcu_read_lock_sched_notrace();
111 ptype = mdata->ptype; 112 ptype = mdata->ptype;
112 if (likely(!ptype)) { 113 if (likely(!ptype)) {
113 marker_probe_func *func; 114 marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
145 va_end(args); 146 va_end(args);
146 } 147 }
147 } 148 }
148 preempt_enable(); 149 rcu_read_unlock_sched_notrace();
149} 150}
150EXPORT_SYMBOL_GPL(marker_probe_cb); 151EXPORT_SYMBOL_GPL(marker_probe_cb);
151 152
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
157 * 158 *
158 * Should be connected to markers "MARK_NOARGS". 159 * Should be connected to markers "MARK_NOARGS".
159 */ 160 */
160void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) 161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
161{ 163{
162 va_list args; /* not initialized */ 164 va_list args; /* not initialized */
163 char ptype; 165 char ptype;
164 166
165 preempt_disable(); 167 rcu_read_lock_sched_notrace();
166 ptype = mdata->ptype; 168 ptype = mdata->ptype;
167 if (likely(!ptype)) { 169 if (likely(!ptype)) {
168 marker_probe_func *func; 170 marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
195 multi[i].func(multi[i].probe_private, call_private, 197 multi[i].func(multi[i].probe_private, call_private,
196 mdata->format, &args); 198 mdata->format, &args);
197 } 199 }
198 preempt_enable(); 200 rcu_read_unlock_sched_notrace();
199} 201}
200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
201 202
202static void free_old_closure(struct rcu_head *head) 203static void free_old_closure(struct rcu_head *head)
203{ 204{
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
416 e->single.probe_private = NULL; 417 e->single.probe_private = NULL;
417 e->multi = NULL; 418 e->multi = NULL;
418 e->ptype = 0; 419 e->ptype = 0;
420 e->format_allocated = 0;
419 e->refcount = 0; 421 e->refcount = 0;
420 e->rcu_pending = 0; 422 e->rcu_pending = 0;
421 hlist_add_head(&e->hlist, head); 423 hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
447 if (e->single.func != __mark_empty_function) 449 if (e->single.func != __mark_empty_function)
448 return -EBUSY; 450 return -EBUSY;
449 hlist_del(&e->hlist); 451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
450 /* Make sure the call_rcu has been executed */ 454 /* Make sure the call_rcu has been executed */
451 if (e->rcu_pending) 455 if (e->rcu_pending)
452 rcu_barrier_sched(); 456 rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
457/* 461/*
458 * Set the mark_entry format to the format found in the element. 462 * Set the mark_entry format to the format found in the element.
459 */ 463 */
460static int marker_set_format(struct marker_entry **entry, const char *format) 464static int marker_set_format(struct marker_entry *entry, const char *format)
461{ 465{
462 struct marker_entry *e; 466 entry->format = kstrdup(format, GFP_KERNEL);
463 size_t name_len = strlen((*entry)->name) + 1; 467 if (!entry->format)
464 size_t format_len = strlen(format) + 1;
465
466
467 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
468 GFP_KERNEL);
469 if (!e)
470 return -ENOMEM; 468 return -ENOMEM;
471 memcpy(&e->name[0], (*entry)->name, name_len); 469 entry->format_allocated = 1;
472 e->format = &e->name[name_len]; 470
473 memcpy(e->format, format, format_len);
474 if (strcmp(e->format, MARK_NOARGS) == 0)
475 e->call = marker_probe_cb_noarg;
476 else
477 e->call = marker_probe_cb;
478 e->single = (*entry)->single;
479 e->multi = (*entry)->multi;
480 e->ptype = (*entry)->ptype;
481 e->refcount = (*entry)->refcount;
482 e->rcu_pending = 0;
483 hlist_add_before(&e->hlist, &(*entry)->hlist);
484 hlist_del(&(*entry)->hlist);
485 /* Make sure the call_rcu has been executed */
486 if ((*entry)->rcu_pending)
487 rcu_barrier_sched();
488 kfree(*entry);
489 *entry = e;
490 trace_mark(core_marker_format, "name %s format %s", 471 trace_mark(core_marker_format, "name %s format %s",
491 e->name, e->format); 472 entry->name, entry->format);
492 return 0; 473 return 0;
493} 474}
494 475
495/* 476/*
496 * Sets the probe callback corresponding to one marker. 477 * Sets the probe callback corresponding to one marker.
497 */ 478 */
498static int set_marker(struct marker_entry **entry, struct marker *elem, 479static int set_marker(struct marker_entry *entry, struct marker *elem,
499 int active) 480 int active)
500{ 481{
501 int ret; 482 int ret = 0;
502 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 483 WARN_ON(strcmp(entry->name, elem->name) != 0);
503 484
504 if ((*entry)->format) { 485 if (entry->format) {
505 if (strcmp((*entry)->format, elem->format) != 0) { 486 if (strcmp(entry->format, elem->format) != 0) {
506 printk(KERN_NOTICE 487 printk(KERN_NOTICE
507 "Format mismatch for probe %s " 488 "Format mismatch for probe %s "
508 "(%s), marker (%s)\n", 489 "(%s), marker (%s)\n",
509 (*entry)->name, 490 entry->name,
510 (*entry)->format, 491 entry->format,
511 elem->format); 492 elem->format);
512 return -EPERM; 493 return -EPERM;
513 } 494 }
@@ -523,48 +504,95 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
523 * pass from a "safe" callback (with argument) to an "unsafe" 504 * pass from a "safe" callback (with argument) to an "unsafe"
524 * callback (does not set arguments). 505 * callback (does not set arguments).
525 */ 506 */
526 elem->call = (*entry)->call; 507 elem->call = entry->call;
527 /* 508 /*
528 * Sanity check : 509 * Sanity check :
529 * We only update the single probe private data when the ptr is 510 * We only update the single probe private data when the ptr is
530 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) 511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
531 */ 512 */
532 WARN_ON(elem->single.func != __mark_empty_function 513 WARN_ON(elem->single.func != __mark_empty_function
533 && elem->single.probe_private 514 && elem->single.probe_private != entry->single.probe_private
534 != (*entry)->single.probe_private && 515 && !elem->ptype);
535 !elem->ptype); 516 elem->single.probe_private = entry->single.probe_private;
536 elem->single.probe_private = (*entry)->single.probe_private;
537 /* 517 /*
538 * Make sure the private data is valid when we update the 518 * Make sure the private data is valid when we update the
539 * single probe ptr. 519 * single probe ptr.
540 */ 520 */
541 smp_wmb(); 521 smp_wmb();
542 elem->single.func = (*entry)->single.func; 522 elem->single.func = entry->single.func;
543 /* 523 /*
544 * We also make sure that the new probe callbacks array is consistent 524 * We also make sure that the new probe callbacks array is consistent
545 * before setting a pointer to it. 525 * before setting a pointer to it.
546 */ 526 */
547 rcu_assign_pointer(elem->multi, (*entry)->multi); 527 rcu_assign_pointer(elem->multi, entry->multi);
548 /* 528 /*
549 * Update the function or multi probe array pointer before setting the 529 * Update the function or multi probe array pointer before setting the
550 * ptype. 530 * ptype.
551 */ 531 */
552 smp_wmb(); 532 smp_wmb();
553 elem->ptype = (*entry)->ptype; 533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
554 elem->state = active; 565 elem->state = active;
555 566
556 return 0; 567 return ret;
557} 568}
558 569
559/* 570/*
560 * Disable a marker and its probe callback. 571 * Disable a marker and its probe callback.
561 * Note: only waiting an RCU period after setting elem->call to the empty 572 * Note: only waiting an RCU period after setting elem->call to the empty
562 * function insures that the original callback is not used anymore. This insured 573 * function insures that the original callback is not used anymore. This insured
563 * by preempt_disable around the call site. 574 * by rcu_read_lock_sched around the call site.
564 */ 575 */
565static void disable_marker(struct marker *elem) 576static void disable_marker(struct marker *elem)
566{ 577{
578 int ret;
579
567 /* leave "call" as is. It is known statically. */ 580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
568 elem->state = 0; 596 elem->state = 0;
569 elem->single.func = __mark_empty_function; 597 elem->single.func = __mark_empty_function;
570 /* Update the function before setting the ptype */ 598 /* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
594 for (iter = begin; iter < end; iter++) { 622 for (iter = begin; iter < end; iter++) {
595 mark_entry = get_marker(iter->name); 623 mark_entry = get_marker(iter->name);
596 if (mark_entry) { 624 if (mark_entry) {
597 set_marker(&mark_entry, iter, 625 set_marker(mark_entry, iter, !!mark_entry->refcount);
598 !!mark_entry->refcount);
599 /* 626 /*
600 * ignore error, continue 627 * ignore error, continue
601 */ 628 */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
629 marker_update_probe_range(__start___markers, __stop___markers); 656 marker_update_probe_range(__start___markers, __stop___markers);
630 /* Markers in modules. */ 657 /* Markers in modules. */
631 module_update_markers(); 658 module_update_markers();
659 tracepoint_probe_update_all();
632} 660}
633 661
634/** 662/**
@@ -653,11 +681,17 @@ int marker_probe_register(const char *name, const char *format,
653 entry = get_marker(name); 681 entry = get_marker(name);
654 if (!entry) { 682 if (!entry) {
655 entry = add_marker(name, format); 683 entry = add_marker(name, format);
656 if (IS_ERR(entry)) { 684 if (IS_ERR(entry))
657 ret = PTR_ERR(entry); 685 ret = PTR_ERR(entry);
658 goto end; 686 } else if (format) {
659 } 687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
660 } 691 }
692 if (ret)
693 goto end;
694
661 /* 695 /*
662 * If we detect that a call_rcu is pending for this marker, 696 * If we detect that a call_rcu is pending for this marker,
663 * make sure it's executed now. 697 * make sure it's executed now.
@@ -670,10 +704,13 @@ int marker_probe_register(const char *name, const char *format,
670 goto end; 704 goto end;
671 } 705 }
672 mutex_unlock(&markers_mutex); 706 mutex_unlock(&markers_mutex);
673 marker_update_probes(); /* may update entry */ 707 marker_update_probes();
674 mutex_lock(&markers_mutex); 708 mutex_lock(&markers_mutex);
675 entry = get_marker(name); 709 entry = get_marker(name);
676 WARN_ON(!entry); 710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
677 entry->oldptr = old; 714 entry->oldptr = old;
678 entry->rcu_pending = 1; 715 entry->rcu_pending = 1;
679 /* write rcu_pending before calling the RCU callback */ 716 /* write rcu_pending before calling the RCU callback */
@@ -712,11 +749,13 @@ int marker_probe_unregister(const char *name,
712 rcu_barrier_sched(); 749 rcu_barrier_sched();
713 old = marker_entry_remove_probe(entry, probe, probe_private); 750 old = marker_entry_remove_probe(entry, probe, probe_private);
714 mutex_unlock(&markers_mutex); 751 mutex_unlock(&markers_mutex);
715 marker_update_probes(); /* may update entry */ 752 marker_update_probes();
716 mutex_lock(&markers_mutex); 753 mutex_lock(&markers_mutex);
717 entry = get_marker(name); 754 entry = get_marker(name);
718 if (!entry) 755 if (!entry)
719 goto end; 756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
720 entry->oldptr = old; 759 entry->oldptr = old;
721 entry->rcu_pending = 1; 760 entry->rcu_pending = 1;
722 /* write rcu_pending before calling the RCU callback */ 761 /* write rcu_pending before calling the RCU callback */
@@ -791,10 +830,13 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
791 rcu_barrier_sched(); 830 rcu_barrier_sched();
792 old = marker_entry_remove_probe(entry, NULL, probe_private); 831 old = marker_entry_remove_probe(entry, NULL, probe_private);
793 mutex_unlock(&markers_mutex); 832 mutex_unlock(&markers_mutex);
794 marker_update_probes(); /* may update entry */ 833 marker_update_probes();
795 mutex_lock(&markers_mutex); 834 mutex_lock(&markers_mutex);
796 entry = get_marker_from_private_data(probe, probe_private); 835 entry = get_marker_from_private_data(probe, probe_private);
797 WARN_ON(!entry); 836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
798 entry->oldptr = old; 840 entry->oldptr = old;
799 entry->rcu_pending = 1; 841 entry->rcu_pending = 1;
800 /* write rcu_pending before calling the RCU callback */ 842 /* write rcu_pending before calling the RCU callback */
@@ -836,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
836 if (!e->ptype) { 878 if (!e->ptype) {
837 if (num == 0 && e->single.func == probe) 879 if (num == 0 && e->single.func == probe)
838 return e->single.probe_private; 880 return e->single.probe_private;
839 else
840 break;
841 } else { 881 } else {
842 struct marker_probe_closure *closure; 882 struct marker_probe_closure *closure;
843 int match = 0; 883 int match = 0;
@@ -849,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
849 return closure[i].probe_private; 889 return closure[i].probe_private;
850 } 890 }
851 } 891 }
892 break;
852 } 893 }
853 } 894 }
854 return ERR_PTR(-ENOENT); 895 return ERR_PTR(-ENOENT);
855} 896}
856EXPORT_SYMBOL_GPL(marker_get_private_data); 897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 9db11911e04b..dd2a54155b54 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,11 +20,13 @@
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kallsyms.h> 22#include <linux/kallsyms.h>
23#include <linux/fs.h>
23#include <linux/sysfs.h> 24#include <linux/sysfs.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/elf.h> 28#include <linux/elf.h>
29#include <linux/proc_fs.h>
28#include <linux/seq_file.h> 30#include <linux/seq_file.h>
29#include <linux/syscalls.h> 31#include <linux/syscalls.h>
30#include <linux/fcntl.h> 32#include <linux/fcntl.h>
@@ -42,10 +44,13 @@
42#include <linux/string.h> 44#include <linux/string.h>
43#include <linux/mutex.h> 45#include <linux/mutex.h>
44#include <linux/unwind.h> 46#include <linux/unwind.h>
47#include <linux/rculist.h>
45#include <asm/uaccess.h> 48#include <asm/uaccess.h>
46#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
47#include <linux/license.h> 50#include <linux/license.h>
48#include <asm/sections.h> 51#include <asm/sections.h>
52#include <linux/tracepoint.h>
53#include <linux/ftrace.h>
49 54
50#if 0 55#if 0
51#define DEBUGP printk 56#define DEBUGP printk
@@ -61,7 +66,7 @@
61#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 66#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
62 67
63/* List of modules, protected by module_mutex or preempt_disable 68/* List of modules, protected by module_mutex or preempt_disable
64 * (add/delete uses stop_machine). */ 69 * (delete uses stop_machine/add uses RCU list operations). */
65static DEFINE_MUTEX(module_mutex); 70static DEFINE_MUTEX(module_mutex);
66static LIST_HEAD(modules); 71static LIST_HEAD(modules);
67 72
@@ -100,7 +105,7 @@ static inline int strong_try_module_get(struct module *mod)
100static inline void add_taint_module(struct module *mod, unsigned flag) 105static inline void add_taint_module(struct module *mod, unsigned flag)
101{ 106{
102 add_taint(flag); 107 add_taint(flag);
103 mod->taints |= flag; 108 mod->taints |= (1U << flag);
104} 109}
105 110
106/* 111/*
@@ -130,6 +135,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr,
130 return 0; 135 return 0;
131} 136}
132 137
138/* Find a module section, or NULL. */
139static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
140 const char *secstrings, const char *name)
141{
142 /* Section 0 has sh_addr 0. */
143 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
144}
145
146/* Find a module section, or NULL. Fill in number of "objects" in section. */
147static void *section_objs(Elf_Ehdr *hdr,
148 Elf_Shdr *sechdrs,
149 const char *secstrings,
150 const char *name,
151 size_t object_size,
152 unsigned int *num)
153{
154 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
155
156 /* Section 0 has sh_addr 0 and sh_size 0. */
157 *num = sechdrs[sec].sh_size / object_size;
158 return (void *)sechdrs[sec].sh_addr;
159}
160
133/* Provided by the linker */ 161/* Provided by the linker */
134extern const struct kernel_symbol __start___ksymtab[]; 162extern const struct kernel_symbol __start___ksymtab[];
135extern const struct kernel_symbol __stop___ksymtab[]; 163extern const struct kernel_symbol __stop___ksymtab[];
@@ -216,7 +244,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
216 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) 244 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
217 return true; 245 return true;
218 246
219 list_for_each_entry(mod, &modules, list) { 247 list_for_each_entry_rcu(mod, &modules, list) {
220 struct symsearch arr[] = { 248 struct symsearch arr[] = {
221 { mod->syms, mod->syms + mod->num_syms, mod->crcs, 249 { mod->syms, mod->syms + mod->num_syms, mod->crcs,
222 NOT_GPL_ONLY, false }, 250 NOT_GPL_ONLY, false },
@@ -784,6 +812,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
784 mutex_lock(&module_mutex); 812 mutex_lock(&module_mutex);
785 /* Store the name of the last unloaded module for diagnostic purposes */ 813 /* Store the name of the last unloaded module for diagnostic purposes */
786 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 814 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
815 unregister_dynamic_debug_module(mod->name);
787 free_module(mod); 816 free_module(mod);
788 817
789 out: 818 out:
@@ -923,7 +952,7 @@ static const char vermagic[] = VERMAGIC_STRING;
923static int try_to_force_load(struct module *mod, const char *symname) 952static int try_to_force_load(struct module *mod, const char *symname)
924{ 953{
925#ifdef CONFIG_MODULE_FORCE_LOAD 954#ifdef CONFIG_MODULE_FORCE_LOAD
926 if (!(tainted & TAINT_FORCED_MODULE)) 955 if (!test_taint(TAINT_FORCED_MODULE))
927 printk("%s: no version for \"%s\" found: kernel tainted.\n", 956 printk("%s: no version for \"%s\" found: kernel tainted.\n",
928 mod->name, symname); 957 mod->name, symname);
929 add_taint_module(mod, TAINT_FORCED_MODULE); 958 add_taint_module(mod, TAINT_FORCED_MODULE);
@@ -1033,7 +1062,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
1033 const unsigned long *crc; 1062 const unsigned long *crc;
1034 1063
1035 ret = find_symbol(name, &owner, &crc, 1064 ret = find_symbol(name, &owner, &crc,
1036 !(mod->taints & TAINT_PROPRIETARY_MODULE), true); 1065 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1037 if (!IS_ERR_VALUE(ret)) { 1066 if (!IS_ERR_VALUE(ret)) {
1038 /* use_module can fail due to OOM, 1067 /* use_module can fail due to OOM,
1039 or module initialization or unloading */ 1068 or module initialization or unloading */
@@ -1173,7 +1202,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1173 while (i-- > 0) 1202 while (i-- > 0)
1174 sysfs_remove_bin_file(notes_attrs->dir, 1203 sysfs_remove_bin_file(notes_attrs->dir,
1175 &notes_attrs->attrs[i]); 1204 &notes_attrs->attrs[i]);
1176 kobject_del(notes_attrs->dir); 1205 kobject_put(notes_attrs->dir);
1177 } 1206 }
1178 kfree(notes_attrs); 1207 kfree(notes_attrs);
1179} 1208}
@@ -1391,17 +1420,6 @@ static void mod_kobject_remove(struct module *mod)
1391} 1420}
1392 1421
1393/* 1422/*
1394 * link the module with the whole machine is stopped with interrupts off
1395 * - this defends against kallsyms not taking locks
1396 */
1397static int __link_module(void *_mod)
1398{
1399 struct module *mod = _mod;
1400 list_add(&mod->list, &modules);
1401 return 0;
1402}
1403
1404/*
1405 * unlink the module with the whole machine is stopped with interrupts off 1423 * unlink the module with the whole machine is stopped with interrupts off
1406 * - this defends against kallsyms not taking locks 1424 * - this defends against kallsyms not taking locks
1407 */ 1425 */
@@ -1429,6 +1447,9 @@ static void free_module(struct module *mod)
1429 /* Module unload stuff */ 1447 /* Module unload stuff */
1430 module_unload_free(mod); 1448 module_unload_free(mod);
1431 1449
1450 /* release any pointers to mcount in this module */
1451 ftrace_release(mod->module_core, mod->core_size);
1452
1432 /* This may be NULL, but that's OK */ 1453 /* This may be NULL, but that's OK */
1433 module_free(mod, mod->module_init); 1454 module_free(mod, mod->module_init);
1434 kfree(mod->args); 1455 kfree(mod->args);
@@ -1634,7 +1655,7 @@ static void set_license(struct module *mod, const char *license)
1634 license = "unspecified"; 1655 license = "unspecified";
1635 1656
1636 if (!license_is_gpl_compatible(license)) { 1657 if (!license_is_gpl_compatible(license)) {
1637 if (!(tainted & TAINT_PROPRIETARY_MODULE)) 1658 if (!test_taint(TAINT_PROPRIETARY_MODULE))
1638 printk(KERN_WARNING "%s: module license '%s' taints " 1659 printk(KERN_WARNING "%s: module license '%s' taints "
1639 "kernel.\n", mod->name, license); 1660 "kernel.\n", mod->name, license);
1640 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1661 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -1783,6 +1804,21 @@ static inline void add_kallsyms(struct module *mod,
1783} 1804}
1784#endif /* CONFIG_KALLSYMS */ 1805#endif /* CONFIG_KALLSYMS */
1785 1806
1807static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
1808{
1809#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
1810 unsigned int i;
1811
1812 for (i = 0; i < num; i++) {
1813 register_dynamic_debug_module(debug[i].modname,
1814 debug[i].type,
1815 debug[i].logical_modname,
1816 debug[i].flag_names,
1817 debug[i].hash, debug[i].hash2);
1818 }
1819#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1820}
1821
1786static void *module_alloc_update_bounds(unsigned long size) 1822static void *module_alloc_update_bounds(unsigned long size)
1787{ 1823{
1788 void *ret = module_alloc(size); 1824 void *ret = module_alloc(size);
@@ -1806,35 +1842,18 @@ static noinline struct module *load_module(void __user *umod,
1806 Elf_Ehdr *hdr; 1842 Elf_Ehdr *hdr;
1807 Elf_Shdr *sechdrs; 1843 Elf_Shdr *sechdrs;
1808 char *secstrings, *args, *modmagic, *strtab = NULL; 1844 char *secstrings, *args, *modmagic, *strtab = NULL;
1845 char *staging;
1809 unsigned int i; 1846 unsigned int i;
1810 unsigned int symindex = 0; 1847 unsigned int symindex = 0;
1811 unsigned int strindex = 0; 1848 unsigned int strindex = 0;
1812 unsigned int setupindex; 1849 unsigned int modindex, versindex, infoindex, pcpuindex;
1813 unsigned int exindex;
1814 unsigned int exportindex;
1815 unsigned int modindex;
1816 unsigned int obsparmindex;
1817 unsigned int infoindex;
1818 unsigned int gplindex;
1819 unsigned int crcindex;
1820 unsigned int gplcrcindex;
1821 unsigned int versindex;
1822 unsigned int pcpuindex;
1823 unsigned int gplfutureindex;
1824 unsigned int gplfuturecrcindex;
1825 unsigned int unwindex = 0; 1850 unsigned int unwindex = 0;
1826#ifdef CONFIG_UNUSED_SYMBOLS 1851 unsigned int num_kp, num_mcount;
1827 unsigned int unusedindex; 1852 struct kernel_param *kp;
1828 unsigned int unusedcrcindex;
1829 unsigned int unusedgplindex;
1830 unsigned int unusedgplcrcindex;
1831#endif
1832 unsigned int markersindex;
1833 unsigned int markersstringsindex;
1834 struct module *mod; 1853 struct module *mod;
1835 long err = 0; 1854 long err = 0;
1836 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1855 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1837 struct exception_table_entry *extable; 1856 unsigned long *mseg;
1838 mm_segment_t old_fs; 1857 mm_segment_t old_fs;
1839 1858
1840 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1859 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -1898,6 +1917,7 @@ static noinline struct module *load_module(void __user *umod,
1898 err = -ENOEXEC; 1917 err = -ENOEXEC;
1899 goto free_hdr; 1918 goto free_hdr;
1900 } 1919 }
1920 /* This is temporary: point mod into copy of data. */
1901 mod = (void *)sechdrs[modindex].sh_addr; 1921 mod = (void *)sechdrs[modindex].sh_addr;
1902 1922
1903 if (symindex == 0) { 1923 if (symindex == 0) {
@@ -1907,22 +1927,6 @@ static noinline struct module *load_module(void __user *umod,
1907 goto free_hdr; 1927 goto free_hdr;
1908 } 1928 }
1909 1929
1910 /* Optional sections */
1911 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1912 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1913 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1914 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1915 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1916 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1917#ifdef CONFIG_UNUSED_SYMBOLS
1918 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1919 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1920 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1921 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1922#endif
1923 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1924 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1925 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1926 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1930 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1927 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1931 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1928 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1932 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
@@ -1960,6 +1964,14 @@ static noinline struct module *load_module(void __user *umod,
1960 goto free_hdr; 1964 goto free_hdr;
1961 } 1965 }
1962 1966
1967 staging = get_modinfo(sechdrs, infoindex, "staging");
1968 if (staging) {
1969 add_taint_module(mod, TAINT_CRAP);
1970 printk(KERN_WARNING "%s: module is from the staging directory,"
1971 " the quality is unknown, you have been warned.\n",
1972 mod->name);
1973 }
1974
1963 /* Now copy in args */ 1975 /* Now copy in args */
1964 args = strndup_user(uargs, ~0UL >> 1); 1976 args = strndup_user(uargs, ~0UL >> 1);
1965 if (IS_ERR(args)) { 1977 if (IS_ERR(args)) {
@@ -2070,42 +2082,57 @@ static noinline struct module *load_module(void __user *umod,
2070 if (err < 0) 2082 if (err < 0)
2071 goto cleanup; 2083 goto cleanup;
2072 2084
2073 /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ 2085 /* Now we've got everything in the final locations, we can
2074 mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); 2086 * find optional sections. */
2075 mod->syms = (void *)sechdrs[exportindex].sh_addr; 2087 kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
2076 if (crcindex) 2088 &num_kp);
2077 mod->crcs = (void *)sechdrs[crcindex].sh_addr; 2089 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2078 mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); 2090 sizeof(*mod->syms), &mod->num_syms);
2079 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; 2091 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2080 if (gplcrcindex) 2092 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2081 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 2093 sizeof(*mod->gpl_syms),
2082 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 2094 &mod->num_gpl_syms);
2083 sizeof(*mod->gpl_future_syms); 2095 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2084 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 2096 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2085 if (gplfuturecrcindex) 2097 "__ksymtab_gpl_future",
2086 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 2098 sizeof(*mod->gpl_future_syms),
2099 &mod->num_gpl_future_syms);
2100 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2101 "__kcrctab_gpl_future");
2087 2102
2088#ifdef CONFIG_UNUSED_SYMBOLS 2103#ifdef CONFIG_UNUSED_SYMBOLS
2089 mod->num_unused_syms = sechdrs[unusedindex].sh_size / 2104 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2090 sizeof(*mod->unused_syms); 2105 "__ksymtab_unused",
2091 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / 2106 sizeof(*mod->unused_syms),
2092 sizeof(*mod->unused_gpl_syms); 2107 &mod->num_unused_syms);
2093 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; 2108 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2094 if (unusedcrcindex) 2109 "__kcrctab_unused");
2095 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; 2110 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2096 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; 2111 "__ksymtab_unused_gpl",
2097 if (unusedgplcrcindex) 2112 sizeof(*mod->unused_gpl_syms),
2098 mod->unused_gpl_crcs 2113 &mod->num_unused_gpl_syms);
2099 = (void *)sechdrs[unusedgplcrcindex].sh_addr; 2114 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2115 "__kcrctab_unused_gpl");
2116#endif
2117
2118#ifdef CONFIG_MARKERS
2119 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2120 sizeof(*mod->markers), &mod->num_markers);
2121#endif
2122#ifdef CONFIG_TRACEPOINTS
2123 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2124 "__tracepoints",
2125 sizeof(*mod->tracepoints),
2126 &mod->num_tracepoints);
2100#endif 2127#endif
2101 2128
2102#ifdef CONFIG_MODVERSIONS 2129#ifdef CONFIG_MODVERSIONS
2103 if ((mod->num_syms && !crcindex) 2130 if ((mod->num_syms && !mod->crcs)
2104 || (mod->num_gpl_syms && !gplcrcindex) 2131 || (mod->num_gpl_syms && !mod->gpl_crcs)
2105 || (mod->num_gpl_future_syms && !gplfuturecrcindex) 2132 || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
2106#ifdef CONFIG_UNUSED_SYMBOLS 2133#ifdef CONFIG_UNUSED_SYMBOLS
2107 || (mod->num_unused_syms && !unusedcrcindex) 2134 || (mod->num_unused_syms && !mod->unused_crcs)
2108 || (mod->num_unused_gpl_syms && !unusedgplcrcindex) 2135 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2109#endif 2136#endif
2110 ) { 2137 ) {
2111 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2138 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
@@ -2114,9 +2141,6 @@ static noinline struct module *load_module(void __user *umod,
2114 goto cleanup; 2141 goto cleanup;
2115 } 2142 }
2116#endif 2143#endif
2117 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
2118 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
2119 "__markers_strings");
2120 2144
2121 /* Now do relocations. */ 2145 /* Now do relocations. */
2122 for (i = 1; i < hdr->e_shnum; i++) { 2146 for (i = 1; i < hdr->e_shnum; i++) {
@@ -2139,22 +2163,16 @@ static noinline struct module *load_module(void __user *umod,
2139 if (err < 0) 2163 if (err < 0)
2140 goto cleanup; 2164 goto cleanup;
2141 } 2165 }
2142#ifdef CONFIG_MARKERS
2143 mod->markers = (void *)sechdrs[markersindex].sh_addr;
2144 mod->num_markers =
2145 sechdrs[markersindex].sh_size / sizeof(*mod->markers);
2146#endif
2147 2166
2148 /* Find duplicate symbols */ 2167 /* Find duplicate symbols */
2149 err = verify_export_symbols(mod); 2168 err = verify_export_symbols(mod);
2150
2151 if (err < 0) 2169 if (err < 0)
2152 goto cleanup; 2170 goto cleanup;
2153 2171
2154 /* Set up and sort exception table */ 2172 /* Set up and sort exception table */
2155 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); 2173 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2156 mod->extable = extable = (void *)sechdrs[exindex].sh_addr; 2174 sizeof(*mod->extable), &mod->num_exentries);
2157 sort_extable(extable, extable + mod->num_exentries); 2175 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2158 2176
2159 /* Finally, copy percpu area over. */ 2177 /* Finally, copy percpu area over. */
2160 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2178 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
@@ -2162,11 +2180,20 @@ static noinline struct module *load_module(void __user *umod,
2162 2180
2163 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2181 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
2164 2182
2165#ifdef CONFIG_MARKERS 2183 if (!mod->taints) {
2166 if (!mod->taints) 2184 struct mod_debug *debug;
2167 marker_update_probe_range(mod->markers, 2185 unsigned int num_debug;
2168 mod->markers + mod->num_markers); 2186
2169#endif 2187 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2188 sizeof(*debug), &num_debug);
2189 dynamic_printk_setup(debug, num_debug);
2190 }
2191
2192 /* sechdrs[0].sh_size is always zero */
2193 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2194 sizeof(*mseg), &num_mcount);
2195 ftrace_init_module(mod, mseg, mseg + num_mcount);
2196
2170 err = module_finalize(hdr, sechdrs, mod); 2197 err = module_finalize(hdr, sechdrs, mod);
2171 if (err < 0) 2198 if (err < 0)
2172 goto cleanup; 2199 goto cleanup;
@@ -2190,30 +2217,24 @@ static noinline struct module *load_module(void __user *umod,
2190 set_fs(old_fs); 2217 set_fs(old_fs);
2191 2218
2192 mod->args = args; 2219 mod->args = args;
2193 if (obsparmindex) 2220 if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
2194 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2221 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2195 mod->name); 2222 mod->name);
2196 2223
2197 /* Now sew it into the lists so we can get lockdep and oops 2224 /* Now sew it into the lists so we can get lockdep and oops
2198 * info during argument parsing. Noone should access us, since 2225 * info during argument parsing. Noone should access us, since
2199 * strong_try_module_get() will fail. */ 2226 * strong_try_module_get() will fail.
2200 stop_machine(__link_module, mod, NULL); 2227 * lockdep/oops can run asynchronous, so use the RCU list insertion
2201 2228 * function to insert in a way safe to concurrent readers.
2202 /* Size of section 0 is 0, so this works well if no params */ 2229 * The mutex protects against concurrent writers.
2203 err = parse_args(mod->name, mod->args, 2230 */
2204 (struct kernel_param *) 2231 list_add_rcu(&mod->list, &modules);
2205 sechdrs[setupindex].sh_addr, 2232
2206 sechdrs[setupindex].sh_size 2233 err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
2207 / sizeof(struct kernel_param),
2208 NULL);
2209 if (err < 0) 2234 if (err < 0)
2210 goto unlink; 2235 goto unlink;
2211 2236
2212 err = mod_sysfs_setup(mod, 2237 err = mod_sysfs_setup(mod, kp, num_kp);
2213 (struct kernel_param *)
2214 sechdrs[setupindex].sh_addr,
2215 sechdrs[setupindex].sh_size
2216 / sizeof(struct kernel_param));
2217 if (err < 0) 2238 if (err < 0)
2218 goto unlink; 2239 goto unlink;
2219 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2240 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2236,6 +2257,7 @@ static noinline struct module *load_module(void __user *umod,
2236 cleanup: 2257 cleanup:
2237 kobject_del(&mod->mkobj.kobj); 2258 kobject_del(&mod->mkobj.kobj);
2238 kobject_put(&mod->mkobj.kobj); 2259 kobject_put(&mod->mkobj.kobj);
2260 ftrace_release(mod->module_core, mod->core_size);
2239 free_unload: 2261 free_unload:
2240 module_unload_free(mod); 2262 module_unload_free(mod);
2241 module_free(mod, mod->module_init); 2263 module_free(mod, mod->module_init);
@@ -2401,7 +2423,7 @@ const char *module_address_lookup(unsigned long addr,
2401 const char *ret = NULL; 2423 const char *ret = NULL;
2402 2424
2403 preempt_disable(); 2425 preempt_disable();
2404 list_for_each_entry(mod, &modules, list) { 2426 list_for_each_entry_rcu(mod, &modules, list) {
2405 if (within(addr, mod->module_init, mod->init_size) 2427 if (within(addr, mod->module_init, mod->init_size)
2406 || within(addr, mod->module_core, mod->core_size)) { 2428 || within(addr, mod->module_core, mod->core_size)) {
2407 if (modname) 2429 if (modname)
@@ -2424,7 +2446,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2424 struct module *mod; 2446 struct module *mod;
2425 2447
2426 preempt_disable(); 2448 preempt_disable();
2427 list_for_each_entry(mod, &modules, list) { 2449 list_for_each_entry_rcu(mod, &modules, list) {
2428 if (within(addr, mod->module_init, mod->init_size) || 2450 if (within(addr, mod->module_init, mod->init_size) ||
2429 within(addr, mod->module_core, mod->core_size)) { 2451 within(addr, mod->module_core, mod->core_size)) {
2430 const char *sym; 2452 const char *sym;
@@ -2448,7 +2470,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2448 struct module *mod; 2470 struct module *mod;
2449 2471
2450 preempt_disable(); 2472 preempt_disable();
2451 list_for_each_entry(mod, &modules, list) { 2473 list_for_each_entry_rcu(mod, &modules, list) {
2452 if (within(addr, mod->module_init, mod->init_size) || 2474 if (within(addr, mod->module_init, mod->init_size) ||
2453 within(addr, mod->module_core, mod->core_size)) { 2475 within(addr, mod->module_core, mod->core_size)) {
2454 const char *sym; 2476 const char *sym;
@@ -2475,7 +2497,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2475 struct module *mod; 2497 struct module *mod;
2476 2498
2477 preempt_disable(); 2499 preempt_disable();
2478 list_for_each_entry(mod, &modules, list) { 2500 list_for_each_entry_rcu(mod, &modules, list) {
2479 if (symnum < mod->num_symtab) { 2501 if (symnum < mod->num_symtab) {
2480 *value = mod->symtab[symnum].st_value; 2502 *value = mod->symtab[symnum].st_value;
2481 *type = mod->symtab[symnum].st_info; 2503 *type = mod->symtab[symnum].st_info;
@@ -2518,7 +2540,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2518 ret = mod_find_symname(mod, colon+1); 2540 ret = mod_find_symname(mod, colon+1);
2519 *colon = ':'; 2541 *colon = ':';
2520 } else { 2542 } else {
2521 list_for_each_entry(mod, &modules, list) 2543 list_for_each_entry_rcu(mod, &modules, list)
2522 if ((ret = mod_find_symname(mod, name)) != 0) 2544 if ((ret = mod_find_symname(mod, name)) != 0)
2523 break; 2545 break;
2524 } 2546 }
@@ -2527,23 +2549,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2527} 2549}
2528#endif /* CONFIG_KALLSYMS */ 2550#endif /* CONFIG_KALLSYMS */
2529 2551
2530/* Called by the /proc file system to return a list of modules. */
2531static void *m_start(struct seq_file *m, loff_t *pos)
2532{
2533 mutex_lock(&module_mutex);
2534 return seq_list_start(&modules, *pos);
2535}
2536
2537static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2538{
2539 return seq_list_next(p, &modules, pos);
2540}
2541
2542static void m_stop(struct seq_file *m, void *p)
2543{
2544 mutex_unlock(&module_mutex);
2545}
2546
2547static char *module_flags(struct module *mod, char *buf) 2552static char *module_flags(struct module *mod, char *buf)
2548{ 2553{
2549 int bx = 0; 2554 int bx = 0;
@@ -2552,10 +2557,12 @@ static char *module_flags(struct module *mod, char *buf)
2552 mod->state == MODULE_STATE_GOING || 2557 mod->state == MODULE_STATE_GOING ||
2553 mod->state == MODULE_STATE_COMING) { 2558 mod->state == MODULE_STATE_COMING) {
2554 buf[bx++] = '('; 2559 buf[bx++] = '(';
2555 if (mod->taints & TAINT_PROPRIETARY_MODULE) 2560 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
2556 buf[bx++] = 'P'; 2561 buf[bx++] = 'P';
2557 if (mod->taints & TAINT_FORCED_MODULE) 2562 if (mod->taints & (1 << TAINT_FORCED_MODULE))
2558 buf[bx++] = 'F'; 2563 buf[bx++] = 'F';
2564 if (mod->taints & (1 << TAINT_CRAP))
2565 buf[bx++] = 'C';
2559 /* 2566 /*
2560 * TAINT_FORCED_RMMOD: could be added. 2567 * TAINT_FORCED_RMMOD: could be added.
2561 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2568 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -2575,6 +2582,24 @@ static char *module_flags(struct module *mod, char *buf)
2575 return buf; 2582 return buf;
2576} 2583}
2577 2584
2585#ifdef CONFIG_PROC_FS
2586/* Called by the /proc file system to return a list of modules. */
2587static void *m_start(struct seq_file *m, loff_t *pos)
2588{
2589 mutex_lock(&module_mutex);
2590 return seq_list_start(&modules, *pos);
2591}
2592
2593static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2594{
2595 return seq_list_next(p, &modules, pos);
2596}
2597
2598static void m_stop(struct seq_file *m, void *p)
2599{
2600 mutex_unlock(&module_mutex);
2601}
2602
2578static int m_show(struct seq_file *m, void *p) 2603static int m_show(struct seq_file *m, void *p)
2579{ 2604{
2580 struct module *mod = list_entry(p, struct module, list); 2605 struct module *mod = list_entry(p, struct module, list);
@@ -2605,13 +2630,33 @@ static int m_show(struct seq_file *m, void *p)
2605 Where refcount is a number or -, and deps is a comma-separated list 2630 Where refcount is a number or -, and deps is a comma-separated list
2606 of depends or -. 2631 of depends or -.
2607*/ 2632*/
2608const struct seq_operations modules_op = { 2633static const struct seq_operations modules_op = {
2609 .start = m_start, 2634 .start = m_start,
2610 .next = m_next, 2635 .next = m_next,
2611 .stop = m_stop, 2636 .stop = m_stop,
2612 .show = m_show 2637 .show = m_show
2613}; 2638};
2614 2639
2640static int modules_open(struct inode *inode, struct file *file)
2641{
2642 return seq_open(file, &modules_op);
2643}
2644
2645static const struct file_operations proc_modules_operations = {
2646 .open = modules_open,
2647 .read = seq_read,
2648 .llseek = seq_lseek,
2649 .release = seq_release,
2650};
2651
2652static int __init proc_modules_init(void)
2653{
2654 proc_create("modules", 0, NULL, &proc_modules_operations);
2655 return 0;
2656}
2657module_init(proc_modules_init);
2658#endif
2659
2615/* Given an address, look for it in the module exception tables. */ 2660/* Given an address, look for it in the module exception tables. */
2616const struct exception_table_entry *search_module_extables(unsigned long addr) 2661const struct exception_table_entry *search_module_extables(unsigned long addr)
2617{ 2662{
@@ -2619,7 +2664,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2619 struct module *mod; 2664 struct module *mod;
2620 2665
2621 preempt_disable(); 2666 preempt_disable();
2622 list_for_each_entry(mod, &modules, list) { 2667 list_for_each_entry_rcu(mod, &modules, list) {
2623 if (mod->num_exentries == 0) 2668 if (mod->num_exentries == 0)
2624 continue; 2669 continue;
2625 2670
@@ -2645,7 +2690,7 @@ int is_module_address(unsigned long addr)
2645 2690
2646 preempt_disable(); 2691 preempt_disable();
2647 2692
2648 list_for_each_entry(mod, &modules, list) { 2693 list_for_each_entry_rcu(mod, &modules, list) {
2649 if (within(addr, mod->module_core, mod->core_size)) { 2694 if (within(addr, mod->module_core, mod->core_size)) {
2650 preempt_enable(); 2695 preempt_enable();
2651 return 1; 2696 return 1;
@@ -2659,14 +2704,14 @@ int is_module_address(unsigned long addr)
2659 2704
2660 2705
2661/* Is this a valid kernel address? */ 2706/* Is this a valid kernel address? */
2662struct module *__module_text_address(unsigned long addr) 2707__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2663{ 2708{
2664 struct module *mod; 2709 struct module *mod;
2665 2710
2666 if (addr < module_addr_min || addr > module_addr_max) 2711 if (addr < module_addr_min || addr > module_addr_max)
2667 return NULL; 2712 return NULL;
2668 2713
2669 list_for_each_entry(mod, &modules, list) 2714 list_for_each_entry_rcu(mod, &modules, list)
2670 if (within(addr, mod->module_init, mod->init_text_size) 2715 if (within(addr, mod->module_init, mod->init_text_size)
2671 || within(addr, mod->module_core, mod->core_text_size)) 2716 || within(addr, mod->module_core, mod->core_text_size))
2672 return mod; 2717 return mod;
@@ -2691,8 +2736,11 @@ void print_modules(void)
2691 char buf[8]; 2736 char buf[8];
2692 2737
2693 printk("Modules linked in:"); 2738 printk("Modules linked in:");
2694 list_for_each_entry(mod, &modules, list) 2739 /* Most callers should already have preempt disabled, but make sure */
2740 preempt_disable();
2741 list_for_each_entry_rcu(mod, &modules, list)
2695 printk(" %s%s", mod->name, module_flags(mod, buf)); 2742 printk(" %s%s", mod->name, module_flags(mod, buf));
2743 preempt_enable();
2696 if (last_unloaded_module[0]) 2744 if (last_unloaded_module[0])
2697 printk(" [last unloaded: %s]", last_unloaded_module); 2745 printk(" [last unloaded: %s]", last_unloaded_module);
2698 printk("\n"); 2746 printk("\n");
@@ -2717,3 +2765,50 @@ void module_update_markers(void)
2717 mutex_unlock(&module_mutex); 2765 mutex_unlock(&module_mutex);
2718} 2766}
2719#endif 2767#endif
2768
2769#ifdef CONFIG_TRACEPOINTS
2770void module_update_tracepoints(void)
2771{
2772 struct module *mod;
2773
2774 mutex_lock(&module_mutex);
2775 list_for_each_entry(mod, &modules, list)
2776 if (!mod->taints)
2777 tracepoint_update_probe_range(mod->tracepoints,
2778 mod->tracepoints + mod->num_tracepoints);
2779 mutex_unlock(&module_mutex);
2780}
2781
2782/*
2783 * Returns 0 if current not found.
2784 * Returns 1 if current found.
2785 */
2786int module_get_iter_tracepoints(struct tracepoint_iter *iter)
2787{
2788 struct module *iter_mod;
2789 int found = 0;
2790
2791 mutex_lock(&module_mutex);
2792 list_for_each_entry(iter_mod, &modules, list) {
2793 if (!iter_mod->taints) {
2794 /*
2795 * Sorted module list
2796 */
2797 if (iter_mod < iter->module)
2798 continue;
2799 else if (iter_mod > iter->module)
2800 iter->tracepoint = NULL;
2801 found = tracepoint_get_iter_range(&iter->tracepoint,
2802 iter_mod->tracepoints,
2803 iter_mod->tracepoints
2804 + iter_mod->num_tracepoints);
2805 if (found) {
2806 iter->module = iter_mod;
2807 break;
2808 }
2809 }
2810 }
2811 mutex_unlock(&module_mutex);
2812 return found;
2813}
2814#endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 823be11584ef..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
@@ -550,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
550 558
551static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
552 560
553int notify_die(enum die_val val, const char *str, 561int notrace notify_die(enum die_val val, const char *str,
554 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
555{ 563{
556 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29a2583..63598dca2d0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
80 goto out_pid; 80 goto out_pid;
81 } 81 }
82 82
83 new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
84 if (IS_ERR(new_nsp->user_ns)) {
85 err = PTR_ERR(new_nsp->user_ns);
86 goto out_user;
87 }
88
89 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); 83 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
90 if (IS_ERR(new_nsp->net_ns)) { 84 if (IS_ERR(new_nsp->net_ns)) {
91 err = PTR_ERR(new_nsp->net_ns); 85 err = PTR_ERR(new_nsp->net_ns);
@@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
95 return new_nsp; 89 return new_nsp;
96 90
97out_net: 91out_net:
98 if (new_nsp->user_ns)
99 put_user_ns(new_nsp->user_ns);
100out_user:
101 if (new_nsp->pid_ns) 92 if (new_nsp->pid_ns)
102 put_pid_ns(new_nsp->pid_ns); 93 put_pid_ns(new_nsp->pid_ns);
103out_pid: 94out_pid:
@@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
130 get_nsproxy(old_ns); 121 get_nsproxy(old_ns);
131 122
132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 123 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) 124 CLONE_NEWPID | CLONE_NEWNET)))
134 return 0; 125 return 0;
135 126
136 if (!capable(CAP_SYS_ADMIN)) { 127 if (!capable(CAP_SYS_ADMIN)) {
@@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns)
173 put_ipc_ns(ns->ipc_ns); 164 put_ipc_ns(ns->ipc_ns);
174 if (ns->pid_ns) 165 if (ns->pid_ns)
175 put_pid_ns(ns->pid_ns); 166 put_pid_ns(ns->pid_ns);
176 if (ns->user_ns)
177 put_user_ns(ns->user_ns);
178 put_net(ns->net_ns); 167 put_net(ns->net_ns);
179 kmem_cache_free(nsproxy_cachep, ns); 168 kmem_cache_free(nsproxy_cachep, ns);
180} 169}
@@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
189 int err = 0; 178 int err = 0;
190 179
191 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 180 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
192 CLONE_NEWUSER | CLONE_NEWNET))) 181 CLONE_NEWNET)))
193 return 0; 182 return 0;
194 183
195 if (!capable(CAP_SYS_ADMIN)) 184 if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/panic.c b/kernel/panic.c
index e0a87bb025c0..3a0b0898690a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,9 +21,10 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/dmi.h>
24 25
25int panic_on_oops; 26int panic_on_oops;
26int tainted; 27static unsigned long tainted_mask;
27static int pause_on_oops; 28static int pause_on_oops;
28static int pause_on_oops_flag; 29static int pause_on_oops_flag;
29static DEFINE_SPINLOCK(pause_on_oops_lock); 30static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -34,13 +35,6 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
34 35
35EXPORT_SYMBOL(panic_notifier_list); 36EXPORT_SYMBOL(panic_notifier_list);
36 37
37static int __init panic_setup(char *str)
38{
39 panic_timeout = simple_strtoul(str, NULL, 0);
40 return 1;
41}
42__setup("panic=", panic_setup);
43
44static long no_blink(long time) 38static long no_blink(long time)
45{ 39{
46 return 0; 40 return 0;
@@ -146,6 +140,27 @@ NORET_TYPE void panic(const char * fmt, ...)
146 140
147EXPORT_SYMBOL(panic); 141EXPORT_SYMBOL(panic);
148 142
143
144struct tnt {
145 u8 bit;
146 char true;
147 char false;
148};
149
150static const struct tnt tnts[] = {
151 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
152 { TAINT_FORCED_MODULE, 'F', ' ' },
153 { TAINT_UNSAFE_SMP, 'S', ' ' },
154 { TAINT_FORCED_RMMOD, 'R', ' ' },
155 { TAINT_MACHINE_CHECK, 'M', ' ' },
156 { TAINT_BAD_PAGE, 'B', ' ' },
157 { TAINT_USER, 'U', ' ' },
158 { TAINT_DIE, 'D', ' ' },
159 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
160 { TAINT_WARN, 'W', ' ' },
161 { TAINT_CRAP, 'C', ' ' },
162};
163
149/** 164/**
150 * print_tainted - return a string to represent the kernel taint state. 165 * print_tainted - return a string to represent the kernel taint state.
151 * 166 *
@@ -156,46 +171,50 @@ EXPORT_SYMBOL(panic);
156 * 'M' - System experienced a machine check exception. 171 * 'M' - System experienced a machine check exception.
157 * 'B' - System has hit bad_page. 172 * 'B' - System has hit bad_page.
158 * 'U' - Userspace-defined naughtiness. 173 * 'U' - Userspace-defined naughtiness.
174 * 'D' - Kernel has oopsed before
159 * 'A' - ACPI table overridden. 175 * 'A' - ACPI table overridden.
160 * 'W' - Taint on warning. 176 * 'W' - Taint on warning.
177 * 'C' - modules from drivers/staging are loaded.
161 * 178 *
162 * The string is overwritten by the next call to print_taint(). 179 * The string is overwritten by the next call to print_taint().
163 */ 180 */
164
165const char *print_tainted(void) 181const char *print_tainted(void)
166{ 182{
167 static char buf[20]; 183 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
168 if (tainted) { 184
169 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", 185 if (tainted_mask) {
170 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 186 char *s;
171 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 187 int i;
172 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 188
173 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 189 s = buf + sprintf(buf, "Tainted: ");
174 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 190 for (i = 0; i < ARRAY_SIZE(tnts); i++) {
175 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 191 const struct tnt *t = &tnts[i];
176 tainted & TAINT_USER ? 'U' : ' ', 192 *s++ = test_bit(t->bit, &tainted_mask) ?
177 tainted & TAINT_DIE ? 'D' : ' ', 193 t->true : t->false;
178 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', 194 }
179 tainted & TAINT_WARN ? 'W' : ' '); 195 *s = 0;
180 } 196 } else
181 else
182 snprintf(buf, sizeof(buf), "Not tainted"); 197 snprintf(buf, sizeof(buf), "Not tainted");
183 return(buf); 198 return(buf);
184} 199}
185 200
186void add_taint(unsigned flag) 201int test_taint(unsigned flag)
187{ 202{
188 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 203 return test_bit(flag, &tainted_mask);
189 tainted |= flag; 204}
205EXPORT_SYMBOL(test_taint);
206
207unsigned long get_taint(void)
208{
209 return tainted_mask;
190} 210}
191EXPORT_SYMBOL(add_taint);
192 211
193static int __init pause_on_oops_setup(char *str) 212void add_taint(unsigned flag)
194{ 213{
195 pause_on_oops = simple_strtoul(str, NULL, 0); 214 debug_locks = 0; /* can't trust the integrity of the kernel anymore */
196 return 1; 215 set_bit(flag, &tainted_mask);
197} 216}
198__setup("pause_on_oops=", pause_on_oops_setup); 217EXPORT_SYMBOL(add_taint);
199 218
200static void spin_msec(int msecs) 219static void spin_msec(int msecs)
201{ 220{
@@ -306,36 +325,27 @@ void oops_exit(void)
306} 325}
307 326
308#ifdef WANT_WARN_ON_SLOWPATH 327#ifdef WANT_WARN_ON_SLOWPATH
309void warn_on_slowpath(const char *file, int line)
310{
311 char function[KSYM_SYMBOL_LEN];
312 unsigned long caller = (unsigned long) __builtin_return_address(0);
313 sprint_symbol(function, caller);
314
315 printk(KERN_WARNING "------------[ cut here ]------------\n");
316 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
317 line, function);
318 print_modules();
319 dump_stack();
320 print_oops_end_marker();
321 add_taint(TAINT_WARN);
322}
323EXPORT_SYMBOL(warn_on_slowpath);
324
325
326void warn_slowpath(const char *file, int line, const char *fmt, ...) 328void warn_slowpath(const char *file, int line, const char *fmt, ...)
327{ 329{
328 va_list args; 330 va_list args;
329 char function[KSYM_SYMBOL_LEN]; 331 char function[KSYM_SYMBOL_LEN];
330 unsigned long caller = (unsigned long)__builtin_return_address(0); 332 unsigned long caller = (unsigned long)__builtin_return_address(0);
333 const char *board;
334
331 sprint_symbol(function, caller); 335 sprint_symbol(function, caller);
332 336
333 printk(KERN_WARNING "------------[ cut here ]------------\n"); 337 printk(KERN_WARNING "------------[ cut here ]------------\n");
334 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 338 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
335 line, function); 339 line, function);
336 va_start(args, fmt); 340 board = dmi_get_system_info(DMI_PRODUCT_NAME);
337 vprintk(fmt, args); 341 if (board)
338 va_end(args); 342 printk(KERN_WARNING "Hardware name: %s\n", board);
343
344 if (fmt) {
345 va_start(args, fmt);
346 vprintk(fmt, args);
347 va_end(args);
348 }
339 349
340 print_modules(); 350 print_modules();
341 dump_stack(); 351 dump_stack();
@@ -363,3 +373,6 @@ void __stack_chk_fail(void)
363EXPORT_SYMBOL(__stack_chk_fail); 373EXPORT_SYMBOL(__stack_chk_fail);
364 374
365#endif 375#endif
376
377core_param(panic, panic_timeout, int, 0644);
378core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index afc46a23eb6d..a1e3025b19a9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
373} 373}
374 374
375/* sysfs output in /sys/modules/XYZ/parameters/ */ 375/* sysfs output in /sys/modules/XYZ/parameters/ */
376#define to_module_attr(n) container_of(n, struct module_attribute, attr);
377#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
376 378
377extern struct kernel_param __start___param[], __stop___param[]; 379extern struct kernel_param __start___param[], __stop___param[];
378 380
@@ -384,6 +386,7 @@ struct param_attribute
384 386
385struct module_param_attrs 387struct module_param_attrs
386{ 388{
389 unsigned int num;
387 struct attribute_group grp; 390 struct attribute_group grp;
388 struct param_attribute attrs[0]; 391 struct param_attribute attrs[0];
389}; 392};
@@ -434,93 +437,120 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
434 437
435#ifdef CONFIG_SYSFS 438#ifdef CONFIG_SYSFS
436/* 439/*
437 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME 440 * add_sysfs_param - add a parameter to sysfs
438 * @mk: struct module_kobject (contains parent kobject) 441 * @mk: struct module_kobject
439 * @kparam: array of struct kernel_param, the actual parameter definitions 442 * @kparam: the actual parameter definition to add to sysfs
440 * @num_params: number of entries in array 443 * @name: name of parameter
441 * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
442 * 444 *
443 * Create a kobject for a (per-module) group of parameters, and create files 445 * Create a kobject if for a (per-module) parameter if mp NULL, and
444 * in sysfs. A pointer to the param_kobject is returned on success, 446 * create file in sysfs. Returns an error on out of memory. Always cleans up
445 * NULL if there's no parameter to export, or other ERR_PTR(err). 447 * if there's an error.
446 */ 448 */
447static __modinit struct module_param_attrs * 449static __modinit int add_sysfs_param(struct module_kobject *mk,
448param_sysfs_setup(struct module_kobject *mk, 450 struct kernel_param *kp,
449 struct kernel_param *kparam, 451 const char *name)
450 unsigned int num_params,
451 unsigned int name_skip)
452{ 452{
453 struct module_param_attrs *mp; 453 struct module_param_attrs *new;
454 unsigned int valid_attrs = 0; 454 struct attribute **attrs;
455 unsigned int i, size[2]; 455 int err, num;
456 struct param_attribute *pattr; 456
457 struct attribute **gattr; 457 /* We don't bother calling this with invisible parameters. */
458 int err; 458 BUG_ON(!kp->perm);
459 459
460 for (i=0; i<num_params; i++) { 460 if (!mk->mp) {
461 if (kparam[i].perm) 461 num = 0;
462 valid_attrs++; 462 attrs = NULL;
463 } else {
464 num = mk->mp->num;
465 attrs = mk->mp->grp.attrs;
463 } 466 }
464 467
465 if (!valid_attrs) 468 /* Enlarge. */
466 return NULL; 469 new = krealloc(mk->mp,
467 470 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
468 size[0] = ALIGN(sizeof(*mp) + 471 GFP_KERNEL);
469 valid_attrs * sizeof(mp->attrs[0]), 472 if (!new) {
470 sizeof(mp->grp.attrs[0])); 473 kfree(mk->mp);
471 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); 474 err = -ENOMEM;
472 475 goto fail;
473 mp = kzalloc(size[0] + size[1], GFP_KERNEL);
474 if (!mp)
475 return ERR_PTR(-ENOMEM);
476
477 mp->grp.name = "parameters";
478 mp->grp.attrs = (void *)mp + size[0];
479
480 pattr = &mp->attrs[0];
481 gattr = &mp->grp.attrs[0];
482 for (i = 0; i < num_params; i++) {
483 struct kernel_param *kp = &kparam[i];
484 if (kp->perm) {
485 pattr->param = kp;
486 pattr->mattr.show = param_attr_show;
487 pattr->mattr.store = param_attr_store;
488 pattr->mattr.attr.name = (char *)&kp->name[name_skip];
489 pattr->mattr.attr.mode = kp->perm;
490 *(gattr++) = &(pattr++)->mattr.attr;
491 }
492 } 476 }
493 *gattr = NULL; 477 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
494 478 if (!attrs) {
495 if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { 479 err = -ENOMEM;
496 kfree(mp); 480 goto fail_free_new;
497 return ERR_PTR(err);
498 } 481 }
499 return mp; 482
483 /* Sysfs wants everything zeroed. */
484 memset(new, 0, sizeof(*new));
485 memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
486 memset(&attrs[num], 0, sizeof(attrs[num]));
487 new->grp.name = "parameters";
488 new->grp.attrs = attrs;
489
490 /* Tack new one on the end. */
491 new->attrs[num].param = kp;
492 new->attrs[num].mattr.show = param_attr_show;
493 new->attrs[num].mattr.store = param_attr_store;
494 new->attrs[num].mattr.attr.name = (char *)name;
495 new->attrs[num].mattr.attr.mode = kp->perm;
496 new->num = num+1;
497
498 /* Fix up all the pointers, since krealloc can move us */
499 for (num = 0; num < new->num; num++)
500 new->grp.attrs[num] = &new->attrs[num].mattr.attr;
501 new->grp.attrs[num] = NULL;
502
503 mk->mp = new;
504 return 0;
505
506fail_free_new:
507 kfree(new);
508fail:
509 mk->mp = NULL;
510 return err;
500} 511}
501 512
502#ifdef CONFIG_MODULES 513#ifdef CONFIG_MODULES
514static void free_module_param_attrs(struct module_kobject *mk)
515{
516 kfree(mk->mp->grp.attrs);
517 kfree(mk->mp);
518 mk->mp = NULL;
519}
520
503/* 521/*
504 * module_param_sysfs_setup - setup sysfs support for one module 522 * module_param_sysfs_setup - setup sysfs support for one module
505 * @mod: module 523 * @mod: module
506 * @kparam: module parameters (array) 524 * @kparam: module parameters (array)
507 * @num_params: number of module parameters 525 * @num_params: number of module parameters
508 * 526 *
509 * Adds sysfs entries for module parameters, and creates a link from 527 * Adds sysfs entries for module parameters under
510 * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ 528 * /sys/module/[mod->name]/parameters/
511 */ 529 */
512int module_param_sysfs_setup(struct module *mod, 530int module_param_sysfs_setup(struct module *mod,
513 struct kernel_param *kparam, 531 struct kernel_param *kparam,
514 unsigned int num_params) 532 unsigned int num_params)
515{ 533{
516 struct module_param_attrs *mp; 534 int i, err;
535 bool params = false;
536
537 for (i = 0; i < num_params; i++) {
538 if (kparam[i].perm == 0)
539 continue;
540 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
541 if (err)
542 return err;
543 params = true;
544 }
517 545
518 mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); 546 if (!params)
519 if (IS_ERR(mp)) 547 return 0;
520 return PTR_ERR(mp);
521 548
522 mod->param_attrs = mp; 549 /* Create the param group. */
523 return 0; 550 err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
551 if (err)
552 free_module_param_attrs(&mod->mkobj);
553 return err;
524} 554}
525 555
526/* 556/*
@@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod,
532 */ 562 */
533void module_param_sysfs_remove(struct module *mod) 563void module_param_sysfs_remove(struct module *mod)
534{ 564{
535 if (mod->param_attrs) { 565 if (mod->mkobj.mp) {
536 sysfs_remove_group(&mod->mkobj.kobj, 566 sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
537 &mod->param_attrs->grp);
538 /* We are positive that no one is using any param 567 /* We are positive that no one is using any param
539 * attrs at this point. Deallocate immediately. */ 568 * attrs at this point. Deallocate immediately. */
540 kfree(mod->param_attrs); 569 free_module_param_attrs(&mod->mkobj);
541 mod->param_attrs = NULL;
542 } 570 }
543} 571}
544#endif 572#endif
545 573
546/* 574static void __init kernel_add_sysfs_param(const char *name,
547 * kernel_param_sysfs_setup - wrapper for built-in params support 575 struct kernel_param *kparam,
548 */ 576 unsigned int name_skip)
549static void __init kernel_param_sysfs_setup(const char *name,
550 struct kernel_param *kparam,
551 unsigned int num_params,
552 unsigned int name_skip)
553{ 577{
554 struct module_kobject *mk; 578 struct module_kobject *mk;
555 int ret; 579 struct kobject *kobj;
580 int err;
556 581
557 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 582 kobj = kset_find_obj(module_kset, name);
558 BUG_ON(!mk); 583 if (kobj) {
559 584 /* We already have one. Remove params so we can add more. */
560 mk->mod = THIS_MODULE; 585 mk = to_module_kobject(kobj);
561 mk->kobj.kset = module_kset; 586 /* We need to remove it before adding parameters. */
562 ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); 587 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
563 if (ret) { 588 } else {
564 kobject_put(&mk->kobj); 589 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
565 printk(KERN_ERR "Module '%s' failed to be added to sysfs, " 590 BUG_ON(!mk);
566 "error number %d\n", name, ret); 591
567 printk(KERN_ERR "The system will be unstable now.\n"); 592 mk->mod = THIS_MODULE;
568 return; 593 mk->kobj.kset = module_kset;
594 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
595 "%s", name);
596 if (err) {
597 kobject_put(&mk->kobj);
598 printk(KERN_ERR "Module '%s' failed add to sysfs, "
599 "error number %d\n", name, err);
600 printk(KERN_ERR "The system will be unstable now.\n");
601 return;
602 }
603 /* So that exit path is even. */
604 kobject_get(&mk->kobj);
569 } 605 }
570 param_sysfs_setup(mk, kparam, num_params, name_skip); 606
607 /* These should not fail at boot. */
608 err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
609 BUG_ON(err);
610 err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
611 BUG_ON(err);
571 kobject_uevent(&mk->kobj, KOBJ_ADD); 612 kobject_uevent(&mk->kobj, KOBJ_ADD);
613 kobject_put(&mk->kobj);
572} 614}
573 615
574/* 616/*
@@ -579,60 +621,36 @@ static void __init kernel_param_sysfs_setup(const char *name,
579 * The "module" name (KBUILD_MODNAME) is stored before a dot, the 621 * The "module" name (KBUILD_MODNAME) is stored before a dot, the
580 * "parameter" name is stored behind a dot in kernel_param->name. So, 622 * "parameter" name is stored behind a dot in kernel_param->name. So,
581 * extract the "module" name for all built-in kernel_param-eters, 623 * extract the "module" name for all built-in kernel_param-eters,
582 * and for all who have the same, call kernel_param_sysfs_setup. 624 * and for all who have the same, call kernel_add_sysfs_param.
583 */ 625 */
584static void __init param_sysfs_builtin(void) 626static void __init param_sysfs_builtin(void)
585{ 627{
586 struct kernel_param *kp, *kp_begin = NULL; 628 struct kernel_param *kp;
587 unsigned int i, name_len, count = 0; 629 unsigned int name_len;
588 char modname[MODULE_NAME_LEN + 1] = ""; 630 char modname[MODULE_NAME_LEN];
589 631
590 for (i=0; i < __stop___param - __start___param; i++) { 632 for (kp = __start___param; kp < __stop___param; kp++) {
591 char *dot; 633 char *dot;
592 size_t max_name_len;
593 634
594 kp = &__start___param[i]; 635 if (kp->perm == 0)
595 max_name_len = 636 continue;
596 min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
597 637
598 dot = memchr(kp->name, '.', max_name_len); 638 dot = strchr(kp->name, '.');
599 if (!dot) { 639 if (!dot) {
600 DEBUGP("couldn't find period in first %d characters " 640 /* This happens for core_param() */
601 "of %s\n", MODULE_NAME_LEN, kp->name); 641 strcpy(modname, "kernel");
602 continue; 642 name_len = 0;
603 } 643 } else {
604 name_len = dot - kp->name; 644 name_len = dot - kp->name + 1;
605 645 strlcpy(modname, kp->name, name_len);
606 /* new kbuild_modname? */
607 if (strlen(modname) != name_len
608 || strncmp(modname, kp->name, name_len) != 0) {
609 /* add a new kobject for previous kernel_params. */
610 if (count)
611 kernel_param_sysfs_setup(modname,
612 kp_begin,
613 count,
614 strlen(modname)+1);
615
616 strncpy(modname, kp->name, name_len);
617 modname[name_len] = '\0';
618 count = 0;
619 kp_begin = kp;
620 } 646 }
621 count++; 647 kernel_add_sysfs_param(modname, kp, name_len);
622 } 648 }
623
624 /* last kernel_params need to be registered as well */
625 if (count)
626 kernel_param_sysfs_setup(modname, kp_begin, count,
627 strlen(modname)+1);
628} 649}
629 650
630 651
631/* module-related sysfs stuff */ 652/* module-related sysfs stuff */
632 653
633#define to_module_attr(n) container_of(n, struct module_attribute, attr);
634#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
635
636static ssize_t module_attr_show(struct kobject *kobj, 654static ssize_t module_attr_show(struct kobject *kobj,
637 struct attribute *attr, 655 struct attribute *attr,
638 char *buf) 656 char *buf)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03aef36f..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,6 +7,93 @@
7#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h>
11
12/*
13 * Allocate the thread_group_cputime structure appropriately and fill in the
14 * current values of the fields. Called from copy_signal() via
15 * thread_group_cputime_clone_thread() when adding a second or subsequent
16 * thread to a thread group. Assumes interrupts are enabled when called.
17 */
18int thread_group_cputime_alloc(struct task_struct *tsk)
19{
20 struct signal_struct *sig = tsk->signal;
21 struct task_cputime *cputime;
22
23 /*
24 * If we have multiple threads and we don't already have a
25 * per-CPU task_cputime struct (checked in the caller), allocate
26 * one and fill it in with the times accumulated so far. We may
27 * race with another thread so recheck after we pick up the sighand
28 * lock.
29 */
30 cputime = alloc_percpu(struct task_cputime);
31 if (cputime == NULL)
32 return -ENOMEM;
33 spin_lock_irq(&tsk->sighand->siglock);
34 if (sig->cputime.totals) {
35 spin_unlock_irq(&tsk->sighand->siglock);
36 free_percpu(cputime);
37 return 0;
38 }
39 sig->cputime.totals = cputime;
40 cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
41 cputime->utime = tsk->utime;
42 cputime->stime = tsk->stime;
43 cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
44 spin_unlock_irq(&tsk->sighand->siglock);
45 return 0;
46}
47
48/**
49 * thread_group_cputime - Sum the thread group time fields across all CPUs.
50 *
51 * @tsk: The task we use to identify the thread group.
52 * @times: task_cputime structure in which we return the summed fields.
53 *
54 * Walk the list of CPUs to sum the per-CPU time fields in the thread group
55 * time structure.
56 */
57void thread_group_cputime(
58 struct task_struct *tsk,
59 struct task_cputime *times)
60{
61 struct task_cputime *totals, *tot;
62 int i;
63
64 totals = tsk->signal->cputime.totals;
65 if (!totals) {
66 times->utime = tsk->utime;
67 times->stime = tsk->stime;
68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
69 return;
70 }
71
72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime;
79 }
80}
81
82/*
83 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
84 */
85void update_rlimit_cpu(unsigned long rlim_new)
86{
87 cputime_t cputime;
88
89 cputime = secs_to_cputime(rlim_new);
90 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
91 cputime_lt(current->signal->it_prof_expires, cputime)) {
92 spin_lock_irq(&current->sighand->siglock);
93 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
94 spin_unlock_irq(&current->sighand->siglock);
95 }
96}
10 97
11static int check_clock(const clockid_t which_clock) 98static int check_clock(const clockid_t which_clock)
12{ 99{
@@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)
158{ 245{
159 return p->utime; 246 return p->utime;
160} 247}
161static inline unsigned long long sched_ns(struct task_struct *p)
162{
163 return task_sched_runtime(p);
164}
165 248
166int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 249int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
167{ 250{
@@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
211 cpu->cpu = virt_ticks(p); 294 cpu->cpu = virt_ticks(p);
212 break; 295 break;
213 case CPUCLOCK_SCHED: 296 case CPUCLOCK_SCHED:
214 cpu->sched = sched_ns(p); 297 cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
215 break; 298 break;
216 } 299 }
217 return 0; 300 return 0;
@@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
220/* 303/*
221 * Sample a process (thread group) clock for the given group_leader task. 304 * Sample a process (thread group) clock for the given group_leader task.
222 * Must be called with tasklist_lock held for reading. 305 * Must be called with tasklist_lock held for reading.
223 * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
224 */ 306 */
225static int cpu_clock_sample_group_locked(unsigned int clock_idx, 307static int cpu_clock_sample_group(const clockid_t which_clock,
226 struct task_struct *p, 308 struct task_struct *p,
227 union cpu_time_count *cpu) 309 union cpu_time_count *cpu)
228{ 310{
229 struct task_struct *t = p; 311 struct task_cputime cputime;
230 switch (clock_idx) { 312
313 thread_group_cputime(p, &cputime);
314 switch (CPUCLOCK_WHICH(which_clock)) {
231 default: 315 default:
232 return -EINVAL; 316 return -EINVAL;
233 case CPUCLOCK_PROF: 317 case CPUCLOCK_PROF:
234 cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); 318 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
235 do {
236 cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
237 t = next_thread(t);
238 } while (t != p);
239 break; 319 break;
240 case CPUCLOCK_VIRT: 320 case CPUCLOCK_VIRT:
241 cpu->cpu = p->signal->utime; 321 cpu->cpu = cputime.utime;
242 do {
243 cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
244 t = next_thread(t);
245 } while (t != p);
246 break; 322 break;
247 case CPUCLOCK_SCHED: 323 case CPUCLOCK_SCHED:
248 cpu->sched = p->signal->sum_sched_runtime; 324 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
249 /* Add in each other live thread. */
250 while ((t = next_thread(t)) != p) {
251 cpu->sched += t->se.sum_exec_runtime;
252 }
253 cpu->sched += sched_ns(p);
254 break; 325 break;
255 } 326 }
256 return 0; 327 return 0;
257} 328}
258 329
259/*
260 * Sample a process (thread group) clock for the given group_leader task.
261 * Must be called with tasklist_lock held for reading.
262 */
263static int cpu_clock_sample_group(const clockid_t which_clock,
264 struct task_struct *p,
265 union cpu_time_count *cpu)
266{
267 int ret;
268 unsigned long flags;
269 spin_lock_irqsave(&p->sighand->siglock, flags);
270 ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
271 cpu);
272 spin_unlock_irqrestore(&p->sighand->siglock, flags);
273 return ret;
274}
275
276 330
277int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 331int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
278{ 332{
@@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
471} 525}
472void posix_cpu_timers_exit_group(struct task_struct *tsk) 526void posix_cpu_timers_exit_group(struct task_struct *tsk)
473{ 527{
474 cleanup_timers(tsk->signal->cpu_timers, 528 struct task_cputime cputime;
475 cputime_add(tsk->utime, tsk->signal->utime),
476 cputime_add(tsk->stime, tsk->signal->stime),
477 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
478}
479
480
481/*
482 * Set the expiry times of all the threads in the process so one of them
483 * will go off before the process cumulative expiry total is reached.
484 */
485static void process_timer_rebalance(struct task_struct *p,
486 unsigned int clock_idx,
487 union cpu_time_count expires,
488 union cpu_time_count val)
489{
490 cputime_t ticks, left;
491 unsigned long long ns, nsleft;
492 struct task_struct *t = p;
493 unsigned int nthreads = atomic_read(&p->signal->live);
494 529
495 if (!nthreads) 530 thread_group_cputime(tsk, &cputime);
496 return; 531 cleanup_timers(tsk->signal->cpu_timers,
497 532 cputime.utime, cputime.stime, cputime.sum_exec_runtime);
498 switch (clock_idx) {
499 default:
500 BUG();
501 break;
502 case CPUCLOCK_PROF:
503 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
504 nthreads);
505 do {
506 if (likely(!(t->flags & PF_EXITING))) {
507 ticks = cputime_add(prof_ticks(t), left);
508 if (cputime_eq(t->it_prof_expires,
509 cputime_zero) ||
510 cputime_gt(t->it_prof_expires, ticks)) {
511 t->it_prof_expires = ticks;
512 }
513 }
514 t = next_thread(t);
515 } while (t != p);
516 break;
517 case CPUCLOCK_VIRT:
518 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
519 nthreads);
520 do {
521 if (likely(!(t->flags & PF_EXITING))) {
522 ticks = cputime_add(virt_ticks(t), left);
523 if (cputime_eq(t->it_virt_expires,
524 cputime_zero) ||
525 cputime_gt(t->it_virt_expires, ticks)) {
526 t->it_virt_expires = ticks;
527 }
528 }
529 t = next_thread(t);
530 } while (t != p);
531 break;
532 case CPUCLOCK_SCHED:
533 nsleft = expires.sched - val.sched;
534 do_div(nsleft, nthreads);
535 nsleft = max_t(unsigned long long, nsleft, 1);
536 do {
537 if (likely(!(t->flags & PF_EXITING))) {
538 ns = t->se.sum_exec_runtime + nsleft;
539 if (t->it_sched_expires == 0 ||
540 t->it_sched_expires > ns) {
541 t->it_sched_expires = ns;
542 }
543 }
544 t = next_thread(t);
545 } while (t != p);
546 break;
547 }
548} 533}
549 534
550static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 535static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
608 default: 593 default:
609 BUG(); 594 BUG();
610 case CPUCLOCK_PROF: 595 case CPUCLOCK_PROF:
611 if (cputime_eq(p->it_prof_expires, 596 if (cputime_eq(p->cputime_expires.prof_exp,
612 cputime_zero) || 597 cputime_zero) ||
613 cputime_gt(p->it_prof_expires, 598 cputime_gt(p->cputime_expires.prof_exp,
614 nt->expires.cpu)) 599 nt->expires.cpu))
615 p->it_prof_expires = nt->expires.cpu; 600 p->cputime_expires.prof_exp =
601 nt->expires.cpu;
616 break; 602 break;
617 case CPUCLOCK_VIRT: 603 case CPUCLOCK_VIRT:
618 if (cputime_eq(p->it_virt_expires, 604 if (cputime_eq(p->cputime_expires.virt_exp,
619 cputime_zero) || 605 cputime_zero) ||
620 cputime_gt(p->it_virt_expires, 606 cputime_gt(p->cputime_expires.virt_exp,
621 nt->expires.cpu)) 607 nt->expires.cpu))
622 p->it_virt_expires = nt->expires.cpu; 608 p->cputime_expires.virt_exp =
609 nt->expires.cpu;
623 break; 610 break;
624 case CPUCLOCK_SCHED: 611 case CPUCLOCK_SCHED:
625 if (p->it_sched_expires == 0 || 612 if (p->cputime_expires.sched_exp == 0 ||
626 p->it_sched_expires > nt->expires.sched) 613 p->cputime_expires.sched_exp >
627 p->it_sched_expires = nt->expires.sched; 614 nt->expires.sched)
615 p->cputime_expires.sched_exp =
616 nt->expires.sched;
628 break; 617 break;
629 } 618 }
630 } else { 619 } else {
631 /* 620 /*
632 * For a process timer, we must balance 621 * For a process timer, set the cached expiration time.
633 * all the live threads' expirations.
634 */ 622 */
635 switch (CPUCLOCK_WHICH(timer->it_clock)) { 623 switch (CPUCLOCK_WHICH(timer->it_clock)) {
636 default: 624 default:
@@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
641 cputime_lt(p->signal->it_virt_expires, 629 cputime_lt(p->signal->it_virt_expires,
642 timer->it.cpu.expires.cpu)) 630 timer->it.cpu.expires.cpu))
643 break; 631 break;
644 goto rebalance; 632 p->signal->cputime_expires.virt_exp =
633 timer->it.cpu.expires.cpu;
634 break;
645 case CPUCLOCK_PROF: 635 case CPUCLOCK_PROF:
646 if (!cputime_eq(p->signal->it_prof_expires, 636 if (!cputime_eq(p->signal->it_prof_expires,
647 cputime_zero) && 637 cputime_zero) &&
@@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
652 if (i != RLIM_INFINITY && 642 if (i != RLIM_INFINITY &&
653 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 643 i <= cputime_to_secs(timer->it.cpu.expires.cpu))
654 break; 644 break;
655 goto rebalance; 645 p->signal->cputime_expires.prof_exp =
646 timer->it.cpu.expires.cpu;
647 break;
656 case CPUCLOCK_SCHED: 648 case CPUCLOCK_SCHED:
657 rebalance: 649 p->signal->cputime_expires.sched_exp =
658 process_timer_rebalance( 650 timer->it.cpu.expires.sched;
659 timer->it.cpu.task,
660 CPUCLOCK_WHICH(timer->it_clock),
661 timer->it.cpu.expires, now);
662 break; 651 break;
663 } 652 }
664 } 653 }
@@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk,
969 struct signal_struct *const sig = tsk->signal; 958 struct signal_struct *const sig = tsk->signal;
970 959
971 maxfire = 20; 960 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 961 tsk->cputime_expires.prof_exp = cputime_zero;
973 while (!list_empty(timers)) { 962 while (!list_empty(timers)) {
974 struct cpu_timer_list *t = list_first_entry(timers, 963 struct cpu_timer_list *t = list_first_entry(timers,
975 struct cpu_timer_list, 964 struct cpu_timer_list,
976 entry); 965 entry);
977 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 966 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
978 tsk->it_prof_expires = t->expires.cpu; 967 tsk->cputime_expires.prof_exp = t->expires.cpu;
979 break; 968 break;
980 } 969 }
981 t->firing = 1; 970 t->firing = 1;
@@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk,
984 973
985 ++timers; 974 ++timers;
986 maxfire = 20; 975 maxfire = 20;
987 tsk->it_virt_expires = cputime_zero; 976 tsk->cputime_expires.virt_exp = cputime_zero;
988 while (!list_empty(timers)) { 977 while (!list_empty(timers)) {
989 struct cpu_timer_list *t = list_first_entry(timers, 978 struct cpu_timer_list *t = list_first_entry(timers,
990 struct cpu_timer_list, 979 struct cpu_timer_list,
991 entry); 980 entry);
992 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 981 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
993 tsk->it_virt_expires = t->expires.cpu; 982 tsk->cputime_expires.virt_exp = t->expires.cpu;
994 break; 983 break;
995 } 984 }
996 t->firing = 1; 985 t->firing = 1;
@@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk,
999 988
1000 ++timers; 989 ++timers;
1001 maxfire = 20; 990 maxfire = 20;
1002 tsk->it_sched_expires = 0; 991 tsk->cputime_expires.sched_exp = 0;
1003 while (!list_empty(timers)) { 992 while (!list_empty(timers)) {
1004 struct cpu_timer_list *t = list_first_entry(timers, 993 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 994 struct cpu_timer_list,
1006 entry); 995 entry);
1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { 996 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 997 tsk->cputime_expires.sched_exp = t->expires.sched;
1009 break; 998 break;
1010 } 999 }
1011 t->firing = 1; 1000 t->firing = 1;
@@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk,
1055{ 1044{
1056 int maxfire; 1045 int maxfire;
1057 struct signal_struct *const sig = tsk->signal; 1046 struct signal_struct *const sig = tsk->signal;
1058 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1047 cputime_t utime, ptime, virt_expires, prof_expires;
1059 unsigned long long sum_sched_runtime, sched_expires; 1048 unsigned long long sum_sched_runtime, sched_expires;
1060 struct task_struct *t;
1061 struct list_head *timers = sig->cpu_timers; 1049 struct list_head *timers = sig->cpu_timers;
1050 struct task_cputime cputime;
1062 1051
1063 /* 1052 /*
1064 * Don't sample the current process CPU clocks if there are no timers. 1053 * Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk,
1074 /* 1063 /*
1075 * Collect the current process totals. 1064 * Collect the current process totals.
1076 */ 1065 */
1077 utime = sig->utime; 1066 thread_group_cputime(tsk, &cputime);
1078 stime = sig->stime; 1067 utime = cputime.utime;
1079 sum_sched_runtime = sig->sum_sched_runtime; 1068 ptime = cputime_add(utime, cputime.stime);
1080 t = tsk; 1069 sum_sched_runtime = cputime.sum_exec_runtime;
1081 do {
1082 utime = cputime_add(utime, t->utime);
1083 stime = cputime_add(stime, t->stime);
1084 sum_sched_runtime += t->se.sum_exec_runtime;
1085 t = next_thread(t);
1086 } while (t != tsk);
1087 ptime = cputime_add(utime, stime);
1088
1089 maxfire = 20; 1070 maxfire = 20;
1090 prof_expires = cputime_zero; 1071 prof_expires = cputime_zero;
1091 while (!list_empty(timers)) { 1072 while (!list_empty(timers)) {
@@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk,
1193 } 1174 }
1194 } 1175 }
1195 1176
1196 if (!cputime_eq(prof_expires, cputime_zero) || 1177 if (!cputime_eq(prof_expires, cputime_zero) &&
1197 !cputime_eq(virt_expires, cputime_zero) || 1178 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
1198 sched_expires != 0) { 1179 cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
1199 /* 1180 sig->cputime_expires.prof_exp = prof_expires;
1200 * Rebalance the threads' expiry times for the remaining 1181 if (!cputime_eq(virt_expires, cputime_zero) &&
1201 * process CPU timers. 1182 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1202 */ 1183 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1203 1184 sig->cputime_expires.virt_exp = virt_expires;
1204 cputime_t prof_left, virt_left, ticks; 1185 if (sched_expires != 0 &&
1205 unsigned long long sched_left, sched; 1186 (sig->cputime_expires.sched_exp == 0 ||
1206 const unsigned int nthreads = atomic_read(&sig->live); 1187 sig->cputime_expires.sched_exp > sched_expires))
1207 1188 sig->cputime_expires.sched_exp = sched_expires;
1208 if (!nthreads)
1209 return;
1210
1211 prof_left = cputime_sub(prof_expires, utime);
1212 prof_left = cputime_sub(prof_left, stime);
1213 prof_left = cputime_div_non_zero(prof_left, nthreads);
1214 virt_left = cputime_sub(virt_expires, utime);
1215 virt_left = cputime_div_non_zero(virt_left, nthreads);
1216 if (sched_expires) {
1217 sched_left = sched_expires - sum_sched_runtime;
1218 do_div(sched_left, nthreads);
1219 sched_left = max_t(unsigned long long, sched_left, 1);
1220 } else {
1221 sched_left = 0;
1222 }
1223 t = tsk;
1224 do {
1225 if (unlikely(t->flags & PF_EXITING))
1226 continue;
1227
1228 ticks = cputime_add(cputime_add(t->utime, t->stime),
1229 prof_left);
1230 if (!cputime_eq(prof_expires, cputime_zero) &&
1231 (cputime_eq(t->it_prof_expires, cputime_zero) ||
1232 cputime_gt(t->it_prof_expires, ticks))) {
1233 t->it_prof_expires = ticks;
1234 }
1235
1236 ticks = cputime_add(t->utime, virt_left);
1237 if (!cputime_eq(virt_expires, cputime_zero) &&
1238 (cputime_eq(t->it_virt_expires, cputime_zero) ||
1239 cputime_gt(t->it_virt_expires, ticks))) {
1240 t->it_virt_expires = ticks;
1241 }
1242
1243 sched = t->se.sum_exec_runtime + sched_left;
1244 if (sched_expires && (t->it_sched_expires == 0 ||
1245 t->it_sched_expires > sched)) {
1246 t->it_sched_expires = sched;
1247 }
1248 } while ((t = next_thread(t)) != tsk);
1249 }
1250} 1189}
1251 1190
1252/* 1191/*
@@ -1314,6 +1253,89 @@ out:
1314 ++timer->it_requeue_pending; 1253 ++timer->it_requeue_pending;
1315} 1254}
1316 1255
1256/**
1257 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1258 *
1259 * @cputime: The struct to compare.
1260 *
1261 * Checks @cputime to see if all fields are zero. Returns true if all fields
1262 * are zero, false if any field is nonzero.
1263 */
1264static inline int task_cputime_zero(const struct task_cputime *cputime)
1265{
1266 if (cputime_eq(cputime->utime, cputime_zero) &&
1267 cputime_eq(cputime->stime, cputime_zero) &&
1268 cputime->sum_exec_runtime == 0)
1269 return 1;
1270 return 0;
1271}
1272
1273/**
1274 * task_cputime_expired - Compare two task_cputime entities.
1275 *
1276 * @sample: The task_cputime structure to be checked for expiration.
1277 * @expires: Expiration times, against which @sample will be checked.
1278 *
1279 * Checks @sample against @expires to see if any field of @sample has expired.
1280 * Returns true if any field of the former is greater than the corresponding
1281 * field of the latter if the latter field is set. Otherwise returns false.
1282 */
1283static inline int task_cputime_expired(const struct task_cputime *sample,
1284 const struct task_cputime *expires)
1285{
1286 if (!cputime_eq(expires->utime, cputime_zero) &&
1287 cputime_ge(sample->utime, expires->utime))
1288 return 1;
1289 if (!cputime_eq(expires->stime, cputime_zero) &&
1290 cputime_ge(cputime_add(sample->utime, sample->stime),
1291 expires->stime))
1292 return 1;
1293 if (expires->sum_exec_runtime != 0 &&
1294 sample->sum_exec_runtime >= expires->sum_exec_runtime)
1295 return 1;
1296 return 0;
1297}
1298
1299/**
1300 * fastpath_timer_check - POSIX CPU timers fast path.
1301 *
1302 * @tsk: The task (thread) being checked.
1303 *
1304 * Check the task and thread group timers. If both are zero (there are no
1305 * timers set) return false. Otherwise snapshot the task and thread group
1306 * timers and compare them with the corresponding expiration times. Return
1307 * true if a timer has expired, else return false.
1308 */
1309static inline int fastpath_timer_check(struct task_struct *tsk)
1310{
1311 struct signal_struct *sig;
1312
1313 /* tsk == current, ensure it is safe to use ->signal/sighand */
1314 if (unlikely(tsk->exit_state))
1315 return 0;
1316
1317 if (!task_cputime_zero(&tsk->cputime_expires)) {
1318 struct task_cputime task_sample = {
1319 .utime = tsk->utime,
1320 .stime = tsk->stime,
1321 .sum_exec_runtime = tsk->se.sum_exec_runtime
1322 };
1323
1324 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1325 return 1;
1326 }
1327
1328 sig = tsk->signal;
1329 if (!task_cputime_zero(&sig->cputime_expires)) {
1330 struct task_cputime group_sample;
1331
1332 thread_group_cputime(tsk, &group_sample);
1333 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1334 return 1;
1335 }
1336 return 0;
1337}
1338
1317/* 1339/*
1318 * This is called from the timer interrupt handler. The irq handler has 1340 * This is called from the timer interrupt handler. The irq handler has
1319 * already updated our counts. We need to check if any timers fire now. 1341 * already updated our counts. We need to check if any timers fire now.
@@ -1326,42 +1348,31 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1326 1348
1327 BUG_ON(!irqs_disabled()); 1349 BUG_ON(!irqs_disabled());
1328 1350
1329#define UNEXPIRED(clock) \ 1351 /*
1330 (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ 1352 * The fast path checks that there are no expired thread or thread
1331 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) 1353 * group timers. If that's so, just return.
1332 1354 */
1333 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1355 if (!fastpath_timer_check(tsk))
1334 (tsk->it_sched_expires == 0 ||
1335 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1336 return; 1356 return;
1337 1357
1338#undef UNEXPIRED 1358 spin_lock(&tsk->sighand->siglock);
1339
1340 /* 1359 /*
1341 * Double-check with locks held. 1360 * Here we take off tsk->signal->cpu_timers[N] and
1361 * tsk->cpu_timers[N] all the timers that are firing, and
1362 * put them on the firing list.
1342 */ 1363 */
1343 read_lock(&tasklist_lock); 1364 check_thread_timers(tsk, &firing);
1344 if (likely(tsk->signal != NULL)) { 1365 check_process_timers(tsk, &firing);
1345 spin_lock(&tsk->sighand->siglock);
1346 1366
1347 /* 1367 /*
1348 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] 1368 * We must release these locks before taking any timer's lock.
1349 * all the timers that are firing, and put them on the firing list. 1369 * There is a potential race with timer deletion here, as the
1350 */ 1370 * siglock now protects our private firing list. We have set
1351 check_thread_timers(tsk, &firing); 1371 * the firing flag in each timer, so that a deletion attempt
1352 check_process_timers(tsk, &firing); 1372 * that gets the timer lock before we do will give it up and
1353 1373 * spin until we've taken care of that timer below.
1354 /* 1374 */
1355 * We must release these locks before taking any timer's lock. 1375 spin_unlock(&tsk->sighand->siglock);
1356 * There is a potential race with timer deletion here, as the
1357 * siglock now protects our private firing list. We have set
1358 * the firing flag in each timer, so that a deletion attempt
1359 * that gets the timer lock before we do will give it up and
1360 * spin until we've taken care of that timer below.
1361 */
1362 spin_unlock(&tsk->sighand->siglock);
1363 }
1364 read_unlock(&tasklist_lock);
1365 1376
1366 /* 1377 /*
1367 * Now that all the timers on our list have the firing flag, 1378 * Now that all the timers on our list have the firing flag,
@@ -1389,10 +1400,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1389 1400
1390/* 1401/*
1391 * Set one of the process-wide special case CPU timers. 1402 * Set one of the process-wide special case CPU timers.
1392 * The tasklist_lock and tsk->sighand->siglock must be held by the caller. 1403 * The tsk->sighand->siglock must be held by the caller.
1393 * The oldval argument is null for the RLIMIT_CPU timer, where *newval is 1404 * The *newval argument is relative and we update it to be absolute, *oldval
1394 * absolute; non-null for ITIMER_*, where *newval is relative and we update 1405 * is absolute and we update it to be relative.
1395 * it to be absolute, *oldval is absolute and we update it to be relative.
1396 */ 1406 */
1397void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1407void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1398 cputime_t *newval, cputime_t *oldval) 1408 cputime_t *newval, cputime_t *oldval)
@@ -1401,7 +1411,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1401 struct list_head *head; 1411 struct list_head *head;
1402 1412
1403 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1413 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1404 cpu_clock_sample_group_locked(clock_idx, tsk, &now); 1414 cpu_clock_sample_group(clock_idx, tsk, &now);
1405 1415
1406 if (oldval) { 1416 if (oldval) {
1407 if (!cputime_eq(*oldval, cputime_zero)) { 1417 if (!cputime_eq(*oldval, cputime_zero)) {
@@ -1435,13 +1445,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1435 cputime_ge(list_first_entry(head, 1445 cputime_ge(list_first_entry(head,
1436 struct cpu_timer_list, entry)->expires.cpu, 1446 struct cpu_timer_list, entry)->expires.cpu,
1437 *newval)) { 1447 *newval)) {
1438 /* 1448 switch (clock_idx) {
1439 * Rejigger each thread's expiry time so that one will 1449 case CPUCLOCK_PROF:
1440 * notice before we hit the process-cumulative expiry time. 1450 tsk->signal->cputime_expires.prof_exp = *newval;
1441 */ 1451 break;
1442 union cpu_time_count expires = { .sched = 0 }; 1452 case CPUCLOCK_VIRT:
1443 expires.cpu = *newval; 1453 tsk->signal->cputime_expires.virt_exp = *newval;
1444 process_timer_rebalance(tsk, clock_idx, expires, now); 1454 break;
1455 }
1445 } 1456 }
1446} 1457}
1447 1458
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5131e5471169..887c63787de6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
116 * must supply functions here, even if the function just returns 116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the 117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the 118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_process 119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code. 120 * fields are not modified by timer code.
121 * 121 *
122 * At this time all functions EXCEPT clock_nanosleep can be 122 * At this time all functions EXCEPT clock_nanosleep can be
@@ -197,6 +197,11 @@ static int common_timer_create(struct k_itimer *new_timer)
197 return 0; 197 return 0;
198} 198}
199 199
200static int no_timer_create(struct k_itimer *new_timer)
201{
202 return -EOPNOTSUPP;
203}
204
200/* 205/*
201 * Return nonzero if we know a priori this clockid_t value is bogus. 206 * Return nonzero if we know a priori this clockid_t value is bogus.
202 */ 207 */
@@ -223,6 +228,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
223} 228}
224 229
225/* 230/*
231 * Get monotonic time for posix timers
232 */
233static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
234{
235 getrawmonotonic(tp);
236 return 0;
237}
238
239/*
226 * Initialize everything, well, just everything in Posix clocks/timers ;) 240 * Initialize everything, well, just everything in Posix clocks/timers ;)
227 */ 241 */
228static __init int init_posix_timers(void) 242static __init int init_posix_timers(void)
@@ -235,9 +249,16 @@ static __init int init_posix_timers(void)
235 .clock_get = posix_ktime_get_ts, 249 .clock_get = posix_ktime_get_ts,
236 .clock_set = do_posix_clock_nosettime, 250 .clock_set = do_posix_clock_nosettime,
237 }; 251 };
252 struct k_clock clock_monotonic_raw = {
253 .clock_getres = hrtimer_get_res,
254 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create,
257 };
238 258
239 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 259 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
240 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 260 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
261 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
241 262
242 posix_timers_cache = kmem_cache_create("posix_timers_cache", 263 posix_timers_cache = kmem_cache_create("posix_timers_cache",
243 sizeof (struct k_itimer), 0, SLAB_PANIC, 264 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -298,6 +319,8 @@ void do_schedule_next_timer(struct siginfo *info)
298 319
299int posix_timer_event(struct k_itimer *timr, int si_private) 320int posix_timer_event(struct k_itimer *timr, int si_private)
300{ 321{
322 struct task_struct *task;
323 int shared, ret = -1;
301 /* 324 /*
302 * FIXME: if ->sigq is queued we can race with 325 * FIXME: if ->sigq is queued we can race with
303 * dequeue_signal()->do_schedule_next_timer(). 326 * dequeue_signal()->do_schedule_next_timer().
@@ -311,25 +334,15 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
311 */ 334 */
312 timr->sigq->info.si_sys_private = si_private; 335 timr->sigq->info.si_sys_private = si_private;
313 336
314 timr->sigq->info.si_signo = timr->it_sigev_signo; 337 rcu_read_lock();
315 timr->sigq->info.si_code = SI_TIMER; 338 task = pid_task(timr->it_pid, PIDTYPE_PID);
316 timr->sigq->info.si_tid = timr->it_id; 339 if (task) {
317 timr->sigq->info.si_value = timr->it_sigev_value; 340 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
318 341 ret = send_sigqueue(timr->sigq, task, shared);
319 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
320 struct task_struct *leader;
321 int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
322
323 if (likely(ret >= 0))
324 return ret;
325
326 timr->it_sigev_notify = SIGEV_SIGNAL;
327 leader = timr->it_process->group_leader;
328 put_task_struct(timr->it_process);
329 timr->it_process = leader;
330 } 342 }
331 343 rcu_read_unlock();
332 return send_sigqueue(timr->sigq, timr->it_process, 1); 344 /* If we failed to send the signal the timer stops. */
345 return ret > 0;
333} 346}
334EXPORT_SYMBOL_GPL(posix_timer_event); 347EXPORT_SYMBOL_GPL(posix_timer_event);
335 348
@@ -404,7 +417,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
404 return ret; 417 return ret;
405} 418}
406 419
407static struct task_struct * good_sigevent(sigevent_t * event) 420static struct pid *good_sigevent(sigevent_t * event)
408{ 421{
409 struct task_struct *rtn = current->group_leader; 422 struct task_struct *rtn = current->group_leader;
410 423
@@ -418,7 +431,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
418 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) 431 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
419 return NULL; 432 return NULL;
420 433
421 return rtn; 434 return task_pid(rtn);
422} 435}
423 436
424void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 437void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -457,6 +470,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
457 idr_remove(&posix_timers_id, tmr->it_id); 470 idr_remove(&posix_timers_id, tmr->it_id);
458 spin_unlock_irqrestore(&idr_lock, flags); 471 spin_unlock_irqrestore(&idr_lock, flags);
459 } 472 }
473 put_pid(tmr->it_pid);
460 sigqueue_free(tmr->sigq); 474 sigqueue_free(tmr->sigq);
461 kmem_cache_free(posix_timers_cache, tmr); 475 kmem_cache_free(posix_timers_cache, tmr);
462} 476}
@@ -468,11 +482,8 @@ sys_timer_create(const clockid_t which_clock,
468 struct sigevent __user *timer_event_spec, 482 struct sigevent __user *timer_event_spec,
469 timer_t __user * created_timer_id) 483 timer_t __user * created_timer_id)
470{ 484{
471 int error = 0; 485 struct k_itimer *new_timer;
472 struct k_itimer *new_timer = NULL; 486 int error, new_timer_id;
473 int new_timer_id;
474 struct task_struct *process = NULL;
475 unsigned long flags;
476 sigevent_t event; 487 sigevent_t event;
477 int it_id_set = IT_ID_NOT_SET; 488 int it_id_set = IT_ID_NOT_SET;
478 489
@@ -490,12 +501,11 @@ sys_timer_create(const clockid_t which_clock,
490 goto out; 501 goto out;
491 } 502 }
492 spin_lock_irq(&idr_lock); 503 spin_lock_irq(&idr_lock);
493 error = idr_get_new(&posix_timers_id, (void *) new_timer, 504 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
494 &new_timer_id);
495 spin_unlock_irq(&idr_lock); 505 spin_unlock_irq(&idr_lock);
496 if (error == -EAGAIN) 506 if (error) {
497 goto retry; 507 if (error == -EAGAIN)
498 else if (error) { 508 goto retry;
499 /* 509 /*
500 * Weird looking, but we return EAGAIN if the IDR is 510 * Weird looking, but we return EAGAIN if the IDR is
501 * full (proper POSIX return value for this) 511 * full (proper POSIX return value for this)
@@ -526,67 +536,40 @@ sys_timer_create(const clockid_t which_clock,
526 error = -EFAULT; 536 error = -EFAULT;
527 goto out; 537 goto out;
528 } 538 }
529 new_timer->it_sigev_notify = event.sigev_notify; 539 rcu_read_lock();
530 new_timer->it_sigev_signo = event.sigev_signo; 540 new_timer->it_pid = get_pid(good_sigevent(&event));
531 new_timer->it_sigev_value = event.sigev_value; 541 rcu_read_unlock();
532 542 if (!new_timer->it_pid) {
533 read_lock(&tasklist_lock);
534 if ((process = good_sigevent(&event))) {
535 /*
536 * We may be setting up this process for another
537 * thread. It may be exiting. To catch this
538 * case the we check the PF_EXITING flag. If
539 * the flag is not set, the siglock will catch
540 * him before it is too late (in exit_itimers).
541 *
542 * The exec case is a bit more invloved but easy
543 * to code. If the process is in our thread
544 * group (and it must be or we would not allow
545 * it here) and is doing an exec, it will cause
546 * us to be killed. In this case it will wait
547 * for us to die which means we can finish this
548 * linkage with our last gasp. I.e. no code :)
549 */
550 spin_lock_irqsave(&process->sighand->siglock, flags);
551 if (!(process->flags & PF_EXITING)) {
552 new_timer->it_process = process;
553 list_add(&new_timer->list,
554 &process->signal->posix_timers);
555 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
556 get_task_struct(process);
557 spin_unlock_irqrestore(&process->sighand->siglock, flags);
558 } else {
559 spin_unlock_irqrestore(&process->sighand->siglock, flags);
560 process = NULL;
561 }
562 }
563 read_unlock(&tasklist_lock);
564 if (!process) {
565 error = -EINVAL; 543 error = -EINVAL;
566 goto out; 544 goto out;
567 } 545 }
568 } else { 546 } else {
569 new_timer->it_sigev_notify = SIGEV_SIGNAL; 547 event.sigev_notify = SIGEV_SIGNAL;
570 new_timer->it_sigev_signo = SIGALRM; 548 event.sigev_signo = SIGALRM;
571 new_timer->it_sigev_value.sival_int = new_timer->it_id; 549 event.sigev_value.sival_int = new_timer->it_id;
572 process = current->group_leader; 550 new_timer->it_pid = get_pid(task_tgid(current));
573 spin_lock_irqsave(&process->sighand->siglock, flags);
574 new_timer->it_process = process;
575 list_add(&new_timer->list, &process->signal->posix_timers);
576 spin_unlock_irqrestore(&process->sighand->siglock, flags);
577 } 551 }
578 552
553 new_timer->it_sigev_notify = event.sigev_notify;
554 new_timer->sigq->info.si_signo = event.sigev_signo;
555 new_timer->sigq->info.si_value = event.sigev_value;
556 new_timer->sigq->info.si_tid = new_timer->it_id;
557 new_timer->sigq->info.si_code = SI_TIMER;
558
559 spin_lock_irq(&current->sighand->siglock);
560 new_timer->it_signal = current->signal;
561 list_add(&new_timer->list, &current->signal->posix_timers);
562 spin_unlock_irq(&current->sighand->siglock);
563
564 return 0;
579 /* 565 /*
580 * In the case of the timer belonging to another task, after 566 * In the case of the timer belonging to another task, after
581 * the task is unlocked, the timer is owned by the other task 567 * the task is unlocked, the timer is owned by the other task
582 * and may cease to exist at any time. Don't use or modify 568 * and may cease to exist at any time. Don't use or modify
583 * new_timer after the unlock call. 569 * new_timer after the unlock call.
584 */ 570 */
585
586out: 571out:
587 if (error) 572 release_posix_timer(new_timer, it_id_set);
588 release_posix_timer(new_timer, it_id_set);
589
590 return error; 573 return error;
591} 574}
592 575
@@ -597,7 +580,7 @@ out:
597 * the find to the timer lock. To avoid a dead lock, the timer id MUST 580 * the find to the timer lock. To avoid a dead lock, the timer id MUST
598 * be release with out holding the timer lock. 581 * be release with out holding the timer lock.
599 */ 582 */
600static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) 583static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
601{ 584{
602 struct k_itimer *timr; 585 struct k_itimer *timr;
603 /* 586 /*
@@ -605,23 +588,19 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
605 * flags part over to the timer lock. Must not let interrupts in 588 * flags part over to the timer lock. Must not let interrupts in
606 * while we are moving the lock. 589 * while we are moving the lock.
607 */ 590 */
608
609 spin_lock_irqsave(&idr_lock, *flags); 591 spin_lock_irqsave(&idr_lock, *flags);
610 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); 592 timr = idr_find(&posix_timers_id, (int)timer_id);
611 if (timr) { 593 if (timr) {
612 spin_lock(&timr->it_lock); 594 spin_lock(&timr->it_lock);
613 595 if (timr->it_signal == current->signal) {
614 if ((timr->it_id != timer_id) || !(timr->it_process) ||
615 !same_thread_group(timr->it_process, current)) {
616 spin_unlock(&timr->it_lock);
617 spin_unlock_irqrestore(&idr_lock, *flags);
618 timr = NULL;
619 } else
620 spin_unlock(&idr_lock); 596 spin_unlock(&idr_lock);
621 } else 597 return timr;
622 spin_unlock_irqrestore(&idr_lock, *flags); 598 }
599 spin_unlock(&timr->it_lock);
600 }
601 spin_unlock_irqrestore(&idr_lock, *flags);
623 602
624 return timr; 603 return NULL;
625} 604}
626 605
627/* 606/*
@@ -668,7 +647,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
668 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) 647 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
669 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); 648 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
670 649
671 remaining = ktime_sub(timer->expires, now); 650 remaining = ktime_sub(hrtimer_get_expires(timer), now);
672 /* Return 0 only, when the timer is expired and not pending */ 651 /* Return 0 only, when the timer is expired and not pending */
673 if (remaining.tv64 <= 0) { 652 if (remaining.tv64 <= 0) {
674 /* 653 /*
@@ -762,7 +741,7 @@ common_timer_set(struct k_itimer *timr, int flags,
762 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 741 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
763 timr->it.real.timer.function = posix_timer_fn; 742 timr->it.real.timer.function = posix_timer_fn;
764 743
765 timer->expires = timespec_to_ktime(new_setting->it_value); 744 hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
766 745
767 /* Convert interval */ 746 /* Convert interval */
768 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); 747 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
@@ -771,14 +750,12 @@ common_timer_set(struct k_itimer *timr, int flags,
771 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 750 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
772 /* Setup correct expiry time for relative timers */ 751 /* Setup correct expiry time for relative timers */
773 if (mode == HRTIMER_MODE_REL) { 752 if (mode == HRTIMER_MODE_REL) {
774 timer->expires = 753 hrtimer_add_expires(timer, timer->base->get_time());
775 ktime_add_safe(timer->expires,
776 timer->base->get_time());
777 } 754 }
778 return 0; 755 return 0;
779 } 756 }
780 757
781 hrtimer_start(timer, timer->expires, mode); 758 hrtimer_start_expires(timer, mode);
782 return 0; 759 return 0;
783} 760}
784 761
@@ -862,9 +839,7 @@ retry_delete:
862 * This keeps any tasks waiting on the spin lock from thinking 839 * This keeps any tasks waiting on the spin lock from thinking
863 * they got something (see the lock code above). 840 * they got something (see the lock code above).
864 */ 841 */
865 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 842 timer->it_signal = NULL;
866 put_task_struct(timer->it_process);
867 timer->it_process = NULL;
868 843
869 unlock_timer(timer, flags); 844 unlock_timer(timer, flags);
870 release_posix_timer(timer, IT_ID_SET); 845 release_posix_timer(timer, IT_ID_SET);
@@ -890,9 +865,7 @@ retry_delete:
890 * This keeps any tasks waiting on the spin lock from thinking 865 * This keeps any tasks waiting on the spin lock from thinking
891 * they got something (see the lock code above). 866 * they got something (see the lock code above).
892 */ 867 */
893 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 868 timer->it_signal = NULL;
894 put_task_struct(timer->it_process);
895 timer->it_process = NULL;
896 869
897 unlock_timer(timer, flags); 870 unlock_timer(timer, flags);
898 release_posix_timer(timer, IT_ID_SET); 871 release_posix_timer(timer, IT_ID_SET);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index dcd165f92a88..23bd4daeb96b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -96,7 +96,7 @@ config SUSPEND
96 96
97config PM_TEST_SUSPEND 97config PM_TEST_SUSPEND
98 bool "Test suspend/resume and wakealarm during bootup" 98 bool "Test suspend/resume and wakealarm during bootup"
99 depends on SUSPEND && PM_DEBUG && RTC_LIB=y 99 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
100 ---help--- 100 ---help---
101 This option will let you suspend your machine during bootup, and 101 This option will let you suspend your machine during bootup, and
102 make it wake up a few seconds later using an RTC wakeup alarm. 102 make it wake up a few seconds later using an RTC wakeup alarm.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index bbd85c60f741..f77d3819ef57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -14,6 +14,7 @@
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/kmod.h>
17#include <linux/delay.h> 18#include <linux/delay.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
19#include <linux/mount.h> 20#include <linux/mount.h>
@@ -21,7 +22,6 @@
21#include <linux/console.h> 22#include <linux/console.h>
22#include <linux/cpu.h> 23#include <linux/cpu.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
24#include <linux/ftrace.h>
25 25
26#include "power.h" 26#include "power.h"
27 27
@@ -256,7 +256,7 @@ static int create_image(int platform_mode)
256 256
257int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
258{ 258{
259 int error, ftrace_save; 259 int error;
260 260
261 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
262 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -268,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
268 goto Close; 268 goto Close;
269 269
270 suspend_console(); 270 suspend_console();
271 ftrace_save = __ftrace_enabled_save();
272 error = device_suspend(PMSG_FREEZE); 271 error = device_suspend(PMSG_FREEZE);
273 if (error) 272 if (error)
274 goto Recover_platform; 273 goto Recover_platform;
@@ -298,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
298 Resume_devices: 297 Resume_devices:
299 device_resume(in_suspend ? 298 device_resume(in_suspend ?
300 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 299 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
301 __ftrace_enabled_restore(ftrace_save);
302 resume_console(); 300 resume_console();
303 Close: 301 Close:
304 platform_end(platform_mode); 302 platform_end(platform_mode);
@@ -369,11 +367,10 @@ static int resume_target_kernel(void)
369 367
370int hibernation_restore(int platform_mode) 368int hibernation_restore(int platform_mode)
371{ 369{
372 int error, ftrace_save; 370 int error;
373 371
374 pm_prepare_console(); 372 pm_prepare_console();
375 suspend_console(); 373 suspend_console();
376 ftrace_save = __ftrace_enabled_save();
377 error = device_suspend(PMSG_QUIESCE); 374 error = device_suspend(PMSG_QUIESCE);
378 if (error) 375 if (error)
379 goto Finish; 376 goto Finish;
@@ -388,7 +385,6 @@ int hibernation_restore(int platform_mode)
388 platform_restore_cleanup(platform_mode); 385 platform_restore_cleanup(platform_mode);
389 device_resume(PMSG_RECOVER); 386 device_resume(PMSG_RECOVER);
390 Finish: 387 Finish:
391 __ftrace_enabled_restore(ftrace_save);
392 resume_console(); 388 resume_console();
393 pm_restore_console(); 389 pm_restore_console();
394 return error; 390 return error;
@@ -401,7 +397,7 @@ int hibernation_restore(int platform_mode)
401 397
402int hibernation_platform_enter(void) 398int hibernation_platform_enter(void)
403{ 399{
404 int error, ftrace_save; 400 int error;
405 401
406 if (!hibernation_ops) 402 if (!hibernation_ops)
407 return -ENOSYS; 403 return -ENOSYS;
@@ -416,7 +412,6 @@ int hibernation_platform_enter(void)
416 goto Close; 412 goto Close;
417 413
418 suspend_console(); 414 suspend_console();
419 ftrace_save = __ftrace_enabled_save();
420 error = device_suspend(PMSG_HIBERNATE); 415 error = device_suspend(PMSG_HIBERNATE);
421 if (error) { 416 if (error) {
422 if (hibernation_ops->recover) 417 if (hibernation_ops->recover)
@@ -451,7 +446,6 @@ int hibernation_platform_enter(void)
451 hibernation_ops->finish(); 446 hibernation_ops->finish();
452 Resume_devices: 447 Resume_devices:
453 device_resume(PMSG_RESTORE); 448 device_resume(PMSG_RESTORE);
454 __ftrace_enabled_restore(ftrace_save);
455 resume_console(); 449 resume_console();
456 Close: 450 Close:
457 hibernation_ops->end(); 451 hibernation_ops->end();
@@ -520,6 +514,10 @@ int hibernate(void)
520 if (error) 514 if (error)
521 goto Exit; 515 goto Exit;
522 516
517 error = usermodehelper_disable();
518 if (error)
519 goto Exit;
520
523 /* Allocate memory management structures */ 521 /* Allocate memory management structures */
524 error = create_basic_memory_bitmaps(); 522 error = create_basic_memory_bitmaps();
525 if (error) 523 if (error)
@@ -558,6 +556,7 @@ int hibernate(void)
558 thaw_processes(); 556 thaw_processes();
559 Finish: 557 Finish:
560 free_basic_memory_bitmaps(); 558 free_basic_memory_bitmaps();
559 usermodehelper_enable();
561 Exit: 560 Exit:
562 pm_notifier_call_chain(PM_POST_HIBERNATION); 561 pm_notifier_call_chain(PM_POST_HIBERNATION);
563 pm_restore_console(); 562 pm_restore_console();
@@ -634,6 +633,10 @@ static int software_resume(void)
634 if (error) 633 if (error)
635 goto Finish; 634 goto Finish;
636 635
636 error = usermodehelper_disable();
637 if (error)
638 goto Finish;
639
637 error = create_basic_memory_bitmaps(); 640 error = create_basic_memory_bitmaps();
638 if (error) 641 if (error)
639 goto Finish; 642 goto Finish;
@@ -641,7 +644,7 @@ static int software_resume(void)
641 pr_debug("PM: Preparing processes for restore.\n"); 644 pr_debug("PM: Preparing processes for restore.\n");
642 error = prepare_processes(); 645 error = prepare_processes();
643 if (error) { 646 if (error) {
644 swsusp_close(); 647 swsusp_close(FMODE_READ);
645 goto Done; 648 goto Done;
646 } 649 }
647 650
@@ -656,6 +659,7 @@ static int software_resume(void)
656 thaw_processes(); 659 thaw_processes();
657 Done: 660 Done:
658 free_basic_memory_bitmaps(); 661 free_basic_memory_bitmaps();
662 usermodehelper_enable();
659 Finish: 663 Finish:
660 pm_notifier_call_chain(PM_POST_RESTORE); 664 pm_notifier_call_chain(PM_POST_RESTORE);
661 pm_restore_console(); 665 pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 540b16b68565..613f16941b85 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -14,6 +14,7 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/kmod.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
@@ -21,7 +22,6 @@
21#include <linux/freezer.h> 22#include <linux/freezer.h>
22#include <linux/vmstat.h> 23#include <linux/vmstat.h>
23#include <linux/syscalls.h> 24#include <linux/syscalls.h>
24#include <linux/ftrace.h>
25 25
26#include "power.h" 26#include "power.h"
27 27
@@ -173,7 +173,7 @@ static void suspend_test_finish(const char *label)
173 * has some performance issues. The stack dump of a WARN_ON 173 * has some performance issues. The stack dump of a WARN_ON
174 * is more likely to get the right attention than a printk... 174 * is more likely to get the right attention than a printk...
175 */ 175 */
176 WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); 176 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
177} 177}
178 178
179#else 179#else
@@ -237,6 +237,10 @@ static int suspend_prepare(void)
237 if (error) 237 if (error)
238 goto Finish; 238 goto Finish;
239 239
240 error = usermodehelper_disable();
241 if (error)
242 goto Finish;
243
240 if (suspend_freeze_processes()) { 244 if (suspend_freeze_processes()) {
241 error = -EAGAIN; 245 error = -EAGAIN;
242 goto Thaw; 246 goto Thaw;
@@ -256,6 +260,7 @@ static int suspend_prepare(void)
256 260
257 Thaw: 261 Thaw:
258 suspend_thaw_processes(); 262 suspend_thaw_processes();
263 usermodehelper_enable();
259 Finish: 264 Finish:
260 pm_notifier_call_chain(PM_POST_SUSPEND); 265 pm_notifier_call_chain(PM_POST_SUSPEND);
261 pm_restore_console(); 266 pm_restore_console();
@@ -311,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
311 */ 316 */
312int suspend_devices_and_enter(suspend_state_t state) 317int suspend_devices_and_enter(suspend_state_t state)
313{ 318{
314 int error, ftrace_save; 319 int error;
315 320
316 if (!suspend_ops) 321 if (!suspend_ops)
317 return -ENOSYS; 322 return -ENOSYS;
@@ -322,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
322 goto Close; 327 goto Close;
323 } 328 }
324 suspend_console(); 329 suspend_console();
325 ftrace_save = __ftrace_enabled_save();
326 suspend_test_start(); 330 suspend_test_start();
327 error = device_suspend(PMSG_SUSPEND); 331 error = device_suspend(PMSG_SUSPEND);
328 if (error) { 332 if (error) {
@@ -354,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
354 suspend_test_start(); 358 suspend_test_start();
355 device_resume(PMSG_RESUME); 359 device_resume(PMSG_RESUME);
356 suspend_test_finish("resume devices"); 360 suspend_test_finish("resume devices");
357 __ftrace_enabled_restore(ftrace_save);
358 resume_console(); 361 resume_console();
359 Close: 362 Close:
360 if (suspend_ops->end) 363 if (suspend_ops->end)
@@ -376,6 +379,7 @@ int suspend_devices_and_enter(suspend_state_t state)
376static void suspend_finish(void) 379static void suspend_finish(void)
377{ 380{
378 suspend_thaw_processes(); 381 suspend_thaw_processes();
382 usermodehelper_enable();
379 pm_notifier_call_chain(PM_POST_SUSPEND); 383 pm_notifier_call_chain(PM_POST_SUSPEND);
380 pm_restore_console(); 384 pm_restore_console();
381} 385}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index acc0c101dbd5..46b5ec7a3afb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -153,7 +153,7 @@ extern int swsusp_shrink_memory(void);
153extern void swsusp_free(void); 153extern void swsusp_free(void);
154extern int swsusp_read(unsigned int *flags_p); 154extern int swsusp_read(unsigned int *flags_p);
155extern int swsusp_write(unsigned int flags); 155extern int swsusp_write(unsigned int flags);
156extern void swsusp_close(void); 156extern void swsusp_close(fmode_t);
157 157
158struct timeval; 158struct timeval;
159/* kernel/power/swsusp.c */ 159/* kernel/power/swsusp.c */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 278946aecaf0..ca634019497a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p)
28 return 1; 28 return 1;
29} 29}
30 30
31/*
32 * freezing is complete, mark current process as frozen
33 */
34static inline void frozen_process(void)
35{
36 if (!unlikely(current->flags & PF_NOFREEZE)) {
37 current->flags |= PF_FROZEN;
38 wmb();
39 }
40 clear_freeze_flag(current);
41}
42
43/* Refrigerator is place where frozen processes are stored :-). */
44void refrigerator(void)
45{
46 /* Hmm, should we be allowed to suspend when there are realtime
47 processes around? */
48 long save;
49
50 task_lock(current);
51 if (freezing(current)) {
52 frozen_process();
53 task_unlock(current);
54 } else {
55 task_unlock(current);
56 return;
57 }
58 save = current->state;
59 pr_debug("%s entered refrigerator\n", current->comm);
60
61 spin_lock_irq(&current->sighand->siglock);
62 recalc_sigpending(); /* We sent fake signal, clean it up */
63 spin_unlock_irq(&current->sighand->siglock);
64
65 for (;;) {
66 set_current_state(TASK_UNINTERRUPTIBLE);
67 if (!frozen(current))
68 break;
69 schedule();
70 }
71 pr_debug("%s left refrigerator\n", current->comm);
72 __set_current_state(save);
73}
74
75static void fake_signal_wake_up(struct task_struct *p)
76{
77 unsigned long flags;
78
79 spin_lock_irqsave(&p->sighand->siglock, flags);
80 signal_wake_up(p, 0);
81 spin_unlock_irqrestore(&p->sighand->siglock, flags);
82}
83
84static inline bool should_send_signal(struct task_struct *p)
85{
86 return !(p->flags & PF_FREEZER_NOSIG);
87}
88
89/**
90 * freeze_task - send a freeze request to given task
91 * @p: task to send the request to
92 * @sig_only: if set, the request will only be sent if the task has the
93 * PF_FREEZER_NOSIG flag unset
94 * Return value: 'false', if @sig_only is set and the task has
95 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
96 *
97 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
98 * either sending a fake signal to it or waking it up, depending on whether
99 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
100 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
101 * TIF_FREEZE flag will not be set.
102 */
103static bool freeze_task(struct task_struct *p, bool sig_only)
104{
105 /*
106 * We first check if the task is freezing and next if it has already
107 * been frozen to avoid the race with frozen_process() which first marks
108 * the task as frozen and next clears its TIF_FREEZE.
109 */
110 if (!freezing(p)) {
111 rmb();
112 if (frozen(p))
113 return false;
114
115 if (!sig_only || should_send_signal(p))
116 set_freeze_flag(p);
117 else
118 return false;
119 }
120
121 if (should_send_signal(p)) {
122 if (!signal_pending(p))
123 fake_signal_wake_up(p);
124 } else if (sig_only) {
125 return false;
126 } else {
127 wake_up_state(p, TASK_INTERRUPTIBLE);
128 }
129
130 return true;
131}
132
133static void cancel_freezing(struct task_struct *p)
134{
135 unsigned long flags;
136
137 if (freezing(p)) {
138 pr_debug(" clean up: %s\n", p->comm);
139 clear_freeze_flag(p);
140 spin_lock_irqsave(&p->sighand->siglock, flags);
141 recalc_sigpending_and_wake(p);
142 spin_unlock_irqrestore(&p->sighand->siglock, flags);
143 }
144}
145
146static int try_to_freeze_tasks(bool sig_only) 31static int try_to_freeze_tasks(bool sig_only)
147{ 32{
148 struct task_struct *g, *p; 33 struct task_struct *g, *p;
@@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
250 if (nosig_only && should_send_signal(p)) 135 if (nosig_only && should_send_signal(p))
251 continue; 136 continue;
252 137
138 if (cgroup_frozen(p))
139 continue;
140
253 thaw_process(p); 141 thaw_process(p);
254 } while_each_thread(g, p); 142 } while_each_thread(g, p);
255 read_unlock(&tasklist_lock); 143 read_unlock(&tasklist_lock);
@@ -264,4 +152,3 @@ void thaw_processes(void)
264 printk("done.\n"); 152 printk("done.\n");
265} 153}
266 154
267EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 80ccac849e46..6da14358537c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -172,13 +172,13 @@ static int swsusp_swap_check(void) /* This is called before saving image */
172 return res; 172 return res;
173 173
174 root_swap = res; 174 root_swap = res;
175 res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); 175 res = blkdev_get(resume_bdev, FMODE_WRITE);
176 if (res) 176 if (res)
177 return res; 177 return res;
178 178
179 res = set_blocksize(resume_bdev, PAGE_SIZE); 179 res = set_blocksize(resume_bdev, PAGE_SIZE);
180 if (res < 0) 180 if (res < 0)
181 blkdev_put(resume_bdev); 181 blkdev_put(resume_bdev, FMODE_WRITE);
182 182
183 return res; 183 return res;
184} 184}
@@ -426,7 +426,7 @@ int swsusp_write(unsigned int flags)
426 426
427 release_swap_writer(&handle); 427 release_swap_writer(&handle);
428 out: 428 out:
429 swsusp_close(); 429 swsusp_close(FMODE_WRITE);
430 return error; 430 return error;
431} 431}
432 432
@@ -574,7 +574,7 @@ int swsusp_read(unsigned int *flags_p)
574 error = load_image(&handle, &snapshot, header->pages - 1); 574 error = load_image(&handle, &snapshot, header->pages - 1);
575 release_swap_reader(&handle); 575 release_swap_reader(&handle);
576 576
577 blkdev_put(resume_bdev); 577 blkdev_put(resume_bdev, FMODE_READ);
578 578
579 if (!error) 579 if (!error)
580 pr_debug("PM: Image successfully loaded\n"); 580 pr_debug("PM: Image successfully loaded\n");
@@ -609,7 +609,7 @@ int swsusp_check(void)
609 return -EINVAL; 609 return -EINVAL;
610 } 610 }
611 if (error) 611 if (error)
612 blkdev_put(resume_bdev); 612 blkdev_put(resume_bdev, FMODE_READ);
613 else 613 else
614 pr_debug("PM: Signature found, resuming\n"); 614 pr_debug("PM: Signature found, resuming\n");
615 } else { 615 } else {
@@ -626,14 +626,14 @@ int swsusp_check(void)
626 * swsusp_close - close swap device. 626 * swsusp_close - close swap device.
627 */ 627 */
628 628
629void swsusp_close(void) 629void swsusp_close(fmode_t mode)
630{ 630{
631 if (IS_ERR(resume_bdev)) { 631 if (IS_ERR(resume_bdev)) {
632 pr_debug("PM: Image device not initialised\n"); 632 pr_debug("PM: Image device not initialised\n");
633 return; 633 return;
634 } 634 }
635 635
636 blkdev_put(resume_bdev); 636 blkdev_put(resume_bdev, mode);
637} 637}
638 638
639static int swsusp_header_init(void) 639static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a6332a313262..005b93d839ba 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
212 case SNAPSHOT_FREEZE: 212 case SNAPSHOT_FREEZE:
213 if (data->frozen) 213 if (data->frozen)
214 break; 214 break;
215
215 printk("Syncing filesystems ... "); 216 printk("Syncing filesystems ... ");
216 sys_sync(); 217 sys_sync();
217 printk("done.\n"); 218 printk("done.\n");
218 219
219 error = freeze_processes(); 220 error = usermodehelper_disable();
220 if (error) 221 if (error)
222 break;
223
224 error = freeze_processes();
225 if (error) {
221 thaw_processes(); 226 thaw_processes();
227 usermodehelper_enable();
228 }
222 if (!error) 229 if (!error)
223 data->frozen = 1; 230 data->frozen = 1;
224 break; 231 break;
@@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
227 if (!data->frozen || data->ready) 234 if (!data->frozen || data->ready)
228 break; 235 break;
229 thaw_processes(); 236 thaw_processes();
237 usermodehelper_enable();
230 data->frozen = 0; 238 data->frozen = 0;
231 break; 239 break;
232 240
diff --git a/kernel/printk.c b/kernel/printk.c
index a430fd04008b..e651ab05655f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -13,7 +13,7 @@
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfred@colorfullife.com 14 * manfred@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton
17 */ 17 */
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
@@ -233,45 +233,6 @@ static inline void boot_delay_msec(void)
233#endif 233#endif
234 234
235/* 235/*
236 * Return the number of unread characters in the log buffer.
237 */
238static int log_buf_get_len(void)
239{
240 return logged_chars;
241}
242
243/*
244 * Copy a range of characters from the log buffer.
245 */
246int log_buf_copy(char *dest, int idx, int len)
247{
248 int ret, max;
249 bool took_lock = false;
250
251 if (!oops_in_progress) {
252 spin_lock_irq(&logbuf_lock);
253 took_lock = true;
254 }
255
256 max = log_buf_get_len();
257 if (idx < 0 || idx >= max) {
258 ret = -1;
259 } else {
260 if (len > max)
261 len = max;
262 ret = len;
263 idx += (log_end - max);
264 while (len-- > 0)
265 dest[len] = LOG_BUF(idx + len);
266 }
267
268 if (took_lock)
269 spin_unlock_irq(&logbuf_lock);
270
271 return ret;
272}
273
274/*
275 * Commands to do_syslog: 236 * Commands to do_syslog:
276 * 237 *
277 * 0 -- Close the log. Currently a NOP. 238 * 0 -- Close the log. Currently a NOP.
@@ -577,9 +538,6 @@ static int have_callable_console(void)
577 * @fmt: format string 538 * @fmt: format string
578 * 539 *
579 * This is printk(). It can be called from any context. We want it to work. 540 * This is printk(). It can be called from any context. We want it to work.
580 * Be aware of the fact that if oops_in_progress is not set, we might try to
581 * wake klogd up which could deadlock on runqueue lock if printk() is called
582 * from scheduler code.
583 * 541 *
584 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 542 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
585 * call the console drivers. If we fail to get the semaphore we place the output 543 * call the console drivers. If we fail to get the semaphore we place the output
@@ -593,6 +551,8 @@ static int have_callable_console(void)
593 * 551 *
594 * See also: 552 * See also:
595 * printf(3) 553 * printf(3)
554 *
555 * See the vsnprintf() documentation for format string extensions over C99.
596 */ 556 */
597 557
598asmlinkage int printk(const char *fmt, ...) 558asmlinkage int printk(const char *fmt, ...)
@@ -702,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
702 if (recursion_bug) { 662 if (recursion_bug) {
703 recursion_bug = 0; 663 recursion_bug = 0;
704 strcpy(printk_buf, recursion_bug_msg); 664 strcpy(printk_buf, recursion_bug_msg);
705 printed_len = sizeof(recursion_bug_msg); 665 printed_len = strlen(recursion_bug_msg);
706 } 666 }
707 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
708 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
@@ -982,10 +942,25 @@ int is_console_locked(void)
982 return console_locked; 942 return console_locked;
983} 943}
984 944
985void wake_up_klogd(void) 945static DEFINE_PER_CPU(int, printk_pending);
946
947void printk_tick(void)
986{ 948{
987 if (!oops_in_progress && waitqueue_active(&log_wait)) 949 if (__get_cpu_var(printk_pending)) {
950 __get_cpu_var(printk_pending) = 0;
988 wake_up_interruptible(&log_wait); 951 wake_up_interruptible(&log_wait);
952 }
953}
954
955int printk_needs_cpu(int cpu)
956{
957 return per_cpu(printk_pending, cpu);
958}
959
960void wake_up_klogd(void)
961{
962 if (waitqueue_active(&log_wait))
963 __raw_get_cpu_var(printk_pending) = 1;
989} 964}
990 965
991/** 966/**
diff --git a/kernel/profile.c b/kernel/profile.c
index cd26bed4cc26..60adefb59b5e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,6 +22,8 @@
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
25#include <asm/sections.h> 27#include <asm/sections.h>
26#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
27#include <asm/ptrace.h> 29#include <asm/ptrace.h>
@@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
50static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
51#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
52 54
53static int __init profile_setup(char *str) 55int profile_setup(char *str)
54{ 56{
55 static char __initdata schedstr[] = "schedule"; 57 static char schedstr[] = "schedule";
56 static char __initdata sleepstr[] = "sleep"; 58 static char sleepstr[] = "sleep";
57 static char __initdata kvmstr[] = "kvm"; 59 static char kvmstr[] = "kvm";
58 int par; 60 int par;
59 61
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 62 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -100,14 +102,33 @@ static int __init profile_setup(char *str)
100__setup("profile=", profile_setup); 102__setup("profile=", profile_setup);
101 103
102 104
103void __init profile_init(void) 105int __ref profile_init(void)
104{ 106{
107 int buffer_bytes;
105 if (!prof_on) 108 if (!prof_on)
106 return; 109 return 0;
107 110
108 /* only text is profiled */ 111 /* only text is profiled */
109 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
110 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 return 0;
117 }
118
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer)
121 return 0;
122
123 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
124 if (prof_buffer)
125 return 0;
126
127 prof_buffer = vmalloc(buffer_bytes);
128 if (prof_buffer)
129 return 0;
130
131 return -ENOMEM;
111} 132}
112 133
113/* Profile event notifications */ 134/* Profile event notifications */
@@ -330,7 +351,7 @@ out:
330 put_cpu(); 351 put_cpu();
331} 352}
332 353
333static int __devinit profile_cpu_callback(struct notifier_block *info, 354static int __cpuinit profile_cpu_callback(struct notifier_block *info,
334 unsigned long action, void *__cpu) 355 unsigned long action, void *__cpu)
335{ 356{
336 int node, cpu = (unsigned long)__cpu; 357 int node, cpu = (unsigned long)__cpu;
@@ -523,11 +544,11 @@ static const struct file_operations proc_profile_operations = {
523}; 544};
524 545
525#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
526static void __init profile_nop(void *unused) 547static void profile_nop(void *unused)
527{ 548{
528} 549}
529 550
530static int __init create_hash_tables(void) 551static int create_hash_tables(void)
531{ 552{
532 int cpu; 553 int cpu;
533 554
@@ -575,14 +596,14 @@ out_cleanup:
575#define create_hash_tables() ({ 0; }) 596#define create_hash_tables() ({ 0; })
576#endif 597#endif
577 598
578static int __init create_proc_profile(void) 599int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
579{ 600{
580 struct proc_dir_entry *entry; 601 struct proc_dir_entry *entry;
581 602
582 if (!prof_on) 603 if (!prof_on)
583 return 0; 604 return 0;
584 if (create_hash_tables()) 605 if (create_hash_tables())
585 return -1; 606 return -ENOMEM;
586 entry = proc_create("profile", S_IWUSR | S_IRUGO, 607 entry = proc_create("profile", S_IWUSR | S_IRUGO,
587 NULL, &proc_profile_operations); 608 NULL, &proc_profile_operations);
588 if (!entry) 609 if (!entry)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 356699a96d56..29dc700e198c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,17 @@
25#include <asm/pgtable.h> 25#include <asm/pgtable.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27 27
28
29/*
30 * Initialize a new task whose father had been ptraced.
31 *
32 * Called from copy_process().
33 */
34void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
35{
36 arch_ptrace_fork(child, clone_flags);
37}
38
28/* 39/*
29 * ptrace a task: make the debugger its new parent and 40 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 41 * move it to the ptrace list.
@@ -45,7 +56,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
45 * TASK_TRACED, resume it now. 56 * TASK_TRACED, resume it now.
46 * Requires that irqs be disabled. 57 * Requires that irqs be disabled.
47 */ 58 */
48void ptrace_untrace(struct task_struct *child) 59static void ptrace_untrace(struct task_struct *child)
49{ 60{
50 spin_lock(&child->sighand->siglock); 61 spin_lock(&child->sighand->siglock);
51 if (task_is_traced(child)) { 62 if (task_is_traced(child)) {
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
72 child->parent = child->real_parent; 83 child->parent = child->real_parent;
73 list_del_init(&child->ptrace_entry); 84 list_del_init(&child->ptrace_entry);
74 85
86 arch_ptrace_untrace(child);
75 if (task_is_traced(child)) 87 if (task_is_traced(child))
76 ptrace_untrace(child); 88 ptrace_untrace(child);
77} 89}
@@ -115,6 +127,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
115 127
116int __ptrace_may_access(struct task_struct *task, unsigned int mode) 128int __ptrace_may_access(struct task_struct *task, unsigned int mode)
117{ 129{
130 const struct cred *cred = current_cred(), *tcred;
131
118 /* May we inspect the given task? 132 /* May we inspect the given task?
119 * This check is used both for attaching with ptrace 133 * This check is used both for attaching with ptrace
120 * and for allowing access to sensitive information in /proc. 134 * and for allowing access to sensitive information in /proc.
@@ -127,13 +141,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
127 /* Don't let security modules deny introspection */ 141 /* Don't let security modules deny introspection */
128 if (task == current) 142 if (task == current)
129 return 0; 143 return 0;
130 if (((current->uid != task->euid) || 144 rcu_read_lock();
131 (current->uid != task->suid) || 145 tcred = __task_cred(task);
132 (current->uid != task->uid) || 146 if ((cred->uid != tcred->euid ||
133 (current->gid != task->egid) || 147 cred->uid != tcred->suid ||
134 (current->gid != task->sgid) || 148 cred->uid != tcred->uid ||
135 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 149 cred->gid != tcred->egid ||
150 cred->gid != tcred->sgid ||
151 cred->gid != tcred->gid) &&
152 !capable(CAP_SYS_PTRACE)) {
153 rcu_read_unlock();
136 return -EPERM; 154 return -EPERM;
155 }
156 rcu_read_unlock();
137 smp_rmb(); 157 smp_rmb();
138 if (task->mm) 158 if (task->mm)
139 dumpable = get_dumpable(task->mm); 159 dumpable = get_dumpable(task->mm);
@@ -163,6 +183,14 @@ int ptrace_attach(struct task_struct *task)
163 if (same_thread_group(task, current)) 183 if (same_thread_group(task, current))
164 goto out; 184 goto out;
165 185
186 /* Protect exec's credential calculations against our interference;
187 * SUID, SGID and LSM creds get determined differently under ptrace.
188 */
189 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
190 if (retval < 0)
191 goto out;
192
193 retval = -EPERM;
166repeat: 194repeat:
167 /* 195 /*
168 * Nasty, nasty. 196 * Nasty, nasty.
@@ -202,6 +230,7 @@ repeat:
202bad: 230bad:
203 write_unlock_irqrestore(&tasklist_lock, flags); 231 write_unlock_irqrestore(&tasklist_lock, flags);
204 task_unlock(task); 232 task_unlock(task);
233 mutex_unlock(&current->cred_exec_mutex);
205out: 234out:
206 return retval; 235 return retval;
207} 236}
@@ -612,7 +641,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
612 return (copied == sizeof(data)) ? 0 : -EIO; 641 return (copied == sizeof(data)) ? 0 : -EIO;
613} 642}
614 643
615#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE 644#if defined CONFIG_COMPAT
616#include <linux/compat.h> 645#include <linux/compat.h>
617 646
618int compat_ptrace_request(struct task_struct *child, compat_long_t request, 647int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -709,4 +738,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
709 unlock_kernel(); 738 unlock_kernel();
710 return ret; 739 return ret;
711} 740}
712#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ 741#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..e503a002f330 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 467d5940f624..ad63af8b2521 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -119,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type)
119 /* Take cpucontrol mutex to protect against CPU hotplug */ 119 /* Take cpucontrol mutex to protect against CPU hotplug */
120 mutex_lock(&rcu_barrier_mutex); 120 mutex_lock(&rcu_barrier_mutex);
121 init_completion(&rcu_barrier_completion); 121 init_completion(&rcu_barrier_completion);
122 atomic_set(&rcu_barrier_cpu_count, 0);
123 /* 122 /*
124 * The queueing of callbacks in all CPUs must be atomic with 123 * Initialize rcu_barrier_cpu_count to 1, then invoke
125 * respect to RCU, otherwise one CPU may queue a callback, 124 * rcu_barrier_func() on each CPU, so that each CPU also has
126 * wait for a grace period, decrement barrier count and call 125 * incremented rcu_barrier_cpu_count. Only then is it safe to
127 * complete(), while other CPUs have not yet queued anything. 126 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
128 * So, we need to make sure that grace periods cannot complete 127 * might complete its grace period before all of the other CPUs
129 * until all the callbacks are queued. 128 * did their increment, causing this function to return too
129 * early.
130 */ 130 */
131 rcu_read_lock(); 131 atomic_set(&rcu_barrier_cpu_count, 1);
132 on_each_cpu(rcu_barrier_func, (void *)type, 1); 132 on_each_cpu(rcu_barrier_func, (void *)type, 1);
133 rcu_read_unlock(); 133 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
134 complete(&rcu_barrier_completion);
134 wait_for_completion(&rcu_barrier_completion); 135 wait_for_completion(&rcu_barrier_completion);
135 mutex_unlock(&rcu_barrier_mutex); 136 mutex_unlock(&rcu_barrier_mutex);
136} 137}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ca4bbbe04aa4..04982659875a 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -54,9 +54,9 @@
54#include <linux/cpu.h> 54#include <linux/cpu.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/delay.h> 56#include <linux/delay.h>
57#include <linux/byteorder/swabb.h>
58#include <linux/cpumask.h> 57#include <linux/cpumask.h>
59#include <linux/rcupreempt_trace.h> 58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60 60
61/* 61/*
62 * PREEMPT_RCU data structures. 62 * PREEMPT_RCU data structures.
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
551 } 551 }
552} 552}
553 553
554void rcu_nmi_enter(void)
555{
556 rcu_irq_enter();
557}
558
559void rcu_nmi_exit(void)
560{
561 rcu_irq_exit();
562}
563
554static void dyntick_save_progress_counter(int cpu) 564static void dyntick_save_progress_counter(int cpu)
555{ 565{
556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); 566 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
149 sp->done_length += cp->done_length; 149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add; 150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove; 151 sp->done_remove += cp->done_remove;
152 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); 152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks; 153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_set(&sp->rcu_try_flip_1, 154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 atomic_read(&cp->rcu_try_flip_1)); 155 &sp->rcu_try_flip_1);
156 atomic_set(&sp->rcu_try_flip_e1, 156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 atomic_read(&cp->rcu_try_flip_e1)); 157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; 158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; 159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; 160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 90b5b123f7a1..b31065522104 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,13 +39,14 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/reboot.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/delay.h> 45#include <linux/delay.h>
45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h>
49 50
50MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -108,7 +109,6 @@ struct rcu_torture {
108 int rtort_mbtest; 109 int rtort_mbtest;
109}; 110};
110 111
111static int fullstop = 0; /* stop generating callbacks at test end. */
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current = NULL;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version = 0;
@@ -136,6 +136,30 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
143 /* spawning of kthreads. */
144
145/*
146 * Detect and respond to a signal-based shutdown.
147 */
148static int
149rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3)
151{
152 if (fullstop)
153 return NOTIFY_DONE;
154 if (signal_pending(current)) {
155 mutex_lock(&fullstop_mutex);
156 if (!ACCESS_ONCE(fullstop))
157 fullstop = FULLSTOP_SIGNALED;
158 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE;
161}
162
139/* 163/*
140 * Allocate an element from the rcu_tortures pool. 164 * Allocate an element from the rcu_tortures pool.
141 */ 165 */
@@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp)
199static void 223static void
200rcu_stutter_wait(void) 224rcu_stutter_wait(void)
201{ 225{
202 while (stutter_pause_test || !rcutorture_runnable) 226 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
203 if (rcutorture_runnable) 227 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1); 228 schedule_timeout_interruptible(1);
205 else 229 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 230 schedule_timeout_interruptible(round_jiffies_relative(HZ));
231 }
207} 232}
208 233
209/* 234/*
@@ -599,7 +624,7 @@ rcu_torture_writer(void *arg)
599 rcu_stutter_wait(); 624 rcu_stutter_wait();
600 } while (!kthread_should_stop() && !fullstop); 625 } while (!kthread_should_stop() && !fullstop);
601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
602 while (!kthread_should_stop()) 627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
603 schedule_timeout_uninterruptible(1); 628 schedule_timeout_uninterruptible(1);
604 return 0; 629 return 0;
605} 630}
@@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg)
624 } while (!kthread_should_stop() && !fullstop); 649 } while (!kthread_should_stop() && !fullstop);
625 650
626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
627 while (!kthread_should_stop()) 652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
628 schedule_timeout_uninterruptible(1); 653 schedule_timeout_uninterruptible(1);
629 return 0; 654 return 0;
630} 655}
@@ -734,7 +759,7 @@ rcu_torture_reader(void *arg)
734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable) 760 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t); 761 del_timer_sync(&t);
737 while (!kthread_should_stop()) 762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
738 schedule_timeout_uninterruptible(1); 763 schedule_timeout_uninterruptible(1);
739 return 0; 764 return 0;
740} 765}
@@ -831,7 +856,7 @@ rcu_torture_stats(void *arg)
831 do { 856 do {
832 schedule_timeout_interruptible(stat_interval * HZ); 857 schedule_timeout_interruptible(stat_interval * HZ);
833 rcu_torture_stats_print(); 858 rcu_torture_stats_print();
834 } while (!kthread_should_stop()); 859 } while (!kthread_should_stop() && !fullstop);
835 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 860 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
836 return 0; 861 return 0;
837} 862}
@@ -899,7 +924,7 @@ rcu_torture_shuffle(void *arg)
899 do { 924 do {
900 schedule_timeout_interruptible(shuffle_interval * HZ); 925 schedule_timeout_interruptible(shuffle_interval * HZ);
901 rcu_torture_shuffle_tasks(); 926 rcu_torture_shuffle_tasks();
902 } while (!kthread_should_stop()); 927 } while (!kthread_should_stop() && !fullstop);
903 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 928 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
904 return 0; 929 return 0;
905} 930}
@@ -914,10 +939,10 @@ rcu_torture_stutter(void *arg)
914 do { 939 do {
915 schedule_timeout_interruptible(stutter * HZ); 940 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1; 941 stutter_pause_test = 1;
917 if (!kthread_should_stop()) 942 if (!kthread_should_stop() && !fullstop)
918 schedule_timeout_interruptible(stutter * HZ); 943 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0; 944 stutter_pause_test = 0;
920 } while (!kthread_should_stop()); 945 } while (!kthread_should_stop() && !fullstop);
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 946 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0; 947 return 0;
923} 948}
@@ -934,12 +959,27 @@ rcu_torture_print_module_parms(char *tag)
934 stutter, irqreader); 959 stutter, irqreader);
935} 960}
936 961
962static struct notifier_block rcutorture_nb = {
963 .notifier_call = rcutorture_shutdown_notify,
964};
965
937static void 966static void
938rcu_torture_cleanup(void) 967rcu_torture_cleanup(void)
939{ 968{
940 int i; 969 int i;
941 970
942 fullstop = 1; 971 mutex_lock(&fullstop_mutex);
972 if (!fullstop) {
973 /* If being signaled, let it happen, then exit. */
974 mutex_unlock(&fullstop_mutex);
975 schedule_timeout_interruptible(10 * HZ);
976 if (cur_ops->cb_barrier != NULL)
977 cur_ops->cb_barrier();
978 return;
979 }
980 fullstop = FULLSTOP_CLEANUP;
981 mutex_unlock(&fullstop_mutex);
982 unregister_reboot_notifier(&rcutorture_nb);
943 if (stutter_task) { 983 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 984 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task); 985 kthread_stop(stutter_task);
@@ -1015,6 +1055,8 @@ rcu_torture_init(void)
1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1055 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1016 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1056 &srcu_ops, &sched_ops, &sched_ops_sync, };
1017 1057
1058 mutex_lock(&fullstop_mutex);
1059
1018 /* Process args and tell the world that the torturer is on the job. */ 1060 /* Process args and tell the world that the torturer is on the job. */
1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1061 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1020 cur_ops = torture_ops[i]; 1062 cur_ops = torture_ops[i];
@@ -1024,6 +1066,7 @@ rcu_torture_init(void)
1024 if (i == ARRAY_SIZE(torture_ops)) { 1066 if (i == ARRAY_SIZE(torture_ops)) {
1025 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1067 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1026 torture_type); 1068 torture_type);
1069 mutex_unlock(&fullstop_mutex);
1027 return (-EINVAL); 1070 return (-EINVAL);
1028 } 1071 }
1029 if (cur_ops->init) 1072 if (cur_ops->init)
@@ -1146,9 +1189,12 @@ rcu_torture_init(void)
1146 goto unwind; 1189 goto unwind;
1147 } 1190 }
1148 } 1191 }
1192 register_reboot_notifier(&rcutorture_nb);
1193 mutex_unlock(&fullstop_mutex);
1149 return 0; 1194 return 0;
1150 1195
1151unwind: 1196unwind:
1197 mutex_unlock(&fullstop_mutex);
1152 rcu_torture_cleanup(); 1198 rcu_torture_cleanup();
1153 return firsterr; 1199 return firsterr;
1154} 1200}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..a342b032112c
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
23 *
24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 *
27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU
29 */
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/smp.h>
35#include <linux/rcupdate.h>
36#include <linux/interrupt.h>
37#include <linux/sched.h>
38#include <asm/atomic.h>
39#include <linux/bitops.h>
40#include <linux/module.h>
41#include <linux/completion.h>
42#include <linux/moduleparam.h>
43#include <linux/percpu.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/mutex.h>
47#include <linux/time.h>
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55
56/* Data structures. */
57
58#define RCU_STATE_INITIALIZER(name) { \
59 .level = { &name.node[0] }, \
60 .levelcnt = { \
61 NUM_RCU_LVL_0, /* root of hierarchy. */ \
62 NUM_RCU_LVL_1, \
63 NUM_RCU_LVL_2, \
64 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
65 }, \
66 .signaled = RCU_SIGNAL_INIT, \
67 .gpnum = -300, \
68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
71 .n_force_qs = 0, \
72 .n_force_qs_ngp = 0, \
73}
74
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data);
77
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80
81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
83#endif /* #ifdef CONFIG_NO_HZ */
84
85static int blimit = 10; /* Maximum callbacks per softirq. */
86static int qhimark = 10000; /* If this many pending, ignore blimit. */
87static int qlowmark = 100; /* Once only this many pending, use blimit. */
88
89static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
90
91/*
92 * Return the number of RCU batches processed thus far for debug & stats.
93 */
94long rcu_batches_completed(void)
95{
96 return rcu_state.completed;
97}
98EXPORT_SYMBOL_GPL(rcu_batches_completed);
99
100/*
101 * Return the number of RCU BH batches processed thus far for debug & stats.
102 */
103long rcu_batches_completed_bh(void)
104{
105 return rcu_bh_state.completed;
106}
107EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
108
109/*
110 * Does the CPU have callbacks ready to be invoked?
111 */
112static int
113cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
114{
115 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
116}
117
118/*
119 * Does the current CPU require a yet-as-unscheduled grace period?
120 */
121static int
122cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
123{
124 /* ACCESS_ONCE() because we are accessing outside of lock. */
125 return *rdp->nxttail[RCU_DONE_TAIL] &&
126 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
127}
128
129/*
130 * Return the root node of the specified rcu_state structure.
131 */
132static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
133{
134 return &rsp->node[0];
135}
136
137#ifdef CONFIG_SMP
138
139/*
140 * If the specified CPU is offline, tell the caller that it is in
141 * a quiescent state. Otherwise, whack it with a reschedule IPI.
142 * Grace periods can end up waiting on an offline CPU when that
143 * CPU is in the process of coming online -- it will be added to the
144 * rcu_node bitmasks before it actually makes it online. The same thing
145 * can happen while a CPU is in the process of coming online. Because this
146 * race is quite rare, we check for it after detecting that the grace
147 * period has been delayed rather than checking each and every CPU
148 * each and every time we start a new grace period.
149 */
150static int rcu_implicit_offline_qs(struct rcu_data *rdp)
151{
152 /*
153 * If the CPU is offline, it is in a quiescent state. We can
154 * trust its state not to change because interrupts are disabled.
155 */
156 if (cpu_is_offline(rdp->cpu)) {
157 rdp->offline_fqs++;
158 return 1;
159 }
160
161 /* The CPU is online, so send it a reschedule IPI. */
162 if (rdp->cpu != smp_processor_id())
163 smp_send_reschedule(rdp->cpu);
164 else
165 set_need_resched();
166 rdp->resched_ipi++;
167 return 0;
168}
169
170#endif /* #ifdef CONFIG_SMP */
171
172#ifdef CONFIG_NO_HZ
173static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
174
175/**
176 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
177 *
178 * Enter nohz mode, in other words, -leave- the mode in which RCU
179 * read-side critical sections can occur. (Though RCU read-side
180 * critical sections can occur in irq handlers in nohz mode, a possibility
181 * handled by rcu_irq_enter() and rcu_irq_exit()).
182 */
183void rcu_enter_nohz(void)
184{
185 unsigned long flags;
186 struct rcu_dynticks *rdtp;
187
188 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
189 local_irq_save(flags);
190 rdtp = &__get_cpu_var(rcu_dynticks);
191 rdtp->dynticks++;
192 rdtp->dynticks_nesting--;
193 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
194 local_irq_restore(flags);
195}
196
197/*
198 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
199 *
200 * Exit nohz mode, in other words, -enter- the mode in which RCU
201 * read-side critical sections normally occur.
202 */
203void rcu_exit_nohz(void)
204{
205 unsigned long flags;
206 struct rcu_dynticks *rdtp;
207
208 local_irq_save(flags);
209 rdtp = &__get_cpu_var(rcu_dynticks);
210 rdtp->dynticks++;
211 rdtp->dynticks_nesting++;
212 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
213 local_irq_restore(flags);
214 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
215}
216
217/**
218 * rcu_nmi_enter - inform RCU of entry to NMI context
219 *
220 * If the CPU was idle with dynamic ticks active, and there is no
221 * irq handler running, this updates rdtp->dynticks_nmi to let the
222 * RCU grace-period handling know that the CPU is active.
223 */
224void rcu_nmi_enter(void)
225{
226 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
227
228 if (rdtp->dynticks & 0x1)
229 return;
230 rdtp->dynticks_nmi++;
231 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
232 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
233}
234
235/**
236 * rcu_nmi_exit - inform RCU of exit from NMI context
237 *
238 * If the CPU was idle with dynamic ticks active, and there is no
239 * irq handler running, this updates rdtp->dynticks_nmi to let the
240 * RCU grace-period handling know that the CPU is no longer active.
241 */
242void rcu_nmi_exit(void)
243{
244 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
245
246 if (rdtp->dynticks & 0x1)
247 return;
248 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
249 rdtp->dynticks_nmi++;
250 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
251}
252
253/**
254 * rcu_irq_enter - inform RCU of entry to hard irq context
255 *
256 * If the CPU was idle with dynamic ticks active, this updates the
257 * rdtp->dynticks to let the RCU handling know that the CPU is active.
258 */
259void rcu_irq_enter(void)
260{
261 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
262
263 if (rdtp->dynticks_nesting++)
264 return;
265 rdtp->dynticks++;
266 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
267 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
268}
269
270/**
271 * rcu_irq_exit - inform RCU of exit from hard irq context
272 *
273 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
274 * to put let the RCU handling be aware that the CPU is going back to idle
275 * with no ticks.
276 */
277void rcu_irq_exit(void)
278{
279 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
280
281 if (--rdtp->dynticks_nesting)
282 return;
283 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
284 rdtp->dynticks++;
285 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
286
287 /* If the interrupt queued a callback, get out of dyntick mode. */
288 if (__get_cpu_var(rcu_data).nxtlist ||
289 __get_cpu_var(rcu_bh_data).nxtlist)
290 set_need_resched();
291}
292
293/*
294 * Record the specified "completed" value, which is later used to validate
295 * dynticks counter manipulations. Specify "rsp->completed - 1" to
296 * unconditionally invalidate any future dynticks manipulations (which is
297 * useful at the beginning of a grace period).
298 */
299static void dyntick_record_completed(struct rcu_state *rsp, long comp)
300{
301 rsp->dynticks_completed = comp;
302}
303
304#ifdef CONFIG_SMP
305
306/*
307 * Recall the previously recorded value of the completion for dynticks.
308 */
309static long dyntick_recall_completed(struct rcu_state *rsp)
310{
311 return rsp->dynticks_completed;
312}
313
314/*
315 * Snapshot the specified CPU's dynticks counter so that we can later
316 * credit them with an implicit quiescent state. Return 1 if this CPU
317 * is already in a quiescent state courtesy of dynticks idle mode.
318 */
319static int dyntick_save_progress_counter(struct rcu_data *rdp)
320{
321 int ret;
322 int snap;
323 int snap_nmi;
324
325 snap = rdp->dynticks->dynticks;
326 snap_nmi = rdp->dynticks->dynticks_nmi;
327 smp_mb(); /* Order sampling of snap with end of grace period. */
328 rdp->dynticks_snap = snap;
329 rdp->dynticks_nmi_snap = snap_nmi;
330 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
331 if (ret)
332 rdp->dynticks_fqs++;
333 return ret;
334}
335
336/*
337 * Return true if the specified CPU has passed through a quiescent
338 * state by virtue of being in or having passed through an dynticks
339 * idle state since the last call to dyntick_save_progress_counter()
340 * for this same CPU.
341 */
342static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
343{
344 long curr;
345 long curr_nmi;
346 long snap;
347 long snap_nmi;
348
349 curr = rdp->dynticks->dynticks;
350 snap = rdp->dynticks_snap;
351 curr_nmi = rdp->dynticks->dynticks_nmi;
352 snap_nmi = rdp->dynticks_nmi_snap;
353 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
354
355 /*
356 * If the CPU passed through or entered a dynticks idle phase with
357 * no active irq/NMI handlers, then we can safely pretend that the CPU
358 * already acknowledged the request to pass through a quiescent
359 * state. Either way, that CPU cannot possibly be in an RCU
360 * read-side critical section that started before the beginning
361 * of the current RCU grace period.
362 */
363 if ((curr != snap || (curr & 0x1) == 0) &&
364 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
365 rdp->dynticks_fqs++;
366 return 1;
367 }
368
369 /* Go check for the CPU being offline. */
370 return rcu_implicit_offline_qs(rdp);
371}
372
373#endif /* #ifdef CONFIG_SMP */
374
375#else /* #ifdef CONFIG_NO_HZ */
376
377static void dyntick_record_completed(struct rcu_state *rsp, long comp)
378{
379}
380
381#ifdef CONFIG_SMP
382
383/*
384 * If there are no dynticks, then the only way that a CPU can passively
385 * be in a quiescent state is to be offline. Unlike dynticks idle, which
386 * is a point in time during the prior (already finished) grace period,
387 * an offline CPU is always in a quiescent state, and thus can be
388 * unconditionally applied. So just return the current value of completed.
389 */
390static long dyntick_recall_completed(struct rcu_state *rsp)
391{
392 return rsp->completed;
393}
394
395static int dyntick_save_progress_counter(struct rcu_data *rdp)
396{
397 return 0;
398}
399
400static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
401{
402 return rcu_implicit_offline_qs(rdp);
403}
404
405#endif /* #ifdef CONFIG_SMP */
406
407#endif /* #else #ifdef CONFIG_NO_HZ */
408
409#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
410
411static void record_gp_stall_check_time(struct rcu_state *rsp)
412{
413 rsp->gp_start = jiffies;
414 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
415}
416
417static void print_other_cpu_stall(struct rcu_state *rsp)
418{
419 int cpu;
420 long delta;
421 unsigned long flags;
422 struct rcu_node *rnp = rcu_get_root(rsp);
423 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
424 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
425
426 /* Only let one CPU complain about others per time interval. */
427
428 spin_lock_irqsave(&rnp->lock, flags);
429 delta = jiffies - rsp->jiffies_stall;
430 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
431 spin_unlock_irqrestore(&rnp->lock, flags);
432 return;
433 }
434 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
435 spin_unlock_irqrestore(&rnp->lock, flags);
436
437 /* OK, time to rat on our buddy... */
438
439 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
440 for (; rnp_cur < rnp_end; rnp_cur++) {
441 if (rnp_cur->qsmask == 0)
442 continue;
443 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
444 if (rnp_cur->qsmask & (1UL << cpu))
445 printk(" %d", rnp_cur->grplo + cpu);
446 }
447 printk(" (detected by %d, t=%ld jiffies)\n",
448 smp_processor_id(), (long)(jiffies - rsp->gp_start));
449 force_quiescent_state(rsp, 0); /* Kick them all. */
450}
451
452static void print_cpu_stall(struct rcu_state *rsp)
453{
454 unsigned long flags;
455 struct rcu_node *rnp = rcu_get_root(rsp);
456
457 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
458 smp_processor_id(), jiffies - rsp->gp_start);
459 dump_stack();
460 spin_lock_irqsave(&rnp->lock, flags);
461 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
462 rsp->jiffies_stall =
463 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
464 spin_unlock_irqrestore(&rnp->lock, flags);
465 set_need_resched(); /* kick ourselves to get things going. */
466}
467
468static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
469{
470 long delta;
471 struct rcu_node *rnp;
472
473 delta = jiffies - rsp->jiffies_stall;
474 rnp = rdp->mynode;
475 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
476
477 /* We haven't checked in, so go dump stack. */
478 print_cpu_stall(rsp);
479
480 } else if (rsp->gpnum != rsp->completed &&
481 delta >= RCU_STALL_RAT_DELAY) {
482
483 /* They had two time units to dump stack, so complain. */
484 print_other_cpu_stall(rsp);
485 }
486}
487
488#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
489
490static void record_gp_stall_check_time(struct rcu_state *rsp)
491{
492}
493
494static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
495{
496}
497
498#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
499
500/*
501 * Update CPU-local rcu_data state to record the newly noticed grace period.
502 * This is used both when we started the grace period and when we notice
503 * that someone else started the grace period.
504 */
505static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
506{
507 rdp->qs_pending = 1;
508 rdp->passed_quiesc = 0;
509 rdp->gpnum = rsp->gpnum;
510 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
511 RCU_JIFFIES_TILL_FORCE_QS;
512}
513
514/*
515 * Did someone else start a new RCU grace period start since we last
516 * checked? Update local state appropriately if so. Must be called
517 * on the CPU corresponding to rdp.
518 */
519static int
520check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
521{
522 unsigned long flags;
523 int ret = 0;
524
525 local_irq_save(flags);
526 if (rdp->gpnum != rsp->gpnum) {
527 note_new_gpnum(rsp, rdp);
528 ret = 1;
529 }
530 local_irq_restore(flags);
531 return ret;
532}
533
534/*
535 * Start a new RCU grace period if warranted, re-initializing the hierarchy
536 * in preparation for detecting the next grace period. The caller must hold
537 * the root node's ->lock, which is released before return. Hard irqs must
538 * be disabled.
539 */
540static void
541rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
542 __releases(rcu_get_root(rsp)->lock)
543{
544 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
545 struct rcu_node *rnp = rcu_get_root(rsp);
546 struct rcu_node *rnp_cur;
547 struct rcu_node *rnp_end;
548
549 if (!cpu_needs_another_gp(rsp, rdp)) {
550 spin_unlock_irqrestore(&rnp->lock, flags);
551 return;
552 }
553
554 /* Advance to a new grace period and initialize state. */
555 rsp->gpnum++;
556 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
557 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
558 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
559 RCU_JIFFIES_TILL_FORCE_QS;
560 record_gp_stall_check_time(rsp);
561 dyntick_record_completed(rsp, rsp->completed - 1);
562 note_new_gpnum(rsp, rdp);
563
564 /*
565 * Because we are first, we know that all our callbacks will
566 * be covered by this upcoming grace period, even the ones
567 * that were registered arbitrarily recently.
568 */
569 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
570 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
571
572 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit;
575 spin_unlock_irqrestore(&rnp->lock, flags);
576 return;
577 }
578
579 spin_unlock(&rnp->lock); /* leave irqs disabled. */
580
581
582 /* Exclude any concurrent CPU-hotplug operations. */
583 spin_lock(&rsp->onofflock); /* irqs already disabled. */
584
585 /*
586 * Set the quiescent-state-needed bits in all the non-leaf RCU
587 * nodes for all currently online CPUs. This operation relies
588 * on the layout of the hierarchy within the rsp->node[] array.
589 * Note that other CPUs will access only the leaves of the
590 * hierarchy, which still indicate that no grace period is in
591 * progress. In addition, we have excluded CPU-hotplug operations.
592 *
593 * We therefore do not need to hold any locks. Any required
594 * memory barriers will be supplied by the locks guarding the
595 * leaf rcu_nodes in the hierarchy.
596 */
597
598 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
599 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
600 rnp_cur->qsmask = rnp_cur->qsmaskinit;
601
602 /*
603 * Now set up the leaf nodes. Here we must be careful. First,
604 * we need to hold the lock in order to exclude other CPUs, which
605 * might be contending for the leaf nodes' locks. Second, as
606 * soon as we initialize a given leaf node, its CPUs might run
607 * up the rest of the hierarchy. We must therefore acquire locks
608 * for each node that we touch during this stage. (But we still
609 * are excluding CPU-hotplug operations.)
610 *
611 * Note that the grace period cannot complete until we finish
612 * the initialization process, as there will be at least one
613 * qsmask bit set in the root node until that time, namely the
614 * one corresponding to this CPU.
615 */
616 rnp_end = &rsp->node[NUM_RCU_NODES];
617 rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
618 for (; rnp_cur < rnp_end; rnp_cur++) {
619 spin_lock(&rnp_cur->lock); /* irqs already disabled. */
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
622 }
623
624 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
625 spin_unlock_irqrestore(&rsp->onofflock, flags);
626}
627
628/*
629 * Advance this CPU's callbacks, but only if the current grace period
630 * has ended. This may be called only from the CPU to whom the rdp
631 * belongs.
632 */
633static void
634rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
635{
636 long completed_snap;
637 unsigned long flags;
638
639 local_irq_save(flags);
640 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
641
642 /* Did another grace period end? */
643 if (rdp->completed != completed_snap) {
644
645 /* Advance callbacks. No harm if list empty. */
646 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
647 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
648 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
649
650 /* Remember that we saw this grace-period completion. */
651 rdp->completed = completed_snap;
652 }
653 local_irq_restore(flags);
654}
655
656/*
657 * Similar to cpu_quiet(), for which it is a helper function. Allows
658 * a group of CPUs to be quieted at one go, though all the CPUs in the
659 * group must be represented by the same leaf rcu_node structure.
660 * That structure's lock must be held upon entry, and it is released
661 * before return.
662 */
663static void
664cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
665 unsigned long flags)
666 __releases(rnp->lock)
667{
668 /* Walk up the rcu_node hierarchy. */
669 for (;;) {
670 if (!(rnp->qsmask & mask)) {
671
672 /* Our bit has already been cleared, so done. */
673 spin_unlock_irqrestore(&rnp->lock, flags);
674 return;
675 }
676 rnp->qsmask &= ~mask;
677 if (rnp->qsmask != 0) {
678
679 /* Other bits still set at this level, so done. */
680 spin_unlock_irqrestore(&rnp->lock, flags);
681 return;
682 }
683 mask = rnp->grpmask;
684 if (rnp->parent == NULL) {
685
686 /* No more levels. Exit loop holding root lock. */
687
688 break;
689 }
690 spin_unlock_irqrestore(&rnp->lock, flags);
691 rnp = rnp->parent;
692 spin_lock_irqsave(&rnp->lock, flags);
693 }
694
695 /*
696 * Get here if we are the last CPU to pass through a quiescent
697 * state for this grace period. Clean up and let rcu_start_gp()
698 * start up the next grace period if one is needed. Note that
699 * we still hold rnp->lock, as required by rcu_start_gp(), which
700 * will release it.
701 */
702 rsp->completed = rsp->gpnum;
703 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
704 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
705}
706
707/*
708 * Record a quiescent state for the specified CPU, which must either be
709 * the current CPU or an offline CPU. The lastcomp argument is used to
710 * make sure we are still in the grace period of interest. We don't want
711 * to end the current grace period based on quiescent states detected in
712 * an earlier grace period!
713 */
714static void
715cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
716{
717 unsigned long flags;
718 unsigned long mask;
719 struct rcu_node *rnp;
720
721 rnp = rdp->mynode;
722 spin_lock_irqsave(&rnp->lock, flags);
723 if (lastcomp != ACCESS_ONCE(rsp->completed)) {
724
725 /*
726 * Someone beat us to it for this grace period, so leave.
727 * The race with GP start is resolved by the fact that we
728 * hold the leaf rcu_node lock, so that the per-CPU bits
729 * cannot yet be initialized -- so we would simply find our
730 * CPU's bit already cleared in cpu_quiet_msk() if this race
731 * occurred.
732 */
733 rdp->passed_quiesc = 0; /* try again later! */
734 spin_unlock_irqrestore(&rnp->lock, flags);
735 return;
736 }
737 mask = rdp->grpmask;
738 if ((rnp->qsmask & mask) == 0) {
739 spin_unlock_irqrestore(&rnp->lock, flags);
740 } else {
741 rdp->qs_pending = 0;
742
743 /*
744 * This GP can't end until cpu checks in, so all of our
745 * callbacks can be processed during the next GP.
746 */
747 rdp = rsp->rda[smp_processor_id()];
748 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
749
750 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
751 }
752}
753
754/*
755 * Check to see if there is a new grace period of which this CPU
756 * is not yet aware, and if so, set up local rcu_data state for it.
757 * Otherwise, see if this CPU has just passed through its first
758 * quiescent state for this grace period, and record that fact if so.
759 */
760static void
761rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
762{
763 /* If there is now a new grace period, record and return. */
764 if (check_for_new_grace_period(rsp, rdp))
765 return;
766
767 /*
768 * Does this CPU still need to do its part for current grace period?
769 * If no, return and let the other CPUs do their part as well.
770 */
771 if (!rdp->qs_pending)
772 return;
773
774 /*
775 * Was there a quiescent state since the beginning of the grace
776 * period? If no, then exit and wait for the next call.
777 */
778 if (!rdp->passed_quiesc)
779 return;
780
781 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
782 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
783}
784
785#ifdef CONFIG_HOTPLUG_CPU
786
787/*
788 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
789 * and move all callbacks from the outgoing CPU to the current one.
790 */
791static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
792{
793 int i;
794 unsigned long flags;
795 long lastcomp;
796 unsigned long mask;
797 struct rcu_data *rdp = rsp->rda[cpu];
798 struct rcu_data *rdp_me;
799 struct rcu_node *rnp;
800
801 /* Exclude any attempts to start a new grace period. */
802 spin_lock_irqsave(&rsp->onofflock, flags);
803
804 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
805 rnp = rdp->mynode;
806 mask = rdp->grpmask; /* rnp->grplo is constant. */
807 do {
808 spin_lock(&rnp->lock); /* irqs already disabled. */
809 rnp->qsmaskinit &= ~mask;
810 if (rnp->qsmaskinit != 0) {
811 spin_unlock(&rnp->lock); /* irqs already disabled. */
812 break;
813 }
814 mask = rnp->grpmask;
815 spin_unlock(&rnp->lock); /* irqs already disabled. */
816 rnp = rnp->parent;
817 } while (rnp != NULL);
818 lastcomp = rsp->completed;
819
820 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
821
822 /* Being offline is a quiescent state, so go record it. */
823 cpu_quiet(cpu, rsp, rdp, lastcomp);
824
825 /*
826 * Move callbacks from the outgoing CPU to the running CPU.
827 * Note that the outgoing CPU is now quiscent, so it is now
828 * (uncharacteristically) safe to access it rcu_data structure.
829 * Note also that we must carefully retain the order of the
830 * outgoing CPU's callbacks in order for rcu_barrier() to work
831 * correctly. Finally, note that we start all the callbacks
832 * afresh, even those that have passed through a grace period
833 * and are therefore ready to invoke. The theory is that hotplug
834 * events are rare, and that if they are frequent enough to
835 * indefinitely delay callbacks, you have far worse things to
836 * be worrying about.
837 */
838 rdp_me = rsp->rda[smp_processor_id()];
839 if (rdp->nxtlist != NULL) {
840 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
841 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
842 rdp->nxtlist = NULL;
843 for (i = 0; i < RCU_NEXT_SIZE; i++)
844 rdp->nxttail[i] = &rdp->nxtlist;
845 rdp_me->qlen += rdp->qlen;
846 rdp->qlen = 0;
847 }
848 local_irq_restore(flags);
849}
850
851/*
852 * Remove the specified CPU from the RCU hierarchy and move any pending
853 * callbacks that it might have to the current CPU. This code assumes
854 * that at least one CPU in the system will remain running at all times.
855 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
856 */
857static void rcu_offline_cpu(int cpu)
858{
859 __rcu_offline_cpu(cpu, &rcu_state);
860 __rcu_offline_cpu(cpu, &rcu_bh_state);
861}
862
863#else /* #ifdef CONFIG_HOTPLUG_CPU */
864
865static void rcu_offline_cpu(int cpu)
866{
867}
868
869#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
870
871/*
872 * Invoke any RCU callbacks that have made it to the end of their grace
873 * period. Thottle as specified by rdp->blimit.
874 */
875static void rcu_do_batch(struct rcu_data *rdp)
876{
877 unsigned long flags;
878 struct rcu_head *next, *list, **tail;
879 int count;
880
881 /* If no callbacks are ready, just return.*/
882 if (!cpu_has_callbacks_ready_to_invoke(rdp))
883 return;
884
885 /*
886 * Extract the list of ready callbacks, disabling to prevent
887 * races with call_rcu() from interrupt handlers.
888 */
889 local_irq_save(flags);
890 list = rdp->nxtlist;
891 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
892 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
893 tail = rdp->nxttail[RCU_DONE_TAIL];
894 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
895 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
896 rdp->nxttail[count] = &rdp->nxtlist;
897 local_irq_restore(flags);
898
899 /* Invoke callbacks. */
900 count = 0;
901 while (list) {
902 next = list->next;
903 prefetch(next);
904 list->func(list);
905 list = next;
906 if (++count >= rdp->blimit)
907 break;
908 }
909
910 local_irq_save(flags);
911
912 /* Update count, and requeue any remaining callbacks. */
913 rdp->qlen -= count;
914 if (list != NULL) {
915 *tail = rdp->nxtlist;
916 rdp->nxtlist = list;
917 for (count = 0; count < RCU_NEXT_SIZE; count++)
918 if (&rdp->nxtlist == rdp->nxttail[count])
919 rdp->nxttail[count] = tail;
920 else
921 break;
922 }
923
924 /* Reinstate batch limit if we have worked down the excess. */
925 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
926 rdp->blimit = blimit;
927
928 local_irq_restore(flags);
929
930 /* Re-raise the RCU softirq if there are callbacks remaining. */
931 if (cpu_has_callbacks_ready_to_invoke(rdp))
932 raise_softirq(RCU_SOFTIRQ);
933}
934
935/*
936 * Check to see if this CPU is in a non-context-switch quiescent state
937 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
938 * Also schedule the RCU softirq handler.
939 *
940 * This function must be called with hardirqs disabled. It is normally
941 * invoked from the scheduling-clock interrupt. If rcu_pending returns
942 * false, there is no point in invoking rcu_check_callbacks().
943 */
944void rcu_check_callbacks(int cpu, int user)
945{
946 if (user ||
947 (idle_cpu(cpu) && !in_softirq() &&
948 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
949
950 /*
951 * Get here if this CPU took its interrupt from user
952 * mode or from the idle loop, and if this is not a
953 * nested interrupt. In this case, the CPU is in
954 * a quiescent state, so count it.
955 *
956 * No memory barrier is required here because both
957 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
958 * only CPU-local variables that other CPUs neither
959 * access nor modify, at least not while the corresponding
960 * CPU is online.
961 */
962
963 rcu_qsctr_inc(cpu);
964 rcu_bh_qsctr_inc(cpu);
965
966 } else if (!in_softirq()) {
967
968 /*
969 * Get here if this CPU did not take its interrupt from
970 * softirq, in other words, if it is not interrupting
971 * a rcu_bh read-side critical section. This is an _bh
972 * critical section, so count it.
973 */
974
975 rcu_bh_qsctr_inc(cpu);
976 }
977 raise_softirq(RCU_SOFTIRQ);
978}
979
980#ifdef CONFIG_SMP
981
982/*
983 * Scan the leaf rcu_node structures, processing dyntick state for any that
984 * have not yet encountered a quiescent state, using the function specified.
985 * Returns 1 if the current grace period ends while scanning (possibly
986 * because we made it end).
987 */
988static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
989 int (*f)(struct rcu_data *))
990{
991 unsigned long bit;
992 int cpu;
993 unsigned long flags;
994 unsigned long mask;
995 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
996 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
997
998 for (; rnp_cur < rnp_end; rnp_cur++) {
999 mask = 0;
1000 spin_lock_irqsave(&rnp_cur->lock, flags);
1001 if (rsp->completed != lastcomp) {
1002 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1003 return 1;
1004 }
1005 if (rnp_cur->qsmask == 0) {
1006 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1007 continue;
1008 }
1009 cpu = rnp_cur->grplo;
1010 bit = 1;
1011 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1012 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1013 mask |= bit;
1014 }
1015 if (mask != 0 && rsp->completed == lastcomp) {
1016
1017 /* cpu_quiet_msk() releases rnp_cur->lock. */
1018 cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1019 continue;
1020 }
1021 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1022 }
1023 return 0;
1024}
1025
1026/*
1027 * Force quiescent states on reluctant CPUs, and also detect which
1028 * CPUs are in dyntick-idle mode.
1029 */
1030static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1031{
1032 unsigned long flags;
1033 long lastcomp;
1034 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1035 struct rcu_node *rnp = rcu_get_root(rsp);
1036 u8 signaled;
1037
1038 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1039 return; /* No grace period in progress, nothing to force. */
1040 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1041 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1042 return; /* Someone else is already on the job. */
1043 }
1044 if (relaxed &&
1045 (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
1046 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1047 goto unlock_ret; /* no emergency and done recently. */
1048 rsp->n_force_qs++;
1049 spin_lock(&rnp->lock);
1050 lastcomp = rsp->completed;
1051 signaled = rsp->signaled;
1052 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1053 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1054 RCU_JIFFIES_TILL_FORCE_QS;
1055 if (lastcomp == rsp->gpnum) {
1056 rsp->n_force_qs_ngp++;
1057 spin_unlock(&rnp->lock);
1058 goto unlock_ret; /* no GP in progress, time updated. */
1059 }
1060 spin_unlock(&rnp->lock);
1061 switch (signaled) {
1062 case RCU_GP_INIT:
1063
1064 break; /* grace period still initializing, ignore. */
1065
1066 case RCU_SAVE_DYNTICK:
1067
1068 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1069 break; /* So gcc recognizes the dead code. */
1070
1071 /* Record dyntick-idle state. */
1072 if (rcu_process_dyntick(rsp, lastcomp,
1073 dyntick_save_progress_counter))
1074 goto unlock_ret;
1075
1076 /* Update state, record completion counter. */
1077 spin_lock(&rnp->lock);
1078 if (lastcomp == rsp->completed) {
1079 rsp->signaled = RCU_FORCE_QS;
1080 dyntick_record_completed(rsp, lastcomp);
1081 }
1082 spin_unlock(&rnp->lock);
1083 break;
1084
1085 case RCU_FORCE_QS:
1086
1087 /* Check dyntick-idle state, send IPI to laggarts. */
1088 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1089 rcu_implicit_dynticks_qs))
1090 goto unlock_ret;
1091
1092 /* Leave state in case more forcing is required. */
1093
1094 break;
1095 }
1096unlock_ret:
1097 spin_unlock_irqrestore(&rsp->fqslock, flags);
1098}
1099
1100#else /* #ifdef CONFIG_SMP */
1101
1102static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1103{
1104 set_need_resched();
1105}
1106
1107#endif /* #else #ifdef CONFIG_SMP */
1108
1109/*
1110 * This does the RCU processing work from softirq context for the
1111 * specified rcu_state and rcu_data structures. This may be called
1112 * only from the CPU to whom the rdp belongs.
1113 */
1114static void
1115__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1116{
1117 unsigned long flags;
1118
1119 /*
1120 * If an RCU GP has gone long enough, go check for dyntick
1121 * idle CPUs and, if needed, send resched IPIs.
1122 */
1123 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1124 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1125 force_quiescent_state(rsp, 1);
1126
1127 /*
1128 * Advance callbacks in response to end of earlier grace
1129 * period that some other CPU ended.
1130 */
1131 rcu_process_gp_end(rsp, rdp);
1132
1133 /* Update RCU state based on any recent quiescent states. */
1134 rcu_check_quiescent_state(rsp, rdp);
1135
1136 /* Does this CPU require a not-yet-started grace period? */
1137 if (cpu_needs_another_gp(rsp, rdp)) {
1138 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1139 rcu_start_gp(rsp, flags); /* releases above lock */
1140 }
1141
1142 /* If there are callbacks ready, invoke them. */
1143 rcu_do_batch(rdp);
1144}
1145
1146/*
1147 * Do softirq processing for the current CPU.
1148 */
1149static void rcu_process_callbacks(struct softirq_action *unused)
1150{
1151 /*
1152 * Memory references from any prior RCU read-side critical sections
1153 * executed by the interrupted code must be seen before any RCU
1154 * grace-period manipulations below.
1155 */
1156 smp_mb(); /* See above block comment. */
1157
1158 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1159 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1160
1161 /*
1162 * Memory references from any later RCU read-side critical sections
1163 * executed by the interrupted code must be seen after any RCU
1164 * grace-period manipulations above.
1165 */
1166 smp_mb(); /* See above block comment. */
1167}
1168
1169static void
1170__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1171 struct rcu_state *rsp)
1172{
1173 unsigned long flags;
1174 struct rcu_data *rdp;
1175
1176 head->func = func;
1177 head->next = NULL;
1178
1179 smp_mb(); /* Ensure RCU update seen before callback registry. */
1180
1181 /*
1182 * Opportunistically note grace-period endings and beginnings.
1183 * Note that we might see a beginning right after we see an
1184 * end, but never vice versa, since this CPU has to pass through
1185 * a quiescent state betweentimes.
1186 */
1187 local_irq_save(flags);
1188 rdp = rsp->rda[smp_processor_id()];
1189 rcu_process_gp_end(rsp, rdp);
1190 check_for_new_grace_period(rsp, rdp);
1191
1192 /* Add the callback to our list. */
1193 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1194 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1195
1196 /* Start a new grace period if one not already started. */
1197 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1198 unsigned long nestflag;
1199 struct rcu_node *rnp_root = rcu_get_root(rsp);
1200
1201 spin_lock_irqsave(&rnp_root->lock, nestflag);
1202 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1203 }
1204
1205 /* Force the grace period if too many callbacks or too long waiting. */
1206 if (unlikely(++rdp->qlen > qhimark)) {
1207 rdp->blimit = LONG_MAX;
1208 force_quiescent_state(rsp, 0);
1209 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1210 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1211 force_quiescent_state(rsp, 1);
1212 local_irq_restore(flags);
1213}
1214
1215/*
1216 * Queue an RCU callback for invocation after a grace period.
1217 */
1218void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1219{
1220 __call_rcu(head, func, &rcu_state);
1221}
1222EXPORT_SYMBOL_GPL(call_rcu);
1223
1224/*
1225 * Queue an RCU for invocation after a quicker grace period.
1226 */
1227void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1228{
1229 __call_rcu(head, func, &rcu_bh_state);
1230}
1231EXPORT_SYMBOL_GPL(call_rcu_bh);
1232
1233/*
1234 * Check to see if there is any immediate RCU-related work to be done
1235 * by the current CPU, for the specified type of RCU, returning 1 if so.
1236 * The checks are in order of increasing expense: checks that can be
1237 * carried out against CPU-local state are performed first. However,
1238 * we must check for CPU stalls first, else we might not get a chance.
1239 */
1240static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1241{
1242 rdp->n_rcu_pending++;
1243
1244 /* Check for CPU stalls, if enabled. */
1245 check_cpu_stall(rsp, rdp);
1246
1247 /* Is the RCU core waiting for a quiescent state from this CPU? */
1248 if (rdp->qs_pending)
1249 return 1;
1250
1251 /* Does this CPU have callbacks ready to invoke? */
1252 if (cpu_has_callbacks_ready_to_invoke(rdp))
1253 return 1;
1254
1255 /* Has RCU gone idle with this CPU needing another grace period? */
1256 if (cpu_needs_another_gp(rsp, rdp))
1257 return 1;
1258
1259 /* Has another RCU grace period completed? */
1260 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1261 return 1;
1262
1263 /* Has a new RCU grace period started? */
1264 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1265 return 1;
1266
1267 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1268 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1269 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1270 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1271 return 1;
1272
1273 /* nothing to do */
1274 return 0;
1275}
1276
1277/*
1278 * Check to see if there is any immediate RCU-related work to be done
1279 * by the current CPU, returning 1 if so. This function is part of the
1280 * RCU implementation; it is -not- an exported member of the RCU API.
1281 */
1282int rcu_pending(int cpu)
1283{
1284 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1285 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1286}
1287
1288/*
1289 * Check to see if any future RCU-related work will need to be done
1290 * by the current CPU, even if none need be done immediately, returning
1291 * 1 if so. This function is part of the RCU implementation; it is -not-
1292 * an exported member of the RCU API.
1293 */
1294int rcu_needs_cpu(int cpu)
1295{
1296 /* RCU callbacks either ready or pending? */
1297 return per_cpu(rcu_data, cpu).nxtlist ||
1298 per_cpu(rcu_bh_data, cpu).nxtlist;
1299}
1300
1301/*
1302 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
1303 * approach so that we don't have to worry about how long the CPU has
1304 * been gone, or whether it ever was online previously. We do trust the
1305 * ->mynode field, as it is constant for a given struct rcu_data and
1306 * initialized during early boot.
1307 *
1308 * Note that only one online or offline event can be happening at a given
1309 * time. Note also that we can accept some slop in the rsp->completed
1310 * access due to the fact that this CPU cannot possibly have any RCU
1311 * callbacks in flight yet.
1312 */
1313static void
1314rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1315{
1316 unsigned long flags;
1317 int i;
1318 long lastcomp;
1319 unsigned long mask;
1320 struct rcu_data *rdp = rsp->rda[cpu];
1321 struct rcu_node *rnp = rcu_get_root(rsp);
1322
1323 /* Set up local state, ensuring consistent view of global state. */
1324 spin_lock_irqsave(&rnp->lock, flags);
1325 lastcomp = rsp->completed;
1326 rdp->completed = lastcomp;
1327 rdp->gpnum = lastcomp;
1328 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1329 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1330 rdp->beenonline = 1; /* We have now been online. */
1331 rdp->passed_quiesc_completed = lastcomp - 1;
1332 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1333 rdp->nxtlist = NULL;
1334 for (i = 0; i < RCU_NEXT_SIZE; i++)
1335 rdp->nxttail[i] = &rdp->nxtlist;
1336 rdp->qlen = 0;
1337 rdp->blimit = blimit;
1338#ifdef CONFIG_NO_HZ
1339 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1340#endif /* #ifdef CONFIG_NO_HZ */
1341 rdp->cpu = cpu;
1342 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1343
1344 /*
1345 * A new grace period might start here. If so, we won't be part
1346 * of it, but that is OK, as we are currently in a quiescent state.
1347 */
1348
1349 /* Exclude any attempts to start a new GP on large systems. */
1350 spin_lock(&rsp->onofflock); /* irqs already disabled. */
1351
1352 /* Add CPU to rcu_node bitmasks. */
1353 rnp = rdp->mynode;
1354 mask = rdp->grpmask;
1355 do {
1356 /* Exclude any attempts to start a new GP on small systems. */
1357 spin_lock(&rnp->lock); /* irqs already disabled. */
1358 rnp->qsmaskinit |= mask;
1359 mask = rnp->grpmask;
1360 spin_unlock(&rnp->lock); /* irqs already disabled. */
1361 rnp = rnp->parent;
1362 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1363
1364 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1365
1366 /*
1367 * A new grace period might start here. If so, we will be part of
1368 * it, and its gpnum will be greater than ours, so we will
1369 * participate. It is also possible for the gpnum to have been
1370 * incremented before this function was called, and the bitmasks
1371 * to not be filled out until now, in which case we will also
1372 * participate due to our gpnum being behind.
1373 */
1374
1375 /* Since it is coming online, the CPU is in a quiescent state. */
1376 cpu_quiet(cpu, rsp, rdp, lastcomp);
1377 local_irq_restore(flags);
1378}
1379
1380static void __cpuinit rcu_online_cpu(int cpu)
1381{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1392}
1393
1394/*
1395 * Handle CPU online/offline notifcation events.
1396 */
1397static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1398 unsigned long action, void *hcpu)
1399{
1400 long cpu = (long)hcpu;
1401
1402 switch (action) {
1403 case CPU_UP_PREPARE:
1404 case CPU_UP_PREPARE_FROZEN:
1405 rcu_online_cpu(cpu);
1406 break;
1407 case CPU_DEAD:
1408 case CPU_DEAD_FROZEN:
1409 case CPU_UP_CANCELED:
1410 case CPU_UP_CANCELED_FROZEN:
1411 rcu_offline_cpu(cpu);
1412 break;
1413 default:
1414 break;
1415 }
1416 return NOTIFY_OK;
1417}
1418
1419/*
1420 * Compute the per-level fanout, either using the exact fanout specified
1421 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1422 */
1423#ifdef CONFIG_RCU_FANOUT_EXACT
1424static void __init rcu_init_levelspread(struct rcu_state *rsp)
1425{
1426 int i;
1427
1428 for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1429 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1430}
1431#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1432static void __init rcu_init_levelspread(struct rcu_state *rsp)
1433{
1434 int ccur;
1435 int cprv;
1436 int i;
1437
1438 cprv = NR_CPUS;
1439 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1440 ccur = rsp->levelcnt[i];
1441 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1442 cprv = ccur;
1443 }
1444}
1445#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1446
1447/*
1448 * Helper function for rcu_init() that initializes one rcu_state structure.
1449 */
1450static void __init rcu_init_one(struct rcu_state *rsp)
1451{
1452 int cpustride = 1;
1453 int i;
1454 int j;
1455 struct rcu_node *rnp;
1456
1457 /* Initialize the level-tracking arrays. */
1458
1459 for (i = 1; i < NUM_RCU_LVLS; i++)
1460 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1461 rcu_init_levelspread(rsp);
1462
1463 /* Initialize the elements themselves, starting from the leaves. */
1464
1465 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1466 cpustride *= rsp->levelspread[i];
1467 rnp = rsp->level[i];
1468 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1469 spin_lock_init(&rnp->lock);
1470 rnp->qsmask = 0;
1471 rnp->qsmaskinit = 0;
1472 rnp->grplo = j * cpustride;
1473 rnp->grphi = (j + 1) * cpustride - 1;
1474 if (rnp->grphi >= NR_CPUS)
1475 rnp->grphi = NR_CPUS - 1;
1476 if (i == 0) {
1477 rnp->grpnum = 0;
1478 rnp->grpmask = 0;
1479 rnp->parent = NULL;
1480 } else {
1481 rnp->grpnum = j % rsp->levelspread[i - 1];
1482 rnp->grpmask = 1UL << rnp->grpnum;
1483 rnp->parent = rsp->level[i - 1] +
1484 j / rsp->levelspread[i - 1];
1485 }
1486 rnp->level = i;
1487 }
1488 }
1489}
1490
1491/*
1492 * Helper macro for __rcu_init(). To be used nowhere else!
1493 * Assigns leaf node pointers into each CPU's rcu_data structure.
1494 */
1495#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1496do { \
1497 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1498 j = 0; \
1499 for_each_possible_cpu(i) { \
1500 if (i > rnp[j].grphi) \
1501 j++; \
1502 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1503 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1504 } \
1505} while (0)
1506
1507static struct notifier_block __cpuinitdata rcu_nb = {
1508 .notifier_call = rcu_cpu_notify,
1509};
1510
1511void __init __rcu_init(void)
1512{
1513 int i; /* All used by RCU_DATA_PTR_INIT(). */
1514 int j;
1515 struct rcu_node *rnp;
1516
1517 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1518#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1519 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1520#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1521 rcu_init_one(&rcu_state);
1522 RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1523 rcu_init_one(&rcu_bh_state);
1524 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1525
1526 for_each_online_cpu(i)
1527 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1528 /* Register notifier for non-boot CPUs */
1529 register_cpu_notifier(&rcu_nb);
1530 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1531}
1532
1533module_param(blimit, int, 0);
1534module_param(qhimark, int, 0);
1535module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
1/*
2 * Read-Copy Update tracing for classic implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
45
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{
48 if (!rdp->beenonline)
49 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
51 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending,
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks,
61 rdp->dynticks->dynticks_nesting,
62 rdp->dynticks->dynticks_nmi,
63 rdp->dynticks_fqs);
64#endif /* #ifdef CONFIG_NO_HZ */
65 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
66 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
67}
68
69#define PRINT_RCU_DATA(name, func, m) \
70 do { \
71 int _p_r_d_i; \
72 \
73 for_each_possible_cpu(_p_r_d_i) \
74 func(m, &per_cpu(name, _p_r_d_i)); \
75 } while (0)
76
77static int show_rcudata(struct seq_file *m, void *unused)
78{
79 seq_puts(m, "rcu:\n");
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0;
84}
85
86static int rcudata_open(struct inode *inode, struct file *file)
87{
88 return single_open(file, show_rcudata, NULL);
89}
90
91static struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE,
93 .open = rcudata_open,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98
99static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{
101 if (!rdp->beenonline)
102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending,
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks,
114 rdp->dynticks->dynticks_nesting,
115 rdp->dynticks->dynticks_nmi,
116 rdp->dynticks_fqs);
117#endif /* #ifdef CONFIG_NO_HZ */
118 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
119 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
120}
121
122static int show_rcudata_csv(struct seq_file *m, void *unused)
123{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
125#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */
128 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
129 seq_puts(m, "\"rcu:\"\n");
130 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
131 seq_puts(m, "\"rcu_bh:\"\n");
132 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
133 return 0;
134}
135
136static int rcudata_csv_open(struct inode *inode, struct file *file)
137{
138 return single_open(file, show_rcudata_csv, NULL);
139}
140
141static struct file_operations rcudata_csv_fops = {
142 .owner = THIS_MODULE,
143 .open = rcudata_csv_open,
144 .read = seq_read,
145 .llseek = seq_lseek,
146 .release = single_release,
147};
148
149static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
150{
151 int level = 0;
152 struct rcu_node *rnp;
153
154 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
155 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
156 rsp->completed, rsp->gpnum, rsp->signaled,
157 (long)(rsp->jiffies_force_qs - jiffies),
158 (int)(jiffies & 0xffff),
159 rsp->n_force_qs, rsp->n_force_qs_ngp,
160 rsp->n_force_qs - rsp->n_force_qs_ngp,
161 rsp->n_force_qs_lh);
162 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
163 if (rnp->level != level) {
164 seq_puts(m, "\n");
165 level = rnp->level;
166 }
167 seq_printf(m, "%lx/%lx %d:%d ^%d ",
168 rnp->qsmask, rnp->qsmaskinit,
169 rnp->grplo, rnp->grphi, rnp->grpnum);
170 }
171 seq_puts(m, "\n");
172}
173
174static int show_rcuhier(struct seq_file *m, void *unused)
175{
176 seq_puts(m, "rcu:\n");
177 print_one_rcu_state(m, &rcu_state);
178 seq_puts(m, "rcu_bh:\n");
179 print_one_rcu_state(m, &rcu_bh_state);
180 return 0;
181}
182
183static int rcuhier_open(struct inode *inode, struct file *file)
184{
185 return single_open(file, show_rcuhier, NULL);
186}
187
188static struct file_operations rcuhier_fops = {
189 .owner = THIS_MODULE,
190 .open = rcuhier_open,
191 .read = seq_read,
192 .llseek = seq_lseek,
193 .release = single_release,
194};
195
196static int show_rcugp(struct seq_file *m, void *unused)
197{
198 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
199 rcu_state.completed, rcu_state.gpnum);
200 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
201 rcu_bh_state.completed, rcu_bh_state.gpnum);
202 return 0;
203}
204
205static int rcugp_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, show_rcugp, NULL);
208}
209
210static struct file_operations rcugp_fops = {
211 .owner = THIS_MODULE,
212 .open = rcugp_open,
213 .read = seq_read,
214 .llseek = seq_lseek,
215 .release = single_release,
216};
217
218static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
219static int __init rcuclassic_trace_init(void)
220{
221 rcudir = debugfs_create_dir("rcu", NULL);
222 if (!rcudir)
223 goto out;
224
225 datadir = debugfs_create_file("rcudata", 0444, rcudir,
226 NULL, &rcudata_fops);
227 if (!datadir)
228 goto free_out;
229
230 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
231 NULL, &rcudata_csv_fops);
232 if (!datadir_csv)
233 goto free_out;
234
235 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
236 if (!gpdir)
237 goto free_out;
238
239 hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
240 NULL, &rcuhier_fops);
241 if (!hierdir)
242 goto free_out;
243 return 0;
244free_out:
245 if (datadir)
246 debugfs_remove(datadir);
247 if (datadir_csv)
248 debugfs_remove(datadir_csv);
249 if (gpdir)
250 debugfs_remove(gpdir);
251 debugfs_remove(rcudir);
252out:
253 return 1;
254}
255
256static void __exit rcuclassic_trace_cleanup(void)
257{
258 debugfs_remove(datadir);
259 debugfs_remove(datadir_csv);
260 debugfs_remove(gpdir);
261 debugfs_remove(hierdir);
262 debugfs_remove(rcudir);
263}
264
265
266module_init(rcuclassic_trace_init);
267module_exit(rcuclassic_trace_cleanup);
268
269MODULE_AUTHOR("Paul E. McKenney");
270MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
271MODULE_LICENSE("GPL");
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a7855c08..09ac2008f77b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
400 } 400 }
401 401
402 mutex_lock(&relay_channels_mutex); 402 mutex_lock(&relay_channels_mutex);
403 for_each_online_cpu(i) 403 for_each_possible_cpu(i)
404 if (chan->buf[i]) 404 if (chan->buf[i])
405 __relay_reset(chan->buf[i], 0); 405 __relay_reset(chan->buf[i], 0);
406 mutex_unlock(&relay_channels_mutex); 406 mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
611 return chan; 611 return chan;
612 612
613free_bufs: 613free_bufs:
614 for_each_online_cpu(i) { 614 for_each_possible_cpu(i) {
615 if (!chan->buf[i]) 615 if (chan->buf[i])
616 break; 616 relay_close_buf(chan->buf[i]);
617 relay_close_buf(chan->buf[i]);
618 } 617 }
619 618
620 kref_put(&chan->kref, relay_destroy_channel); 619 kref_put(&chan->kref, relay_destroy_channel);
@@ -1318,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in,
1318 if (ret < 0) 1317 if (ret < 0)
1319 break; 1318 break;
1320 else if (!ret) { 1319 else if (!ret) {
1321 if (spliced) 1320 if (flags & SPLICE_F_NONBLOCK)
1322 break;
1323 if (flags & SPLICE_F_NONBLOCK) {
1324 ret = -EAGAIN; 1321 ret = -EAGAIN;
1325 break; 1322 break;
1326 }
1327 } 1323 }
1328 1324
1329 *ppos += ret; 1325 *ppos += ret;
diff --git a/kernel/resource.c b/kernel/resource.c
index 414d6fc9131e..e633106b12f6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -17,6 +17,7 @@
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/device.h> 19#include <linux/device.h>
20#include <linux/pfn.h>
20#include <asm/io.h> 21#include <asm/io.h>
21 22
22 23
@@ -38,10 +39,6 @@ EXPORT_SYMBOL(iomem_resource);
38 39
39static DEFINE_RWLOCK(resource_lock); 40static DEFINE_RWLOCK(resource_lock);
40 41
41#ifdef CONFIG_PROC_FS
42
43enum { MAX_IORES_LEVEL = 5 };
44
45static void *r_next(struct seq_file *m, void *v, loff_t *pos) 42static void *r_next(struct seq_file *m, void *v, loff_t *pos)
46{ 43{
47 struct resource *p = v; 44 struct resource *p = v;
@@ -53,6 +50,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
53 return p->sibling; 50 return p->sibling;
54} 51}
55 52
53#ifdef CONFIG_PROC_FS
54
55enum { MAX_IORES_LEVEL = 5 };
56
56static void *r_start(struct seq_file *m, loff_t *pos) 57static void *r_start(struct seq_file *m, loff_t *pos)
57 __acquires(resource_lock) 58 __acquires(resource_lock)
58{ 59{
@@ -522,7 +523,7 @@ static void __init __reserve_region_with_split(struct resource *root,
522{ 523{
523 struct resource *parent = root; 524 struct resource *parent = root;
524 struct resource *conflict; 525 struct resource *conflict;
525 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 526 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
526 527
527 if (!res) 528 if (!res)
528 return; 529 return;
@@ -549,13 +550,9 @@ static void __init __reserve_region_with_split(struct resource *root,
549 } 550 }
550 551
551 if (!res) { 552 if (!res) {
552 printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
553 conflict->name, conflict->start, conflict->end,
554 name, start, end);
555
556 /* failed, split and try again */ 553 /* failed, split and try again */
557 554
558 /* conflict coverred whole area */ 555 /* conflict covered whole area */
559 if (conflict->start <= start && conflict->end >= end) 556 if (conflict->start <= start && conflict->end >= end)
560 return; 557 return;
561 558
@@ -575,7 +572,7 @@ static void __init __reserve_region_with_split(struct resource *root,
575 572
576} 573}
577 574
578void reserve_region_with_split(struct resource *root, 575void __init reserve_region_with_split(struct resource *root,
579 resource_size_t start, resource_size_t end, 576 resource_size_t start, resource_size_t end,
580 const char *name) 577 const char *name)
581{ 578{
@@ -630,33 +627,34 @@ struct resource * __request_region(struct resource *parent,
630{ 627{
631 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
632 629
633 if (res) { 630 if (!res)
634 res->name = name; 631 return NULL;
635 res->start = start;
636 res->end = start + n - 1;
637 res->flags = IORESOURCE_BUSY;
638 632
639 write_lock(&resource_lock); 633 res->name = name;
634 res->start = start;
635 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY;
640 637
641 for (;;) { 638 write_lock(&resource_lock);
642 struct resource *conflict;
643 639
644 conflict = __request_resource(parent, res); 640 for (;;) {
645 if (!conflict) 641 struct resource *conflict;
646 break;
647 if (conflict != parent) {
648 parent = conflict;
649 if (!(conflict->flags & IORESOURCE_BUSY))
650 continue;
651 }
652 642
653 /* Uhhuh, that didn't work out.. */ 643 conflict = __request_resource(parent, res);
654 kfree(res); 644 if (!conflict)
655 res = NULL;
656 break; 645 break;
646 if (conflict != parent) {
647 parent = conflict;
648 if (!(conflict->flags & IORESOURCE_BUSY))
649 continue;
657 } 650 }
658 write_unlock(&resource_lock); 651
652 /* Uhhuh, that didn't work out.. */
653 kfree(res);
654 res = NULL;
655 break;
659 } 656 }
657 write_unlock(&resource_lock);
660 return res; 658 return res;
661} 659}
662EXPORT_SYMBOL(__request_region); 660EXPORT_SYMBOL(__request_region);
@@ -831,3 +829,50 @@ static int __init reserve_setup(char *str)
831} 829}
832 830
833__setup("reserve=", reserve_setup); 831__setup("reserve=", reserve_setup);
832
833/*
834 * Check if the requested addr and size spans more than any slot in the
835 * iomem resource tree.
836 */
837int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
838{
839 struct resource *p = &iomem_resource;
840 int err = 0;
841 loff_t l;
842
843 read_lock(&resource_lock);
844 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
845 /*
846 * We can probably skip the resources without
847 * IORESOURCE_IO attribute?
848 */
849 if (p->start >= addr + size)
850 continue;
851 if (p->end < addr)
852 continue;
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue;
856 /*
857 * if a resource is "BUSY", it's not a hardware resource
858 * but a driver mapping of such a resource; we don't want
859 * to warn for those; some drivers legitimately map only
860 * partial hardware resources. (example: vesafb)
861 */
862 if (p->flags & IORESOURCE_BUSY)
863 continue;
864
865 printk(KERN_WARNING "resource map sanity check conflict: "
866 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
867 (unsigned long long)addr,
868 (unsigned long long)(addr + size - 1),
869 (unsigned long long)p->start,
870 (unsigned long long)p->end,
871 p->name);
872 err = -1;
873 break;
874 }
875 read_unlock(&resource_lock);
876
877 return err;
878}
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 6522ae5b14a2..69d9cb921ffa 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
631 631
632 /* Setup the timer, when timeout != NULL */ 632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) { 633 if (unlikely(timeout)) {
634 hrtimer_start(&timeout->timer, timeout->timer.expires, 634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 HRTIMER_MODE_ABS);
636 if (!hrtimer_active(&timeout->timer)) 635 if (!hrtimer_active(&timeout->timer))
637 timeout->task = NULL; 636 timeout->task = NULL;
638 } 637 }
diff --git a/kernel/sched.c b/kernel/sched.c
index d897a524e7d8..c731dd820d1a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
55#include <linux/cpuset.h> 55#include <linux/cpuset.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 59#include <linux/seq_file.h>
59#include <linux/sysctl.h> 60#include <linux/sysctl.h>
60#include <linux/syscalls.h> 61#include <linux/syscalls.h>
@@ -71,6 +72,7 @@
71#include <linux/debugfs.h> 72#include <linux/debugfs.h>
72#include <linux/ctype.h> 73#include <linux/ctype.h>
73#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
74 76
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
@@ -116,6 +118,12 @@
116 */ 118 */
117#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
118 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
119#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
120/* 128/*
121 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -201,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205} 212}
206 213
207static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -226,9 +233,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
226 233
227 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 234 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
228 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 235 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
229 hrtimer_start(&rt_b->rt_period_timer, 236 hrtimer_start_expires(&rt_b->rt_period_timer,
230 rt_b->rt_period_timer.expires, 237 HRTIMER_MODE_ABS);
231 HRTIMER_MODE_ABS);
232 } 238 }
233 spin_unlock(&rt_b->rt_runtime_lock); 239 spin_unlock(&rt_b->rt_runtime_lock);
234} 240}
@@ -260,6 +266,10 @@ struct task_group {
260 struct cgroup_subsys_state css; 266 struct cgroup_subsys_state css;
261#endif 267#endif
262 268
269#ifdef CONFIG_USER_SCHED
270 uid_t uid;
271#endif
272
263#ifdef CONFIG_FAIR_GROUP_SCHED 273#ifdef CONFIG_FAIR_GROUP_SCHED
264 /* schedulable entities of this group on each cpu */ 274 /* schedulable entities of this group on each cpu */
265 struct sched_entity **se; 275 struct sched_entity **se;
@@ -285,6 +295,12 @@ struct task_group {
285 295
286#ifdef CONFIG_USER_SCHED 296#ifdef CONFIG_USER_SCHED
287 297
298/* Helper function to pass uid information to create_sched_user() */
299void set_tg_uid(struct user_struct *user)
300{
301 user->tg->uid = user->uid;
302}
303
288/* 304/*
289 * Root task group. 305 * Root task group.
290 * Every UID task group (including init_task_group aka UID-0) will 306 * Every UID task group (including init_task_group aka UID-0) will
@@ -344,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
344 struct task_group *tg; 360 struct task_group *tg;
345 361
346#ifdef CONFIG_USER_SCHED 362#ifdef CONFIG_USER_SCHED
347 tg = p->user->tg; 363 rcu_read_lock();
364 tg = __task_cred(p)->user->tg;
365 rcu_read_unlock();
348#elif defined(CONFIG_CGROUP_SCHED) 366#elif defined(CONFIG_CGROUP_SCHED)
349 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 367 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
350 struct task_group, css); 368 struct task_group, css);
@@ -385,7 +403,6 @@ struct cfs_rq {
385 403
386 u64 exec_clock; 404 u64 exec_clock;
387 u64 min_vruntime; 405 u64 min_vruntime;
388 u64 pair_start;
389 406
390 struct rb_root tasks_timeline; 407 struct rb_root tasks_timeline;
391 struct rb_node *rb_leftmost; 408 struct rb_node *rb_leftmost;
@@ -397,9 +414,9 @@ struct cfs_rq {
397 * 'curr' points to currently running entity on this cfs_rq. 414 * 'curr' points to currently running entity on this cfs_rq.
398 * It is set to NULL otherwise (i.e when none are currently running). 415 * It is set to NULL otherwise (i.e when none are currently running).
399 */ 416 */
400 struct sched_entity *curr, *next; 417 struct sched_entity *curr, *next, *last;
401 418
402 unsigned long nr_spread_over; 419 unsigned int nr_spread_over;
403 420
404#ifdef CONFIG_FAIR_GROUP_SCHED 421#ifdef CONFIG_FAIR_GROUP_SCHED
405 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 422 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -586,6 +603,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 603#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 604 /* latency stats */
588 struct sched_info rq_sched_info; 605 struct sched_info rq_sched_info;
606 unsigned long long rq_cpu_time;
607 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 608
590 /* sys_sched_yield() stats */ 609 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 610 unsigned int yld_exp_empty;
@@ -703,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = {
703 722
704#undef SCHED_FEAT 723#undef SCHED_FEAT
705 724
706static int sched_feat_open(struct inode *inode, struct file *filp) 725static int sched_feat_show(struct seq_file *m, void *v)
707{ 726{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 727 int i;
720 728
721 for (i = 0; sched_feat_names[i]; i++) { 729 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 730 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 731 seq_puts(m, "NO_");
732 seq_printf(m, "%s ", sched_feat_names[i]);
724 } 733 }
734 seq_puts(m, "\n");
725 735
726 buf = kmalloc(len + 2, GFP_KERNEL); 736 return 0;
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 }
736
737 r += sprintf(buf + r, "\n");
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 737}
746 738
747static ssize_t 739static ssize_t
@@ -786,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 778 return cnt;
787} 779}
788 780
781static int sched_feat_open(struct inode *inode, struct file *filp)
782{
783 return single_open(filp, sched_feat_show, NULL);
784}
785
789static struct file_operations sched_feat_fops = { 786static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 787 .open = sched_feat_open,
791 .read = sched_feat_read, 788 .write = sched_feat_write,
792 .write = sched_feat_write, 789 .read = seq_read,
790 .llseek = seq_lseek,
791 .release = single_release,
793}; 792};
794 793
795static __init int sched_init_debug(void) 794static __init int sched_init_debug(void)
@@ -818,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
818unsigned int sysctl_sched_shares_ratelimit = 250000; 817unsigned int sysctl_sched_shares_ratelimit = 250000;
819 818
820/* 819/*
820 * Inject some fuzzyness into changing the per-cpu group shares
821 * this avoids remote rq-locks at the expense of fairness.
822 * default: 4
823 */
824unsigned int sysctl_sched_shares_thresh = 4;
825
826/*
821 * period over which we measure -rt task cpu usage in us. 827 * period over which we measure -rt task cpu usage in us.
822 * default: 1s 828 * default: 1s
823 */ 829 */
@@ -962,6 +968,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
962 } 968 }
963} 969}
964 970
971void task_rq_unlock_wait(struct task_struct *p)
972{
973 struct rq *rq = task_rq(p);
974
975 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
976 spin_unlock_wait(&rq->lock);
977}
978
965static void __task_rq_unlock(struct rq *rq) 979static void __task_rq_unlock(struct rq *rq)
966 __releases(rq->lock) 980 __releases(rq->lock)
967{ 981{
@@ -1063,7 +1077,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1063 struct hrtimer *timer = &rq->hrtick_timer; 1077 struct hrtimer *timer = &rq->hrtick_timer;
1064 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1078 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1065 1079
1066 timer->expires = time; 1080 hrtimer_set_expires(timer, time);
1067 1081
1068 if (rq == this_rq()) { 1082 if (rq == this_rq()) {
1069 hrtimer_restart(timer); 1083 hrtimer_restart(timer);
@@ -1124,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
1124 1138
1125 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1139 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1126 rq->hrtick_timer.function = hrtick; 1140 rq->hrtick_timer.function = hrtick;
1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1128} 1141}
1129#else /* CONFIG_SCHED_HRTICK */ 1142#else /* CONFIG_SCHED_HRTICK */
1130static inline void hrtick_clear(struct rq *rq) 1143static inline void hrtick_clear(struct rq *rq)
@@ -1438,9 +1451,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1438static unsigned long cpu_avg_load_per_task(int cpu) 1451static unsigned long cpu_avg_load_per_task(int cpu)
1439{ 1452{
1440 struct rq *rq = cpu_rq(cpu); 1453 struct rq *rq = cpu_rq(cpu);
1454 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1441 1455
1442 if (rq->nr_running) 1456 if (nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running; 1457 rq->avg_load_per_task = rq->load.weight / nr_running;
1458 else
1459 rq->avg_load_per_task = 0;
1444 1460
1445 return rq->avg_load_per_task; 1461 return rq->avg_load_per_task;
1446} 1462}
@@ -1453,30 +1469,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1453 * Calculate and set the cpu's group shares. 1469 * Calculate and set the cpu's group shares.
1454 */ 1470 */
1455static void 1471static void
1456__update_group_shares_cpu(struct task_group *tg, int cpu, 1472update_group_shares_cpu(struct task_group *tg, int cpu,
1457 unsigned long sd_shares, unsigned long sd_rq_weight) 1473 unsigned long sd_shares, unsigned long sd_rq_weight)
1458{ 1474{
1459 int boost = 0;
1460 unsigned long shares; 1475 unsigned long shares;
1461 unsigned long rq_weight; 1476 unsigned long rq_weight;
1462 1477
1463 if (!tg->se[cpu]) 1478 if (!tg->se[cpu])
1464 return; 1479 return;
1465 1480
1466 rq_weight = tg->cfs_rq[cpu]->load.weight; 1481 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1467
1468 /*
1469 * If there are currently no tasks on the cpu pretend there is one of
1470 * average load so that when a new task gets to run here it will not
1471 * get delayed by group starvation.
1472 */
1473 if (!rq_weight) {
1474 boost = 1;
1475 rq_weight = NICE_0_LOAD;
1476 }
1477
1478 if (unlikely(rq_weight > sd_rq_weight))
1479 rq_weight = sd_rq_weight;
1480 1482
1481 /* 1483 /*
1482 * \Sum shares * rq_weight 1484 * \Sum shares * rq_weight
@@ -1484,20 +1486,20 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1484 * \Sum rq_weight 1486 * \Sum rq_weight
1485 * 1487 *
1486 */ 1488 */
1487 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1489 shares = (sd_shares * rq_weight) / sd_rq_weight;
1490 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1488 1491
1489 /* 1492 if (abs(shares - tg->se[cpu]->load.weight) >
1490 * record the actual number of shares, not the boosted amount. 1493 sysctl_sched_shares_thresh) {
1491 */ 1494 struct rq *rq = cpu_rq(cpu);
1492 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1495 unsigned long flags;
1493 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1494 1496
1495 if (shares < MIN_SHARES) 1497 spin_lock_irqsave(&rq->lock, flags);
1496 shares = MIN_SHARES; 1498 tg->cfs_rq[cpu]->shares = shares;
1497 else if (shares > MAX_SHARES)
1498 shares = MAX_SHARES;
1499 1499
1500 __set_se_shares(tg->se[cpu], shares); 1500 __set_se_shares(tg->se[cpu], shares);
1501 spin_unlock_irqrestore(&rq->lock, flags);
1502 }
1501} 1503}
1502 1504
1503/* 1505/*
@@ -1507,13 +1509,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1507 */ 1509 */
1508static int tg_shares_up(struct task_group *tg, void *data) 1510static int tg_shares_up(struct task_group *tg, void *data)
1509{ 1511{
1510 unsigned long rq_weight = 0; 1512 unsigned long weight, rq_weight = 0;
1511 unsigned long shares = 0; 1513 unsigned long shares = 0;
1512 struct sched_domain *sd = data; 1514 struct sched_domain *sd = data;
1513 int i; 1515 int i;
1514 1516
1515 for_each_cpu_mask(i, sd->span) { 1517 for_each_cpu_mask(i, sd->span) {
1516 rq_weight += tg->cfs_rq[i]->load.weight; 1518 /*
1519 * If there are currently no tasks on the cpu pretend there
1520 * is one of average load so that when a new task gets to
1521 * run here it will not get delayed by group starvation.
1522 */
1523 weight = tg->cfs_rq[i]->load.weight;
1524 if (!weight)
1525 weight = NICE_0_LOAD;
1526
1527 tg->cfs_rq[i]->rq_weight = weight;
1528 rq_weight += weight;
1517 shares += tg->cfs_rq[i]->shares; 1529 shares += tg->cfs_rq[i]->shares;
1518 } 1530 }
1519 1531
@@ -1523,17 +1535,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
1523 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1535 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1524 shares = tg->shares; 1536 shares = tg->shares;
1525 1537
1526 if (!rq_weight) 1538 for_each_cpu_mask(i, sd->span)
1527 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; 1539 update_group_shares_cpu(tg, i, shares, rq_weight);
1528
1529 for_each_cpu_mask(i, sd->span) {
1530 struct rq *rq = cpu_rq(i);
1531 unsigned long flags;
1532
1533 spin_lock_irqsave(&rq->lock, flags);
1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1535 spin_unlock_irqrestore(&rq->lock, flags);
1536 }
1537 1540
1538 return 0; 1541 return 0;
1539} 1542}
@@ -1596,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1596 1599
1597#endif 1600#endif
1598 1601
1602/*
1603 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1604 */
1605static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1606 __releases(this_rq->lock)
1607 __acquires(busiest->lock)
1608 __acquires(this_rq->lock)
1609{
1610 int ret = 0;
1611
1612 if (unlikely(!irqs_disabled())) {
1613 /* printk() doesn't work good under rq->lock */
1614 spin_unlock(&this_rq->lock);
1615 BUG_ON(1);
1616 }
1617 if (unlikely(!spin_trylock(&busiest->lock))) {
1618 if (busiest < this_rq) {
1619 spin_unlock(&this_rq->lock);
1620 spin_lock(&busiest->lock);
1621 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1622 ret = 1;
1623 } else
1624 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1625 }
1626 return ret;
1627}
1628
1629static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1630 __releases(busiest->lock)
1631{
1632 spin_unlock(&busiest->lock);
1633 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1634}
1599#endif 1635#endif
1600 1636
1601#ifdef CONFIG_FAIR_GROUP_SCHED 1637#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1800,7 +1836,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1800 /* 1836 /*
1801 * Buddy candidates are cache hot: 1837 * Buddy candidates are cache hot:
1802 */ 1838 */
1803 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) 1839 if (sched_feat(CACHE_HOT_BUDDY) &&
1840 (&p->se == cfs_rq_of(&p->se)->next ||
1841 &p->se == cfs_rq_of(&p->se)->last))
1804 return 1; 1842 return 1;
1805 1843
1806 if (p->sched_class != &fair_sched_class) 1844 if (p->sched_class != &fair_sched_class)
@@ -1827,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1827 1865
1828 clock_offset = old_rq->clock - new_rq->clock; 1866 clock_offset = old_rq->clock - new_rq->clock;
1829 1867
1868 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1869
1830#ifdef CONFIG_SCHEDSTATS 1870#ifdef CONFIG_SCHEDSTATS
1831 if (p->se.wait_start) 1871 if (p->se.wait_start)
1832 p->se.wait_start -= clock_offset; 1872 p->se.wait_start -= clock_offset;
@@ -1936,6 +1976,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1936 * just go back and repeat. 1976 * just go back and repeat.
1937 */ 1977 */
1938 rq = task_rq_lock(p, &flags); 1978 rq = task_rq_lock(p, &flags);
1979 trace_sched_wait_task(rq, p);
1939 running = task_running(rq, p); 1980 running = task_running(rq, p);
1940 on_rq = p->se.on_rq; 1981 on_rq = p->se.on_rq;
1941 ncsw = 0; 1982 ncsw = 0;
@@ -2235,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2235 2276
2236 smp_wmb(); 2277 smp_wmb();
2237 rq = task_rq_lock(p, &flags); 2278 rq = task_rq_lock(p, &flags);
2279 update_rq_clock(rq);
2238 old_state = p->state; 2280 old_state = p->state;
2239 if (!(old_state & state)) 2281 if (!(old_state & state))
2240 goto out; 2282 goto out;
@@ -2292,14 +2334,11 @@ out_activate:
2292 schedstat_inc(p, se.nr_wakeups_local); 2334 schedstat_inc(p, se.nr_wakeups_local);
2293 else 2335 else
2294 schedstat_inc(p, se.nr_wakeups_remote); 2336 schedstat_inc(p, se.nr_wakeups_remote);
2295 update_rq_clock(rq);
2296 activate_task(rq, p, 1); 2337 activate_task(rq, p, 1);
2297 success = 1; 2338 success = 1;
2298 2339
2299out_running: 2340out_running:
2300 trace_mark(kernel_sched_wakeup, 2341 trace_sched_wakeup(rq, p, success);
2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2302 p->pid, p->state, rq, p, rq->curr);
2303 check_preempt_curr(rq, p, sync); 2342 check_preempt_curr(rq, p, sync);
2304 2343
2305 p->state = TASK_RUNNING; 2344 p->state = TASK_RUNNING;
@@ -2432,9 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2432 p->sched_class->task_new(rq, p); 2471 p->sched_class->task_new(rq, p);
2433 inc_nr_running(rq); 2472 inc_nr_running(rq);
2434 } 2473 }
2435 trace_mark(kernel_sched_wakeup_new, 2474 trace_sched_wakeup_new(rq, p, 1);
2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2437 p->pid, p->state, rq, p, rq->curr);
2438 check_preempt_curr(rq, p, 0); 2475 check_preempt_curr(rq, p, 0);
2439#ifdef CONFIG_SMP 2476#ifdef CONFIG_SMP
2440 if (p->sched_class->task_wake_up) 2477 if (p->sched_class->task_wake_up)
@@ -2607,11 +2644,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2607 struct mm_struct *mm, *oldmm; 2644 struct mm_struct *mm, *oldmm;
2608 2645
2609 prepare_task_switch(rq, prev, next); 2646 prepare_task_switch(rq, prev, next);
2610 trace_mark(kernel_sched_schedule, 2647 trace_sched_switch(rq, prev, next);
2611 "prev_pid %d next_pid %d prev_state %ld "
2612 "## rq %p prev %p next %p",
2613 prev->pid, next->pid, prev->state,
2614 rq, prev, next);
2615 mm = next->mm; 2648 mm = next->mm;
2616 oldmm = prev->active_mm; 2649 oldmm = prev->active_mm;
2617 /* 2650 /*
@@ -2801,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2801} 2834}
2802 2835
2803/* 2836/*
2804 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2805 */
2806static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2807 __releases(this_rq->lock)
2808 __acquires(busiest->lock)
2809 __acquires(this_rq->lock)
2810{
2811 int ret = 0;
2812
2813 if (unlikely(!irqs_disabled())) {
2814 /* printk() doesn't work good under rq->lock */
2815 spin_unlock(&this_rq->lock);
2816 BUG_ON(1);
2817 }
2818 if (unlikely(!spin_trylock(&busiest->lock))) {
2819 if (busiest < this_rq) {
2820 spin_unlock(&this_rq->lock);
2821 spin_lock(&busiest->lock);
2822 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2823 ret = 1;
2824 } else
2825 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2826 }
2827 return ret;
2828}
2829
2830static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2831 __releases(busiest->lock)
2832{
2833 spin_unlock(&busiest->lock);
2834 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2835}
2836
2837/*
2838 * If dest_cpu is allowed for this process, migrate the task to it. 2837 * If dest_cpu is allowed for this process, migrate the task to it.
2839 * This is accomplished by forcing the cpu_allowed mask to only 2838 * This is accomplished by forcing the cpu_allowed mask to only
2840 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2839 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -3344,7 +3343,7 @@ small_imbalance:
3344 } else 3343 } else
3345 this_load_per_task = cpu_avg_load_per_task(this_cpu); 3344 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3346 3345
3347 if (max_load - this_load + 2*busiest_load_per_task >= 3346 if (max_load - this_load + busiest_load_per_task >=
3348 busiest_load_per_task * imbn) { 3347 busiest_load_per_task * imbn) {
3349 *imbalance = busiest_load_per_task; 3348 *imbalance = busiest_load_per_task;
3350 return busiest; 3349 return busiest;
@@ -3695,7 +3694,7 @@ out_balanced:
3695static void idle_balance(int this_cpu, struct rq *this_rq) 3694static void idle_balance(int this_cpu, struct rq *this_rq)
3696{ 3695{
3697 struct sched_domain *sd; 3696 struct sched_domain *sd;
3698 int pulled_task = -1; 3697 int pulled_task = 0;
3699 unsigned long next_balance = jiffies + HZ; 3698 unsigned long next_balance = jiffies + HZ;
3700 cpumask_t tmpmask; 3699 cpumask_t tmpmask;
3701 3700
@@ -4052,23 +4051,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4052EXPORT_PER_CPU_SYMBOL(kstat); 4051EXPORT_PER_CPU_SYMBOL(kstat);
4053 4052
4054/* 4053/*
4055 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4054 * Return any ns on the sched_clock that have not yet been banked in
4056 * that have not yet been banked in case the task is currently running. 4055 * @p in case that task is currently running.
4057 */ 4056 */
4058unsigned long long task_sched_runtime(struct task_struct *p) 4057unsigned long long task_delta_exec(struct task_struct *p)
4059{ 4058{
4060 unsigned long flags; 4059 unsigned long flags;
4061 u64 ns, delta_exec;
4062 struct rq *rq; 4060 struct rq *rq;
4061 u64 ns = 0;
4063 4062
4064 rq = task_rq_lock(p, &flags); 4063 rq = task_rq_lock(p, &flags);
4065 ns = p->se.sum_exec_runtime; 4064
4066 if (task_current(rq, p)) { 4065 if (task_current(rq, p)) {
4066 u64 delta_exec;
4067
4067 update_rq_clock(rq); 4068 update_rq_clock(rq);
4068 delta_exec = rq->clock - p->se.exec_start; 4069 delta_exec = rq->clock - p->se.exec_start;
4069 if ((s64)delta_exec > 0) 4070 if ((s64)delta_exec > 0)
4070 ns += delta_exec; 4071 ns = delta_exec;
4071 } 4072 }
4073
4072 task_rq_unlock(rq, &flags); 4074 task_rq_unlock(rq, &flags);
4073 4075
4074 return ns; 4076 return ns;
@@ -4085,6 +4087,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4085 cputime64_t tmp; 4087 cputime64_t tmp;
4086 4088
4087 p->utime = cputime_add(p->utime, cputime); 4089 p->utime = cputime_add(p->utime, cputime);
4090 account_group_user_time(p, cputime);
4088 4091
4089 /* Add user time to cpustat. */ 4092 /* Add user time to cpustat. */
4090 tmp = cputime_to_cputime64(cputime); 4093 tmp = cputime_to_cputime64(cputime);
@@ -4109,6 +4112,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4109 tmp = cputime_to_cputime64(cputime); 4112 tmp = cputime_to_cputime64(cputime);
4110 4113
4111 p->utime = cputime_add(p->utime, cputime); 4114 p->utime = cputime_add(p->utime, cputime);
4115 account_group_user_time(p, cputime);
4112 p->gtime = cputime_add(p->gtime, cputime); 4116 p->gtime = cputime_add(p->gtime, cputime);
4113 4117
4114 cpustat->user = cputime64_add(cpustat->user, tmp); 4118 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4144,6 +4148,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4144 } 4148 }
4145 4149
4146 p->stime = cputime_add(p->stime, cputime); 4150 p->stime = cputime_add(p->stime, cputime);
4151 account_group_system_time(p, cputime);
4147 4152
4148 /* Add system time to cpustat. */ 4153 /* Add system time to cpustat. */
4149 tmp = cputime_to_cputime64(cputime); 4154 tmp = cputime_to_cputime64(cputime);
@@ -4320,7 +4325,7 @@ void __kprobes sub_preempt_count(int val)
4320 /* 4325 /*
4321 * Underflow? 4326 * Underflow?
4322 */ 4327 */
4323 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4328 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4324 return; 4329 return;
4325 /* 4330 /*
4326 * Is the spinlock portion underflowing? 4331 * Is the spinlock portion underflowing?
@@ -4441,12 +4446,8 @@ need_resched_nonpreemptible:
4441 if (sched_feat(HRTICK)) 4446 if (sched_feat(HRTICK))
4442 hrtick_clear(rq); 4447 hrtick_clear(rq);
4443 4448
4444 /* 4449 spin_lock_irq(&rq->lock);
4445 * Do the rq-clock update outside the rq lock:
4446 */
4447 local_irq_disable();
4448 update_rq_clock(rq); 4450 update_rq_clock(rq);
4449 spin_lock(&rq->lock);
4450 clear_tsk_need_resched(prev); 4451 clear_tsk_need_resched(prev);
4451 4452
4452 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4453 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -5119,6 +5120,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5119 set_load_weight(p); 5120 set_load_weight(p);
5120} 5121}
5121 5122
5123/*
5124 * check the target process has a UID that matches the current process's
5125 */
5126static bool check_same_owner(struct task_struct *p)
5127{
5128 const struct cred *cred = current_cred(), *pcred;
5129 bool match;
5130
5131 rcu_read_lock();
5132 pcred = __task_cred(p);
5133 match = (cred->euid == pcred->euid ||
5134 cred->euid == pcred->uid);
5135 rcu_read_unlock();
5136 return match;
5137}
5138
5122static int __sched_setscheduler(struct task_struct *p, int policy, 5139static int __sched_setscheduler(struct task_struct *p, int policy,
5123 struct sched_param *param, bool user) 5140 struct sched_param *param, bool user)
5124{ 5141{
@@ -5178,8 +5195,7 @@ recheck:
5178 return -EPERM; 5195 return -EPERM;
5179 5196
5180 /* can't change other user's priorities */ 5197 /* can't change other user's priorities */
5181 if ((current->euid != p->euid) && 5198 if (!check_same_owner(p))
5182 (current->euid != p->uid))
5183 return -EPERM; 5199 return -EPERM;
5184 } 5200 }
5185 5201
@@ -5411,8 +5427,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5411 read_unlock(&tasklist_lock); 5427 read_unlock(&tasklist_lock);
5412 5428
5413 retval = -EPERM; 5429 retval = -EPERM;
5414 if ((current->euid != p->euid) && (current->euid != p->uid) && 5430 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5415 !capable(CAP_SYS_NICE))
5416 goto out_unlock; 5431 goto out_unlock;
5417 5432
5418 retval = security_task_setscheduler(p, 0, NULL); 5433 retval = security_task_setscheduler(p, 0, NULL);
@@ -5851,6 +5866,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5851 struct rq *rq = cpu_rq(cpu); 5866 struct rq *rq = cpu_rq(cpu);
5852 unsigned long flags; 5867 unsigned long flags;
5853 5868
5869 spin_lock_irqsave(&rq->lock, flags);
5870
5854 __sched_fork(idle); 5871 __sched_fork(idle);
5855 idle->se.exec_start = sched_clock(); 5872 idle->se.exec_start = sched_clock();
5856 5873
@@ -5858,7 +5875,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5858 idle->cpus_allowed = cpumask_of_cpu(cpu); 5875 idle->cpus_allowed = cpumask_of_cpu(cpu);
5859 __set_task_cpu(idle, cpu); 5876 __set_task_cpu(idle, cpu);
5860 5877
5861 spin_lock_irqsave(&rq->lock, flags);
5862 rq->curr = rq->idle = idle; 5878 rq->curr = rq->idle = idle;
5863#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5879#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5864 idle->oncpu = 1; 5880 idle->oncpu = 1;
@@ -5875,6 +5891,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5875 * The idle tasks have their own, simple scheduling class: 5891 * The idle tasks have their own, simple scheduling class:
5876 */ 5892 */
5877 idle->sched_class = &idle_sched_class; 5893 idle->sched_class = &idle_sched_class;
5894 ftrace_graph_init_task(idle);
5878} 5895}
5879 5896
5880/* 5897/*
@@ -6105,7 +6122,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6105 6122
6106/* 6123/*
6107 * Figure out where task on dead CPU should go, use force if necessary. 6124 * Figure out where task on dead CPU should go, use force if necessary.
6108 * NOTE: interrupts should be disabled by the caller
6109 */ 6125 */
6110static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6126static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6111{ 6127{
@@ -6566,7 +6582,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6566 req = list_entry(rq->migration_queue.next, 6582 req = list_entry(rq->migration_queue.next,
6567 struct migration_req, list); 6583 struct migration_req, list);
6568 list_del_init(&req->list); 6584 list_del_init(&req->list);
6585 spin_unlock_irq(&rq->lock);
6569 complete(&req->done); 6586 complete(&req->done);
6587 spin_lock_irq(&rq->lock);
6570 } 6588 }
6571 spin_unlock_irq(&rq->lock); 6589 spin_unlock_irq(&rq->lock);
6572 break; 6590 break;
@@ -6615,28 +6633,6 @@ early_initcall(migration_init);
6615 6633
6616#ifdef CONFIG_SCHED_DEBUG 6634#ifdef CONFIG_SCHED_DEBUG
6617 6635
6618static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6619{
6620 switch (lvl) {
6621 case SD_LV_NONE:
6622 return "NONE";
6623 case SD_LV_SIBLING:
6624 return "SIBLING";
6625 case SD_LV_MC:
6626 return "MC";
6627 case SD_LV_CPU:
6628 return "CPU";
6629 case SD_LV_NODE:
6630 return "NODE";
6631 case SD_LV_ALLNODES:
6632 return "ALLNODES";
6633 case SD_LV_MAX:
6634 return "MAX";
6635
6636 }
6637 return "MAX";
6638}
6639
6640static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6636static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6641 cpumask_t *groupmask) 6637 cpumask_t *groupmask)
6642{ 6638{
@@ -6656,8 +6652,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6656 return -1; 6652 return -1;
6657 } 6653 }
6658 6654
6659 printk(KERN_CONT "span %s level %s\n", 6655 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6660 str, sd_level_to_string(sd->level));
6661 6656
6662 if (!cpu_isset(cpu, sd->span)) { 6657 if (!cpu_isset(cpu, sd->span)) {
6663 printk(KERN_ERR "ERROR: domain->span does not contain " 6658 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6793,6 +6788,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6793 SD_BALANCE_EXEC | 6788 SD_BALANCE_EXEC |
6794 SD_SHARE_CPUPOWER | 6789 SD_SHARE_CPUPOWER |
6795 SD_SHARE_PKG_RESOURCES); 6790 SD_SHARE_PKG_RESOURCES);
6791 if (nr_node_ids == 1)
6792 pflags &= ~SD_SERIALIZE;
6796 } 6793 }
6797 if (~cflags & pflags) 6794 if (~cflags & pflags)
6798 return 0; 6795 return 0;
@@ -6868,15 +6865,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6868 struct sched_domain *tmp; 6865 struct sched_domain *tmp;
6869 6866
6870 /* Remove the sched domains which do not contribute to scheduling. */ 6867 /* Remove the sched domains which do not contribute to scheduling. */
6871 for (tmp = sd; tmp; tmp = tmp->parent) { 6868 for (tmp = sd; tmp; ) {
6872 struct sched_domain *parent = tmp->parent; 6869 struct sched_domain *parent = tmp->parent;
6873 if (!parent) 6870 if (!parent)
6874 break; 6871 break;
6872
6875 if (sd_parent_degenerate(tmp, parent)) { 6873 if (sd_parent_degenerate(tmp, parent)) {
6876 tmp->parent = parent->parent; 6874 tmp->parent = parent->parent;
6877 if (parent->parent) 6875 if (parent->parent)
6878 parent->parent->child = tmp; 6876 parent->parent->child = tmp;
6879 } 6877 } else
6878 tmp = tmp->parent;
6880 } 6879 }
6881 6880
6882 if (sd && sd_degenerate(sd)) { 6881 if (sd && sd_degenerate(sd)) {
@@ -7311,13 +7310,21 @@ struct allmasks {
7311}; 7310};
7312 7311
7313#if NR_CPUS > 128 7312#if NR_CPUS > 128
7314#define SCHED_CPUMASK_ALLOC 1 7313#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7315#define SCHED_CPUMASK_FREE(v) kfree(v) 7314static inline void sched_cpumask_alloc(struct allmasks **masks)
7316#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7315{
7316 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7317}
7318static inline void sched_cpumask_free(struct allmasks *masks)
7319{
7320 kfree(masks);
7321}
7317#else 7322#else
7318#define SCHED_CPUMASK_ALLOC 0 7323#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7319#define SCHED_CPUMASK_FREE(v) 7324static inline void sched_cpumask_alloc(struct allmasks **masks)
7320#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7325{ }
7326static inline void sched_cpumask_free(struct allmasks *masks)
7327{ }
7321#endif 7328#endif
7322 7329
7323#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7330#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7393,9 +7400,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7393 return -ENOMEM; 7400 return -ENOMEM;
7394 } 7401 }
7395 7402
7396#if SCHED_CPUMASK_ALLOC
7397 /* get space for all scratch cpumask variables */ 7403 /* get space for all scratch cpumask variables */
7398 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7404 sched_cpumask_alloc(&allmasks);
7399 if (!allmasks) { 7405 if (!allmasks) {
7400 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7406 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7401 kfree(rd); 7407 kfree(rd);
@@ -7404,7 +7410,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7404#endif 7410#endif
7405 return -ENOMEM; 7411 return -ENOMEM;
7406 } 7412 }
7407#endif 7413
7408 tmpmask = (cpumask_t *)allmasks; 7414 tmpmask = (cpumask_t *)allmasks;
7409 7415
7410 7416
@@ -7658,13 +7664,14 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7658 cpu_attach_domain(sd, rd, i); 7664 cpu_attach_domain(sd, rd, i);
7659 } 7665 }
7660 7666
7661 SCHED_CPUMASK_FREE((void *)allmasks); 7667 sched_cpumask_free(allmasks);
7662 return 0; 7668 return 0;
7663 7669
7664#ifdef CONFIG_NUMA 7670#ifdef CONFIG_NUMA
7665error: 7671error:
7666 free_sched_groups(cpu_map, tmpmask); 7672 free_sched_groups(cpu_map, tmpmask);
7667 SCHED_CPUMASK_FREE((void *)allmasks); 7673 sched_cpumask_free(allmasks);
7674 kfree(rd);
7668 return -ENOMEM; 7675 return -ENOMEM;
7669#endif 7676#endif
7670} 7677}
@@ -7686,8 +7693,14 @@ static struct sched_domain_attr *dattr_cur;
7686 */ 7693 */
7687static cpumask_t fallback_doms; 7694static cpumask_t fallback_doms;
7688 7695
7689void __attribute__((weak)) arch_update_cpu_topology(void) 7696/*
7697 * arch_update_cpu_topology lets virtualized architectures update the
7698 * cpu core maps. It is supposed to return 1 if the topology changed
7699 * or 0 if it stayed the same.
7700 */
7701int __attribute__((weak)) arch_update_cpu_topology(void)
7690{ 7702{
7703 return 0;
7691} 7704}
7692 7705
7693/* 7706/*
@@ -7727,8 +7740,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7727 cpumask_t tmpmask; 7740 cpumask_t tmpmask;
7728 int i; 7741 int i;
7729 7742
7730 unregister_sched_domain_sysctl();
7731
7732 for_each_cpu_mask_nr(i, *cpu_map) 7743 for_each_cpu_mask_nr(i, *cpu_map)
7733 cpu_attach_domain(NULL, &def_root_domain, i); 7744 cpu_attach_domain(NULL, &def_root_domain, i);
7734 synchronize_sched(); 7745 synchronize_sched();
@@ -7766,13 +7777,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7766 * 7777 *
7767 * The passed in 'doms_new' should be kmalloc'd. This routine takes 7778 * The passed in 'doms_new' should be kmalloc'd. This routine takes
7768 * ownership of it and will kfree it when done with it. If the caller 7779 * ownership of it and will kfree it when done with it. If the caller
7769 * failed the kmalloc call, then it can pass in doms_new == NULL, 7780 * failed the kmalloc call, then it can pass in doms_new == NULL &&
7770 * and partition_sched_domains() will fallback to the single partition 7781 * ndoms_new == 1, and partition_sched_domains() will fallback to
7771 * 'fallback_doms', it also forces the domains to be rebuilt. 7782 * the single partition 'fallback_doms', it also forces the domains
7783 * to be rebuilt.
7772 * 7784 *
7773 * If doms_new==NULL it will be replaced with cpu_online_map. 7785 * If doms_new == NULL it will be replaced with cpu_online_map.
7774 * ndoms_new==0 is a special case for destroying existing domains. 7786 * ndoms_new == 0 is a special case for destroying existing domains,
7775 * It will not create the default domain. 7787 * and it will not create the default domain.
7776 * 7788 *
7777 * Call with hotplug lock held 7789 * Call with hotplug lock held
7778 */ 7790 */
@@ -7780,17 +7792,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7780 struct sched_domain_attr *dattr_new) 7792 struct sched_domain_attr *dattr_new)
7781{ 7793{
7782 int i, j, n; 7794 int i, j, n;
7795 int new_topology;
7783 7796
7784 mutex_lock(&sched_domains_mutex); 7797 mutex_lock(&sched_domains_mutex);
7785 7798
7786 /* always unregister in case we don't destroy any domains */ 7799 /* always unregister in case we don't destroy any domains */
7787 unregister_sched_domain_sysctl(); 7800 unregister_sched_domain_sysctl();
7788 7801
7802 /* Let architecture update cpu core mappings. */
7803 new_topology = arch_update_cpu_topology();
7804
7789 n = doms_new ? ndoms_new : 0; 7805 n = doms_new ? ndoms_new : 0;
7790 7806
7791 /* Destroy deleted domains */ 7807 /* Destroy deleted domains */
7792 for (i = 0; i < ndoms_cur; i++) { 7808 for (i = 0; i < ndoms_cur; i++) {
7793 for (j = 0; j < n; j++) { 7809 for (j = 0; j < n && !new_topology; j++) {
7794 if (cpus_equal(doms_cur[i], doms_new[j]) 7810 if (cpus_equal(doms_cur[i], doms_new[j])
7795 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7811 && dattrs_equal(dattr_cur, i, dattr_new, j))
7796 goto match1; 7812 goto match1;
@@ -7805,12 +7821,12 @@ match1:
7805 ndoms_cur = 0; 7821 ndoms_cur = 0;
7806 doms_new = &fallback_doms; 7822 doms_new = &fallback_doms;
7807 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7823 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7808 dattr_new = NULL; 7824 WARN_ON_ONCE(dattr_new);
7809 } 7825 }
7810 7826
7811 /* Build new domains */ 7827 /* Build new domains */
7812 for (i = 0; i < ndoms_new; i++) { 7828 for (i = 0; i < ndoms_new; i++) {
7813 for (j = 0; j < ndoms_cur; j++) { 7829 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7814 if (cpus_equal(doms_new[i], doms_cur[j]) 7830 if (cpus_equal(doms_new[i], doms_cur[j])
7815 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7831 && dattrs_equal(dattr_new, i, dattr_cur, j))
7816 goto match2; 7832 goto match2;
@@ -8465,7 +8481,7 @@ static
8465int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8481int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8466{ 8482{
8467 struct cfs_rq *cfs_rq; 8483 struct cfs_rq *cfs_rq;
8468 struct sched_entity *se, *parent_se; 8484 struct sched_entity *se;
8469 struct rq *rq; 8485 struct rq *rq;
8470 int i; 8486 int i;
8471 8487
@@ -8481,18 +8497,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8481 for_each_possible_cpu(i) { 8497 for_each_possible_cpu(i) {
8482 rq = cpu_rq(i); 8498 rq = cpu_rq(i);
8483 8499
8484 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8500 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8485 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8501 GFP_KERNEL, cpu_to_node(i));
8486 if (!cfs_rq) 8502 if (!cfs_rq)
8487 goto err; 8503 goto err;
8488 8504
8489 se = kmalloc_node(sizeof(struct sched_entity), 8505 se = kzalloc_node(sizeof(struct sched_entity),
8490 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8506 GFP_KERNEL, cpu_to_node(i));
8491 if (!se) 8507 if (!se)
8492 goto err; 8508 goto err;
8493 8509
8494 parent_se = parent ? parent->se[i] : NULL; 8510 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8495 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8496 } 8511 }
8497 8512
8498 return 1; 8513 return 1;
@@ -8553,7 +8568,7 @@ static
8553int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8568int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8554{ 8569{
8555 struct rt_rq *rt_rq; 8570 struct rt_rq *rt_rq;
8556 struct sched_rt_entity *rt_se, *parent_se; 8571 struct sched_rt_entity *rt_se;
8557 struct rq *rq; 8572 struct rq *rq;
8558 int i; 8573 int i;
8559 8574
@@ -8570,18 +8585,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8570 for_each_possible_cpu(i) { 8585 for_each_possible_cpu(i) {
8571 rq = cpu_rq(i); 8586 rq = cpu_rq(i);
8572 8587
8573 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8588 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8574 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8589 GFP_KERNEL, cpu_to_node(i));
8575 if (!rt_rq) 8590 if (!rt_rq)
8576 goto err; 8591 goto err;
8577 8592
8578 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8593 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8579 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8594 GFP_KERNEL, cpu_to_node(i));
8580 if (!rt_se) 8595 if (!rt_se)
8581 goto err; 8596 goto err;
8582 8597
8583 parent_se = parent ? parent->rt_se[i] : NULL; 8598 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8584 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8585 } 8599 }
8586 8600
8587 return 1; 8601 return 1;
@@ -9224,11 +9238,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9224 * (balbir@in.ibm.com). 9238 * (balbir@in.ibm.com).
9225 */ 9239 */
9226 9240
9227/* track cpu usage of a group of tasks */ 9241/* track cpu usage of a group of tasks and its child groups */
9228struct cpuacct { 9242struct cpuacct {
9229 struct cgroup_subsys_state css; 9243 struct cgroup_subsys_state css;
9230 /* cpuusage holds pointer to a u64-type object on every cpu */ 9244 /* cpuusage holds pointer to a u64-type object on every cpu */
9231 u64 *cpuusage; 9245 u64 *cpuusage;
9246 struct cpuacct *parent;
9232}; 9247};
9233 9248
9234struct cgroup_subsys cpuacct_subsys; 9249struct cgroup_subsys cpuacct_subsys;
@@ -9262,6 +9277,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9262 return ERR_PTR(-ENOMEM); 9277 return ERR_PTR(-ENOMEM);
9263 } 9278 }
9264 9279
9280 if (cgrp->parent)
9281 ca->parent = cgroup_ca(cgrp->parent);
9282
9265 return &ca->css; 9283 return &ca->css;
9266} 9284}
9267 9285
@@ -9275,6 +9293,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9275 kfree(ca); 9293 kfree(ca);
9276} 9294}
9277 9295
9296static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9297{
9298 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9299 u64 data;
9300
9301#ifndef CONFIG_64BIT
9302 /*
9303 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9304 */
9305 spin_lock_irq(&cpu_rq(cpu)->lock);
9306 data = *cpuusage;
9307 spin_unlock_irq(&cpu_rq(cpu)->lock);
9308#else
9309 data = *cpuusage;
9310#endif
9311
9312 return data;
9313}
9314
9315static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9316{
9317 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9318
9319#ifndef CONFIG_64BIT
9320 /*
9321 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9322 */
9323 spin_lock_irq(&cpu_rq(cpu)->lock);
9324 *cpuusage = val;
9325 spin_unlock_irq(&cpu_rq(cpu)->lock);
9326#else
9327 *cpuusage = val;
9328#endif
9329}
9330
9278/* return total cpu usage (in nanoseconds) of a group */ 9331/* return total cpu usage (in nanoseconds) of a group */
9279static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9332static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9280{ 9333{
@@ -9282,17 +9335,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9282 u64 totalcpuusage = 0; 9335 u64 totalcpuusage = 0;
9283 int i; 9336 int i;
9284 9337
9285 for_each_possible_cpu(i) { 9338 for_each_present_cpu(i)
9286 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9339 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9287
9288 /*
9289 * Take rq->lock to make 64-bit addition safe on 32-bit
9290 * platforms.
9291 */
9292 spin_lock_irq(&cpu_rq(i)->lock);
9293 totalcpuusage += *cpuusage;
9294 spin_unlock_irq(&cpu_rq(i)->lock);
9295 }
9296 9340
9297 return totalcpuusage; 9341 return totalcpuusage;
9298} 9342}
@@ -9309,23 +9353,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9309 goto out; 9353 goto out;
9310 } 9354 }
9311 9355
9312 for_each_possible_cpu(i) { 9356 for_each_present_cpu(i)
9313 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9357 cpuacct_cpuusage_write(ca, i, 0);
9314 9358
9315 spin_lock_irq(&cpu_rq(i)->lock);
9316 *cpuusage = 0;
9317 spin_unlock_irq(&cpu_rq(i)->lock);
9318 }
9319out: 9359out:
9320 return err; 9360 return err;
9321} 9361}
9322 9362
9363static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9364 struct seq_file *m)
9365{
9366 struct cpuacct *ca = cgroup_ca(cgroup);
9367 u64 percpu;
9368 int i;
9369
9370 for_each_present_cpu(i) {
9371 percpu = cpuacct_cpuusage_read(ca, i);
9372 seq_printf(m, "%llu ", (unsigned long long) percpu);
9373 }
9374 seq_printf(m, "\n");
9375 return 0;
9376}
9377
9323static struct cftype files[] = { 9378static struct cftype files[] = {
9324 { 9379 {
9325 .name = "usage", 9380 .name = "usage",
9326 .read_u64 = cpuusage_read, 9381 .read_u64 = cpuusage_read,
9327 .write_u64 = cpuusage_write, 9382 .write_u64 = cpuusage_write,
9328 }, 9383 },
9384 {
9385 .name = "usage_percpu",
9386 .read_seq_string = cpuacct_percpu_seq_read,
9387 },
9388
9329}; 9389};
9330 9390
9331static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9391static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9341,14 +9401,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9341static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9401static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9342{ 9402{
9343 struct cpuacct *ca; 9403 struct cpuacct *ca;
9404 int cpu;
9344 9405
9345 if (!cpuacct_subsys.active) 9406 if (!cpuacct_subsys.active)
9346 return; 9407 return;
9347 9408
9409 cpu = task_cpu(tsk);
9348 ca = task_ca(tsk); 9410 ca = task_ca(tsk);
9349 if (ca) {
9350 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9351 9411
9412 for (; ca; ca = ca->parent) {
9413 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9352 *cpuusage += cputime; 9414 *cpuusage += cputime;
9353 } 9415 }
9354} 9416}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ad958c1ec708..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
164 {
165 uid_t uid = cfs_rq->tg->uid;
166 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
167 }
134#else 168#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 169 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 170#endif
137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 171 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
139 SPLIT_NS(cfs_rq->exec_clock)); 172 SPLIT_NS(cfs_rq->exec_clock));
140 173
@@ -144,7 +177,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
144 last = __pick_last_entity(cfs_rq); 177 last = __pick_last_entity(cfs_rq);
145 if (last) 178 if (last)
146 max_vruntime = last->vruntime; 179 max_vruntime = last->vruntime;
147 min_vruntime = rq->cfs.min_vruntime; 180 min_vruntime = cfs_rq->min_vruntime;
148 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 181 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
149 spin_unlock_irqrestore(&rq->lock, flags); 182 spin_unlock_irqrestore(&rq->lock, flags);
150 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 183 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,31 +194,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
161 SPLIT_NS(spread0)); 194 SPLIT_NS(spread0));
162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 195 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 196 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
164#ifdef CONFIG_SCHEDSTATS
165#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
166
167 P(yld_exp_empty);
168 P(yld_act_empty);
169 P(yld_both_empty);
170 P(yld_count);
171
172 P(sched_switch);
173 P(sched_count);
174 P(sched_goidle);
175 197
176 P(ttwu_count); 198 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
177 P(ttwu_local);
178
179 P(bkl_count);
180
181#undef P
182#endif
183 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
184 cfs_rq->nr_spread_over); 199 cfs_rq->nr_spread_over);
185#ifdef CONFIG_FAIR_GROUP_SCHED 200#ifdef CONFIG_FAIR_GROUP_SCHED
186#ifdef CONFIG_SMP 201#ifdef CONFIG_SMP
187 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 202 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
188#endif 203#endif
204 print_cfs_group_stats(m, cpu, cfs_rq->tg);
189#endif 205#endif
190} 206}
191 207
@@ -193,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
193{ 209{
194#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
195 char path[128] = ""; 211 char path[128] = "";
196 struct cgroup *cgroup = NULL;
197 struct task_group *tg = rt_rq->tg; 212 struct task_group *tg = rt_rq->tg;
198 213
199 if (tg) 214 cgroup_path(tg->css.cgroup, path, sizeof(path));
200 cgroup = tg->css.cgroup;
201
202 if (cgroup)
203 cgroup_path(cgroup, path, sizeof(path));
204 215
205 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
206#else 217#else
@@ -260,6 +271,25 @@ static void print_cpu(struct seq_file *m, int cpu)
260#undef P 271#undef P
261#undef PN 272#undef PN
262 273
274#ifdef CONFIG_SCHEDSTATS
275#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
276
277 P(yld_exp_empty);
278 P(yld_act_empty);
279 P(yld_both_empty);
280 P(yld_count);
281
282 P(sched_switch);
283 P(sched_count);
284 P(sched_goidle);
285
286 P(ttwu_count);
287 P(ttwu_local);
288
289 P(bkl_count);
290
291#undef P
292#endif
263 print_cfs_stats(m, cpu); 293 print_cfs_stats(m, cpu);
264 print_rt_stats(m, cpu); 294 print_rt_stats(m, cpu);
265 295
@@ -271,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
271 u64 now = ktime_to_ns(ktime_get()); 301 u64 now = ktime_to_ns(ktime_get());
272 int cpu; 302 int cpu;
273 303
274 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 304 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
275 init_utsname()->release, 305 init_utsname()->release,
276 (int)strcspn(init_utsname()->version, " "), 306 (int)strcspn(init_utsname()->version, " "),
277 init_utsname()->version); 307 init_utsname()->version);
@@ -319,7 +349,7 @@ static int __init init_sched_debug_procfs(void)
319{ 349{
320 struct proc_dir_entry *pe; 350 struct proc_dir_entry *pe;
321 351
322 pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); 352 pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
323 if (!pe) 353 if (!pe)
324 return -ENOMEM; 354 return -ENOMEM;
325 return 0; 355 return 0;
@@ -422,10 +452,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
422#undef __P 452#undef __P
423 453
424 { 454 {
455 unsigned int this_cpu = raw_smp_processor_id();
425 u64 t0, t1; 456 u64 t0, t1;
426 457
427 t0 = sched_clock(); 458 t0 = cpu_clock(this_cpu);
428 t1 = sched_clock(); 459 t1 = cpu_clock(this_cpu);
429 SEQ_printf(m, "%-35s:%21Ld\n", 460 SEQ_printf(m, "%-35s:%21Ld\n",
430 "clock-delta", (long long)(t1-t0)); 461 "clock-delta", (long long)(t1-t0));
431 } 462 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 18fd17172eb6..5ad4440f0fc4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
76static const struct sched_class fair_sched_class;
77
76/************************************************************** 78/**************************************************************
77 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
78 */ 80 */
@@ -141,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
141 return se->parent; 143 return se->parent;
142} 144}
143 145
146/* return depth at which a sched entity is present in the hierarchy */
147static inline int depth_se(struct sched_entity *se)
148{
149 int depth = 0;
150
151 for_each_sched_entity(se)
152 depth++;
153
154 return depth;
155}
156
157static void
158find_matching_se(struct sched_entity **se, struct sched_entity **pse)
159{
160 int se_depth, pse_depth;
161
162 /*
163 * preemption test can be made between sibling entities who are in the
164 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
165 * both tasks until we find their ancestors who are siblings of common
166 * parent.
167 */
168
169 /* First walk up until both entities are at same depth */
170 se_depth = depth_se(*se);
171 pse_depth = depth_se(*pse);
172
173 while (se_depth > pse_depth) {
174 se_depth--;
175 *se = parent_entity(*se);
176 }
177
178 while (pse_depth > se_depth) {
179 pse_depth--;
180 *pse = parent_entity(*pse);
181 }
182
183 while (!is_same_group(*se, *pse)) {
184 *se = parent_entity(*se);
185 *pse = parent_entity(*pse);
186 }
187}
188
144#else /* CONFIG_FAIR_GROUP_SCHED */ 189#else /* CONFIG_FAIR_GROUP_SCHED */
145 190
146static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 191static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
191 return NULL; 236 return NULL;
192} 237}
193 238
239static inline void
240find_matching_se(struct sched_entity **se, struct sched_entity **pse)
241{
242}
243
194#endif /* CONFIG_FAIR_GROUP_SCHED */ 244#endif /* CONFIG_FAIR_GROUP_SCHED */
195 245
196 246
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
221 return se->vruntime - cfs_rq->min_vruntime; 271 return se->vruntime - cfs_rq->min_vruntime;
222} 272}
223 273
274static void update_min_vruntime(struct cfs_rq *cfs_rq)
275{
276 u64 vruntime = cfs_rq->min_vruntime;
277
278 if (cfs_rq->curr)
279 vruntime = cfs_rq->curr->vruntime;
280
281 if (cfs_rq->rb_leftmost) {
282 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
283 struct sched_entity,
284 run_node);
285
286 if (vruntime == cfs_rq->min_vruntime)
287 vruntime = se->vruntime;
288 else
289 vruntime = min_vruntime(vruntime, se->vruntime);
290 }
291
292 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
293}
294
224/* 295/*
225 * Enqueue an entity into the rb-tree: 296 * Enqueue an entity into the rb-tree:
226 */ 297 */
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
254 * Maintain a cache of leftmost tree entries (it is frequently 325 * Maintain a cache of leftmost tree entries (it is frequently
255 * used): 326 * used):
256 */ 327 */
257 if (leftmost) { 328 if (leftmost)
258 cfs_rq->rb_leftmost = &se->run_node; 329 cfs_rq->rb_leftmost = &se->run_node;
259 /*
260 * maintain cfs_rq->min_vruntime to be a monotonic increasing
261 * value tracking the leftmost vruntime in the tree.
262 */
263 cfs_rq->min_vruntime =
264 max_vruntime(cfs_rq->min_vruntime, se->vruntime);
265 }
266 330
267 rb_link_node(&se->run_node, parent, link); 331 rb_link_node(&se->run_node, parent, link);
268 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 332 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
272{ 336{
273 if (cfs_rq->rb_leftmost == &se->run_node) { 337 if (cfs_rq->rb_leftmost == &se->run_node) {
274 struct rb_node *next_node; 338 struct rb_node *next_node;
275 struct sched_entity *next;
276 339
277 next_node = rb_next(&se->run_node); 340 next_node = rb_next(&se->run_node);
278 cfs_rq->rb_leftmost = next_node; 341 cfs_rq->rb_leftmost = next_node;
279
280 if (next_node) {
281 next = rb_entry(next_node,
282 struct sched_entity, run_node);
283 cfs_rq->min_vruntime =
284 max_vruntime(cfs_rq->min_vruntime,
285 next->vruntime);
286 }
287 } 342 }
288 343
289 if (cfs_rq->next == se)
290 cfs_rq->next = NULL;
291
292 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 344 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
293} 345}
294 346
295static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
296{
297 return cfs_rq->rb_leftmost;
298}
299
300static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 347static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
301{ 348{
302 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 349 struct rb_node *left = cfs_rq->rb_leftmost;
350
351 if (!left)
352 return NULL;
353
354 return rb_entry(left, struct sched_entity, run_node);
303} 355}
304 356
305static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 357static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
306{ 358{
307 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 359 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
308 360
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 386#endif
335 387
336/* 388/*
337 * delta *= w / rw 389 * delta *= P[w / rw]
338 */ 390 */
339static inline unsigned long 391static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se) 392calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
348} 400}
349 401
350/* 402/*
351 * delta *= rw / w 403 * delta /= w
352 */ 404 */
353static inline unsigned long 405static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se) 406calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{ 407{
356 for_each_sched_entity(se) { 408 if (unlikely(se->load.weight != NICE_0_LOAD))
357 delta = calc_delta_mine(delta, 409 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360 410
361 return delta; 411 return delta;
362} 412}
@@ -386,26 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
386 * We calculate the wall-time slice from the period by taking a part 436 * We calculate the wall-time slice from the period by taking a part
387 * proportional to the weight. 437 * proportional to the weight.
388 * 438 *
389 * s = p*w/rw 439 * s = p*P[w/rw]
390 */ 440 */
391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
392{ 442{
393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); 443 unsigned long nr_running = cfs_rq->nr_running;
444
445 if (unlikely(!se->on_rq))
446 nr_running++;
447
448 return calc_delta_weight(__sched_period(nr_running), se);
394} 449}
395 450
396/* 451/*
397 * We calculate the vruntime slice of a to be inserted task 452 * We calculate the vruntime slice of a to be inserted task
398 * 453 *
399 * vs = s*rw/w = p 454 * vs = s/w
400 */ 455 */
401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 456static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
402{ 457{
403 unsigned long nr_running = cfs_rq->nr_running; 458 return calc_delta_fair(sched_slice(cfs_rq, se), se);
404
405 if (!se->on_rq)
406 nr_running++;
407
408 return __sched_period(nr_running);
409} 459}
410 460
411/* 461/*
@@ -424,6 +474,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
424 schedstat_add(cfs_rq, exec_clock, delta_exec); 474 schedstat_add(cfs_rq, exec_clock, delta_exec);
425 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 475 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
426 curr->vruntime += delta_exec_weighted; 476 curr->vruntime += delta_exec_weighted;
477 update_min_vruntime(cfs_rq);
427} 478}
428 479
429static void update_curr(struct cfs_rq *cfs_rq) 480static void update_curr(struct cfs_rq *cfs_rq)
@@ -441,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
441 * overflow on 32 bits): 492 * overflow on 32 bits):
442 */ 493 */
443 delta_exec = (unsigned long)(now - curr->exec_start); 494 delta_exec = (unsigned long)(now - curr->exec_start);
495 if (!delta_exec)
496 return;
444 497
445 __update_curr(cfs_rq, curr, delta_exec); 498 __update_curr(cfs_rq, curr, delta_exec);
446 curr->exec_start = now; 499 curr->exec_start = now;
@@ -449,6 +502,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
449 struct task_struct *curtask = task_of(curr); 502 struct task_struct *curtask = task_of(curr);
450 503
451 cpuacct_charge(curtask, delta_exec); 504 cpuacct_charge(curtask, delta_exec);
505 account_group_exec_runtime(curtask, delta_exec);
452 } 506 }
453} 507}
454 508
@@ -612,13 +666,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
612static void 666static void
613place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) 667place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
614{ 668{
615 u64 vruntime; 669 u64 vruntime = cfs_rq->min_vruntime;
616
617 if (first_fair(cfs_rq)) {
618 vruntime = min_vruntime(cfs_rq->min_vruntime,
619 __pick_next_entity(cfs_rq)->vruntime);
620 } else
621 vruntime = cfs_rq->min_vruntime;
622 670
623 /* 671 /*
624 * The 'current' period is already promised to the current tasks, 672 * The 'current' period is already promised to the current tasks,
@@ -627,7 +675,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
627 * stays open at the end. 675 * stays open at the end.
628 */ 676 */
629 if (initial && sched_feat(START_DEBIT)) 677 if (initial && sched_feat(START_DEBIT))
630 vruntime += sched_vslice_add(cfs_rq, se); 678 vruntime += sched_vslice(cfs_rq, se);
631 679
632 if (!initial) { 680 if (!initial) {
633 /* sleeps upto a single latency don't count. */ 681 /* sleeps upto a single latency don't count. */
@@ -670,6 +718,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
670 __enqueue_entity(cfs_rq, se); 718 __enqueue_entity(cfs_rq, se);
671} 719}
672 720
721static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
722{
723 if (cfs_rq->last == se)
724 cfs_rq->last = NULL;
725
726 if (cfs_rq->next == se)
727 cfs_rq->next = NULL;
728}
729
673static void 730static void
674dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 731dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
675{ 732{
@@ -692,9 +749,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
692#endif 749#endif
693 } 750 }
694 751
752 clear_buddies(cfs_rq, se);
753
695 if (se != cfs_rq->curr) 754 if (se != cfs_rq->curr)
696 __dequeue_entity(cfs_rq, se); 755 __dequeue_entity(cfs_rq, se);
697 account_entity_dequeue(cfs_rq, se); 756 account_entity_dequeue(cfs_rq, se);
757 update_min_vruntime(cfs_rq);
698} 758}
699 759
700/* 760/*
@@ -741,29 +801,18 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
741 se->prev_sum_exec_runtime = se->sum_exec_runtime; 801 se->prev_sum_exec_runtime = se->sum_exec_runtime;
742} 802}
743 803
744static struct sched_entity * 804static int
745pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 805wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
746{
747 struct rq *rq = rq_of(cfs_rq);
748 u64 pair_slice = rq->clock - cfs_rq->pair_start;
749
750 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
751 cfs_rq->pair_start = rq->clock;
752 return se;
753 }
754
755 return cfs_rq->next;
756}
757 806
758static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 807static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
759{ 808{
760 struct sched_entity *se = NULL; 809 struct sched_entity *se = __pick_next_entity(cfs_rq);
761 810
762 if (first_fair(cfs_rq)) { 811 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
763 se = __pick_next_entity(cfs_rq); 812 return cfs_rq->next;
764 se = pick_next(cfs_rq, se); 813
765 set_next_entity(cfs_rq, se); 814 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
766 } 815 return cfs_rq->last;
767 816
768 return se; 817 return se;
769} 818}
@@ -848,11 +897,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
848 hrtick_start(rq, delta); 897 hrtick_start(rq, delta);
849 } 898 }
850} 899}
900
901/*
902 * called from enqueue/dequeue and updates the hrtick when the
903 * current task is from our class and nr_running is low enough
904 * to matter.
905 */
906static void hrtick_update(struct rq *rq)
907{
908 struct task_struct *curr = rq->curr;
909
910 if (curr->sched_class != &fair_sched_class)
911 return;
912
913 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
914 hrtick_start_fair(rq, curr);
915}
851#else /* !CONFIG_SCHED_HRTICK */ 916#else /* !CONFIG_SCHED_HRTICK */
852static inline void 917static inline void
853hrtick_start_fair(struct rq *rq, struct task_struct *p) 918hrtick_start_fair(struct rq *rq, struct task_struct *p)
854{ 919{
855} 920}
921
922static inline void hrtick_update(struct rq *rq)
923{
924}
856#endif 925#endif
857 926
858/* 927/*
@@ -873,7 +942,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
873 wakeup = 1; 942 wakeup = 1;
874 } 943 }
875 944
876 hrtick_start_fair(rq, rq->curr); 945 hrtick_update(rq);
877} 946}
878 947
879/* 948/*
@@ -895,7 +964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
895 sleep = 1; 964 sleep = 1;
896 } 965 }
897 966
898 hrtick_start_fair(rq, rq->curr); 967 hrtick_update(rq);
899} 968}
900 969
901/* 970/*
@@ -915,6 +984,8 @@ static void yield_task_fair(struct rq *rq)
915 if (unlikely(cfs_rq->nr_running == 1)) 984 if (unlikely(cfs_rq->nr_running == 1))
916 return; 985 return;
917 986
987 clear_buddies(cfs_rq, se);
988
918 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 989 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
919 update_rq_clock(rq); 990 update_rq_clock(rq);
920 /* 991 /*
@@ -1001,8 +1072,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1001 1072
1002#ifdef CONFIG_SMP 1073#ifdef CONFIG_SMP
1003 1074
1004static const struct sched_class fair_sched_class;
1005
1006#ifdef CONFIG_FAIR_GROUP_SCHED 1075#ifdef CONFIG_FAIR_GROUP_SCHED
1007/* 1076/*
1008 * effective_load() calculates the load change as seen from the root_task_group 1077 * effective_load() calculates the load change as seen from the root_task_group
@@ -1103,10 +1172,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1103 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1172 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1104 return 0; 1173 return 0;
1105 1174
1106 if (!sync && sched_feat(SYNC_WAKEUPS) && 1175 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1107 curr->se.avg_overlap < sysctl_sched_migration_cost && 1176 p->se.avg_overlap > sysctl_sched_migration_cost))
1108 p->se.avg_overlap < sysctl_sched_migration_cost) 1177 sync = 0;
1109 sync = 1;
1110 1178
1111 /* 1179 /*
1112 * If sync wakeup then subtract the (maximum possible) 1180 * If sync wakeup then subtract the (maximum possible)
@@ -1225,33 +1293,87 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1225 * More easily preempt - nice tasks, while not making it harder for 1293 * More easily preempt - nice tasks, while not making it harder for
1226 * + nice tasks. 1294 * + nice tasks.
1227 */ 1295 */
1228 if (sched_feat(ASYM_GRAN)) 1296 if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
1229 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); 1297 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1230 1298
1231 return gran; 1299 return gran;
1232} 1300}
1233 1301
1234/* 1302/*
1303 * Should 'se' preempt 'curr'.
1304 *
1305 * |s1
1306 * |s2
1307 * |s3
1308 * g
1309 * |<--->|c
1310 *
1311 * w(c, s1) = -1
1312 * w(c, s2) = 0
1313 * w(c, s3) = 1
1314 *
1315 */
1316static int
1317wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1318{
1319 s64 gran, vdiff = curr->vruntime - se->vruntime;
1320
1321 if (vdiff <= 0)
1322 return -1;
1323
1324 gran = wakeup_gran(curr);
1325 if (vdiff > gran)
1326 return 1;
1327
1328 return 0;
1329}
1330
1331static void set_last_buddy(struct sched_entity *se)
1332{
1333 for_each_sched_entity(se)
1334 cfs_rq_of(se)->last = se;
1335}
1336
1337static void set_next_buddy(struct sched_entity *se)
1338{
1339 for_each_sched_entity(se)
1340 cfs_rq_of(se)->next = se;
1341}
1342
1343/*
1235 * Preempt the current task with a newly woken task if needed: 1344 * Preempt the current task with a newly woken task if needed:
1236 */ 1345 */
1237static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1346static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1238{ 1347{
1239 struct task_struct *curr = rq->curr; 1348 struct task_struct *curr = rq->curr;
1240 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1241 struct sched_entity *se = &curr->se, *pse = &p->se; 1349 struct sched_entity *se = &curr->se, *pse = &p->se;
1242 s64 delta_exec; 1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1351
1352 update_curr(cfs_rq);
1243 1353
1244 if (unlikely(rt_prio(p->prio))) { 1354 if (unlikely(rt_prio(p->prio))) {
1245 update_rq_clock(rq);
1246 update_curr(cfs_rq);
1247 resched_task(curr); 1355 resched_task(curr);
1248 return; 1356 return;
1249 } 1357 }
1250 1358
1359 if (unlikely(p->sched_class != &fair_sched_class))
1360 return;
1361
1251 if (unlikely(se == pse)) 1362 if (unlikely(se == pse))
1252 return; 1363 return;
1253 1364
1254 cfs_rq_of(pse)->next = pse; 1365 /*
1366 * Only set the backward buddy when the current task is still on the
1367 * rq. This can happen when a wakeup gets interleaved with schedule on
1368 * the ->pre_schedule() or idle_balance() point, either of which can
1369 * drop the rq lock.
1370 *
1371 * Also, during early boot the idle thread is in the fair class, for
1372 * obvious reasons its a bad idea to schedule back to the idle thread.
1373 */
1374 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1375 set_last_buddy(se);
1376 set_next_buddy(pse);
1255 1377
1256 /* 1378 /*
1257 * We can come here with TIF_NEED_RESCHED already set from new task 1379 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1277,9 +1399,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1277 return; 1399 return;
1278 } 1400 }
1279 1401
1280 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; 1402 find_matching_se(&se, &pse);
1281 if (delta_exec > wakeup_gran(pse)) 1403
1282 resched_task(curr); 1404 while (se) {
1405 BUG_ON(!pse);
1406
1407 if (wakeup_preempt_entity(se, pse) == 1) {
1408 resched_task(curr);
1409 break;
1410 }
1411
1412 se = parent_entity(se);
1413 pse = parent_entity(pse);
1414 }
1283} 1415}
1284 1416
1285static struct task_struct *pick_next_task_fair(struct rq *rq) 1417static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1293,6 +1425,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1293 1425
1294 do { 1426 do {
1295 se = pick_next_entity(cfs_rq); 1427 se = pick_next_entity(cfs_rq);
1428 set_next_entity(cfs_rq, se);
1296 cfs_rq = group_cfs_rq(se); 1429 cfs_rq = group_cfs_rq(se);
1297 } while (cfs_rq); 1430 } while (cfs_rq);
1298 1431
@@ -1575,9 +1708,6 @@ static const struct sched_class fair_sched_class = {
1575 .enqueue_task = enqueue_task_fair, 1708 .enqueue_task = enqueue_task_fair,
1576 .dequeue_task = dequeue_task_fair, 1709 .dequeue_task = dequeue_task_fair,
1577 .yield_task = yield_task_fair, 1710 .yield_task = yield_task_fair,
1578#ifdef CONFIG_SMP
1579 .select_task_rq = select_task_rq_fair,
1580#endif /* CONFIG_SMP */
1581 1711
1582 .check_preempt_curr = check_preempt_wakeup, 1712 .check_preempt_curr = check_preempt_wakeup,
1583 1713
@@ -1585,6 +1715,8 @@ static const struct sched_class fair_sched_class = {
1585 .put_prev_task = put_prev_task_fair, 1715 .put_prev_task = put_prev_task_fair,
1586 1716
1587#ifdef CONFIG_SMP 1717#ifdef CONFIG_SMP
1718 .select_task_rq = select_task_rq_fair,
1719
1588 .load_balance = load_balance_fair, 1720 .load_balance = load_balance_fair,
1589 .move_one_task = move_one_task_fair, 1721 .move_one_task = move_one_task_fair,
1590#endif 1722#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 7c9e8f4a049f..da5d93b5d2c6 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -5,10 +5,11 @@ SCHED_FEAT(START_DEBIT, 1)
5SCHED_FEAT(AFFINE_WAKEUPS, 1) 5SCHED_FEAT(AFFINE_WAKEUPS, 1)
6SCHED_FEAT(CACHE_HOT_BUDDY, 1) 6SCHED_FEAT(CACHE_HOT_BUDDY, 1)
7SCHED_FEAT(SYNC_WAKEUPS, 1) 7SCHED_FEAT(SYNC_WAKEUPS, 1)
8SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 0)
9SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
10SCHED_FEAT(ASYM_GRAN, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 1) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0) 14SCHED_FEAT(WAKEUP_OVERLAP, 0)
15SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dec4ccabe2f5..8a21a2e28c13 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
105 105
106 /* dequeue is not valid, we print a debug message there: */ 106 /* dequeue is not valid, we print a debug message there: */
107 .dequeue_task = dequeue_task_idle, 107 .dequeue_task = dequeue_task_idle,
108#ifdef CONFIG_SMP
109 .select_task_rq = select_task_rq_idle,
110#endif /* CONFIG_SMP */
111 108
112 .check_preempt_curr = check_preempt_curr_idle, 109 .check_preempt_curr = check_preempt_curr_idle,
113 110
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
115 .put_prev_task = put_prev_task_idle, 112 .put_prev_task = put_prev_task_idle,
116 113
117#ifdef CONFIG_SMP 114#ifdef CONFIG_SMP
115 .select_task_rq = select_task_rq_idle,
116
118 .load_balance = load_balance_idle, 117 .load_balance = load_balance_idle,
119 .move_one_task = move_one_task_idle, 118 .move_one_task = move_one_task_idle,
120#endif 119#endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cdf5740ab03e..51d2af3e6191 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
77} 77}
78 78
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 79#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 81
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{ 83{
@@ -526,6 +526,8 @@ static void update_curr_rt(struct rq *rq)
526 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 526 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
527 527
528 curr->se.sum_exec_runtime += delta_exec; 528 curr->se.sum_exec_runtime += delta_exec;
529 account_group_exec_runtime(curr, delta_exec);
530
529 curr->se.exec_start = rq->clock; 531 curr->se.exec_start = rq->clock;
530 cpuacct_charge(curr, delta_exec); 532 cpuacct_charge(curr, delta_exec);
531 533
@@ -535,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
535 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
536 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
537 539
538 spin_lock(&rt_rq->rt_runtime_lock);
539 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
540 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
541 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
542 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
543 } 546 }
544 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 547 }
546} 548}
547 549
@@ -907,9 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
907/* Only try algorithms three times */ 909/* Only try algorithms three times */
908#define RT_MAX_TRIES 3 910#define RT_MAX_TRIES 3
909 911
910static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
911static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
912
913static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
914 913
915static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 914static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1458,7 +1457,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1458 p->rt.timeout++; 1457 p->rt.timeout++;
1459 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1458 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1460 if (p->rt.timeout > next) 1459 if (p->rt.timeout > next)
1461 p->it_sched_expires = p->se.sum_exec_runtime; 1460 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1462 } 1461 }
1463} 1462}
1464 1463
@@ -1502,9 +1501,6 @@ static const struct sched_class rt_sched_class = {
1502 .enqueue_task = enqueue_task_rt, 1501 .enqueue_task = enqueue_task_rt,
1503 .dequeue_task = dequeue_task_rt, 1502 .dequeue_task = dequeue_task_rt,
1504 .yield_task = yield_task_rt, 1503 .yield_task = yield_task_rt,
1505#ifdef CONFIG_SMP
1506 .select_task_rq = select_task_rq_rt,
1507#endif /* CONFIG_SMP */
1508 1504
1509 .check_preempt_curr = check_preempt_curr_rt, 1505 .check_preempt_curr = check_preempt_curr_rt,
1510 1506
@@ -1512,6 +1508,8 @@ static const struct sched_class rt_sched_class = {
1512 .put_prev_task = put_prev_task_rt, 1508 .put_prev_task = put_prev_task_rt,
1513 1509
1514#ifdef CONFIG_SMP 1510#ifdef CONFIG_SMP
1511 .select_task_rq = select_task_rq_rt,
1512
1515 .load_balance = load_balance_rt, 1513 .load_balance = load_balance_rt,
1516 .move_one_task = move_one_task_rt, 1514 .move_one_task = move_one_task_rt,
1517 .set_cpus_allowed = set_cpus_allowed_rt, 1515 .set_cpus_allowed = set_cpus_allowed_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43987e2..3b01098164c8 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,7 +9,7 @@
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
11 int cpu; 11 int cpu;
12 int mask_len = NR_CPUS/32 * 9; 12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL); 13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14 14
15 if (mask_str == NULL) 15 if (mask_str == NULL)
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, 31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 32 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 33 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_sched_info.cpu_time, 34 rq->rq_cpu_time,
35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
36 36
37 seq_printf(seq, "\n"); 37 seq_printf(seq, "\n");
@@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file)
90 return res; 90 return res;
91} 91}
92 92
93const struct file_operations proc_schedstat_operations = { 93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open, 94 .open = schedstat_open,
95 .read = seq_read, 95 .read = seq_read,
96 .llseek = seq_lseek, 96 .llseek = seq_lseek,
97 .release = single_release, 97 .release = single_release,
98}; 98};
99 99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106
100/* 107/*
101 * Expects runqueue lock to be held for atomicity of update 108 * Expects runqueue lock to be held for atomicity of update
102 */ 109 */
@@ -116,7 +123,7 @@ static inline void
116rq_sched_info_depart(struct rq *rq, unsigned long long delta) 123rq_sched_info_depart(struct rq *rq, unsigned long long delta)
117{ 124{
118 if (rq) 125 if (rq)
119 rq->rq_sched_info.cpu_time += delta; 126 rq->rq_cpu_time += delta;
120} 127}
121 128
122static inline void 129static inline void
@@ -229,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
229 unsigned long long delta = task_rq(t)->clock - 236 unsigned long long delta = task_rq(t)->clock -
230 t->sched_info.last_arrival; 237 t->sched_info.last_arrival;
231 238
232 t->sched_info.cpu_time += delta;
233 rq_sched_info_depart(task_rq(t), delta); 239 rq_sched_info_depart(task_rq(t), delta);
234 240
235 if (t->state == TASK_RUNNING) 241 if (t->state == TASK_RUNNING)
@@ -270,3 +276,96 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
270#define sched_info_switch(t, next) do { } while (0) 276#define sched_info_switch(t, next) do { } while (0)
271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 277#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
272 278
279/*
280 * The following are functions that support scheduler-internal time accounting.
281 * These functions are generally called at the timer tick. None of this depends
282 * on CONFIG_SCHEDSTATS.
283 */
284
285/**
286 * account_group_user_time - Maintain utime for a thread group.
287 *
288 * @tsk: Pointer to task structure.
289 * @cputime: Time value by which to increment the utime field of the
290 * thread_group_cputime structure.
291 *
292 * If thread group time is being maintained, get the structure for the
293 * running CPU and update the utime field there.
294 */
295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime)
297{
298 struct signal_struct *sig;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 sig = tsk->signal;
305 if (sig->cputime.totals) {
306 struct task_cputime *times;
307
308 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
309 times->utime = cputime_add(times->utime, cputime);
310 put_cpu_no_resched();
311 }
312}
313
314/**
315 * account_group_system_time - Maintain stime for a thread group.
316 *
317 * @tsk: Pointer to task structure.
318 * @cputime: Time value by which to increment the stime field of the
319 * thread_group_cputime structure.
320 *
321 * If thread group time is being maintained, get the structure for the
322 * running CPU and update the stime field there.
323 */
324static inline void account_group_system_time(struct task_struct *tsk,
325 cputime_t cputime)
326{
327 struct signal_struct *sig;
328
329 /* tsk == current, ensure it is safe to use ->signal */
330 if (unlikely(tsk->exit_state))
331 return;
332
333 sig = tsk->signal;
334 if (sig->cputime.totals) {
335 struct task_cputime *times;
336
337 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
338 times->stime = cputime_add(times->stime, cputime);
339 put_cpu_no_resched();
340 }
341}
342
343/**
344 * account_group_exec_runtime - Maintain exec runtime for a thread group.
345 *
346 * @tsk: Pointer to task structure.
347 * @ns: Time value by which to increment the sum_exec_runtime field
348 * of the thread_group_cputime structure.
349 *
350 * If thread group time is being maintained, get the structure for the
351 * running CPU and update the sum_exec_runtime field there.
352 */
353static inline void account_group_exec_runtime(struct task_struct *tsk,
354 unsigned long long ns)
355{
356 struct signal_struct *sig;
357
358 sig = tsk->signal;
359 /* see __exit_signal()->task_rq_unlock_wait() */
360 barrier();
361 if (unlikely(!sig))
362 return;
363
364 if (sig->cputime.totals) {
365 struct task_cputime *times;
366
367 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
368 times->sum_exec_runtime += ns;
369 put_cpu_no_resched();
370 }
371}
diff --git a/kernel/signal.c b/kernel/signal.c
index e661b01d340f..8e95855ff3cf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,6 +27,7 @@
27#include <linux/freezer.h> 27#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <trace/sched.h>
30 31
31#include <asm/param.h> 32#include <asm/param.h>
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
@@ -40,6 +41,8 @@
40 41
41static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
42 43
44DEFINE_TRACE(sched_signal_send);
45
43static void __user *sig_handler(struct task_struct *t, int sig) 46static void __user *sig_handler(struct task_struct *t, int sig)
44{ 47{
45 return t->sighand->action[sig - 1].sa.sa_handler; 48 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -176,6 +179,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
176 return sig; 179 return sig;
177} 180}
178 181
182/*
183 * allocate a new signal queue record
184 * - this may be called without locks if and only if t == current, otherwise an
185 * appopriate lock must be held to stop the target task from exiting
186 */
179static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 187static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
180 int override_rlimit) 188 int override_rlimit)
181{ 189{
@@ -183,11 +191,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
183 struct user_struct *user; 191 struct user_struct *user;
184 192
185 /* 193 /*
186 * In order to avoid problems with "switch_user()", we want to make 194 * We won't get problems with the target's UID changing under us
187 * sure that the compiler doesn't re-load "t->user" 195 * because changing it requires RCU be used, and if t != current, the
196 * caller must be holding the RCU readlock (by way of a spinlock) and
197 * we use RCU protection here
188 */ 198 */
189 user = t->user; 199 user = get_uid(__task_cred(t)->user);
190 barrier();
191 atomic_inc(&user->sigpending); 200 atomic_inc(&user->sigpending);
192 if (override_rlimit || 201 if (override_rlimit ||
193 atomic_read(&user->sigpending) <= 202 atomic_read(&user->sigpending) <=
@@ -195,12 +204,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
195 q = kmem_cache_alloc(sigqueue_cachep, flags); 204 q = kmem_cache_alloc(sigqueue_cachep, flags);
196 if (unlikely(q == NULL)) { 205 if (unlikely(q == NULL)) {
197 atomic_dec(&user->sigpending); 206 atomic_dec(&user->sigpending);
207 free_uid(user);
198 } else { 208 } else {
199 INIT_LIST_HEAD(&q->list); 209 INIT_LIST_HEAD(&q->list);
200 q->flags = 0; 210 q->flags = 0;
201 q->user = get_uid(user); 211 q->user = user;
202 } 212 }
203 return(q); 213
214 return q;
204} 215}
205 216
206static void __sigqueue_free(struct sigqueue *q) 217static void __sigqueue_free(struct sigqueue *q)
@@ -561,10 +572,12 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
561 572
562/* 573/*
563 * Bad permissions for sending the signal 574 * Bad permissions for sending the signal
575 * - the caller must hold at least the RCU read lock
564 */ 576 */
565static int check_kill_permission(int sig, struct siginfo *info, 577static int check_kill_permission(int sig, struct siginfo *info,
566 struct task_struct *t) 578 struct task_struct *t)
567{ 579{
580 const struct cred *cred = current_cred(), *tcred;
568 struct pid *sid; 581 struct pid *sid;
569 int error; 582 int error;
570 583
@@ -578,8 +591,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
578 if (error) 591 if (error)
579 return error; 592 return error;
580 593
581 if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && 594 tcred = __task_cred(t);
582 (current->uid ^ t->suid) && (current->uid ^ t->uid) && 595 if ((cred->euid ^ tcred->suid) &&
596 (cred->euid ^ tcred->uid) &&
597 (cred->uid ^ tcred->suid) &&
598 (cred->uid ^ tcred->uid) &&
583 !capable(CAP_KILL)) { 599 !capable(CAP_KILL)) {
584 switch (sig) { 600 switch (sig) {
585 case SIGCONT: 601 case SIGCONT:
@@ -803,6 +819,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
803 struct sigpending *pending; 819 struct sigpending *pending;
804 struct sigqueue *q; 820 struct sigqueue *q;
805 821
822 trace_sched_signal_send(sig, t);
823
806 assert_spin_locked(&t->sighand->siglock); 824 assert_spin_locked(&t->sighand->siglock);
807 if (!prepare_signal(sig, t)) 825 if (!prepare_signal(sig, t))
808 return 0; 826 return 0;
@@ -841,7 +859,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
841 q->info.si_errno = 0; 859 q->info.si_errno = 0;
842 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
843 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_pid_vnr(current);
844 q->info.si_uid = current->uid; 862 q->info.si_uid = current_uid();
845 break; 863 break;
846 case (unsigned long) SEND_SIG_PRIV: 864 case (unsigned long) SEND_SIG_PRIV:
847 q->info.si_signo = sig; 865 q->info.si_signo = sig;
@@ -1005,6 +1023,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1005 return sighand; 1023 return sighand;
1006} 1024}
1007 1025
1026/*
1027 * send signal info to all the members of a group
1028 * - the caller must hold the RCU read lock at least
1029 */
1008int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1030int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1009{ 1031{
1010 unsigned long flags; 1032 unsigned long flags;
@@ -1026,8 +1048,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1026/* 1048/*
1027 * __kill_pgrp_info() sends a signal to a process group: this is what the tty 1049 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
1028 * control characters do (^C, ^Z etc) 1050 * control characters do (^C, ^Z etc)
1051 * - the caller must hold at least a readlock on tasklist_lock
1029 */ 1052 */
1030
1031int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) 1053int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1032{ 1054{
1033 struct task_struct *p = NULL; 1055 struct task_struct *p = NULL;
@@ -1083,6 +1105,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1083{ 1105{
1084 int ret = -EINVAL; 1106 int ret = -EINVAL;
1085 struct task_struct *p; 1107 struct task_struct *p;
1108 const struct cred *pcred;
1086 1109
1087 if (!valid_signal(sig)) 1110 if (!valid_signal(sig))
1088 return ret; 1111 return ret;
@@ -1093,9 +1116,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1093 ret = -ESRCH; 1116 ret = -ESRCH;
1094 goto out_unlock; 1117 goto out_unlock;
1095 } 1118 }
1096 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 1119 pcred = __task_cred(p);
1097 && (euid != p->suid) && (euid != p->uid) 1120 if ((info == SEND_SIG_NOINFO ||
1098 && (uid != p->suid) && (uid != p->uid)) { 1121 (!is_si_special(info) && SI_FROMUSER(info))) &&
1122 euid != pcred->suid && euid != pcred->uid &&
1123 uid != pcred->suid && uid != pcred->uid) {
1099 ret = -EPERM; 1124 ret = -EPERM;
1100 goto out_unlock; 1125 goto out_unlock;
1101 } 1126 }
@@ -1141,7 +1166,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1141 struct task_struct * p; 1166 struct task_struct * p;
1142 1167
1143 for_each_process(p) { 1168 for_each_process(p) {
1144 if (p->pid > 1 && !same_thread_group(p, current)) { 1169 if (task_pid_vnr(p) > 1 &&
1170 !same_thread_group(p, current)) {
1145 int err = group_send_sig_info(sig, info, p); 1171 int err = group_send_sig_info(sig, info, p);
1146 ++count; 1172 ++count;
1147 if (err != -EPERM) 1173 if (err != -EPERM)
@@ -1338,6 +1364,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1338 struct siginfo info; 1364 struct siginfo info;
1339 unsigned long flags; 1365 unsigned long flags;
1340 struct sighand_struct *psig; 1366 struct sighand_struct *psig;
1367 struct task_cputime cputime;
1341 int ret = sig; 1368 int ret = sig;
1342 1369
1343 BUG_ON(sig == -1); 1370 BUG_ON(sig == -1);
@@ -1364,14 +1391,12 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1364 */ 1391 */
1365 rcu_read_lock(); 1392 rcu_read_lock();
1366 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1393 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1394 info.si_uid = __task_cred(tsk)->uid;
1367 rcu_read_unlock(); 1395 rcu_read_unlock();
1368 1396
1369 info.si_uid = tsk->uid; 1397 thread_group_cputime(tsk, &cputime);
1370 1398 info.si_utime = cputime_to_jiffies(cputime.utime);
1371 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1399 info.si_stime = cputime_to_jiffies(cputime.stime);
1372 tsk->signal->utime));
1373 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1374 tsk->signal->stime));
1375 1400
1376 info.si_status = tsk->exit_code & 0x7f; 1401 info.si_status = tsk->exit_code & 0x7f;
1377 if (tsk->exit_code & 0x80) 1402 if (tsk->exit_code & 0x80)
@@ -1436,10 +1461,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1436 */ 1461 */
1437 rcu_read_lock(); 1462 rcu_read_lock();
1438 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1463 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1464 info.si_uid = __task_cred(tsk)->uid;
1439 rcu_read_unlock(); 1465 rcu_read_unlock();
1440 1466
1441 info.si_uid = tsk->uid;
1442
1443 info.si_utime = cputime_to_clock_t(tsk->utime); 1467 info.si_utime = cputime_to_clock_t(tsk->utime);
1444 info.si_stime = cputime_to_clock_t(tsk->stime); 1468 info.si_stime = cputime_to_clock_t(tsk->stime);
1445 1469
@@ -1594,7 +1618,7 @@ void ptrace_notify(int exit_code)
1594 info.si_signo = SIGTRAP; 1618 info.si_signo = SIGTRAP;
1595 info.si_code = exit_code; 1619 info.si_code = exit_code;
1596 info.si_pid = task_pid_vnr(current); 1620 info.si_pid = task_pid_vnr(current);
1597 info.si_uid = current->uid; 1621 info.si_uid = current_uid();
1598 1622
1599 /* Let the debugger run. */ 1623 /* Let the debugger run. */
1600 spin_lock_irq(&current->sighand->siglock); 1624 spin_lock_irq(&current->sighand->siglock);
@@ -1706,7 +1730,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
1706 info->si_errno = 0; 1730 info->si_errno = 0;
1707 info->si_code = SI_USER; 1731 info->si_code = SI_USER;
1708 info->si_pid = task_pid_vnr(current->parent); 1732 info->si_pid = task_pid_vnr(current->parent);
1709 info->si_uid = current->parent->uid; 1733 info->si_uid = task_uid(current->parent);
1710 } 1734 }
1711 1735
1712 /* If the (new) signal is now blocked, requeue it. */ 1736 /* If the (new) signal is now blocked, requeue it. */
@@ -2207,7 +2231,7 @@ sys_kill(pid_t pid, int sig)
2207 info.si_errno = 0; 2231 info.si_errno = 0;
2208 info.si_code = SI_USER; 2232 info.si_code = SI_USER;
2209 info.si_pid = task_tgid_vnr(current); 2233 info.si_pid = task_tgid_vnr(current);
2210 info.si_uid = current->uid; 2234 info.si_uid = current_uid();
2211 2235
2212 return kill_something_info(sig, &info, pid); 2236 return kill_something_info(sig, &info, pid);
2213} 2237}
@@ -2224,7 +2248,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2224 info.si_errno = 0; 2248 info.si_errno = 0;
2225 info.si_code = SI_TKILL; 2249 info.si_code = SI_TKILL;
2226 info.si_pid = task_tgid_vnr(current); 2250 info.si_pid = task_tgid_vnr(current);
2227 info.si_uid = current->uid; 2251 info.si_uid = current_uid();
2228 2252
2229 rcu_read_lock(); 2253 rcu_read_lock();
2230 p = find_task_by_vpid(pid); 2254 p = find_task_by_vpid(pid);
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a8553777..75c8dde58c55 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
51{ 51{
52 /* Wait for response */ 52 /* Wait for response */
53 do { 53 do {
54 /*
55 * We need to see the flags store in the IPI handler
56 */
57 smp_mb();
58 if (!(data->flags & CSD_FLAG_WAIT)) 54 if (!(data->flags & CSD_FLAG_WAIT))
59 break; 55 break;
60 cpu_relax(); 56 cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
76 list_add_tail(&data->list, &dst->list); 72 list_add_tail(&data->list, &dst->list);
77 spin_unlock_irqrestore(&dst->lock, flags); 73 spin_unlock_irqrestore(&dst->lock, flags);
78 74
75 /*
76 * Make the list addition visible before sending the ipi.
77 */
78 smp_mb();
79
79 if (ipi) 80 if (ipi)
80 arch_send_call_function_single_ipi(cpu); 81 arch_send_call_function_single_ipi(cpu);
81 82
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
157 * Need to see other stores to list head for checking whether 158 * Need to see other stores to list head for checking whether
158 * list is empty without holding q->lock 159 * list is empty without holding q->lock
159 */ 160 */
160 smp_mb(); 161 smp_read_barrier_depends();
161 while (!list_empty(&q->list)) { 162 while (!list_empty(&q->list)) {
162 unsigned int data_flags; 163 unsigned int data_flags;
163 164
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
191 /* 192 /*
192 * See comment on outer loop 193 * See comment on outer loop
193 */ 194 */
194 smp_mb(); 195 smp_read_barrier_depends();
195 } 196 }
196} 197}
197 198
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
370 list_add_tail_rcu(&data->csd.list, &call_function_queue); 371 list_add_tail_rcu(&data->csd.list, &call_function_queue);
371 spin_unlock_irqrestore(&call_function_lock, flags); 372 spin_unlock_irqrestore(&call_function_lock, flags);
372 373
374 /*
375 * Make the list addition visible before sending the ipi.
376 */
377 smp_mb();
378
373 /* Send a message to all CPUs in the map */ 379 /* Send a message to all CPUs in the map */
374 arch_send_call_function_ipi(mask); 380 arch_send_call_function_ipi(mask);
375 381
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c506f266a6b9..466e75ce271a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,6 +6,8 @@
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 *
10 * Remote softirq infrastructure is by Jens Axboe.
9 */ 11 */
10 12
11#include <linux/module.h> 13#include <linux/module.h>
@@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
46EXPORT_SYMBOL(irq_stat); 48EXPORT_SYMBOL(irq_stat);
47#endif 49#endif
48 50
49static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; 51static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
50 52
51static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 53static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
52 54
@@ -100,20 +102,6 @@ void local_bh_disable(void)
100 102
101EXPORT_SYMBOL(local_bh_disable); 103EXPORT_SYMBOL(local_bh_disable);
102 104
103void __local_bh_enable(void)
104{
105 WARN_ON_ONCE(in_irq());
106
107 /*
108 * softirqs should never be enabled by __local_bh_enable(),
109 * it always nests inside local_bh_enable() sections:
110 */
111 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
112
113 sub_preempt_count(SOFTIRQ_OFFSET);
114}
115EXPORT_SYMBOL_GPL(__local_bh_enable);
116
117/* 105/*
118 * Special-case - softirqs can safely be enabled in 106 * Special-case - softirqs can safely be enabled in
119 * cond_resched_softirq(), or by __do_softirq(), 107 * cond_resched_softirq(), or by __do_softirq(),
@@ -205,7 +193,18 @@ restart:
205 193
206 do { 194 do {
207 if (pending & 1) { 195 if (pending & 1) {
196 int prev_count = preempt_count();
197
208 h->action(h); 198 h->action(h);
199
200 if (unlikely(prev_count != preempt_count())) {
201 printk(KERN_ERR "huh, entered softirq %td %p"
202 "with preempt_count %08x,"
203 " exited with %08x?\n", h - softirq_vec,
204 h->action, prev_count, preempt_count());
205 preempt_count() = prev_count;
206 }
207
209 rcu_bh_qsctr_inc(cpu); 208 rcu_bh_qsctr_inc(cpu);
210 } 209 }
211 h++; 210 h++;
@@ -254,16 +253,14 @@ asmlinkage void do_softirq(void)
254 */ 253 */
255void irq_enter(void) 254void irq_enter(void)
256{ 255{
257#ifdef CONFIG_NO_HZ
258 int cpu = smp_processor_id(); 256 int cpu = smp_processor_id();
259 if (idle_cpu(cpu) && !in_interrupt()) 257
260 tick_nohz_stop_idle(cpu); 258 rcu_irq_enter();
261#endif 259 if (idle_cpu(cpu) && !in_interrupt()) {
262 __irq_enter(); 260 __irq_enter();
263#ifdef CONFIG_NO_HZ 261 tick_check_idle(cpu);
264 if (idle_cpu(cpu)) 262 } else
265 tick_nohz_update_jiffies(); 263 __irq_enter();
266#endif
267} 264}
268 265
269#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 266#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -285,9 +282,9 @@ void irq_exit(void)
285 282
286#ifdef CONFIG_NO_HZ 283#ifdef CONFIG_NO_HZ
287 /* Make sure that timer wheel updates are propagated */ 284 /* Make sure that timer wheel updates are propagated */
288 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
289 tick_nohz_stop_sched_tick(0);
290 rcu_irq_exit(); 285 rcu_irq_exit();
286 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
287 tick_nohz_stop_sched_tick(0);
291#endif 288#endif
292 preempt_enable_no_resched(); 289 preempt_enable_no_resched();
293} 290}
@@ -463,17 +460,144 @@ void tasklet_kill(struct tasklet_struct *t)
463 460
464EXPORT_SYMBOL(tasklet_kill); 461EXPORT_SYMBOL(tasklet_kill);
465 462
463DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
464EXPORT_PER_CPU_SYMBOL(softirq_work_list);
465
466static void __local_trigger(struct call_single_data *cp, int softirq)
467{
468 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
469
470 list_add_tail(&cp->list, head);
471
472 /* Trigger the softirq only if the list was previously empty. */
473 if (head->next == &cp->list)
474 raise_softirq_irqoff(softirq);
475}
476
477#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
478static void remote_softirq_receive(void *data)
479{
480 struct call_single_data *cp = data;
481 unsigned long flags;
482 int softirq;
483
484 softirq = cp->priv;
485
486 local_irq_save(flags);
487 __local_trigger(cp, softirq);
488 local_irq_restore(flags);
489}
490
491static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
492{
493 if (cpu_online(cpu)) {
494 cp->func = remote_softirq_receive;
495 cp->info = cp;
496 cp->flags = 0;
497 cp->priv = softirq;
498
499 __smp_call_function_single(cpu, cp);
500 return 0;
501 }
502 return 1;
503}
504#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
505static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
506{
507 return 1;
508}
509#endif
510
511/**
512 * __send_remote_softirq - try to schedule softirq work on a remote cpu
513 * @cp: private SMP call function data area
514 * @cpu: the remote cpu
515 * @this_cpu: the currently executing cpu
516 * @softirq: the softirq for the work
517 *
518 * Attempt to schedule softirq work on a remote cpu. If this cannot be
519 * done, the work is instead queued up on the local cpu.
520 *
521 * Interrupts must be disabled.
522 */
523void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
524{
525 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
526 __local_trigger(cp, softirq);
527}
528EXPORT_SYMBOL(__send_remote_softirq);
529
530/**
531 * send_remote_softirq - try to schedule softirq work on a remote cpu
532 * @cp: private SMP call function data area
533 * @cpu: the remote cpu
534 * @softirq: the softirq for the work
535 *
536 * Like __send_remote_softirq except that disabling interrupts and
537 * computing the current cpu is done for the caller.
538 */
539void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
540{
541 unsigned long flags;
542 int this_cpu;
543
544 local_irq_save(flags);
545 this_cpu = smp_processor_id();
546 __send_remote_softirq(cp, cpu, this_cpu, softirq);
547 local_irq_restore(flags);
548}
549EXPORT_SYMBOL(send_remote_softirq);
550
551static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
552 unsigned long action, void *hcpu)
553{
554 /*
555 * If a CPU goes away, splice its entries to the current CPU
556 * and trigger a run of the softirq
557 */
558 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
559 int cpu = (unsigned long) hcpu;
560 int i;
561
562 local_irq_disable();
563 for (i = 0; i < NR_SOFTIRQS; i++) {
564 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
565 struct list_head *local_head;
566
567 if (list_empty(head))
568 continue;
569
570 local_head = &__get_cpu_var(softirq_work_list[i]);
571 list_splice_init(head, local_head);
572 raise_softirq_irqoff(i);
573 }
574 local_irq_enable();
575 }
576
577 return NOTIFY_OK;
578}
579
580static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
581 .notifier_call = remote_softirq_cpu_notify,
582};
583
466void __init softirq_init(void) 584void __init softirq_init(void)
467{ 585{
468 int cpu; 586 int cpu;
469 587
470 for_each_possible_cpu(cpu) { 588 for_each_possible_cpu(cpu) {
589 int i;
590
471 per_cpu(tasklet_vec, cpu).tail = 591 per_cpu(tasklet_vec, cpu).tail =
472 &per_cpu(tasklet_vec, cpu).head; 592 &per_cpu(tasklet_vec, cpu).head;
473 per_cpu(tasklet_hi_vec, cpu).tail = 593 per_cpu(tasklet_hi_vec, cpu).tail =
474 &per_cpu(tasklet_hi_vec, cpu).head; 594 &per_cpu(tasklet_hi_vec, cpu).head;
595 for (i = 0; i < NR_SOFTIRQS; i++)
596 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
475 } 597 }
476 598
599 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
600
477 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 601 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
478 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 602 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
479} 603}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index cb838ee93a82..1ab790c67b17 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
@@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
188 if ((long)(now - t->last_switch_timestamp) < 188 if ((long)(now - t->last_switch_timestamp) <
189 sysctl_hung_task_timeout_secs) 189 sysctl_hung_task_timeout_secs)
190 return; 190 return;
191 if (sysctl_hung_task_warnings < 0) 191 if (!sysctl_hung_task_warnings)
192 return; 192 return;
193 sysctl_hung_task_warnings--; 193 sysctl_hung_task_warnings--;
194 194
@@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
226 * If the system crashed already then all bets are off, 226 * If the system crashed already then all bets are off,
227 * do not report extra hung tasks: 227 * do not report extra hung tasks:
228 */ 228 */
229 if ((tainted & TAINT_DIE) || did_panic) 229 if (test_taint(TAINT_DIE) || did_panic)
230 return; 230 return;
231 231
232 read_lock(&tasklist_lock); 232 read_lock(&tasklist_lock);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
11#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
24} 25}
25EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
26 27
28/*
29 * Architectures that do not implement save_stack_trace_tsk get this
30 * weak alias and a once-per-bootup warning (whenever this facility
31 * is utilized - for example by procfs):
32 */
33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index af3c7cea258b..24e8ceacc388 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -37,9 +37,13 @@ struct stop_machine_data {
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
38static unsigned int num_threads; 38static unsigned int num_threads;
39static atomic_t thread_ack; 39static atomic_t thread_ack;
40static struct completion finished;
41static DEFINE_MUTEX(lock); 40static DEFINE_MUTEX(lock);
42 41
42static struct workqueue_struct *stop_machine_wq;
43static struct stop_machine_data active, idle;
44static const cpumask_t *active_cpus;
45static void *stop_machine_work;
46
43static void set_state(enum stopmachine_state newstate) 47static void set_state(enum stopmachine_state newstate)
44{ 48{
45 /* Reset ack counter. */ 49 /* Reset ack counter. */
@@ -51,21 +55,26 @@ static void set_state(enum stopmachine_state newstate)
51/* Last one to ack a state moves to the next state. */ 55/* Last one to ack a state moves to the next state. */
52static void ack_state(void) 56static void ack_state(void)
53{ 57{
54 if (atomic_dec_and_test(&thread_ack)) { 58 if (atomic_dec_and_test(&thread_ack))
55 /* If we're the last one to ack the EXIT, we're finished. */ 59 set_state(state + 1);
56 if (state == STOPMACHINE_EXIT)
57 complete(&finished);
58 else
59 set_state(state + 1);
60 }
61} 60}
62 61
63/* This is the actual thread which stops the CPU. It exits by itself rather 62/* This is the actual function which stops the CPU. It runs
64 * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ 63 * in the context of a dedicated stopmachine workqueue. */
65static int stop_cpu(struct stop_machine_data *smdata) 64static void stop_cpu(struct work_struct *unused)
66{ 65{
67 enum stopmachine_state curstate = STOPMACHINE_NONE; 66 enum stopmachine_state curstate = STOPMACHINE_NONE;
68 67 struct stop_machine_data *smdata = &idle;
68 int cpu = smp_processor_id();
69 int err;
70
71 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map))
73 smdata = &active;
74 } else {
75 if (cpu_isset(cpu, *active_cpus))
76 smdata = &active;
77 }
69 /* Simple state machine */ 78 /* Simple state machine */
70 do { 79 do {
71 /* Chill out and ensure we re-read stopmachine_state. */ 80 /* Chill out and ensure we re-read stopmachine_state. */
@@ -78,9 +87,11 @@ static int stop_cpu(struct stop_machine_data *smdata)
78 hard_irq_disable(); 87 hard_irq_disable();
79 break; 88 break;
80 case STOPMACHINE_RUN: 89 case STOPMACHINE_RUN:
81 /* |= allows error detection if functions on 90 /* On multiple CPUs only a single error code
82 * multiple CPUs. */ 91 * is needed to tell that something failed. */
83 smdata->fnret |= smdata->fn(smdata->data); 92 err = smdata->fn(smdata->data);
93 if (err)
94 smdata->fnret = err;
84 break; 95 break;
85 default: 96 default:
86 break; 97 break;
@@ -90,7 +101,6 @@ static int stop_cpu(struct stop_machine_data *smdata)
90 } while (curstate != STOPMACHINE_EXIT); 101 } while (curstate != STOPMACHINE_EXIT);
91 102
92 local_irq_enable(); 103 local_irq_enable();
93 do_exit(0);
94} 104}
95 105
96/* Callback for CPUs which aren't supposed to do anything. */ 106/* Callback for CPUs which aren't supposed to do anything. */
@@ -101,78 +111,35 @@ static int chill(void *unused)
101 111
102int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
103{ 113{
104 int i, err; 114 struct work_struct *sm_work;
105 struct stop_machine_data active, idle; 115 int i, ret;
106 struct task_struct **threads;
107 116
117 /* Set up initial state. */
118 mutex_lock(&lock);
119 num_threads = num_online_cpus();
120 active_cpus = cpus;
108 active.fn = fn; 121 active.fn = fn;
109 active.data = data; 122 active.data = data;
110 active.fnret = 0; 123 active.fnret = 0;
111 idle.fn = chill; 124 idle.fn = chill;
112 idle.data = NULL; 125 idle.data = NULL;
113 126
114 /* This could be too big for stack on large machines. */
115 threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
116 if (!threads)
117 return -ENOMEM;
118
119 /* Set up initial state. */
120 mutex_lock(&lock);
121 init_completion(&finished);
122 num_threads = num_online_cpus();
123 set_state(STOPMACHINE_PREPARE); 127 set_state(STOPMACHINE_PREPARE);
124 128
125 for_each_online_cpu(i) { 129 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
126 struct stop_machine_data *smdata = &idle;
127 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
128
129 if (!cpus) {
130 if (i == first_cpu(cpu_online_map))
131 smdata = &active;
132 } else {
133 if (cpu_isset(i, *cpus))
134 smdata = &active;
135 }
136
137 threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
138 i);
139 if (IS_ERR(threads[i])) {
140 err = PTR_ERR(threads[i]);
141 threads[i] = NULL;
142 goto kill_threads;
143 }
144
145 /* Place it onto correct cpu. */
146 kthread_bind(threads[i], i);
147
148 /* Make it highest prio. */
149 if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
150 BUG();
151 }
152
153 /* We've created all the threads. Wake them all: hold this CPU so one
154 * doesn't hit this CPU until we're ready. */ 130 * doesn't hit this CPU until we're ready. */
155 get_cpu(); 131 get_cpu();
156 for_each_online_cpu(i) 132 for_each_online_cpu(i) {
157 wake_up_process(threads[i]); 133 sm_work = percpu_ptr(stop_machine_work, i);
158 134 INIT_WORK(sm_work, stop_cpu);
135 queue_work_on(i, stop_machine_wq, sm_work);
136 }
159 /* This will release the thread on our CPU. */ 137 /* This will release the thread on our CPU. */
160 put_cpu(); 138 put_cpu();
161 wait_for_completion(&finished); 139 flush_workqueue(stop_machine_wq);
162 mutex_unlock(&lock); 140 ret = active.fnret;
163
164 kfree(threads);
165
166 return active.fnret;
167
168kill_threads:
169 for_each_online_cpu(i)
170 if (threads[i])
171 kthread_stop(threads[i]);
172 mutex_unlock(&lock); 141 mutex_unlock(&lock);
173 142 return ret;
174 kfree(threads);
175 return err;
176} 143}
177 144
178int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
@@ -187,3 +154,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
187 return ret; 154 return ret;
188} 155}
189EXPORT_SYMBOL_GPL(stop_machine); 156EXPORT_SYMBOL_GPL(stop_machine);
157
158static int __init stop_machine_init(void)
159{
160 stop_machine_wq = create_rt_workqueue("kstop");
161 stop_machine_work = alloc_percpu(struct work_struct);
162 return 0;
163}
164core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 234d9454294e..d356d79e84ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -112,12 +112,17 @@ EXPORT_SYMBOL(cad_pid);
112 112
113void (*pm_power_off_prepare)(void); 113void (*pm_power_off_prepare)(void);
114 114
115/*
116 * set the priority of a task
117 * - the caller must hold the RCU read lock
118 */
115static int set_one_prio(struct task_struct *p, int niceval, int error) 119static int set_one_prio(struct task_struct *p, int niceval, int error)
116{ 120{
121 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
117 int no_nice; 122 int no_nice;
118 123
119 if (p->uid != current->euid && 124 if (pcred->uid != cred->euid &&
120 p->euid != current->euid && !capable(CAP_SYS_NICE)) { 125 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
121 error = -EPERM; 126 error = -EPERM;
122 goto out; 127 goto out;
123 } 128 }
@@ -141,6 +146,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
141{ 146{
142 struct task_struct *g, *p; 147 struct task_struct *g, *p;
143 struct user_struct *user; 148 struct user_struct *user;
149 const struct cred *cred = current_cred();
144 int error = -EINVAL; 150 int error = -EINVAL;
145 struct pid *pgrp; 151 struct pid *pgrp;
146 152
@@ -174,18 +180,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
174 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 180 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
175 break; 181 break;
176 case PRIO_USER: 182 case PRIO_USER:
177 user = current->user; 183 user = (struct user_struct *) cred->user;
178 if (!who) 184 if (!who)
179 who = current->uid; 185 who = cred->uid;
180 else 186 else if ((who != cred->uid) &&
181 if ((who != current->uid) && !(user = find_user(who))) 187 !(user = find_user(who)))
182 goto out_unlock; /* No processes for this user */ 188 goto out_unlock; /* No processes for this user */
183 189
184 do_each_thread(g, p) 190 do_each_thread(g, p)
185 if (p->uid == who) 191 if (__task_cred(p)->uid == who)
186 error = set_one_prio(p, niceval, error); 192 error = set_one_prio(p, niceval, error);
187 while_each_thread(g, p); 193 while_each_thread(g, p);
188 if (who != current->uid) 194 if (who != cred->uid)
189 free_uid(user); /* For find_user() */ 195 free_uid(user); /* For find_user() */
190 break; 196 break;
191 } 197 }
@@ -205,6 +211,7 @@ asmlinkage long sys_getpriority(int which, int who)
205{ 211{
206 struct task_struct *g, *p; 212 struct task_struct *g, *p;
207 struct user_struct *user; 213 struct user_struct *user;
214 const struct cred *cred = current_cred();
208 long niceval, retval = -ESRCH; 215 long niceval, retval = -ESRCH;
209 struct pid *pgrp; 216 struct pid *pgrp;
210 217
@@ -236,21 +243,21 @@ asmlinkage long sys_getpriority(int which, int who)
236 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 243 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
237 break; 244 break;
238 case PRIO_USER: 245 case PRIO_USER:
239 user = current->user; 246 user = (struct user_struct *) cred->user;
240 if (!who) 247 if (!who)
241 who = current->uid; 248 who = cred->uid;
242 else 249 else if ((who != cred->uid) &&
243 if ((who != current->uid) && !(user = find_user(who))) 250 !(user = find_user(who)))
244 goto out_unlock; /* No processes for this user */ 251 goto out_unlock; /* No processes for this user */
245 252
246 do_each_thread(g, p) 253 do_each_thread(g, p)
247 if (p->uid == who) { 254 if (__task_cred(p)->uid == who) {
248 niceval = 20 - task_nice(p); 255 niceval = 20 - task_nice(p);
249 if (niceval > retval) 256 if (niceval > retval)
250 retval = niceval; 257 retval = niceval;
251 } 258 }
252 while_each_thread(g, p); 259 while_each_thread(g, p);
253 if (who != current->uid) 260 if (who != cred->uid)
254 free_uid(user); /* for find_user() */ 261 free_uid(user); /* for find_user() */
255 break; 262 break;
256 } 263 }
@@ -472,46 +479,48 @@ void ctrl_alt_del(void)
472 */ 479 */
473asmlinkage long sys_setregid(gid_t rgid, gid_t egid) 480asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
474{ 481{
475 int old_rgid = current->gid; 482 const struct cred *old;
476 int old_egid = current->egid; 483 struct cred *new;
477 int new_rgid = old_rgid;
478 int new_egid = old_egid;
479 int retval; 484 int retval;
480 485
486 new = prepare_creds();
487 if (!new)
488 return -ENOMEM;
489 old = current_cred();
490
481 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); 491 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
482 if (retval) 492 if (retval)
483 return retval; 493 goto error;
484 494
495 retval = -EPERM;
485 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
486 if ((old_rgid == rgid) || 497 if (old->gid == rgid ||
487 (current->egid==rgid) || 498 old->egid == rgid ||
488 capable(CAP_SETGID)) 499 capable(CAP_SETGID))
489 new_rgid = rgid; 500 new->gid = rgid;
490 else 501 else
491 return -EPERM; 502 goto error;
492 } 503 }
493 if (egid != (gid_t) -1) { 504 if (egid != (gid_t) -1) {
494 if ((old_rgid == egid) || 505 if (old->gid == egid ||
495 (current->egid == egid) || 506 old->egid == egid ||
496 (current->sgid == egid) || 507 old->sgid == egid ||
497 capable(CAP_SETGID)) 508 capable(CAP_SETGID))
498 new_egid = egid; 509 new->egid = egid;
499 else 510 else
500 return -EPERM; 511 goto error;
501 }
502 if (new_egid != old_egid) {
503 set_dumpable(current->mm, suid_dumpable);
504 smp_wmb();
505 } 512 }
513
506 if (rgid != (gid_t) -1 || 514 if (rgid != (gid_t) -1 ||
507 (egid != (gid_t) -1 && egid != old_rgid)) 515 (egid != (gid_t) -1 && egid != old->gid))
508 current->sgid = new_egid; 516 new->sgid = new->egid;
509 current->fsgid = new_egid; 517 new->fsgid = new->egid;
510 current->egid = new_egid; 518
511 current->gid = new_rgid; 519 return commit_creds(new);
512 key_fsgid_changed(current); 520
513 proc_id_connector(current, PROC_EVENT_GID); 521error:
514 return 0; 522 abort_creds(new);
523 return retval;
515} 524}
516 525
517/* 526/*
@@ -521,56 +530,54 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
521 */ 530 */
522asmlinkage long sys_setgid(gid_t gid) 531asmlinkage long sys_setgid(gid_t gid)
523{ 532{
524 int old_egid = current->egid; 533 const struct cred *old;
534 struct cred *new;
525 int retval; 535 int retval;
526 536
537 new = prepare_creds();
538 if (!new)
539 return -ENOMEM;
540 old = current_cred();
541
527 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); 542 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
528 if (retval) 543 if (retval)
529 return retval; 544 goto error;
530 545
531 if (capable(CAP_SETGID)) { 546 retval = -EPERM;
532 if (old_egid != gid) { 547 if (capable(CAP_SETGID))
533 set_dumpable(current->mm, suid_dumpable); 548 new->gid = new->egid = new->sgid = new->fsgid = gid;
534 smp_wmb(); 549 else if (gid == old->gid || gid == old->sgid)
535 } 550 new->egid = new->fsgid = gid;
536 current->gid = current->egid = current->sgid = current->fsgid = gid;
537 } else if ((gid == current->gid) || (gid == current->sgid)) {
538 if (old_egid != gid) {
539 set_dumpable(current->mm, suid_dumpable);
540 smp_wmb();
541 }
542 current->egid = current->fsgid = gid;
543 }
544 else 551 else
545 return -EPERM; 552 goto error;
546 553
547 key_fsgid_changed(current); 554 return commit_creds(new);
548 proc_id_connector(current, PROC_EVENT_GID); 555
549 return 0; 556error:
557 abort_creds(new);
558 return retval;
550} 559}
551 560
552static int set_user(uid_t new_ruid, int dumpclear) 561/*
562 * change the user struct in a credentials set to match the new UID
563 */
564static int set_user(struct cred *new)
553{ 565{
554 struct user_struct *new_user; 566 struct user_struct *new_user;
555 567
556 new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); 568 new_user = alloc_uid(current_user_ns(), new->uid);
557 if (!new_user) 569 if (!new_user)
558 return -EAGAIN; 570 return -EAGAIN;
559 571
560 if (atomic_read(&new_user->processes) >= 572 if (atomic_read(&new_user->processes) >=
561 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 573 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
562 new_user != current->nsproxy->user_ns->root_user) { 574 new_user != INIT_USER) {
563 free_uid(new_user); 575 free_uid(new_user);
564 return -EAGAIN; 576 return -EAGAIN;
565 } 577 }
566 578
567 switch_uid(new_user); 579 free_uid(new->user);
568 580 new->user = new_user;
569 if (dumpclear) {
570 set_dumpable(current->mm, suid_dumpable);
571 smp_wmb();
572 }
573 current->uid = new_ruid;
574 return 0; 581 return 0;
575} 582}
576 583
@@ -591,54 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
591 */ 598 */
592asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) 599asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
593{ 600{
594 int old_ruid, old_euid, old_suid, new_ruid, new_euid; 601 const struct cred *old;
602 struct cred *new;
595 int retval; 603 int retval;
596 604
605 new = prepare_creds();
606 if (!new)
607 return -ENOMEM;
608 old = current_cred();
609
597 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); 610 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
598 if (retval) 611 if (retval)
599 return retval; 612 goto error;
600
601 new_ruid = old_ruid = current->uid;
602 new_euid = old_euid = current->euid;
603 old_suid = current->suid;
604 613
614 retval = -EPERM;
605 if (ruid != (uid_t) -1) { 615 if (ruid != (uid_t) -1) {
606 new_ruid = ruid; 616 new->uid = ruid;
607 if ((old_ruid != ruid) && 617 if (old->uid != ruid &&
608 (current->euid != ruid) && 618 old->euid != ruid &&
609 !capable(CAP_SETUID)) 619 !capable(CAP_SETUID))
610 return -EPERM; 620 goto error;
611 } 621 }
612 622
613 if (euid != (uid_t) -1) { 623 if (euid != (uid_t) -1) {
614 new_euid = euid; 624 new->euid = euid;
615 if ((old_ruid != euid) && 625 if (old->uid != euid &&
616 (current->euid != euid) && 626 old->euid != euid &&
617 (current->suid != euid) && 627 old->suid != euid &&
618 !capable(CAP_SETUID)) 628 !capable(CAP_SETUID))
619 return -EPERM; 629 goto error;
620 } 630 }
621 631
622 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) 632 retval = -EAGAIN;
623 return -EAGAIN; 633 if (new->uid != old->uid && set_user(new) < 0)
634 goto error;
624 635
625 if (new_euid != old_euid) {
626 set_dumpable(current->mm, suid_dumpable);
627 smp_wmb();
628 }
629 current->fsuid = current->euid = new_euid;
630 if (ruid != (uid_t) -1 || 636 if (ruid != (uid_t) -1 ||
631 (euid != (uid_t) -1 && euid != old_ruid)) 637 (euid != (uid_t) -1 && euid != old->uid))
632 current->suid = current->euid; 638 new->suid = new->euid;
633 current->fsuid = current->euid; 639 new->fsuid = new->euid;
634 640
635 key_fsuid_changed(current); 641 retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
636 proc_id_connector(current, PROC_EVENT_UID); 642 if (retval < 0)
637 643 goto error;
638 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
639}
640 644
645 return commit_creds(new);
641 646
647error:
648 abort_creds(new);
649 return retval;
650}
642 651
643/* 652/*
644 * setuid() is implemented like SysV with SAVED_IDS 653 * setuid() is implemented like SysV with SAVED_IDS
@@ -653,36 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 */ 662 */
654asmlinkage long sys_setuid(uid_t uid) 663asmlinkage long sys_setuid(uid_t uid)
655{ 664{
656 int old_euid = current->euid; 665 const struct cred *old;
657 int old_ruid, old_suid, new_suid; 666 struct cred *new;
658 int retval; 667 int retval;
659 668
669 new = prepare_creds();
670 if (!new)
671 return -ENOMEM;
672 old = current_cred();
673
660 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 674 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
661 if (retval) 675 if (retval)
662 return retval; 676 goto error;
663 677
664 old_ruid = current->uid; 678 retval = -EPERM;
665 old_suid = current->suid;
666 new_suid = old_suid;
667
668 if (capable(CAP_SETUID)) { 679 if (capable(CAP_SETUID)) {
669 if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) 680 new->suid = new->uid = uid;
670 return -EAGAIN; 681 if (uid != old->uid && set_user(new) < 0) {
671 new_suid = uid; 682 retval = -EAGAIN;
672 } else if ((uid != current->uid) && (uid != new_suid)) 683 goto error;
673 return -EPERM; 684 }
674 685 } else if (uid != old->uid && uid != new->suid) {
675 if (old_euid != uid) { 686 goto error;
676 set_dumpable(current->mm, suid_dumpable);
677 smp_wmb();
678 } 687 }
679 current->fsuid = current->euid = uid;
680 current->suid = new_suid;
681 688
682 key_fsuid_changed(current); 689 new->fsuid = new->euid = uid;
683 proc_id_connector(current, PROC_EVENT_UID); 690
691 retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
692 if (retval < 0)
693 goto error;
694
695 return commit_creds(new);
684 696
685 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); 697error:
698 abort_creds(new);
699 return retval;
686} 700}
687 701
688 702
@@ -692,54 +706,63 @@ asmlinkage long sys_setuid(uid_t uid)
692 */ 706 */
693asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 707asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
694{ 708{
695 int old_ruid = current->uid; 709 const struct cred *old;
696 int old_euid = current->euid; 710 struct cred *new;
697 int old_suid = current->suid;
698 int retval; 711 int retval;
699 712
713 new = prepare_creds();
714 if (!new)
715 return -ENOMEM;
716
700 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); 717 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
701 if (retval) 718 if (retval)
702 return retval; 719 goto error;
720 old = current_cred();
703 721
722 retval = -EPERM;
704 if (!capable(CAP_SETUID)) { 723 if (!capable(CAP_SETUID)) {
705 if ((ruid != (uid_t) -1) && (ruid != current->uid) && 724 if (ruid != (uid_t) -1 && ruid != old->uid &&
706 (ruid != current->euid) && (ruid != current->suid)) 725 ruid != old->euid && ruid != old->suid)
707 return -EPERM; 726 goto error;
708 if ((euid != (uid_t) -1) && (euid != current->uid) && 727 if (euid != (uid_t) -1 && euid != old->uid &&
709 (euid != current->euid) && (euid != current->suid)) 728 euid != old->euid && euid != old->suid)
710 return -EPERM; 729 goto error;
711 if ((suid != (uid_t) -1) && (suid != current->uid) && 730 if (suid != (uid_t) -1 && suid != old->uid &&
712 (suid != current->euid) && (suid != current->suid)) 731 suid != old->euid && suid != old->suid)
713 return -EPERM; 732 goto error;
714 } 733 }
734
735 retval = -EAGAIN;
715 if (ruid != (uid_t) -1) { 736 if (ruid != (uid_t) -1) {
716 if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) 737 new->uid = ruid;
717 return -EAGAIN; 738 if (ruid != old->uid && set_user(new) < 0)
739 goto error;
718 } 740 }
719 if (euid != (uid_t) -1) { 741 if (euid != (uid_t) -1)
720 if (euid != current->euid) { 742 new->euid = euid;
721 set_dumpable(current->mm, suid_dumpable);
722 smp_wmb();
723 }
724 current->euid = euid;
725 }
726 current->fsuid = current->euid;
727 if (suid != (uid_t) -1) 743 if (suid != (uid_t) -1)
728 current->suid = suid; 744 new->suid = suid;
745 new->fsuid = new->euid;
746
747 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
748 if (retval < 0)
749 goto error;
729 750
730 key_fsuid_changed(current); 751 return commit_creds(new);
731 proc_id_connector(current, PROC_EVENT_UID);
732 752
733 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); 753error:
754 abort_creds(new);
755 return retval;
734} 756}
735 757
736asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) 758asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
737{ 759{
760 const struct cred *cred = current_cred();
738 int retval; 761 int retval;
739 762
740 if (!(retval = put_user(current->uid, ruid)) && 763 if (!(retval = put_user(cred->uid, ruid)) &&
741 !(retval = put_user(current->euid, euid))) 764 !(retval = put_user(cred->euid, euid)))
742 retval = put_user(current->suid, suid); 765 retval = put_user(cred->suid, suid);
743 766
744 return retval; 767 return retval;
745} 768}
@@ -749,48 +772,55 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
749 */ 772 */
750asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 773asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
751{ 774{
775 const struct cred *old;
776 struct cred *new;
752 int retval; 777 int retval;
753 778
779 new = prepare_creds();
780 if (!new)
781 return -ENOMEM;
782 old = current_cred();
783
754 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); 784 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
755 if (retval) 785 if (retval)
756 return retval; 786 goto error;
757 787
788 retval = -EPERM;
758 if (!capable(CAP_SETGID)) { 789 if (!capable(CAP_SETGID)) {
759 if ((rgid != (gid_t) -1) && (rgid != current->gid) && 790 if (rgid != (gid_t) -1 && rgid != old->gid &&
760 (rgid != current->egid) && (rgid != current->sgid)) 791 rgid != old->egid && rgid != old->sgid)
761 return -EPERM; 792 goto error;
762 if ((egid != (gid_t) -1) && (egid != current->gid) && 793 if (egid != (gid_t) -1 && egid != old->gid &&
763 (egid != current->egid) && (egid != current->sgid)) 794 egid != old->egid && egid != old->sgid)
764 return -EPERM; 795 goto error;
765 if ((sgid != (gid_t) -1) && (sgid != current->gid) && 796 if (sgid != (gid_t) -1 && sgid != old->gid &&
766 (sgid != current->egid) && (sgid != current->sgid)) 797 sgid != old->egid && sgid != old->sgid)
767 return -EPERM; 798 goto error;
768 } 799 }
769 if (egid != (gid_t) -1) { 800
770 if (egid != current->egid) {
771 set_dumpable(current->mm, suid_dumpable);
772 smp_wmb();
773 }
774 current->egid = egid;
775 }
776 current->fsgid = current->egid;
777 if (rgid != (gid_t) -1) 801 if (rgid != (gid_t) -1)
778 current->gid = rgid; 802 new->gid = rgid;
803 if (egid != (gid_t) -1)
804 new->egid = egid;
779 if (sgid != (gid_t) -1) 805 if (sgid != (gid_t) -1)
780 current->sgid = sgid; 806 new->sgid = sgid;
807 new->fsgid = new->egid;
781 808
782 key_fsgid_changed(current); 809 return commit_creds(new);
783 proc_id_connector(current, PROC_EVENT_GID); 810
784 return 0; 811error:
812 abort_creds(new);
813 return retval;
785} 814}
786 815
787asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) 816asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
788{ 817{
818 const struct cred *cred = current_cred();
789 int retval; 819 int retval;
790 820
791 if (!(retval = put_user(current->gid, rgid)) && 821 if (!(retval = put_user(cred->gid, rgid)) &&
792 !(retval = put_user(current->egid, egid))) 822 !(retval = put_user(cred->egid, egid)))
793 retval = put_user(current->sgid, sgid); 823 retval = put_user(cred->sgid, sgid);
794 824
795 return retval; 825 return retval;
796} 826}
@@ -804,27 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
804 */ 834 */
805asmlinkage long sys_setfsuid(uid_t uid) 835asmlinkage long sys_setfsuid(uid_t uid)
806{ 836{
807 int old_fsuid; 837 const struct cred *old;
838 struct cred *new;
839 uid_t old_fsuid;
840
841 new = prepare_creds();
842 if (!new)
843 return current_fsuid();
844 old = current_cred();
845 old_fsuid = old->fsuid;
808 846
809 old_fsuid = current->fsuid; 847 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
810 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) 848 goto error;
811 return old_fsuid;
812 849
813 if (uid == current->uid || uid == current->euid || 850 if (uid == old->uid || uid == old->euid ||
814 uid == current->suid || uid == current->fsuid || 851 uid == old->suid || uid == old->fsuid ||
815 capable(CAP_SETUID)) { 852 capable(CAP_SETUID)) {
816 if (uid != old_fsuid) { 853 if (uid != old_fsuid) {
817 set_dumpable(current->mm, suid_dumpable); 854 new->fsuid = uid;
818 smp_wmb(); 855 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
856 goto change_okay;
819 } 857 }
820 current->fsuid = uid;
821 } 858 }
822 859
823 key_fsuid_changed(current); 860error:
824 proc_id_connector(current, PROC_EVENT_UID); 861 abort_creds(new);
825 862 return old_fsuid;
826 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
827 863
864change_okay:
865 commit_creds(new);
828 return old_fsuid; 866 return old_fsuid;
829} 867}
830 868
@@ -833,58 +871,59 @@ asmlinkage long sys_setfsuid(uid_t uid)
833 */ 871 */
834asmlinkage long sys_setfsgid(gid_t gid) 872asmlinkage long sys_setfsgid(gid_t gid)
835{ 873{
836 int old_fsgid; 874 const struct cred *old;
875 struct cred *new;
876 gid_t old_fsgid;
877
878 new = prepare_creds();
879 if (!new)
880 return current_fsgid();
881 old = current_cred();
882 old_fsgid = old->fsgid;
837 883
838 old_fsgid = current->fsgid;
839 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) 884 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
840 return old_fsgid; 885 goto error;
841 886
842 if (gid == current->gid || gid == current->egid || 887 if (gid == old->gid || gid == old->egid ||
843 gid == current->sgid || gid == current->fsgid || 888 gid == old->sgid || gid == old->fsgid ||
844 capable(CAP_SETGID)) { 889 capable(CAP_SETGID)) {
845 if (gid != old_fsgid) { 890 if (gid != old_fsgid) {
846 set_dumpable(current->mm, suid_dumpable); 891 new->fsgid = gid;
847 smp_wmb(); 892 goto change_okay;
848 } 893 }
849 current->fsgid = gid;
850 key_fsgid_changed(current);
851 proc_id_connector(current, PROC_EVENT_GID);
852 } 894 }
895
896error:
897 abort_creds(new);
898 return old_fsgid;
899
900change_okay:
901 commit_creds(new);
853 return old_fsgid; 902 return old_fsgid;
854} 903}
855 904
905void do_sys_times(struct tms *tms)
906{
907 struct task_cputime cputime;
908 cputime_t cutime, cstime;
909
910 thread_group_cputime(current, &cputime);
911 spin_lock_irq(&current->sighand->siglock);
912 cutime = current->signal->cutime;
913 cstime = current->signal->cstime;
914 spin_unlock_irq(&current->sighand->siglock);
915 tms->tms_utime = cputime_to_clock_t(cputime.utime);
916 tms->tms_stime = cputime_to_clock_t(cputime.stime);
917 tms->tms_cutime = cputime_to_clock_t(cutime);
918 tms->tms_cstime = cputime_to_clock_t(cstime);
919}
920
856asmlinkage long sys_times(struct tms __user * tbuf) 921asmlinkage long sys_times(struct tms __user * tbuf)
857{ 922{
858 /*
859 * In the SMP world we might just be unlucky and have one of
860 * the times increment as we use it. Since the value is an
861 * atomically safe type this is just fine. Conceptually its
862 * as if the syscall took an instant longer to occur.
863 */
864 if (tbuf) { 923 if (tbuf) {
865 struct tms tmp; 924 struct tms tmp;
866 struct task_struct *tsk = current; 925
867 struct task_struct *t; 926 do_sys_times(&tmp);
868 cputime_t utime, stime, cutime, cstime;
869
870 spin_lock_irq(&tsk->sighand->siglock);
871 utime = tsk->signal->utime;
872 stime = tsk->signal->stime;
873 t = tsk;
874 do {
875 utime = cputime_add(utime, t->utime);
876 stime = cputime_add(stime, t->stime);
877 t = next_thread(t);
878 } while (t != tsk);
879
880 cutime = tsk->signal->cutime;
881 cstime = tsk->signal->cstime;
882 spin_unlock_irq(&tsk->sighand->siglock);
883
884 tmp.tms_utime = cputime_to_clock_t(utime);
885 tmp.tms_stime = cputime_to_clock_t(stime);
886 tmp.tms_cutime = cputime_to_clock_t(cutime);
887 tmp.tms_cstime = cputime_to_clock_t(cstime);
888 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 927 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
889 return -EFAULT; 928 return -EFAULT;
890 } 929 }
@@ -1128,7 +1167,7 @@ EXPORT_SYMBOL(groups_free);
1128 1167
1129/* export the group_info to a user-space array */ 1168/* export the group_info to a user-space array */
1130static int groups_to_user(gid_t __user *grouplist, 1169static int groups_to_user(gid_t __user *grouplist,
1131 struct group_info *group_info) 1170 const struct group_info *group_info)
1132{ 1171{
1133 int i; 1172 int i;
1134 unsigned int count = group_info->ngroups; 1173 unsigned int count = group_info->ngroups;
@@ -1196,7 +1235,7 @@ static void groups_sort(struct group_info *group_info)
1196} 1235}
1197 1236
1198/* a simple bsearch */ 1237/* a simple bsearch */
1199int groups_search(struct group_info *group_info, gid_t grp) 1238int groups_search(const struct group_info *group_info, gid_t grp)
1200{ 1239{
1201 unsigned int left, right; 1240 unsigned int left, right;
1202 1241
@@ -1218,51 +1257,74 @@ int groups_search(struct group_info *group_info, gid_t grp)
1218 return 0; 1257 return 0;
1219} 1258}
1220 1259
1221/* validate and set current->group_info */ 1260/**
1222int set_current_groups(struct group_info *group_info) 1261 * set_groups - Change a group subscription in a set of credentials
1262 * @new: The newly prepared set of credentials to alter
1263 * @group_info: The group list to install
1264 *
1265 * Validate a group subscription and, if valid, insert it into a set
1266 * of credentials.
1267 */
1268int set_groups(struct cred *new, struct group_info *group_info)
1223{ 1269{
1224 int retval; 1270 int retval;
1225 struct group_info *old_info;
1226 1271
1227 retval = security_task_setgroups(group_info); 1272 retval = security_task_setgroups(group_info);
1228 if (retval) 1273 if (retval)
1229 return retval; 1274 return retval;
1230 1275
1276 put_group_info(new->group_info);
1231 groups_sort(group_info); 1277 groups_sort(group_info);
1232 get_group_info(group_info); 1278 get_group_info(group_info);
1279 new->group_info = group_info;
1280 return 0;
1281}
1233 1282
1234 task_lock(current); 1283EXPORT_SYMBOL(set_groups);
1235 old_info = current->group_info;
1236 current->group_info = group_info;
1237 task_unlock(current);
1238 1284
1239 put_group_info(old_info); 1285/**
1286 * set_current_groups - Change current's group subscription
1287 * @group_info: The group list to impose
1288 *
1289 * Validate a group subscription and, if valid, impose it upon current's task
1290 * security record.
1291 */
1292int set_current_groups(struct group_info *group_info)
1293{
1294 struct cred *new;
1295 int ret;
1240 1296
1241 return 0; 1297 new = prepare_creds();
1298 if (!new)
1299 return -ENOMEM;
1300
1301 ret = set_groups(new, group_info);
1302 if (ret < 0) {
1303 abort_creds(new);
1304 return ret;
1305 }
1306
1307 return commit_creds(new);
1242} 1308}
1243 1309
1244EXPORT_SYMBOL(set_current_groups); 1310EXPORT_SYMBOL(set_current_groups);
1245 1311
1246asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) 1312asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1247{ 1313{
1248 int i = 0; 1314 const struct cred *cred = current_cred();
1249 1315 int i;
1250 /*
1251 * SMP: Nobody else can change our grouplist. Thus we are
1252 * safe.
1253 */
1254 1316
1255 if (gidsetsize < 0) 1317 if (gidsetsize < 0)
1256 return -EINVAL; 1318 return -EINVAL;
1257 1319
1258 /* no need to grab task_lock here; it cannot change */ 1320 /* no need to grab task_lock here; it cannot change */
1259 i = current->group_info->ngroups; 1321 i = cred->group_info->ngroups;
1260 if (gidsetsize) { 1322 if (gidsetsize) {
1261 if (i > gidsetsize) { 1323 if (i > gidsetsize) {
1262 i = -EINVAL; 1324 i = -EINVAL;
1263 goto out; 1325 goto out;
1264 } 1326 }
1265 if (groups_to_user(grouplist, current->group_info)) { 1327 if (groups_to_user(grouplist, cred->group_info)) {
1266 i = -EFAULT; 1328 i = -EFAULT;
1267 goto out; 1329 goto out;
1268 } 1330 }
@@ -1306,9 +1368,11 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
1306 */ 1368 */
1307int in_group_p(gid_t grp) 1369int in_group_p(gid_t grp)
1308{ 1370{
1371 const struct cred *cred = current_cred();
1309 int retval = 1; 1372 int retval = 1;
1310 if (grp != current->fsgid) 1373
1311 retval = groups_search(current->group_info, grp); 1374 if (grp != cred->fsgid)
1375 retval = groups_search(cred->group_info, grp);
1312 return retval; 1376 return retval;
1313} 1377}
1314 1378
@@ -1316,9 +1380,11 @@ EXPORT_SYMBOL(in_group_p);
1316 1380
1317int in_egroup_p(gid_t grp) 1381int in_egroup_p(gid_t grp)
1318{ 1382{
1383 const struct cred *cred = current_cred();
1319 int retval = 1; 1384 int retval = 1;
1320 if (grp != current->egid) 1385
1321 retval = groups_search(current->group_info, grp); 1386 if (grp != cred->egid)
1387 retval = groups_search(cred->group_info, grp);
1322 return retval; 1388 return retval;
1323} 1389}
1324 1390
@@ -1349,8 +1415,10 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1349 down_write(&uts_sem); 1415 down_write(&uts_sem);
1350 errno = -EFAULT; 1416 errno = -EFAULT;
1351 if (!copy_from_user(tmp, name, len)) { 1417 if (!copy_from_user(tmp, name, len)) {
1352 memcpy(utsname()->nodename, tmp, len); 1418 struct new_utsname *u = utsname();
1353 utsname()->nodename[len] = 0; 1419
1420 memcpy(u->nodename, tmp, len);
1421 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1354 errno = 0; 1422 errno = 0;
1355 } 1423 }
1356 up_write(&uts_sem); 1424 up_write(&uts_sem);
@@ -1362,15 +1430,17 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1362asmlinkage long sys_gethostname(char __user *name, int len) 1430asmlinkage long sys_gethostname(char __user *name, int len)
1363{ 1431{
1364 int i, errno; 1432 int i, errno;
1433 struct new_utsname *u;
1365 1434
1366 if (len < 0) 1435 if (len < 0)
1367 return -EINVAL; 1436 return -EINVAL;
1368 down_read(&uts_sem); 1437 down_read(&uts_sem);
1369 i = 1 + strlen(utsname()->nodename); 1438 u = utsname();
1439 i = 1 + strlen(u->nodename);
1370 if (i > len) 1440 if (i > len)
1371 i = len; 1441 i = len;
1372 errno = 0; 1442 errno = 0;
1373 if (copy_to_user(name, utsname()->nodename, i)) 1443 if (copy_to_user(name, u->nodename, i))
1374 errno = -EFAULT; 1444 errno = -EFAULT;
1375 up_read(&uts_sem); 1445 up_read(&uts_sem);
1376 return errno; 1446 return errno;
@@ -1395,8 +1465,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1395 down_write(&uts_sem); 1465 down_write(&uts_sem);
1396 errno = -EFAULT; 1466 errno = -EFAULT;
1397 if (!copy_from_user(tmp, name, len)) { 1467 if (!copy_from_user(tmp, name, len)) {
1398 memcpy(utsname()->domainname, tmp, len); 1468 struct new_utsname *u = utsname();
1399 utsname()->domainname[len] = 0; 1469
1470 memcpy(u->domainname, tmp, len);
1471 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1400 errno = 0; 1472 errno = 0;
1401 } 1473 }
1402 up_write(&uts_sem); 1474 up_write(&uts_sem);
@@ -1443,21 +1515,28 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1443asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1515asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1444{ 1516{
1445 struct rlimit new_rlim, *old_rlim; 1517 struct rlimit new_rlim, *old_rlim;
1446 unsigned long it_prof_secs;
1447 int retval; 1518 int retval;
1448 1519
1449 if (resource >= RLIM_NLIMITS) 1520 if (resource >= RLIM_NLIMITS)
1450 return -EINVAL; 1521 return -EINVAL;
1451 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1522 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1452 return -EFAULT; 1523 return -EFAULT;
1453 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1454 return -EINVAL;
1455 old_rlim = current->signal->rlim + resource; 1524 old_rlim = current->signal->rlim + resource;
1456 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1525 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1457 !capable(CAP_SYS_RESOURCE)) 1526 !capable(CAP_SYS_RESOURCE))
1458 return -EPERM; 1527 return -EPERM;
1459 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) 1528
1460 return -EPERM; 1529 if (resource == RLIMIT_NOFILE) {
1530 if (new_rlim.rlim_max == RLIM_INFINITY)
1531 new_rlim.rlim_max = sysctl_nr_open;
1532 if (new_rlim.rlim_cur == RLIM_INFINITY)
1533 new_rlim.rlim_cur = sysctl_nr_open;
1534 if (new_rlim.rlim_max > sysctl_nr_open)
1535 return -EPERM;
1536 }
1537
1538 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1539 return -EINVAL;
1461 1540
1462 retval = security_task_setrlimit(resource, &new_rlim); 1541 retval = security_task_setrlimit(resource, &new_rlim);
1463 if (retval) 1542 if (retval)
@@ -1489,18 +1568,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1489 if (new_rlim.rlim_cur == RLIM_INFINITY) 1568 if (new_rlim.rlim_cur == RLIM_INFINITY)
1490 goto out; 1569 goto out;
1491 1570
1492 it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); 1571 update_rlimit_cpu(new_rlim.rlim_cur);
1493 if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
1494 unsigned long rlim_cur = new_rlim.rlim_cur;
1495 cputime_t cputime;
1496
1497 cputime = secs_to_cputime(rlim_cur);
1498 read_lock(&tasklist_lock);
1499 spin_lock_irq(&current->sighand->siglock);
1500 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
1501 spin_unlock_irq(&current->sighand->siglock);
1502 read_unlock(&tasklist_lock);
1503 }
1504out: 1572out:
1505 return 0; 1573 return 0;
1506} 1574}
@@ -1538,11 +1606,8 @@ out:
1538 * 1606 *
1539 */ 1607 */
1540 1608
1541static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, 1609static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1542 cputime_t *utimep, cputime_t *stimep)
1543{ 1610{
1544 *utimep = cputime_add(*utimep, t->utime);
1545 *stimep = cputime_add(*stimep, t->stime);
1546 r->ru_nvcsw += t->nvcsw; 1611 r->ru_nvcsw += t->nvcsw;
1547 r->ru_nivcsw += t->nivcsw; 1612 r->ru_nivcsw += t->nivcsw;
1548 r->ru_minflt += t->min_flt; 1613 r->ru_minflt += t->min_flt;
@@ -1556,12 +1621,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1556 struct task_struct *t; 1621 struct task_struct *t;
1557 unsigned long flags; 1622 unsigned long flags;
1558 cputime_t utime, stime; 1623 cputime_t utime, stime;
1624 struct task_cputime cputime;
1559 1625
1560 memset((char *) r, 0, sizeof *r); 1626 memset((char *) r, 0, sizeof *r);
1561 utime = stime = cputime_zero; 1627 utime = stime = cputime_zero;
1562 1628
1563 if (who == RUSAGE_THREAD) { 1629 if (who == RUSAGE_THREAD) {
1564 accumulate_thread_rusage(p, r, &utime, &stime); 1630 accumulate_thread_rusage(p, r);
1565 goto out; 1631 goto out;
1566 } 1632 }
1567 1633
@@ -1584,8 +1650,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1584 break; 1650 break;
1585 1651
1586 case RUSAGE_SELF: 1652 case RUSAGE_SELF:
1587 utime = cputime_add(utime, p->signal->utime); 1653 thread_group_cputime(p, &cputime);
1588 stime = cputime_add(stime, p->signal->stime); 1654 utime = cputime_add(utime, cputime.utime);
1655 stime = cputime_add(stime, cputime.stime);
1589 r->ru_nvcsw += p->signal->nvcsw; 1656 r->ru_nvcsw += p->signal->nvcsw;
1590 r->ru_nivcsw += p->signal->nivcsw; 1657 r->ru_nivcsw += p->signal->nivcsw;
1591 r->ru_minflt += p->signal->min_flt; 1658 r->ru_minflt += p->signal->min_flt;
@@ -1594,7 +1661,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1594 r->ru_oublock += p->signal->oublock; 1661 r->ru_oublock += p->signal->oublock;
1595 t = p; 1662 t = p;
1596 do { 1663 do {
1597 accumulate_thread_rusage(t, r, &utime, &stime); 1664 accumulate_thread_rusage(t, r);
1598 t = next_thread(t); 1665 t = next_thread(t);
1599 } while (t != p); 1666 } while (t != p);
1600 break; 1667 break;
@@ -1633,50 +1700,56 @@ asmlinkage long sys_umask(int mask)
1633asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1700asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1634 unsigned long arg4, unsigned long arg5) 1701 unsigned long arg4, unsigned long arg5)
1635{ 1702{
1636 long error = 0; 1703 struct task_struct *me = current;
1704 unsigned char comm[sizeof(me->comm)];
1705 long error;
1637 1706
1638 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) 1707 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1708 if (error != -ENOSYS)
1639 return error; 1709 return error;
1640 1710
1711 error = 0;
1641 switch (option) { 1712 switch (option) {
1642 case PR_SET_PDEATHSIG: 1713 case PR_SET_PDEATHSIG:
1643 if (!valid_signal(arg2)) { 1714 if (!valid_signal(arg2)) {
1644 error = -EINVAL; 1715 error = -EINVAL;
1645 break; 1716 break;
1646 } 1717 }
1647 current->pdeath_signal = arg2; 1718 me->pdeath_signal = arg2;
1719 error = 0;
1648 break; 1720 break;
1649 case PR_GET_PDEATHSIG: 1721 case PR_GET_PDEATHSIG:
1650 error = put_user(current->pdeath_signal, (int __user *)arg2); 1722 error = put_user(me->pdeath_signal, (int __user *)arg2);
1651 break; 1723 break;
1652 case PR_GET_DUMPABLE: 1724 case PR_GET_DUMPABLE:
1653 error = get_dumpable(current->mm); 1725 error = get_dumpable(me->mm);
1654 break; 1726 break;
1655 case PR_SET_DUMPABLE: 1727 case PR_SET_DUMPABLE:
1656 if (arg2 < 0 || arg2 > 1) { 1728 if (arg2 < 0 || arg2 > 1) {
1657 error = -EINVAL; 1729 error = -EINVAL;
1658 break; 1730 break;
1659 } 1731 }
1660 set_dumpable(current->mm, arg2); 1732 set_dumpable(me->mm, arg2);
1733 error = 0;
1661 break; 1734 break;
1662 1735
1663 case PR_SET_UNALIGN: 1736 case PR_SET_UNALIGN:
1664 error = SET_UNALIGN_CTL(current, arg2); 1737 error = SET_UNALIGN_CTL(me, arg2);
1665 break; 1738 break;
1666 case PR_GET_UNALIGN: 1739 case PR_GET_UNALIGN:
1667 error = GET_UNALIGN_CTL(current, arg2); 1740 error = GET_UNALIGN_CTL(me, arg2);
1668 break; 1741 break;
1669 case PR_SET_FPEMU: 1742 case PR_SET_FPEMU:
1670 error = SET_FPEMU_CTL(current, arg2); 1743 error = SET_FPEMU_CTL(me, arg2);
1671 break; 1744 break;
1672 case PR_GET_FPEMU: 1745 case PR_GET_FPEMU:
1673 error = GET_FPEMU_CTL(current, arg2); 1746 error = GET_FPEMU_CTL(me, arg2);
1674 break; 1747 break;
1675 case PR_SET_FPEXC: 1748 case PR_SET_FPEXC:
1676 error = SET_FPEXC_CTL(current, arg2); 1749 error = SET_FPEXC_CTL(me, arg2);
1677 break; 1750 break;
1678 case PR_GET_FPEXC: 1751 case PR_GET_FPEXC:
1679 error = GET_FPEXC_CTL(current, arg2); 1752 error = GET_FPEXC_CTL(me, arg2);
1680 break; 1753 break;
1681 case PR_GET_TIMING: 1754 case PR_GET_TIMING:
1682 error = PR_TIMING_STATISTICAL; 1755 error = PR_TIMING_STATISTICAL;
@@ -1684,33 +1757,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1684 case PR_SET_TIMING: 1757 case PR_SET_TIMING:
1685 if (arg2 != PR_TIMING_STATISTICAL) 1758 if (arg2 != PR_TIMING_STATISTICAL)
1686 error = -EINVAL; 1759 error = -EINVAL;
1760 else
1761 error = 0;
1687 break; 1762 break;
1688 1763
1689 case PR_SET_NAME: { 1764 case PR_SET_NAME:
1690 struct task_struct *me = current; 1765 comm[sizeof(me->comm)-1] = 0;
1691 unsigned char ncomm[sizeof(me->comm)]; 1766 if (strncpy_from_user(comm, (char __user *)arg2,
1692 1767 sizeof(me->comm) - 1) < 0)
1693 ncomm[sizeof(me->comm)-1] = 0;
1694 if (strncpy_from_user(ncomm, (char __user *)arg2,
1695 sizeof(me->comm)-1) < 0)
1696 return -EFAULT; 1768 return -EFAULT;
1697 set_task_comm(me, ncomm); 1769 set_task_comm(me, comm);
1698 return 0; 1770 return 0;
1699 } 1771 case PR_GET_NAME:
1700 case PR_GET_NAME: { 1772 get_task_comm(comm, me);
1701 struct task_struct *me = current; 1773 if (copy_to_user((char __user *)arg2, comm,
1702 unsigned char tcomm[sizeof(me->comm)]; 1774 sizeof(comm)))
1703
1704 get_task_comm(tcomm, me);
1705 if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
1706 return -EFAULT; 1775 return -EFAULT;
1707 return 0; 1776 return 0;
1708 }
1709 case PR_GET_ENDIAN: 1777 case PR_GET_ENDIAN:
1710 error = GET_ENDIAN(current, arg2); 1778 error = GET_ENDIAN(me, arg2);
1711 break; 1779 break;
1712 case PR_SET_ENDIAN: 1780 case PR_SET_ENDIAN:
1713 error = SET_ENDIAN(current, arg2); 1781 error = SET_ENDIAN(me, arg2);
1714 break; 1782 break;
1715 1783
1716 case PR_GET_SECCOMP: 1784 case PR_GET_SECCOMP:
@@ -1725,6 +1793,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1725 case PR_SET_TSC: 1793 case PR_SET_TSC:
1726 error = SET_TSC_CTL(arg2); 1794 error = SET_TSC_CTL(arg2);
1727 break; 1795 break;
1796 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns;
1798 break;
1799 case PR_SET_TIMERSLACK:
1800 if (arg2 <= 0)
1801 current->timer_slack_ns =
1802 current->default_timer_slack_ns;
1803 else
1804 current->timer_slack_ns = arg2;
1805 error = 0;
1806 break;
1728 default: 1807 default:
1729 error = -EINVAL; 1808 error = -EINVAL;
1730 break; 1809 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 503d8d4eb80a..e14a23281707 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
31cond_syscall(sys_bind); 31cond_syscall(sys_bind);
32cond_syscall(sys_listen); 32cond_syscall(sys_listen);
33cond_syscall(sys_accept); 33cond_syscall(sys_accept);
34cond_syscall(sys_paccept); 34cond_syscall(sys_accept4);
35cond_syscall(sys_connect); 35cond_syscall(sys_connect);
36cond_syscall(sys_getsockname); 36cond_syscall(sys_getsockname);
37cond_syscall(sys_getpeername); 37cond_syscall(sys_getpeername);
@@ -126,6 +126,11 @@ cond_syscall(sys_vm86);
126cond_syscall(compat_sys_ipc); 126cond_syscall(compat_sys_ipc);
127cond_syscall(compat_sys_sysctl); 127cond_syscall(compat_sys_sysctl);
128cond_syscall(sys_flock); 128cond_syscall(sys_flock);
129cond_syscall(sys_io_setup);
130cond_syscall(sys_io_destroy);
131cond_syscall(sys_io_submit);
132cond_syscall(sys_io_cancel);
133cond_syscall(sys_io_getevents);
129 134
130/* arch-specific weak syscall entries */ 135/* arch-specific weak syscall entries */
131cond_syscall(sys_pciconfig_read); 136cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cfc5295f1e82..ff6d45c7626f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -121,6 +121,10 @@ extern int sg_big_buff;
121#include <asm/system.h> 121#include <asm/system.h>
122#endif 122#endif
123 123
124#ifdef CONFIG_SPARC64
125extern int sysctl_tsb_ratio;
126#endif
127
124#ifdef __hppa__ 128#ifdef __hppa__
125extern int pwrsw_enabled; 129extern int pwrsw_enabled;
126extern int unaligned_enabled; 130extern int unaligned_enabled;
@@ -149,7 +153,7 @@ extern int max_lock_depth;
149#ifdef CONFIG_PROC_SYSCTL 153#ifdef CONFIG_PROC_SYSCTL
150static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 154static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
151 void __user *buffer, size_t *lenp, loff_t *ppos); 155 void __user *buffer, size_t *lenp, loff_t *ppos);
152static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 156static int proc_taint(struct ctl_table *table, int write, struct file *filp,
153 void __user *buffer, size_t *lenp, loff_t *ppos); 157 void __user *buffer, size_t *lenp, loff_t *ppos);
154#endif 158#endif
155 159
@@ -176,6 +180,9 @@ extern struct ctl_table random_table[];
176#ifdef CONFIG_INOTIFY_USER 180#ifdef CONFIG_INOTIFY_USER
177extern struct ctl_table inotify_table[]; 181extern struct ctl_table inotify_table[];
178#endif 182#endif
183#ifdef CONFIG_EPOLL
184extern struct ctl_table epoll_table[];
185#endif
179 186
180#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 187#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
181int sysctl_legacy_va_layout; 188int sysctl_legacy_va_layout;
@@ -276,6 +283,16 @@ static struct ctl_table kern_table[] = {
276 }, 283 },
277 { 284 {
278 .ctl_name = CTL_UNNUMBERED, 285 .ctl_name = CTL_UNNUMBERED,
286 .procname = "sched_shares_thresh",
287 .data = &sysctl_sched_shares_thresh,
288 .maxlen = sizeof(unsigned int),
289 .mode = 0644,
290 .proc_handler = &proc_dointvec_minmax,
291 .strategy = &sysctl_intvec,
292 .extra1 = &zero,
293 },
294 {
295 .ctl_name = CTL_UNNUMBERED,
279 .procname = "sched_child_runs_first", 296 .procname = "sched_child_runs_first",
280 .data = &sysctl_sched_child_runs_first, 297 .data = &sysctl_sched_child_runs_first,
281 .maxlen = sizeof(unsigned int), 298 .maxlen = sizeof(unsigned int),
@@ -379,10 +396,9 @@ static struct ctl_table kern_table[] = {
379#ifdef CONFIG_PROC_SYSCTL 396#ifdef CONFIG_PROC_SYSCTL
380 { 397 {
381 .procname = "tainted", 398 .procname = "tainted",
382 .data = &tainted, 399 .maxlen = sizeof(long),
383 .maxlen = sizeof(int),
384 .mode = 0644, 400 .mode = 0644,
385 .proc_handler = &proc_dointvec_taint, 401 .proc_handler = &proc_taint,
386 }, 402 },
387#endif 403#endif
388#ifdef CONFIG_LATENCYTOP 404#ifdef CONFIG_LATENCYTOP
@@ -439,6 +455,16 @@ static struct ctl_table kern_table[] = {
439 .proc_handler = &proc_dointvec, 455 .proc_handler = &proc_dointvec,
440 }, 456 },
441#endif 457#endif
458#ifdef CONFIG_SPARC64
459 {
460 .ctl_name = CTL_UNNUMBERED,
461 .procname = "tsb-ratio",
462 .data = &sysctl_tsb_ratio,
463 .maxlen = sizeof (int),
464 .mode = 0644,
465 .proc_handler = &proc_dointvec,
466 },
467#endif
442#ifdef __hppa__ 468#ifdef __hppa__
443 { 469 {
444 .ctl_name = KERN_HPPA_PWRSW, 470 .ctl_name = KERN_HPPA_PWRSW,
@@ -465,7 +491,7 @@ static struct ctl_table kern_table[] = {
465 .mode = 0644, 491 .mode = 0644,
466 .proc_handler = &proc_dointvec, 492 .proc_handler = &proc_dointvec,
467 }, 493 },
468#ifdef CONFIG_FTRACE 494#ifdef CONFIG_FUNCTION_TRACER
469 { 495 {
470 .ctl_name = CTL_UNNUMBERED, 496 .ctl_name = CTL_UNNUMBERED,
471 .procname = "ftrace_enabled", 497 .procname = "ftrace_enabled",
@@ -475,6 +501,26 @@ static struct ctl_table kern_table[] = {
475 .proc_handler = &ftrace_enable_sysctl, 501 .proc_handler = &ftrace_enable_sysctl,
476 }, 502 },
477#endif 503#endif
504#ifdef CONFIG_STACK_TRACER
505 {
506 .ctl_name = CTL_UNNUMBERED,
507 .procname = "stack_tracer_enabled",
508 .data = &stack_tracer_enabled,
509 .maxlen = sizeof(int),
510 .mode = 0644,
511 .proc_handler = &stack_trace_sysctl,
512 },
513#endif
514#ifdef CONFIG_TRACING
515 {
516 .ctl_name = CTL_UNNUMBERED,
517 .procname = "ftrace_dump_on_oops",
518 .data = &ftrace_dump_on_oops,
519 .maxlen = sizeof(int),
520 .mode = 0644,
521 .proc_handler = &proc_dointvec,
522 },
523#endif
478#ifdef CONFIG_MODULES 524#ifdef CONFIG_MODULES
479 { 525 {
480 .ctl_name = KERN_MODPROBE, 526 .ctl_name = KERN_MODPROBE,
@@ -834,6 +880,16 @@ static struct ctl_table kern_table[] = {
834 .proc_handler = &proc_dointvec, 880 .proc_handler = &proc_dointvec,
835 }, 881 },
836#endif 882#endif
883#ifdef CONFIG_UNEVICTABLE_LRU
884 {
885 .ctl_name = CTL_UNNUMBERED,
886 .procname = "scan_unevictable_pages",
887 .data = &scan_unevictable_pages,
888 .maxlen = sizeof(scan_unevictable_pages),
889 .mode = 0644,
890 .proc_handler = &scan_unevictable_handler,
891 },
892#endif
837/* 893/*
838 * NOTE: do not add new entries to this table unless you have read 894 * NOTE: do not add new entries to this table unless you have read
839 * Documentation/sysctl/ctl_unnumbered.txt 895 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1282,6 +1338,7 @@ static struct ctl_table fs_table[] = {
1282 .extra2 = &two, 1338 .extra2 = &two,
1283 }, 1339 },
1284#endif 1340#endif
1341#ifdef CONFIG_AIO
1285 { 1342 {
1286 .procname = "aio-nr", 1343 .procname = "aio-nr",
1287 .data = &aio_nr, 1344 .data = &aio_nr,
@@ -1296,6 +1353,7 @@ static struct ctl_table fs_table[] = {
1296 .mode = 0644, 1353 .mode = 0644,
1297 .proc_handler = &proc_doulongvec_minmax, 1354 .proc_handler = &proc_doulongvec_minmax,
1298 }, 1355 },
1356#endif /* CONFIG_AIO */
1299#ifdef CONFIG_INOTIFY_USER 1357#ifdef CONFIG_INOTIFY_USER
1300 { 1358 {
1301 .ctl_name = FS_INOTIFY, 1359 .ctl_name = FS_INOTIFY,
@@ -1304,6 +1362,13 @@ static struct ctl_table fs_table[] = {
1304 .child = inotify_table, 1362 .child = inotify_table,
1305 }, 1363 },
1306#endif 1364#endif
1365#ifdef CONFIG_EPOLL
1366 {
1367 .procname = "epoll",
1368 .mode = 0555,
1369 .child = epoll_table,
1370 },
1371#endif
1307#endif 1372#endif
1308 { 1373 {
1309 .ctl_name = KERN_SETUID_DUMPABLE, 1374 .ctl_name = KERN_SETUID_DUMPABLE,
@@ -1501,7 +1566,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1501/* Perform the actual read/write of a sysctl table entry. */ 1566/* Perform the actual read/write of a sysctl table entry. */
1502static int do_sysctl_strategy(struct ctl_table_root *root, 1567static int do_sysctl_strategy(struct ctl_table_root *root,
1503 struct ctl_table *table, 1568 struct ctl_table *table,
1504 int __user *name, int nlen,
1505 void __user *oldval, size_t __user *oldlenp, 1569 void __user *oldval, size_t __user *oldlenp,
1506 void __user *newval, size_t newlen) 1570 void __user *newval, size_t newlen)
1507{ 1571{
@@ -1515,8 +1579,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1515 return -EPERM; 1579 return -EPERM;
1516 1580
1517 if (table->strategy) { 1581 if (table->strategy) {
1518 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1582 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1519 newval, newlen);
1520 if (rc < 0) 1583 if (rc < 0)
1521 return rc; 1584 return rc;
1522 if (rc > 0) 1585 if (rc > 0)
@@ -1526,8 +1589,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1526 /* If there is no strategy routine, or if the strategy returns 1589 /* If there is no strategy routine, or if the strategy returns
1527 * zero, proceed with automatic r/w */ 1590 * zero, proceed with automatic r/w */
1528 if (table->data && table->maxlen) { 1591 if (table->data && table->maxlen) {
1529 rc = sysctl_data(table, name, nlen, oldval, oldlenp, 1592 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1530 newval, newlen);
1531 if (rc < 0) 1593 if (rc < 0)
1532 return rc; 1594 return rc;
1533 } 1595 }
@@ -1559,7 +1621,7 @@ repeat:
1559 table = table->child; 1621 table = table->child;
1560 goto repeat; 1622 goto repeat;
1561 } 1623 }
1562 error = do_sysctl_strategy(root, table, name, nlen, 1624 error = do_sysctl_strategy(root, table,
1563 oldval, oldlenp, 1625 oldval, oldlenp,
1564 newval, newlen); 1626 newval, newlen);
1565 return error; 1627 return error;
@@ -1623,7 +1685,7 @@ out:
1623 1685
1624static int test_perm(int mode, int op) 1686static int test_perm(int mode, int op)
1625{ 1687{
1626 if (!current->euid) 1688 if (!current_euid())
1627 mode >>= 6; 1689 mode >>= 6;
1628 else if (in_egroup_p(0)) 1690 else if (in_egroup_p(0))
1629 mode >>= 3; 1691 mode >>= 3;
@@ -2228,49 +2290,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2228 NULL,NULL); 2290 NULL,NULL);
2229} 2291}
2230 2292
2231#define OP_SET 0
2232#define OP_AND 1
2233#define OP_OR 2
2234
2235static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
2236 int *valp,
2237 int write, void *data)
2238{
2239 int op = *(int *)data;
2240 if (write) {
2241 int val = *negp ? -*lvalp : *lvalp;
2242 switch(op) {
2243 case OP_SET: *valp = val; break;
2244 case OP_AND: *valp &= val; break;
2245 case OP_OR: *valp |= val; break;
2246 }
2247 } else {
2248 int val = *valp;
2249 if (val < 0) {
2250 *negp = -1;
2251 *lvalp = (unsigned long)-val;
2252 } else {
2253 *negp = 0;
2254 *lvalp = (unsigned long)val;
2255 }
2256 }
2257 return 0;
2258}
2259
2260/* 2293/*
2261 * Taint values can only be increased 2294 * Taint values can only be increased
2295 * This means we can safely use a temporary.
2262 */ 2296 */
2263static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 2297static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2264 void __user *buffer, size_t *lenp, loff_t *ppos) 2298 void __user *buffer, size_t *lenp, loff_t *ppos)
2265{ 2299{
2266 int op; 2300 struct ctl_table t;
2301 unsigned long tmptaint = get_taint();
2302 int err;
2267 2303
2268 if (write && !capable(CAP_SYS_ADMIN)) 2304 if (write && !capable(CAP_SYS_ADMIN))
2269 return -EPERM; 2305 return -EPERM;
2270 2306
2271 op = OP_OR; 2307 t = *table;
2272 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2308 t.data = &tmptaint;
2273 do_proc_dointvec_bset_conv,&op); 2309 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
2310 if (err < 0)
2311 return err;
2312
2313 if (write) {
2314 /*
2315 * Poor man's atomic or. Not worth adding a primitive
2316 * to everyone's atomic.h for this
2317 */
2318 int i;
2319 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2320 if ((tmptaint >> i) & 1)
2321 add_taint(i);
2322 }
2323 }
2324
2325 return err;
2274} 2326}
2275 2327
2276struct do_proc_dointvec_minmax_conv_param { 2328struct do_proc_dointvec_minmax_conv_param {
@@ -2718,7 +2770,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2718 */ 2770 */
2719 2771
2720/* The generic sysctl data routine (used if no strategy routine supplied) */ 2772/* The generic sysctl data routine (used if no strategy routine supplied) */
2721int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2773int sysctl_data(struct ctl_table *table,
2722 void __user *oldval, size_t __user *oldlenp, 2774 void __user *oldval, size_t __user *oldlenp,
2723 void __user *newval, size_t newlen) 2775 void __user *newval, size_t newlen)
2724{ 2776{
@@ -2752,7 +2804,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2752} 2804}
2753 2805
2754/* The generic string strategy routine: */ 2806/* The generic string strategy routine: */
2755int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2807int sysctl_string(struct ctl_table *table,
2756 void __user *oldval, size_t __user *oldlenp, 2808 void __user *oldval, size_t __user *oldlenp,
2757 void __user *newval, size_t newlen) 2809 void __user *newval, size_t newlen)
2758{ 2810{
@@ -2798,7 +2850,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2798 * are between the minimum and maximum values given in the arrays 2850 * are between the minimum and maximum values given in the arrays
2799 * table->extra1 and table->extra2, respectively. 2851 * table->extra1 and table->extra2, respectively.
2800 */ 2852 */
2801int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2853int sysctl_intvec(struct ctl_table *table,
2802 void __user *oldval, size_t __user *oldlenp, 2854 void __user *oldval, size_t __user *oldlenp,
2803 void __user *newval, size_t newlen) 2855 void __user *newval, size_t newlen)
2804{ 2856{
@@ -2834,7 +2886,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2834} 2886}
2835 2887
2836/* Strategy function to convert jiffies to seconds */ 2888/* Strategy function to convert jiffies to seconds */
2837int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2889int sysctl_jiffies(struct ctl_table *table,
2838 void __user *oldval, size_t __user *oldlenp, 2890 void __user *oldval, size_t __user *oldlenp,
2839 void __user *newval, size_t newlen) 2891 void __user *newval, size_t newlen)
2840{ 2892{
@@ -2868,7 +2920,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2868} 2920}
2869 2921
2870/* Strategy function to convert jiffies to seconds */ 2922/* Strategy function to convert jiffies to seconds */
2871int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2923int sysctl_ms_jiffies(struct ctl_table *table,
2872 void __user *oldval, size_t __user *oldlenp, 2924 void __user *oldval, size_t __user *oldlenp,
2873 void __user *newval, size_t newlen) 2925 void __user *newval, size_t newlen)
2874{ 2926{
@@ -2923,35 +2975,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2923 return error; 2975 return error;
2924} 2976}
2925 2977
2926int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2978int sysctl_data(struct ctl_table *table,
2927 void __user *oldval, size_t __user *oldlenp, 2979 void __user *oldval, size_t __user *oldlenp,
2928 void __user *newval, size_t newlen) 2980 void __user *newval, size_t newlen)
2929{ 2981{
2930 return -ENOSYS; 2982 return -ENOSYS;
2931} 2983}
2932 2984
2933int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2985int sysctl_string(struct ctl_table *table,
2934 void __user *oldval, size_t __user *oldlenp, 2986 void __user *oldval, size_t __user *oldlenp,
2935 void __user *newval, size_t newlen) 2987 void __user *newval, size_t newlen)
2936{ 2988{
2937 return -ENOSYS; 2989 return -ENOSYS;
2938} 2990}
2939 2991
2940int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2992int sysctl_intvec(struct ctl_table *table,
2941 void __user *oldval, size_t __user *oldlenp, 2993 void __user *oldval, size_t __user *oldlenp,
2942 void __user *newval, size_t newlen) 2994 void __user *newval, size_t newlen)
2943{ 2995{
2944 return -ENOSYS; 2996 return -ENOSYS;
2945} 2997}
2946 2998
2947int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2999int sysctl_jiffies(struct ctl_table *table,
2948 void __user *oldval, size_t __user *oldlenp, 3000 void __user *oldval, size_t __user *oldlenp,
2949 void __user *newval, size_t newlen) 3001 void __user *newval, size_t newlen)
2950{ 3002{
2951 return -ENOSYS; 3003 return -ENOSYS;
2952} 3004}
2953 3005
2954int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 3006int sysctl_ms_jiffies(struct ctl_table *table,
2955 void __user *oldval, size_t __user *oldlenp, 3007 void __user *oldval, size_t __user *oldlenp,
2956 void __user *newval, size_t newlen) 3008 void __user *newval, size_t newlen)
2957{ 3009{
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23ab8fb..fafeb48f27c0 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = {
730}; 730};
731 731
732static const struct trans_ctl_table trans_fs_xfs_table[] = { 732static const struct trans_ctl_table trans_fs_xfs_table[] = {
733 { XFS_RESTRICT_CHOWN, "restrict_chown" },
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" }, 733 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" }, 734 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" }, 735 { XFS_PANIC_MASK, "panic_mask" },
diff --git a/kernel/time.c b/kernel/time.c
index 6a08660b4fac..d63a4336fad6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64);
669#endif 669#endif
670 670
671EXPORT_SYMBOL(jiffies); 671EXPORT_SYMBOL(jiffies);
672
673/*
674 * Add two timespec values and do a safety check for overflow.
675 * It's assumed that both values are valid (>= 0)
676 */
677struct timespec timespec_add_safe(const struct timespec lhs,
678 const struct timespec rhs)
679{
680 struct timespec res;
681
682 set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
683 lhs.tv_nsec + rhs.tv_nsec);
684
685 if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
686 res.tv_sec = TIME_T_MAX;
687
688 return res;
689}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8d53106a0a92..95ed42951e0a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config TICK_ONESHOT 4config TICK_ONESHOT
5 bool 5 bool
6 default n
7 6
8config NO_HZ 7config NO_HZ
9 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 093d4acf993b..9ed2eec97526 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c)
325 unsigned long flags; 325 unsigned long flags;
326 int ret; 326 int ret;
327 327
328 /* save mult_orig on registration */
329 c->mult_orig = c->mult;
330
328 spin_lock_irqsave(&clocksource_lock, flags); 331 spin_lock_irqsave(&clocksource_lock, flags);
329 ret = clocksource_enqueue(c); 332 ret = clocksource_enqueue(c);
330 if (!ret) 333 if (!ret)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 4c256fdb8875..1ca99557e929 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
64 .shift = JIFFIES_SHIFT, 65 .shift = JIFFIES_SHIFT,
65}; 66};
66 67
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1ad46f3df6e7..f5f793d92415 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,13 +10,13 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/timer.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15#include <linux/jiffies.h> 14#include <linux/jiffies.h>
16#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/math64.h> 17#include <linux/math64.h>
19#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/workqueue.h>
20#include <asm/timex.h> 20#include <asm/timex.h>
21 21
22/* 22/*
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
131{ 131{
132 enum hrtimer_restart res = HRTIMER_NORESTART; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
133 133
134 write_seqlock_irq(&xtime_lock); 134 write_seqlock(&xtime_lock);
135 135
136 switch (time_state) { 136 switch (time_state) {
137 case TIME_OK: 137 case TIME_OK:
@@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
142 time_state = TIME_OOP; 142 time_state = TIME_OOP;
143 printk(KERN_NOTICE "Clock: " 143 printk(KERN_NOTICE "Clock: "
144 "inserting leap second 23:59:60 UTC\n"); 144 "inserting leap second 23:59:60 UTC\n");
145 leap_timer.expires = ktime_add_ns(leap_timer.expires, 145 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
146 NSEC_PER_SEC);
147 res = HRTIMER_RESTART; 146 res = HRTIMER_RESTART;
148 break; 147 break;
149 case TIME_DEL: 148 case TIME_DEL:
@@ -165,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
165 } 164 }
166 update_vsyscall(&xtime, clock); 165 update_vsyscall(&xtime, clock);
167 166
168 write_sequnlock_irq(&xtime_lock); 167 write_sequnlock(&xtime_lock);
169 168
170 return res; 169 return res;
171} 170}
@@ -218,11 +217,11 @@ void second_overflow(void)
218/* Disable the cmos update - used by virtualization and embedded */ 217/* Disable the cmos update - used by virtualization and embedded */
219int no_sync_cmos_clock __read_mostly; 218int no_sync_cmos_clock __read_mostly;
220 219
221static void sync_cmos_clock(unsigned long dummy); 220static void sync_cmos_clock(struct work_struct *work);
222 221
223static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 222static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
224 223
225static void sync_cmos_clock(unsigned long dummy) 224static void sync_cmos_clock(struct work_struct *work)
226{ 225{
227 struct timespec now, next; 226 struct timespec now, next;
228 int fail = 1; 227 int fail = 1;
@@ -258,13 +257,13 @@ static void sync_cmos_clock(unsigned long dummy)
258 next.tv_sec++; 257 next.tv_sec++;
259 next.tv_nsec -= NSEC_PER_SEC; 258 next.tv_nsec -= NSEC_PER_SEC;
260 } 259 }
261 mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); 260 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
262} 261}
263 262
264static void notify_cmos_timer(void) 263static void notify_cmos_timer(void)
265{ 264{
266 if (!no_sync_cmos_clock) 265 if (!no_sync_cmos_clock)
267 mod_timer(&sync_cmos_timer, jiffies + 1); 266 schedule_delayed_work(&sync_cmos_work, 0);
268} 267}
269 268
270#else 269#else
@@ -277,38 +276,50 @@ static inline void notify_cmos_timer(void) { }
277int do_adjtimex(struct timex *txc) 276int do_adjtimex(struct timex *txc)
278{ 277{
279 struct timespec ts; 278 struct timespec ts;
280 long save_adjust, sec;
281 int result; 279 int result;
282 280
283 /* In order to modify anything, you gotta be super-user! */ 281 /* Validate the data before disabling interrupts */
284 if (txc->modes && !capable(CAP_SYS_TIME)) 282 if (txc->modes & ADJ_ADJTIME) {
285 return -EPERM;
286
287 /* Now we validate the data before disabling interrupts */
288
289 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
290 /* singleshot must not be used with any other mode bits */ 283 /* singleshot must not be used with any other mode bits */
291 if (txc->modes & ~ADJ_OFFSET_SS_READ) 284 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
292 return -EINVAL; 285 return -EINVAL;
286 if (!(txc->modes & ADJ_OFFSET_READONLY) &&
287 !capable(CAP_SYS_TIME))
288 return -EPERM;
289 } else {
290 /* In order to modify anything, you gotta be super-user! */
291 if (txc->modes && !capable(CAP_SYS_TIME))
292 return -EPERM;
293
294 /* if the quartz is off by more than 10% something is VERY wrong! */
295 if (txc->modes & ADJ_TICK &&
296 (txc->tick < 900000/USER_HZ ||
297 txc->tick > 1100000/USER_HZ))
298 return -EINVAL;
299
300 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
301 hrtimer_cancel(&leap_timer);
293 } 302 }
294 303
295 /* if the quartz is off by more than 10% something is VERY wrong ! */
296 if (txc->modes & ADJ_TICK)
297 if (txc->tick < 900000/USER_HZ ||
298 txc->tick > 1100000/USER_HZ)
299 return -EINVAL;
300
301 if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
302 hrtimer_cancel(&leap_timer);
303 getnstimeofday(&ts); 304 getnstimeofday(&ts);
304 305
305 write_seqlock_irq(&xtime_lock); 306 write_seqlock_irq(&xtime_lock);
306 307
307 /* Save for later - semantics of adjtime is to return old value */
308 save_adjust = time_adjust;
309
310 /* If there are input parameters, then process them */ 308 /* If there are input parameters, then process them */
309 if (txc->modes & ADJ_ADJTIME) {
310 long save_adjust = time_adjust;
311
312 if (!(txc->modes & ADJ_OFFSET_READONLY)) {
313 /* adjtime() is independent from ntp_adjtime() */
314 time_adjust = txc->offset;
315 ntp_update_frequency();
316 }
317 txc->offset = save_adjust;
318 goto adj_done;
319 }
311 if (txc->modes) { 320 if (txc->modes) {
321 long sec;
322
312 if (txc->modes & ADJ_STATUS) { 323 if (txc->modes & ADJ_STATUS) {
313 if ((time_status & STA_PLL) && 324 if ((time_status & STA_PLL) &&
314 !(txc->status & STA_PLL)) { 325 !(txc->status & STA_PLL)) {
@@ -375,13 +386,8 @@ int do_adjtimex(struct timex *txc)
375 if (txc->modes & ADJ_TAI && txc->constant > 0) 386 if (txc->modes & ADJ_TAI && txc->constant > 0)
376 time_tai = txc->constant; 387 time_tai = txc->constant;
377 388
378 if (txc->modes & ADJ_OFFSET) { 389 if (txc->modes & ADJ_OFFSET)
379 if (txc->modes == ADJ_OFFSET_SINGLESHOT) 390 ntp_update_offset(txc->offset);
380 /* adjtime() is independent from ntp_adjtime() */
381 time_adjust = txc->offset;
382 else
383 ntp_update_offset(txc->offset);
384 }
385 if (txc->modes & ADJ_TICK) 391 if (txc->modes & ADJ_TICK)
386 tick_usec = txc->tick; 392 tick_usec = txc->tick;
387 393
@@ -389,22 +395,18 @@ int do_adjtimex(struct timex *txc)
389 ntp_update_frequency(); 395 ntp_update_frequency();
390 } 396 }
391 397
398 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
399 NTP_SCALE_SHIFT);
400 if (!(time_status & STA_NANO))
401 txc->offset /= NSEC_PER_USEC;
402
403adj_done:
392 result = time_state; /* mostly `TIME_OK' */ 404 result = time_state; /* mostly `TIME_OK' */
393 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 405 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
394 result = TIME_ERROR; 406 result = TIME_ERROR;
395 407
396 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || 408 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
397 (txc->modes == ADJ_OFFSET_SS_READ)) 409 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
398 txc->offset = save_adjust;
399 else {
400 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
401 NTP_SCALE_SHIFT);
402 if (!(time_status & STA_NANO))
403 txc->offset /= NSEC_PER_USEC;
404 }
405 txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
406 (s64)PPM_SCALE_INV,
407 NTP_SCALE_SHIFT);
408 txc->maxerror = time_maxerror; 410 txc->maxerror = time_maxerror;
409 txc->esterror = time_esterror; 411 txc->esterror = time_esterror;
410 txc->status = time_status; 412 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index cb01cd8f919b..f98a1b7b16e9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -384,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
384} 384}
385 385
386/* 386/*
387 * Called from irq_enter() when idle was interrupted to reenable the
388 * per cpu device.
389 */
390void tick_check_oneshot_broadcast(int cpu)
391{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
396 }
397}
398
399/*
387 * Handle oneshot mode broadcasting 400 * Handle oneshot mode broadcasting
388 */ 401 */
389static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 469248782c23..b1c05bf75ee0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void);
36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void); 38extern int tick_broadcast_oneshot_active(void);
39extern void tick_check_oneshot_broadcast(int cpu);
39# else /* BROADCAST */ 40# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 41static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 42{
@@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
45static inline void tick_broadcast_switch_to_oneshot(void) { } 46static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 47static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 48static inline int tick_broadcast_oneshot_active(void) { return 0; }
49static inline void tick_check_oneshot_broadcast(int cpu) { }
48# endif /* !BROADCAST */ 50# endif /* !BROADCAST */
49 51
50#else /* !ONESHOT */ 52#else /* !ONESHOT */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a4d219398167..8f3fc2582d38 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
156} 156}
157 157
158void tick_nohz_stop_idle(int cpu) 158static void tick_nohz_stop_idle(int cpu)
159{ 159{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
161 161
@@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle)
247 if (need_resched()) 247 if (need_resched())
248 goto end; 248 goto end;
249 249
250 if (unlikely(local_softirq_pending())) { 250 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
251 static int ratelimit; 251 static int ratelimit;
252 252
253 if (ratelimit < 10) { 253 if (ratelimit < 10) {
@@ -270,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle)
270 next_jiffies = get_next_timer_interrupt(last_jiffies); 270 next_jiffies = get_next_timer_interrupt(last_jiffies);
271 delta_jiffies = next_jiffies - last_jiffies; 271 delta_jiffies = next_jiffies - last_jiffies;
272 272
273 if (rcu_needs_cpu(cpu)) 273 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
274 delta_jiffies = 1; 274 delta_jiffies = 1;
275 /* 275 /*
276 * Do not stop the tick, if we are only one off 276 * Do not stop the tick, if we are only one off
@@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle)
282 /* Schedule the tick, if we are at least one jiffie off */ 282 /* Schedule the tick, if we are at least one jiffie off */
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 /*
286 * calculate the expiry time for the next timer wheel
287 * timer
288 */
289 expires = ktime_add_ns(last_update, tick_period.tv64 *
290 delta_jiffies);
291
292 /*
293 * If this cpu is the one which updates jiffies, then
294 * give up the assignment and let it be taken by the
295 * cpu which runs the tick timer next, which might be
296 * this cpu as well. If we don't drop this here the
297 * jiffies might be stale and do_timer() never
298 * invoked.
299 */
300 if (cpu == tick_do_timer_cpu)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302
285 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 304 cpu_set(cpu, nohz_cpu_mask);
305
306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
308 goto out;
309
287 /* 310 /*
288 * nohz_stop_sched_tick can be called several times before 311 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 312 * the nohz_restart_sched_tick is called. This happens when
@@ -300,23 +323,12 @@ void tick_nohz_stop_sched_tick(int inidle)
300 goto out; 323 goto out;
301 } 324 }
302 325
303 ts->idle_tick = ts->sched_timer.expires; 326 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
304 ts->tick_stopped = 1; 327 ts->tick_stopped = 1;
305 ts->idle_jiffies = last_jiffies; 328 ts->idle_jiffies = last_jiffies;
306 rcu_enter_nohz(); 329 rcu_enter_nohz();
307 } 330 }
308 331
309 /*
310 * If this cpu is the one which updates jiffies, then
311 * give up the assignment and let it be taken by the
312 * cpu which runs the tick timer next, which might be
313 * this cpu as well. If we don't drop this here the
314 * jiffies might be stale and do_timer() never
315 * invoked.
316 */
317 if (cpu == tick_do_timer_cpu)
318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
319
320 ts->idle_sleeps++; 332 ts->idle_sleeps++;
321 333
322 /* 334 /*
@@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle)
332 goto out; 344 goto out;
333 } 345 }
334 346
335 /* 347 /* Mark expiries */
336 * calculate the expiry time for the next timer wheel
337 * timer
338 */
339 expires = ktime_add_ns(last_update, tick_period.tv64 *
340 delta_jiffies);
341 ts->idle_expires = expires; 348 ts->idle_expires = expires;
342 349
343 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@ -377,6 +384,32 @@ ktime_t tick_nohz_get_sleep_length(void)
377 return ts->sleep_length; 384 return ts->sleep_length;
378} 385}
379 386
387static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
388{
389 hrtimer_cancel(&ts->sched_timer);
390 hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
391
392 while (1) {
393 /* Forward the time to expire in the future */
394 hrtimer_forward(&ts->sched_timer, now, tick_period);
395
396 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
397 hrtimer_start_expires(&ts->sched_timer,
398 HRTIMER_MODE_ABS);
399 /* Check, if the timer was already in the past */
400 if (hrtimer_active(&ts->sched_timer))
401 break;
402 } else {
403 if (!tick_program_event(
404 hrtimer_get_expires(&ts->sched_timer), 0))
405 break;
406 }
407 /* Update jiffies and reread time */
408 tick_do_update_jiffies64(now);
409 now = ktime_get();
410 }
411}
412
380/** 413/**
381 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 414 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
382 * 415 *
@@ -430,35 +463,16 @@ void tick_nohz_restart_sched_tick(void)
430 */ 463 */
431 ts->tick_stopped = 0; 464 ts->tick_stopped = 0;
432 ts->idle_exittime = now; 465 ts->idle_exittime = now;
433 hrtimer_cancel(&ts->sched_timer);
434 ts->sched_timer.expires = ts->idle_tick;
435 466
436 while (1) { 467 tick_nohz_restart(ts, now);
437 /* Forward the time to expire in the future */
438 hrtimer_forward(&ts->sched_timer, now, tick_period);
439 468
440 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
441 hrtimer_start(&ts->sched_timer,
442 ts->sched_timer.expires,
443 HRTIMER_MODE_ABS);
444 /* Check, if the timer was already in the past */
445 if (hrtimer_active(&ts->sched_timer))
446 break;
447 } else {
448 if (!tick_program_event(ts->sched_timer.expires, 0))
449 break;
450 }
451 /* Update jiffies and reread time */
452 tick_do_update_jiffies64(now);
453 now = ktime_get();
454 }
455 local_irq_enable(); 469 local_irq_enable();
456} 470}
457 471
458static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 472static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
459{ 473{
460 hrtimer_forward(&ts->sched_timer, now, tick_period); 474 hrtimer_forward(&ts->sched_timer, now, tick_period);
461 return tick_program_event(ts->sched_timer.expires, 0); 475 return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
462} 476}
463 477
464/* 478/*
@@ -503,10 +517,6 @@ static void tick_nohz_handler(struct clock_event_device *dev)
503 update_process_times(user_mode(regs)); 517 update_process_times(user_mode(regs));
504 profile_tick(CPU_PROFILING); 518 profile_tick(CPU_PROFILING);
505 519
506 /* Do not restart, when we are in the idle loop */
507 if (ts->tick_stopped)
508 return;
509
510 while (tick_nohz_reprogram(ts, now)) { 520 while (tick_nohz_reprogram(ts, now)) {
511 now = ktime_get(); 521 now = ktime_get();
512 tick_do_update_jiffies64(now); 522 tick_do_update_jiffies64(now);
@@ -541,7 +551,7 @@ static void tick_nohz_switch_to_nohz(void)
541 next = tick_init_jiffy_update(); 551 next = tick_init_jiffy_update();
542 552
543 for (;;) { 553 for (;;) {
544 ts->sched_timer.expires = next; 554 hrtimer_set_expires(&ts->sched_timer, next);
545 if (!tick_program_event(next, 0)) 555 if (!tick_program_event(next, 0))
546 break; 556 break;
547 next = ktime_add(next, tick_period); 557 next = ktime_add(next, tick_period);
@@ -552,6 +562,41 @@ static void tick_nohz_switch_to_nohz(void)
552 smp_processor_id()); 562 smp_processor_id());
553} 563}
554 564
565/*
566 * When NOHZ is enabled and the tick is stopped, we need to kick the
567 * tick timer from irq_enter() so that the jiffies update is kept
568 * alive during long running softirqs. That's ugly as hell, but
569 * correctness is key even if we need to fix the offending softirq in
570 * the first place.
571 *
572 * Note, this is different to tick_nohz_restart. We just kick the
573 * timer and do not touch the other magic bits which need to be done
574 * when idle is left.
575 */
576static void tick_nohz_kick_tick(int cpu)
577{
578#if 0
579 /* Switch back to 2.6.27 behaviour */
580
581 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
582 ktime_t delta, now;
583
584 if (!ts->tick_stopped)
585 return;
586
587 /*
588 * Do not touch the tick device, when the next expiry is either
589 * already reached or less/equal than the tick period.
590 */
591 now = ktime_get();
592 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
593 if (delta.tv64 <= tick_period.tv64)
594 return;
595
596 tick_nohz_restart(ts, now);
597#endif
598}
599
555#else 600#else
556 601
557static inline void tick_nohz_switch_to_nohz(void) { } 602static inline void tick_nohz_switch_to_nohz(void) { }
@@ -559,6 +604,19 @@ static inline void tick_nohz_switch_to_nohz(void) { }
559#endif /* NO_HZ */ 604#endif /* NO_HZ */
560 605
561/* 606/*
607 * Called from irq_enter to notify about the possible interruption of idle()
608 */
609void tick_check_idle(int cpu)
610{
611 tick_check_oneshot_broadcast(cpu);
612#ifdef CONFIG_NO_HZ
613 tick_nohz_stop_idle(cpu);
614 tick_nohz_update_jiffies();
615 tick_nohz_kick_tick(cpu);
616#endif
617}
618
619/*
562 * High resolution timer specific code 620 * High resolution timer specific code
563 */ 621 */
564#ifdef CONFIG_HIGH_RES_TIMERS 622#ifdef CONFIG_HIGH_RES_TIMERS
@@ -611,10 +669,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
611 profile_tick(CPU_PROFILING); 669 profile_tick(CPU_PROFILING);
612 } 670 }
613 671
614 /* Do not restart, when we are in the idle loop */
615 if (ts->tick_stopped)
616 return HRTIMER_NORESTART;
617
618 hrtimer_forward(timer, now, tick_period); 672 hrtimer_forward(timer, now, tick_period);
619 673
620 return HRTIMER_RESTART; 674 return HRTIMER_RESTART;
@@ -634,19 +688,17 @@ void tick_setup_sched_timer(void)
634 */ 688 */
635 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 689 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
636 ts->sched_timer.function = tick_sched_timer; 690 ts->sched_timer.function = tick_sched_timer;
637 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
638 691
639 /* Get the next period (per cpu) */ 692 /* Get the next period (per cpu) */
640 ts->sched_timer.expires = tick_init_jiffy_update(); 693 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
641 offset = ktime_to_ns(tick_period) >> 1; 694 offset = ktime_to_ns(tick_period) >> 1;
642 do_div(offset, num_possible_cpus()); 695 do_div(offset, num_possible_cpus());
643 offset *= smp_processor_id(); 696 offset *= smp_processor_id();
644 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); 697 hrtimer_add_expires_ns(&ts->sched_timer, offset);
645 698
646 for (;;) { 699 for (;;) {
647 hrtimer_forward(&ts->sched_timer, now, tick_period); 700 hrtimer_forward(&ts->sched_timer, now, tick_period);
648 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, 701 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
649 HRTIMER_MODE_ABS);
650 /* Check, if the timer was already in the past */ 702 /* Check, if the timer was already in the past */
651 if (hrtimer_active(&ts->sched_timer)) 703 if (hrtimer_active(&ts->sched_timer))
652 break; 704 break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e91c29f961c9..fa05e88aa76f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,27 +58,26 @@ struct clocksource *clock;
58 58
59#ifdef CONFIG_GENERIC_TIME 59#ifdef CONFIG_GENERIC_TIME
60/** 60/**
61 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook 61 * clocksource_forward_now - update clock to the current time
62 * 62 *
63 * private function, must hold xtime_lock lock when being 63 * Forward the current clock to update its state since the last call to
64 * called. Returns the number of nanoseconds since the 64 * update_wall_time(). This is useful before significant clock changes,
65 * last call to update_wall_time() (adjusted by NTP scaling) 65 * as it avoids having to deal with this time offset explicitly.
66 */ 66 */
67static inline s64 __get_nsec_offset(void) 67static void clocksource_forward_now(void)
68{ 68{
69 cycle_t cycle_now, cycle_delta; 69 cycle_t cycle_now, cycle_delta;
70 s64 ns_offset; 70 s64 nsec;
71 71
72 /* read clocksource: */
73 cycle_now = clocksource_read(clock); 72 cycle_now = clocksource_read(clock);
74
75 /* calculate the delta since the last update_wall_time: */
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 73 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
74 clock->cycle_last = cycle_now;
77 75
78 /* convert to nanoseconds: */ 76 nsec = cyc2ns(clock, cycle_delta);
79 ns_offset = cyc2ns(clock, cycle_delta); 77 timespec_add_ns(&xtime, nsec);
80 78
81 return ns_offset; 79 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
80 clock->raw_time.tv_nsec += nsec;
82} 81}
83 82
84/** 83/**
@@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void)
89 */ 88 */
90void getnstimeofday(struct timespec *ts) 89void getnstimeofday(struct timespec *ts)
91{ 90{
91 cycle_t cycle_now, cycle_delta;
92 unsigned long seq; 92 unsigned long seq;
93 s64 nsecs; 93 s64 nsecs;
94 94
@@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts)
96 seq = read_seqbegin(&xtime_lock); 96 seq = read_seqbegin(&xtime_lock);
97 97
98 *ts = xtime; 98 *ts = xtime;
99 nsecs = __get_nsec_offset(); 99
100 /* read clocksource: */
101 cycle_now = clocksource_read(clock);
102
103 /* calculate the delta since the last update_wall_time: */
104 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
105
106 /* convert to nanoseconds: */
107 nsecs = cyc2ns(clock, cycle_delta);
100 108
101 } while (read_seqretry(&xtime_lock, seq)); 109 } while (read_seqretry(&xtime_lock, seq));
102 110
@@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday);
129 */ 137 */
130int do_settimeofday(struct timespec *tv) 138int do_settimeofday(struct timespec *tv)
131{ 139{
140 struct timespec ts_delta;
132 unsigned long flags; 141 unsigned long flags;
133 time_t wtm_sec, sec = tv->tv_sec;
134 long wtm_nsec, nsec = tv->tv_nsec;
135 142
136 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 143 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
137 return -EINVAL; 144 return -EINVAL;
138 145
139 write_seqlock_irqsave(&xtime_lock, flags); 146 write_seqlock_irqsave(&xtime_lock, flags);
140 147
141 nsec -= __get_nsec_offset(); 148 clocksource_forward_now();
142 149
143 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); 150 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
144 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); 151 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
152 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
153
154 xtime = *tv;
145 155
146 set_normalized_timespec(&xtime, sec, nsec);
147 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
148 update_xtime_cache(0); 156 update_xtime_cache(0);
149 157
150 clock->error = 0; 158 clock->error = 0;
@@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday);
170static void change_clocksource(void) 178static void change_clocksource(void)
171{ 179{
172 struct clocksource *new; 180 struct clocksource *new;
173 cycle_t now;
174 u64 nsec;
175 181
176 new = clocksource_get_next(); 182 new = clocksource_get_next();
177 183
178 if (clock == new) 184 if (clock == new)
179 return; 185 return;
180 186
181 new->cycle_last = 0; 187 clocksource_forward_now();
182 now = clocksource_read(new);
183 nsec = __get_nsec_offset();
184 timespec_add_ns(&xtime, nsec);
185 188
186 clock = new; 189 new->raw_time = clock->raw_time;
187 clock->cycle_last = now;
188 190
191 clock = new;
192 clock->cycle_last = 0;
193 clock->cycle_last = clocksource_read(new);
189 clock->error = 0; 194 clock->error = 0;
190 clock->xtime_nsec = 0; 195 clock->xtime_nsec = 0;
191 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 196 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -200,11 +205,44 @@ static void change_clocksource(void)
200 */ 205 */
201} 206}
202#else 207#else
208static inline void clocksource_forward_now(void) { }
203static inline void change_clocksource(void) { } 209static inline void change_clocksource(void) { }
204static inline s64 __get_nsec_offset(void) { return 0; }
205#endif 210#endif
206 211
207/** 212/**
213 * getrawmonotonic - Returns the raw monotonic time in a timespec
214 * @ts: pointer to the timespec to be set
215 *
216 * Returns the raw monotonic time (completely un-modified by ntp)
217 */
218void getrawmonotonic(struct timespec *ts)
219{
220 unsigned long seq;
221 s64 nsecs;
222 cycle_t cycle_now, cycle_delta;
223
224 do {
225 seq = read_seqbegin(&xtime_lock);
226
227 /* read clocksource: */
228 cycle_now = clocksource_read(clock);
229
230 /* calculate the delta since the last update_wall_time: */
231 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
232
233 /* convert to nanoseconds: */
234 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
235
236 *ts = clock->raw_time;
237
238 } while (read_seqretry(&xtime_lock, seq));
239
240 timespec_add_ns(ts, nsecs);
241}
242EXPORT_SYMBOL(getrawmonotonic);
243
244
245/**
208 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 246 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
209 */ 247 */
210int timekeeping_valid_for_hres(void) 248int timekeeping_valid_for_hres(void)
@@ -265,8 +303,6 @@ void __init timekeeping_init(void)
265static int timekeeping_suspended; 303static int timekeeping_suspended;
266/* time in seconds when suspend began */ 304/* time in seconds when suspend began */
267static unsigned long timekeeping_suspend_time; 305static unsigned long timekeeping_suspend_time;
268/* xtime offset when we went into suspend */
269static s64 timekeeping_suspend_nsecs;
270 306
271/** 307/**
272 * timekeeping_resume - Resumes the generic timekeeping subsystem. 308 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev)
292 wall_to_monotonic.tv_sec -= sleep_length; 328 wall_to_monotonic.tv_sec -= sleep_length;
293 total_sleep_time += sleep_length; 329 total_sleep_time += sleep_length;
294 } 330 }
295 /* Make sure that we have the correct xtime reference */
296 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
297 update_xtime_cache(0); 331 update_xtime_cache(0);
298 /* re-base the last cycle value */ 332 /* re-base the last cycle value */
299 clock->cycle_last = 0; 333 clock->cycle_last = 0;
@@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
319 timekeeping_suspend_time = read_persistent_clock(); 353 timekeeping_suspend_time = read_persistent_clock();
320 354
321 write_seqlock_irqsave(&xtime_lock, flags); 355 write_seqlock_irqsave(&xtime_lock, flags);
322 /* Get the current xtime offset */ 356 clocksource_forward_now();
323 timekeeping_suspend_nsecs = __get_nsec_offset();
324 timekeeping_suspended = 1; 357 timekeeping_suspended = 1;
325 write_sequnlock_irqrestore(&xtime_lock, flags); 358 write_sequnlock_irqrestore(&xtime_lock, flags);
326 359
@@ -454,23 +487,29 @@ void update_wall_time(void)
454#else 487#else
455 offset = clock->cycle_interval; 488 offset = clock->cycle_interval;
456#endif 489#endif
457 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; 490 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
458 491
459 /* normally this loop will run just once, however in the 492 /* normally this loop will run just once, however in the
460 * case of lost or late ticks, it will accumulate correctly. 493 * case of lost or late ticks, it will accumulate correctly.
461 */ 494 */
462 while (offset >= clock->cycle_interval) { 495 while (offset >= clock->cycle_interval) {
463 /* accumulate one interval */ 496 /* accumulate one interval */
464 clock->xtime_nsec += clock->xtime_interval;
465 clock->cycle_last += clock->cycle_interval;
466 offset -= clock->cycle_interval; 497 offset -= clock->cycle_interval;
498 clock->cycle_last += clock->cycle_interval;
467 499
500 clock->xtime_nsec += clock->xtime_interval;
468 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 501 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
469 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 502 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
470 xtime.tv_sec++; 503 xtime.tv_sec++;
471 second_overflow(); 504 second_overflow();
472 } 505 }
473 506
507 clock->raw_time.tv_nsec += clock->raw_interval;
508 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
509 clock->raw_time.tv_nsec -= NSEC_PER_SEC;
510 clock->raw_time.tv_sec++;
511 }
512
474 /* accumulate error between NTP and clock interval */ 513 /* accumulate error between NTP and clock interval */
475 clock->error += tick_length; 514 clock->error += tick_length;
476 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 515 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@ -479,9 +518,34 @@ void update_wall_time(void)
479 /* correct the clock when NTP error is too big */ 518 /* correct the clock when NTP error is too big */
480 clocksource_adjust(offset); 519 clocksource_adjust(offset);
481 520
482 /* store full nanoseconds into xtime */ 521 /*
483 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; 522 * Since in the loop above, we accumulate any amount of time
523 * in xtime_nsec over a second into xtime.tv_sec, its possible for
524 * xtime_nsec to be fairly small after the loop. Further, if we're
525 * slightly speeding the clocksource up in clocksource_adjust(),
526 * its possible the required corrective factor to xtime_nsec could
527 * cause it to underflow.
528 *
529 * Now, we cannot simply roll the accumulated second back, since
530 * the NTP subsystem has been notified via second_overflow. So
531 * instead we push xtime_nsec forward by the amount we underflowed,
532 * and add that amount into the error.
533 *
534 * We'll correct this error next time through this function, when
535 * xtime_nsec is not as small.
536 */
537 if (unlikely((s64)clock->xtime_nsec < 0)) {
538 s64 neg = -(s64)clock->xtime_nsec;
539 clock->xtime_nsec = 0;
540 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
541 }
542
543 /* store full nanoseconds into xtime after rounding it up and
544 * add the remainder to the error difference.
545 */
546 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
484 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 547 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
548 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
485 549
486 update_xtime_cache(cyc2ns(clock, offset)); 550 update_xtime_cache(cyc2ns(clock, offset));
487 551
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a40e20fd0001..a999b92a1277 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym)
47} 47}
48 48
49static void 49static void
50print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) 50print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
51 int idx, u64 now)
51{ 52{
52#ifdef CONFIG_TIMER_STATS 53#ifdef CONFIG_TIMER_STATS
53 char tmp[TASK_COMM_LEN + 1]; 54 char tmp[TASK_COMM_LEN + 1];
54#endif 55#endif
55 SEQ_printf(m, " #%d: ", idx); 56 SEQ_printf(m, " #%d: ", idx);
56 print_name_offset(m, timer); 57 print_name_offset(m, taddr);
57 SEQ_printf(m, ", "); 58 SEQ_printf(m, ", ");
58 print_name_offset(m, timer->function); 59 print_name_offset(m, timer->function);
59 SEQ_printf(m, ", S:%02lx", timer->state); 60 SEQ_printf(m, ", S:%02lx", timer->state);
@@ -65,9 +66,11 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); 66 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
66#endif 67#endif
67 SEQ_printf(m, "\n"); 68 SEQ_printf(m, "\n");
68 SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", 69 SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
69 (unsigned long long)ktime_to_ns(timer->expires), 70 (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
70 (long long)(ktime_to_ns(timer->expires) - now)); 71 (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
72 (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
73 (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
71} 74}
72 75
73static void 76static void
@@ -99,7 +102,7 @@ next_one:
99 tmp = *timer; 102 tmp = *timer;
100 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
101 104
102 print_timer(m, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
103 next++; 106 next++;
104 goto next_one; 107 goto next_one;
105 } 108 }
@@ -109,6 +112,7 @@ next_one:
109static void 112static void
110print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
111{ 114{
115 SEQ_printf(m, " .base: %p\n", base);
112 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
113 base->index); 117 base->index);
114 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
@@ -183,12 +187,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
183 187
184#ifdef CONFIG_GENERIC_CLOCKEVENTS 188#ifdef CONFIG_GENERIC_CLOCKEVENTS
185static void 189static void
186print_tickdevice(struct seq_file *m, struct tick_device *td) 190print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
187{ 191{
188 struct clock_event_device *dev = td->evtdev; 192 struct clock_event_device *dev = td->evtdev;
189 193
190 SEQ_printf(m, "\n"); 194 SEQ_printf(m, "\n");
191 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 195 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
196 if (cpu < 0)
197 SEQ_printf(m, "Broadcast device\n");
198 else
199 SEQ_printf(m, "Per CPU device: %d\n", cpu);
192 200
193 SEQ_printf(m, "Clock Event Device: "); 201 SEQ_printf(m, "Clock Event Device: ");
194 if (!dev) { 202 if (!dev) {
@@ -222,7 +230,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
222 int cpu; 230 int cpu;
223 231
224#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 232#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
225 print_tickdevice(m, tick_get_broadcast_device()); 233 print_tickdevice(m, tick_get_broadcast_device(), -1);
226 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 234 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
227 tick_get_broadcast_mask()->bits[0]); 235 tick_get_broadcast_mask()->bits[0]);
228#ifdef CONFIG_TICK_ONESHOT 236#ifdef CONFIG_TICK_ONESHOT
@@ -232,7 +240,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
232 SEQ_printf(m, "\n"); 240 SEQ_printf(m, "\n");
233#endif 241#endif
234 for_each_online_cpu(cpu) 242 for_each_online_cpu(cpu)
235 print_tickdevice(m, tick_get_device(cpu)); 243 print_tickdevice(m, tick_get_device(cpu), cpu);
236 SEQ_printf(m, "\n"); 244 SEQ_printf(m, "\n");
237} 245}
238#else 246#else
@@ -244,7 +252,7 @@ static int timer_list_show(struct seq_file *m, void *v)
244 u64 now = ktime_to_ns(ktime_get()); 252 u64 now = ktime_to_ns(ktime_get());
245 int cpu; 253 int cpu;
246 254
247 SEQ_printf(m, "Timer List Version: v0.3\n"); 255 SEQ_printf(m, "Timer List Version: v0.4\n");
248 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
249 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
250 258
diff --git a/kernel/timer.c b/kernel/timer.c
index 03bc7f1f1593..566257d1dc10 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
112 tbase_get_deferrable(timer->base)); 112 tbase_get_deferrable(timer->base));
113} 113}
114 114
115/** 115static unsigned long round_jiffies_common(unsigned long j, int cpu,
116 * __round_jiffies - function to round jiffies to a full second 116 bool force_up)
117 * @j: the time in (absolute) jiffies that should be rounded
118 * @cpu: the processor number on which the timeout will happen
119 *
120 * __round_jiffies() rounds an absolute time in the future (in jiffies)
121 * up or down to (approximately) full seconds. This is useful for timers
122 * for which the exact time they fire does not matter too much, as long as
123 * they fire approximately every X seconds.
124 *
125 * By rounding these timers to whole seconds, all such timers will fire
126 * at the same time, rather than at various times spread out. The goal
127 * of this is to have the CPU wake up less, which saves power.
128 *
129 * The exact rounding is skewed for each processor to avoid all
130 * processors firing at the exact same time, which could lead
131 * to lock contention or spurious cache line bouncing.
132 *
133 * The return value is the rounded version of the @j parameter.
134 */
135unsigned long __round_jiffies(unsigned long j, int cpu)
136{ 117{
137 int rem; 118 int rem;
138 unsigned long original = j; 119 unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
154 * due to delays of the timer irq, long irq off times etc etc) then 135 * due to delays of the timer irq, long irq off times etc etc) then
155 * we should round down to the whole second, not up. Use 1/4th second 136 * we should round down to the whole second, not up. Use 1/4th second
156 * as cutoff for this rounding as an extreme upper bound for this. 137 * as cutoff for this rounding as an extreme upper bound for this.
138 * But never round down if @force_up is set.
157 */ 139 */
158 if (rem < HZ/4) /* round down */ 140 if (rem < HZ/4 && !force_up) /* round down */
159 j = j - rem; 141 j = j - rem;
160 else /* round up */ 142 else /* round up */
161 j = j - rem + HZ; 143 j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
167 return original; 149 return original;
168 return j; 150 return j;
169} 151}
152
153/**
154 * __round_jiffies - function to round jiffies to a full second
155 * @j: the time in (absolute) jiffies that should be rounded
156 * @cpu: the processor number on which the timeout will happen
157 *
158 * __round_jiffies() rounds an absolute time in the future (in jiffies)
159 * up or down to (approximately) full seconds. This is useful for timers
160 * for which the exact time they fire does not matter too much, as long as
161 * they fire approximately every X seconds.
162 *
163 * By rounding these timers to whole seconds, all such timers will fire
164 * at the same time, rather than at various times spread out. The goal
165 * of this is to have the CPU wake up less, which saves power.
166 *
167 * The exact rounding is skewed for each processor to avoid all
168 * processors firing at the exact same time, which could lead
169 * to lock contention or spurious cache line bouncing.
170 *
171 * The return value is the rounded version of the @j parameter.
172 */
173unsigned long __round_jiffies(unsigned long j, int cpu)
174{
175 return round_jiffies_common(j, cpu, false);
176}
170EXPORT_SYMBOL_GPL(__round_jiffies); 177EXPORT_SYMBOL_GPL(__round_jiffies);
171 178
172/** 179/**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
191 */ 198 */
192unsigned long __round_jiffies_relative(unsigned long j, int cpu) 199unsigned long __round_jiffies_relative(unsigned long j, int cpu)
193{ 200{
194 /* 201 unsigned long j0 = jiffies;
195 * In theory the following code can skip a jiffy in case jiffies 202
196 * increments right between the addition and the later subtraction. 203 /* Use j0 because jiffies might change while we run */
197 * However since the entire point of this function is to use approximate 204 return round_jiffies_common(j + j0, cpu, false) - j0;
198 * timeouts, it's entirely ok to not handle that.
199 */
200 return __round_jiffies(j + jiffies, cpu) - jiffies;
201} 205}
202EXPORT_SYMBOL_GPL(__round_jiffies_relative); 206EXPORT_SYMBOL_GPL(__round_jiffies_relative);
203 207
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
218 */ 222 */
219unsigned long round_jiffies(unsigned long j) 223unsigned long round_jiffies(unsigned long j)
220{ 224{
221 return __round_jiffies(j, raw_smp_processor_id()); 225 return round_jiffies_common(j, raw_smp_processor_id(), false);
222} 226}
223EXPORT_SYMBOL_GPL(round_jiffies); 227EXPORT_SYMBOL_GPL(round_jiffies);
224 228
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
243} 247}
244EXPORT_SYMBOL_GPL(round_jiffies_relative); 248EXPORT_SYMBOL_GPL(round_jiffies_relative);
245 249
250/**
251 * __round_jiffies_up - function to round jiffies up to a full second
252 * @j: the time in (absolute) jiffies that should be rounded
253 * @cpu: the processor number on which the timeout will happen
254 *
255 * This is the same as __round_jiffies() except that it will never
256 * round down. This is useful for timeouts for which the exact time
257 * of firing does not matter too much, as long as they don't fire too
258 * early.
259 */
260unsigned long __round_jiffies_up(unsigned long j, int cpu)
261{
262 return round_jiffies_common(j, cpu, true);
263}
264EXPORT_SYMBOL_GPL(__round_jiffies_up);
265
266/**
267 * __round_jiffies_up_relative - function to round jiffies up to a full second
268 * @j: the time in (relative) jiffies that should be rounded
269 * @cpu: the processor number on which the timeout will happen
270 *
271 * This is the same as __round_jiffies_relative() except that it will never
272 * round down. This is useful for timeouts for which the exact time
273 * of firing does not matter too much, as long as they don't fire too
274 * early.
275 */
276unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
277{
278 unsigned long j0 = jiffies;
279
280 /* Use j0 because jiffies might change while we run */
281 return round_jiffies_common(j + j0, cpu, true) - j0;
282}
283EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
284
285/**
286 * round_jiffies_up - function to round jiffies up to a full second
287 * @j: the time in (absolute) jiffies that should be rounded
288 *
289 * This is the same as round_jiffies() except that it will never
290 * round down. This is useful for timeouts for which the exact time
291 * of firing does not matter too much, as long as they don't fire too
292 * early.
293 */
294unsigned long round_jiffies_up(unsigned long j)
295{
296 return round_jiffies_common(j, raw_smp_processor_id(), true);
297}
298EXPORT_SYMBOL_GPL(round_jiffies_up);
299
300/**
301 * round_jiffies_up_relative - function to round jiffies up to a full second
302 * @j: the time in (relative) jiffies that should be rounded
303 *
304 * This is the same as round_jiffies_relative() except that it will never
305 * round down. This is useful for timeouts for which the exact time
306 * of firing does not matter too much, as long as they don't fire too
307 * early.
308 */
309unsigned long round_jiffies_up_relative(unsigned long j)
310{
311 return __round_jiffies_up_relative(j, raw_smp_processor_id());
312}
313EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
314
246 315
247static inline void set_running_timer(struct tvec_base *base, 316static inline void set_running_timer(struct tvec_base *base,
248 struct timer_list *timer) 317 struct timer_list *timer)
@@ -978,6 +1047,7 @@ void update_process_times(int user_tick)
978 run_local_timers(); 1047 run_local_timers();
979 if (rcu_pending(cpu)) 1048 if (rcu_pending(cpu))
980 rcu_check_callbacks(cpu, user_tick); 1049 rcu_check_callbacks(cpu, user_tick);
1050 printk_tick();
981 scheduler_tick(); 1051 scheduler_tick();
982 run_posix_cpu_timers(p); 1052 run_posix_cpu_timers(p);
983} 1053}
@@ -1122,25 +1192,25 @@ asmlinkage long sys_getppid(void)
1122asmlinkage long sys_getuid(void) 1192asmlinkage long sys_getuid(void)
1123{ 1193{
1124 /* Only we change this so SMP safe */ 1194 /* Only we change this so SMP safe */
1125 return current->uid; 1195 return current_uid();
1126} 1196}
1127 1197
1128asmlinkage long sys_geteuid(void) 1198asmlinkage long sys_geteuid(void)
1129{ 1199{
1130 /* Only we change this so SMP safe */ 1200 /* Only we change this so SMP safe */
1131 return current->euid; 1201 return current_euid();
1132} 1202}
1133 1203
1134asmlinkage long sys_getgid(void) 1204asmlinkage long sys_getgid(void)
1135{ 1205{
1136 /* Only we change this so SMP safe */ 1206 /* Only we change this so SMP safe */
1137 return current->gid; 1207 return current_gid();
1138} 1208}
1139 1209
1140asmlinkage long sys_getegid(void) 1210asmlinkage long sys_getegid(void)
1141{ 1211{
1142 /* Only we change this so SMP safe */ 1212 /* Only we change this so SMP safe */
1143 return current->egid; 1213 return current_egid();
1144} 1214}
1145 1215
1146#endif 1216#endif
@@ -1435,9 +1505,11 @@ static void __cpuinit migrate_timers(int cpu)
1435 BUG_ON(cpu_online(cpu)); 1505 BUG_ON(cpu_online(cpu));
1436 old_base = per_cpu(tvec_bases, cpu); 1506 old_base = per_cpu(tvec_bases, cpu);
1437 new_base = get_cpu_var(tvec_bases); 1507 new_base = get_cpu_var(tvec_bases);
1438 1508 /*
1439 local_irq_disable(); 1509 * The caller is globally serialized and nobody else
1440 spin_lock(&new_base->lock); 1510 * takes two locks at once, deadlock is not possible.
1511 */
1512 spin_lock_irq(&new_base->lock);
1441 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1513 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1442 1514
1443 BUG_ON(old_base->running_timer); 1515 BUG_ON(old_base->running_timer);
@@ -1452,8 +1524,7 @@ static void __cpuinit migrate_timers(int cpu)
1452 } 1524 }
1453 1525
1454 spin_unlock(&old_base->lock); 1526 spin_unlock(&old_base->lock);
1455 spin_unlock(&new_base->lock); 1527 spin_unlock_irq(&new_base->lock);
1456 local_irq_enable();
1457 put_cpu_var(tvec_bases); 1528 put_cpu_var(tvec_bases);
1458} 1529}
1459#endif /* CONFIG_HOTPLUG_CPU */ 1530#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 263e9e6bbd60..e2a4ff6fc3a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,23 +1,56 @@
1# 1#
2# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: 2# Architectures that offer an FUNCTION_TRACER implementation should
3# select HAVE_FUNCTION_TRACER:
3# 4#
4config HAVE_FTRACE 5
6config USER_STACKTRACE_SUPPORT
7 bool
8
9config NOP_TRACER
10 bool
11
12config HAVE_FUNCTION_TRACER
13 bool
14
15config HAVE_FUNCTION_GRAPH_TRACER
16 bool
17
18config HAVE_FUNCTION_TRACE_MCOUNT_TEST
5 bool 19 bool
20 help
21 This gets selected when the arch tests the function_trace_stop
22 variable at the mcount call site. Otherwise, this variable
23 is tested by the called function.
6 24
7config HAVE_DYNAMIC_FTRACE 25config HAVE_DYNAMIC_FTRACE
8 bool 26 bool
9 27
28config HAVE_FTRACE_MCOUNT_RECORD
29 bool
30
31config HAVE_HW_BRANCH_TRACER
32 bool
33
10config TRACER_MAX_TRACE 34config TRACER_MAX_TRACE
11 bool 35 bool
12 36
37config RING_BUFFER
38 bool
39
13config TRACING 40config TRACING
14 bool 41 bool
15 select DEBUG_FS 42 select DEBUG_FS
16 select STACKTRACE 43 select RING_BUFFER
44 select STACKTRACE if STACKTRACE_SUPPORT
45 select TRACEPOINTS
46 select NOP_TRACER
47
48menu "Tracers"
17 49
18config FTRACE 50config FUNCTION_TRACER
19 bool "Kernel Function Tracer" 51 bool "Kernel Function Tracer"
20 depends on HAVE_FTRACE 52 depends on HAVE_FUNCTION_TRACER
53 depends on DEBUG_KERNEL
21 select FRAME_POINTER 54 select FRAME_POINTER
22 select TRACING 55 select TRACING
23 select CONTEXT_SWITCH_TRACER 56 select CONTEXT_SWITCH_TRACER
@@ -30,12 +63,26 @@ config FTRACE
30 (the bootup default), then the overhead of the instructions is very 63 (the bootup default), then the overhead of the instructions is very
31 small and not measurable even in micro-benchmarks. 64 small and not measurable even in micro-benchmarks.
32 65
66config FUNCTION_GRAPH_TRACER
67 bool "Kernel Function Graph Tracer"
68 depends on HAVE_FUNCTION_GRAPH_TRACER
69 depends on FUNCTION_TRACER
70 default y
71 help
72 Enable the kernel to trace a function at both its return
73 and its entry.
74 It's first purpose is to trace the duration of functions and
75 draw a call graph for each thread with some informations like
76 the return value.
77 This is done by setting the current return address on the current
78 task structure into a stack of calls.
79
33config IRQSOFF_TRACER 80config IRQSOFF_TRACER
34 bool "Interrupts-off Latency Tracer" 81 bool "Interrupts-off Latency Tracer"
35 default n 82 default n
36 depends on TRACE_IRQFLAGS_SUPPORT 83 depends on TRACE_IRQFLAGS_SUPPORT
37 depends on GENERIC_TIME 84 depends on GENERIC_TIME
38 depends on HAVE_FTRACE 85 depends on DEBUG_KERNEL
39 select TRACE_IRQFLAGS 86 select TRACE_IRQFLAGS
40 select TRACING 87 select TRACING
41 select TRACER_MAX_TRACE 88 select TRACER_MAX_TRACE
@@ -58,7 +105,7 @@ config PREEMPT_TRACER
58 default n 105 default n
59 depends on GENERIC_TIME 106 depends on GENERIC_TIME
60 depends on PREEMPT 107 depends on PREEMPT
61 depends on HAVE_FTRACE 108 depends on DEBUG_KERNEL
62 select TRACING 109 select TRACING
63 select TRACER_MAX_TRACE 110 select TRACER_MAX_TRACE
64 help 111 help
@@ -85,7 +132,7 @@ config SYSPROF_TRACER
85 132
86config SCHED_TRACER 133config SCHED_TRACER
87 bool "Scheduling Latency Tracer" 134 bool "Scheduling Latency Tracer"
88 depends on HAVE_FTRACE 135 depends on DEBUG_KERNEL
89 select TRACING 136 select TRACING
90 select CONTEXT_SWITCH_TRACER 137 select CONTEXT_SWITCH_TRACER
91 select TRACER_MAX_TRACE 138 select TRACER_MAX_TRACE
@@ -95,17 +142,133 @@ config SCHED_TRACER
95 142
96config CONTEXT_SWITCH_TRACER 143config CONTEXT_SWITCH_TRACER
97 bool "Trace process context switches" 144 bool "Trace process context switches"
98 depends on HAVE_FTRACE 145 depends on DEBUG_KERNEL
99 select TRACING 146 select TRACING
100 select MARKERS 147 select MARKERS
101 help 148 help
102 This tracer gets called from the context switch and records 149 This tracer gets called from the context switch and records
103 all switching of tasks. 150 all switching of tasks.
104 151
152config BOOT_TRACER
153 bool "Trace boot initcalls"
154 depends on DEBUG_KERNEL
155 select TRACING
156 select CONTEXT_SWITCH_TRACER
157 help
158 This tracer helps developers to optimize boot times: it records
159 the timings of the initcalls and traces key events and the identity
160 of tasks that can cause boot delays, such as context-switches.
161
162 Its aim is to be parsed by the /scripts/bootgraph.pl tool to
163 produce pretty graphics about boot inefficiencies, giving a visual
164 representation of the delays during initcalls - but the raw
165 /debug/tracing/trace text output is readable too.
166
167 ( Note that tracing self tests can't be enabled if this tracer is
168 selected, because the self-tests are an initcall as well and that
169 would invalidate the boot trace. )
170
171config TRACE_BRANCH_PROFILING
172 bool "Trace likely/unlikely profiler"
173 depends on DEBUG_KERNEL
174 select TRACING
175 help
176 This tracer profiles all the the likely and unlikely macros
177 in the kernel. It will display the results in:
178
179 /debugfs/tracing/profile_annotated_branch
180
181 Note: this will add a significant overhead, only turn this
182 on if you need to profile the system's use of these macros.
183
184 Say N if unsure.
185
186config PROFILE_ALL_BRANCHES
187 bool "Profile all if conditionals"
188 depends on TRACE_BRANCH_PROFILING
189 help
190 This tracer profiles all branch conditions. Every if ()
191 taken in the kernel is recorded whether it hit or miss.
192 The results will be displayed in:
193
194 /debugfs/tracing/profile_branch
195
196 This configuration, when enabled, will impose a great overhead
197 on the system. This should only be enabled when the system
198 is to be analyzed
199
200 Say N if unsure.
201
202config TRACING_BRANCHES
203 bool
204 help
205 Selected by tracers that will trace the likely and unlikely
206 conditions. This prevents the tracers themselves from being
207 profiled. Profiling the tracing infrastructure can only happen
208 when the likelys and unlikelys are not being traced.
209
210config BRANCH_TRACER
211 bool "Trace likely/unlikely instances"
212 depends on TRACE_BRANCH_PROFILING
213 select TRACING_BRANCHES
214 help
215 This traces the events of likely and unlikely condition
216 calls in the kernel. The difference between this and the
217 "Trace likely/unlikely profiler" is that this is not a
218 histogram of the callers, but actually places the calling
219 events into a running trace buffer to see when and where the
220 events happened, as well as their results.
221
222 Say N if unsure.
223
224config POWER_TRACER
225 bool "Trace power consumption behavior"
226 depends on DEBUG_KERNEL
227 depends on X86
228 select TRACING
229 help
230 This tracer helps developers to analyze and optimize the kernels
231 power management decisions, specifically the C-state and P-state
232 behavior.
233
234
235config STACK_TRACER
236 bool "Trace max stack"
237 depends on HAVE_FUNCTION_TRACER
238 depends on DEBUG_KERNEL
239 select FUNCTION_TRACER
240 select STACKTRACE
241 help
242 This special tracer records the maximum stack footprint of the
243 kernel and displays it in debugfs/tracing/stack_trace.
244
245 This tracer works by hooking into every function call that the
246 kernel executes, and keeping a maximum stack depth value and
247 stack-trace saved. If this is configured with DYNAMIC_FTRACE
248 then it will not have any overhead while the stack tracer
249 is disabled.
250
251 To enable the stack tracer on bootup, pass in 'stacktrace'
252 on the kernel command line.
253
254 The stack tracer can also be enabled or disabled via the
255 sysctl kernel.stack_tracer_enabled
256
257 Say N if unsure.
258
259config HW_BRANCH_TRACER
260 depends on HAVE_HW_BRANCH_TRACER
261 bool "Trace hw branches"
262 select TRACING
263 help
264 This tracer records all branches on the system in a circular
265 buffer giving access to the last N branches for each cpu.
266
105config DYNAMIC_FTRACE 267config DYNAMIC_FTRACE
106 bool "enable/disable ftrace tracepoints dynamically" 268 bool "enable/disable ftrace tracepoints dynamically"
107 depends on FTRACE 269 depends on FUNCTION_TRACER
108 depends on HAVE_DYNAMIC_FTRACE 270 depends on HAVE_DYNAMIC_FTRACE
271 depends on DEBUG_KERNEL
109 default y 272 default y
110 help 273 help
111 This option will modify all the calls to ftrace dynamically 274 This option will modify all the calls to ftrace dynamically
@@ -113,7 +276,7 @@ config DYNAMIC_FTRACE
113 with a No-Op instruction) as they are called. A table is 276 with a No-Op instruction) as they are called. A table is
114 created to dynamically enable them again. 277 created to dynamically enable them again.
115 278
116 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise 279 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
117 has native performance as long as no tracing is active. 280 has native performance as long as no tracing is active.
118 281
119 The changes to the code are done by a kernel thread that 282 The changes to the code are done by a kernel thread that
@@ -121,15 +284,22 @@ config DYNAMIC_FTRACE
121 were made. If so, it runs stop_machine (stops all CPUS) 284 were made. If so, it runs stop_machine (stops all CPUS)
122 and modifies the code to jump over the call to ftrace. 285 and modifies the code to jump over the call to ftrace.
123 286
287config FTRACE_MCOUNT_RECORD
288 def_bool y
289 depends on DYNAMIC_FTRACE
290 depends on HAVE_FTRACE_MCOUNT_RECORD
291
124config FTRACE_SELFTEST 292config FTRACE_SELFTEST
125 bool 293 bool
126 294
127config FTRACE_STARTUP_TEST 295config FTRACE_STARTUP_TEST
128 bool "Perform a startup test on ftrace" 296 bool "Perform a startup test on ftrace"
129 depends on TRACING 297 depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
130 select FTRACE_SELFTEST 298 select FTRACE_SELFTEST
131 help 299 help
132 This option performs a series of startup tests on ftrace. On bootup 300 This option performs a series of startup tests on ftrace. On bootup
133 a series of tests are made to verify that the tracer is 301 a series of tests are made to verify that the tracer is
134 functioning properly. It will do tests on all the configured 302 functioning properly. It will do tests on all the configured
135 tracers of ftrace. 303 tracers of ftrace.
304
305endmenu
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 71d17de17288..349d5a93653f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,7 +1,7 @@
1 1
2# Do not instrument the tracer itself: 2# Do not instrument the tracer itself:
3 3
4ifdef CONFIG_FTRACE 4ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
7 7
@@ -10,15 +10,28 @@ CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13obj-$(CONFIG_FTRACE) += libftrace.o 13# If unlikely tracing is enabled, do not trace these files
14ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif
17
18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
14 20
15obj-$(CONFIG_TRACING) += trace.o 21obj-$(CONFIG_TRACING) += trace.o
16obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 22obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
17obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o 23obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
18obj-$(CONFIG_FTRACE) += trace_functions.o 24obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
19obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 25obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
20obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 26obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
21obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o 27obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
28obj-$(CONFIG_NOP_TRACER) += trace_nop.o
29obj-$(CONFIG_STACK_TRACER) += trace_stack.o
22obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 30obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
32obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o
23 36
24libftrace-y := ftrace.o 37libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6e3af31b403..2f32969c09df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -25,17 +25,35 @@
25#include <linux/ftrace.h> 25#include <linux/ftrace.h>
26#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/ctype.h> 27#include <linux/ctype.h>
28#include <linux/hash.h>
29#include <linux/list.h> 28#include <linux/list.h>
30 29
31#include <asm/ftrace.h> 30#include <asm/ftrace.h>
32 31
33#include "trace.h" 32#include "trace.h"
34 33
34#define FTRACE_WARN_ON(cond) \
35 do { \
36 if (WARN_ON(cond)) \
37 ftrace_kill(); \
38 } while (0)
39
40#define FTRACE_WARN_ON_ONCE(cond) \
41 do { \
42 if (WARN_ON_ONCE(cond)) \
43 ftrace_kill(); \
44 } while (0)
45
35/* ftrace_enabled is a method to turn ftrace on or off */ 46/* ftrace_enabled is a method to turn ftrace on or off */
36int ftrace_enabled __read_mostly; 47int ftrace_enabled __read_mostly;
37static int last_ftrace_enabled; 48static int last_ftrace_enabled;
38 49
50/* set when tracing only a pid */
51struct pid *ftrace_pid_trace;
52static struct pid * const ftrace_swapper_pid = &init_struct_pid;
53
54/* Quick disabling of function tracer. */
55int function_trace_stop;
56
39/* 57/*
40 * ftrace_disabled is set when an anomaly is discovered. 58 * ftrace_disabled is set when an anomaly is discovered.
41 * ftrace_disabled is much stronger than ftrace_enabled. 59 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -44,6 +62,7 @@ static int ftrace_disabled __read_mostly;
44 62
45static DEFINE_SPINLOCK(ftrace_lock); 63static DEFINE_SPINLOCK(ftrace_lock);
46static DEFINE_MUTEX(ftrace_sysctl_lock); 64static DEFINE_MUTEX(ftrace_sysctl_lock);
65static DEFINE_MUTEX(ftrace_start_lock);
47 66
48static struct ftrace_ops ftrace_list_end __read_mostly = 67static struct ftrace_ops ftrace_list_end __read_mostly =
49{ 68{
@@ -52,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
52 71
53static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 72static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
54ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 73ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
74ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
75ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
55 76
56static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 77static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
57{ 78{
@@ -68,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
68 }; 89 };
69} 90}
70 91
92static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
93{
94 if (!test_tsk_trace_trace(current))
95 return;
96
97 ftrace_pid_function(ip, parent_ip);
98}
99
100static void set_ftrace_pid_function(ftrace_func_t func)
101{
102 /* do not set ftrace_pid_function to itself! */
103 if (func != ftrace_pid_func)
104 ftrace_pid_function = func;
105}
106
71/** 107/**
72 * clear_ftrace_function - reset the ftrace function 108 * clear_ftrace_function - reset the ftrace function
73 * 109 *
@@ -77,11 +113,27 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
77void clear_ftrace_function(void) 113void clear_ftrace_function(void)
78{ 114{
79 ftrace_trace_function = ftrace_stub; 115 ftrace_trace_function = ftrace_stub;
116 __ftrace_trace_function = ftrace_stub;
117 ftrace_pid_function = ftrace_stub;
118}
119
120#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
121/*
122 * For those archs that do not test ftrace_trace_stop in their
123 * mcount call site, we need to do it from C.
124 */
125static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
126{
127 if (function_trace_stop)
128 return;
129
130 __ftrace_trace_function(ip, parent_ip);
80} 131}
132#endif
81 133
82static int __register_ftrace_function(struct ftrace_ops *ops) 134static int __register_ftrace_function(struct ftrace_ops *ops)
83{ 135{
84 /* Should never be called by interrupts */ 136 /* should not be called from interrupt context */
85 spin_lock(&ftrace_lock); 137 spin_lock(&ftrace_lock);
86 138
87 ops->next = ftrace_list; 139 ops->next = ftrace_list;
@@ -95,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
95 ftrace_list = ops; 147 ftrace_list = ops;
96 148
97 if (ftrace_enabled) { 149 if (ftrace_enabled) {
150 ftrace_func_t func;
151
152 if (ops->next == &ftrace_list_end)
153 func = ops->func;
154 else
155 func = ftrace_list_func;
156
157 if (ftrace_pid_trace) {
158 set_ftrace_pid_function(func);
159 func = ftrace_pid_func;
160 }
161
98 /* 162 /*
99 * For one func, simply call it directly. 163 * For one func, simply call it directly.
100 * For more than one func, call the chain. 164 * For more than one func, call the chain.
101 */ 165 */
102 if (ops->next == &ftrace_list_end) 166#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
103 ftrace_trace_function = ops->func; 167 ftrace_trace_function = func;
104 else 168#else
105 ftrace_trace_function = ftrace_list_func; 169 __ftrace_trace_function = func;
170 ftrace_trace_function = ftrace_test_stop_func;
171#endif
106 } 172 }
107 173
108 spin_unlock(&ftrace_lock); 174 spin_unlock(&ftrace_lock);
@@ -115,6 +181,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
115 struct ftrace_ops **p; 181 struct ftrace_ops **p;
116 int ret = 0; 182 int ret = 0;
117 183
184 /* should not be called from interrupt context */
118 spin_lock(&ftrace_lock); 185 spin_lock(&ftrace_lock);
119 186
120 /* 187 /*
@@ -140,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
140 207
141 if (ftrace_enabled) { 208 if (ftrace_enabled) {
142 /* If we only have one func left, then call that directly */ 209 /* If we only have one func left, then call that directly */
143 if (ftrace_list == &ftrace_list_end || 210 if (ftrace_list->next == &ftrace_list_end) {
144 ftrace_list->next == &ftrace_list_end) 211 ftrace_func_t func = ftrace_list->func;
145 ftrace_trace_function = ftrace_list->func; 212
213 if (ftrace_pid_trace) {
214 set_ftrace_pid_function(func);
215 func = ftrace_pid_func;
216 }
217#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
218 ftrace_trace_function = func;
219#else
220 __ftrace_trace_function = func;
221#endif
222 }
146 } 223 }
147 224
148 out: 225 out:
@@ -151,9 +228,48 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
151 return ret; 228 return ret;
152} 229}
153 230
231static void ftrace_update_pid_func(void)
232{
233 ftrace_func_t func;
234
235 /* should not be called from interrupt context */
236 spin_lock(&ftrace_lock);
237
238 if (ftrace_trace_function == ftrace_stub)
239 goto out;
240
241 func = ftrace_trace_function;
242
243 if (ftrace_pid_trace) {
244 set_ftrace_pid_function(func);
245 func = ftrace_pid_func;
246 } else {
247 if (func == ftrace_pid_func)
248 func = ftrace_pid_function;
249 }
250
251#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
252 ftrace_trace_function = func;
253#else
254 __ftrace_trace_function = func;
255#endif
256
257 out:
258 spin_unlock(&ftrace_lock);
259}
260
154#ifdef CONFIG_DYNAMIC_FTRACE 261#ifdef CONFIG_DYNAMIC_FTRACE
262#ifndef CONFIG_FTRACE_MCOUNT_RECORD
263# error Dynamic ftrace depends on MCOUNT_RECORD
264#endif
155 265
156static struct task_struct *ftraced_task; 266/*
267 * Since MCOUNT_ADDR may point to mcount itself, we do not want
268 * to get it confused by reading a reference in the code as we
269 * are parsing on objcopy output of text. Use a variable for
270 * it instead.
271 */
272static unsigned long mcount_addr = MCOUNT_ADDR;
157 273
158enum { 274enum {
159 FTRACE_ENABLE_CALLS = (1 << 0), 275 FTRACE_ENABLE_CALLS = (1 << 0),
@@ -161,18 +277,14 @@ enum {
161 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 277 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
162 FTRACE_ENABLE_MCOUNT = (1 << 3), 278 FTRACE_ENABLE_MCOUNT = (1 << 3),
163 FTRACE_DISABLE_MCOUNT = (1 << 4), 279 FTRACE_DISABLE_MCOUNT = (1 << 4),
280 FTRACE_START_FUNC_RET = (1 << 5),
281 FTRACE_STOP_FUNC_RET = (1 << 6),
164}; 282};
165 283
166static int ftrace_filtered; 284static int ftrace_filtered;
167static int tracing_on;
168static int frozen_record_count;
169
170static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
171 285
172static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); 286static LIST_HEAD(ftrace_new_addrs);
173 287
174static DEFINE_SPINLOCK(ftrace_shutdown_lock);
175static DEFINE_MUTEX(ftraced_lock);
176static DEFINE_MUTEX(ftrace_regex_lock); 288static DEFINE_MUTEX(ftrace_regex_lock);
177 289
178struct ftrace_page { 290struct ftrace_page {
@@ -190,16 +302,13 @@ struct ftrace_page {
190static struct ftrace_page *ftrace_pages_start; 302static struct ftrace_page *ftrace_pages_start;
191static struct ftrace_page *ftrace_pages; 303static struct ftrace_page *ftrace_pages;
192 304
193static int ftraced_trigger;
194static int ftraced_suspend;
195static int ftraced_stop;
196
197static int ftrace_record_suspend;
198
199static struct dyn_ftrace *ftrace_free_records; 305static struct dyn_ftrace *ftrace_free_records;
200 306
201 307
202#ifdef CONFIG_KPROBES 308#ifdef CONFIG_KPROBES
309
310static int frozen_record_count;
311
203static inline void freeze_record(struct dyn_ftrace *rec) 312static inline void freeze_record(struct dyn_ftrace *rec)
204{ 313{
205 if (!(rec->flags & FTRACE_FL_FROZEN)) { 314 if (!(rec->flags & FTRACE_FL_FROZEN)) {
@@ -226,79 +335,36 @@ static inline int record_frozen(struct dyn_ftrace *rec)
226# define record_frozen(rec) ({ 0; }) 335# define record_frozen(rec) ({ 0; })
227#endif /* CONFIG_KPROBES */ 336#endif /* CONFIG_KPROBES */
228 337
229int skip_trace(unsigned long ip) 338static void ftrace_free_rec(struct dyn_ftrace *rec)
230{ 339{
231 unsigned long fl; 340 rec->ip = (unsigned long)ftrace_free_records;
232 struct dyn_ftrace *rec; 341 ftrace_free_records = rec;
233 struct hlist_node *t; 342 rec->flags |= FTRACE_FL_FREE;
234 struct hlist_head *head;
235
236 if (frozen_record_count == 0)
237 return 0;
238
239 head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
240 hlist_for_each_entry_rcu(rec, t, head, node) {
241 if (rec->ip == ip) {
242 if (record_frozen(rec)) {
243 if (rec->flags & FTRACE_FL_FAILED)
244 return 1;
245
246 if (!(rec->flags & FTRACE_FL_CONVERTED))
247 return 1;
248
249 if (!tracing_on || !ftrace_enabled)
250 return 1;
251
252 if (ftrace_filtered) {
253 fl = rec->flags & (FTRACE_FL_FILTER |
254 FTRACE_FL_NOTRACE);
255 if (!fl || (fl & FTRACE_FL_NOTRACE))
256 return 1;
257 }
258 }
259 break;
260 }
261 }
262
263 return 0;
264} 343}
265 344
266static inline int 345void ftrace_release(void *start, unsigned long size)
267ftrace_ip_in_hash(unsigned long ip, unsigned long key)
268{ 346{
269 struct dyn_ftrace *p; 347 struct dyn_ftrace *rec;
270 struct hlist_node *t; 348 struct ftrace_page *pg;
271 int found = 0; 349 unsigned long s = (unsigned long)start;
272 350 unsigned long e = s + size;
273 hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { 351 int i;
274 if (p->ip == ip) {
275 found = 1;
276 break;
277 }
278 }
279
280 return found;
281}
282 352
283static inline void 353 if (ftrace_disabled || !start)
284ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) 354 return;
285{
286 hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
287}
288 355
289/* called from kstop_machine */ 356 /* should not be called from interrupt context */
290static inline void ftrace_del_hash(struct dyn_ftrace *node) 357 spin_lock(&ftrace_lock);
291{
292 hlist_del(&node->node);
293}
294 358
295static void ftrace_free_rec(struct dyn_ftrace *rec) 359 for (pg = ftrace_pages_start; pg; pg = pg->next) {
296{ 360 for (i = 0; i < pg->index; i++) {
297 /* no locking, only called from kstop_machine */ 361 rec = &pg->records[i];
298 362
299 rec->ip = (unsigned long)ftrace_free_records; 363 if ((rec->ip >= s) && (rec->ip < e))
300 ftrace_free_records = rec; 364 ftrace_free_rec(rec);
301 rec->flags |= FTRACE_FL_FREE; 365 }
366 }
367 spin_unlock(&ftrace_lock);
302} 368}
303 369
304static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 370static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -310,10 +376,8 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
310 rec = ftrace_free_records; 376 rec = ftrace_free_records;
311 377
312 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { 378 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
313 WARN_ON_ONCE(1); 379 FTRACE_WARN_ON_ONCE(1);
314 ftrace_free_records = NULL; 380 ftrace_free_records = NULL;
315 ftrace_disabled = 1;
316 ftrace_enabled = 0;
317 return NULL; 381 return NULL;
318 } 382 }
319 383
@@ -323,182 +387,163 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
323 } 387 }
324 388
325 if (ftrace_pages->index == ENTRIES_PER_PAGE) { 389 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
326 if (!ftrace_pages->next) 390 if (!ftrace_pages->next) {
327 return NULL; 391 /* allocate another page */
392 ftrace_pages->next =
393 (void *)get_zeroed_page(GFP_KERNEL);
394 if (!ftrace_pages->next)
395 return NULL;
396 }
328 ftrace_pages = ftrace_pages->next; 397 ftrace_pages = ftrace_pages->next;
329 } 398 }
330 399
331 return &ftrace_pages->records[ftrace_pages->index++]; 400 return &ftrace_pages->records[ftrace_pages->index++];
332} 401}
333 402
334static void 403static struct dyn_ftrace *
335ftrace_record_ip(unsigned long ip) 404ftrace_record_ip(unsigned long ip)
336{ 405{
337 struct dyn_ftrace *node; 406 struct dyn_ftrace *rec;
338 unsigned long flags;
339 unsigned long key;
340 int resched;
341 int atomic;
342 int cpu;
343
344 if (!ftrace_enabled || ftrace_disabled)
345 return;
346
347 resched = need_resched();
348 preempt_disable_notrace();
349
350 /*
351 * We simply need to protect against recursion.
352 * Use the the raw version of smp_processor_id and not
353 * __get_cpu_var which can call debug hooks that can
354 * cause a recursive crash here.
355 */
356 cpu = raw_smp_processor_id();
357 per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
358 if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
359 goto out;
360
361 if (unlikely(ftrace_record_suspend))
362 goto out;
363
364 key = hash_long(ip, FTRACE_HASHBITS);
365
366 WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
367
368 if (ftrace_ip_in_hash(ip, key))
369 goto out;
370
371 atomic = irqs_disabled();
372
373 spin_lock_irqsave(&ftrace_shutdown_lock, flags);
374 407
375 /* This ip may have hit the hash before the lock */ 408 if (ftrace_disabled)
376 if (ftrace_ip_in_hash(ip, key)) 409 return NULL;
377 goto out_unlock;
378 410
379 node = ftrace_alloc_dyn_node(ip); 411 rec = ftrace_alloc_dyn_node(ip);
380 if (!node) 412 if (!rec)
381 goto out_unlock; 413 return NULL;
382 414
383 node->ip = ip; 415 rec->ip = ip;
384 416
385 ftrace_add_hash(node, key); 417 list_add(&rec->list, &ftrace_new_addrs);
386 418
387 ftraced_trigger = 1; 419 return rec;
420}
388 421
389 out_unlock: 422static void print_ip_ins(const char *fmt, unsigned char *p)
390 spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); 423{
391 out: 424 int i;
392 per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
393 425
394 /* prevent recursion with scheduler */ 426 printk(KERN_CONT "%s", fmt);
395 if (resched) 427
396 preempt_enable_no_resched_notrace(); 428 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
397 else 429 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
398 preempt_enable_notrace(); 430}
431
432static void ftrace_bug(int failed, unsigned long ip)
433{
434 switch (failed) {
435 case -EFAULT:
436 FTRACE_WARN_ON_ONCE(1);
437 pr_info("ftrace faulted on modifying ");
438 print_ip_sym(ip);
439 break;
440 case -EINVAL:
441 FTRACE_WARN_ON_ONCE(1);
442 pr_info("ftrace failed to modify ");
443 print_ip_sym(ip);
444 print_ip_ins(" actual: ", (unsigned char *)ip);
445 printk(KERN_CONT "\n");
446 break;
447 case -EPERM:
448 FTRACE_WARN_ON_ONCE(1);
449 pr_info("ftrace faulted on writing ");
450 print_ip_sym(ip);
451 break;
452 default:
453 FTRACE_WARN_ON_ONCE(1);
454 pr_info("ftrace faulted on unknown error ");
455 print_ip_sym(ip);
456 }
399} 457}
400 458
401#define FTRACE_ADDR ((long)(ftrace_caller))
402 459
403static int 460static int
404__ftrace_replace_code(struct dyn_ftrace *rec, 461__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
405 unsigned char *old, unsigned char *new, int enable)
406{ 462{
407 unsigned long ip, fl; 463 unsigned long ip, fl;
464 unsigned long ftrace_addr;
465
466 ftrace_addr = (unsigned long)ftrace_caller;
408 467
409 ip = rec->ip; 468 ip = rec->ip;
410 469
411 if (ftrace_filtered && enable) { 470 /*
471 * If this record is not to be traced and
472 * it is not enabled then do nothing.
473 *
474 * If this record is not to be traced and
475 * it is enabled then disabled it.
476 *
477 */
478 if (rec->flags & FTRACE_FL_NOTRACE) {
479 if (rec->flags & FTRACE_FL_ENABLED)
480 rec->flags &= ~FTRACE_FL_ENABLED;
481 else
482 return 0;
483
484 } else if (ftrace_filtered && enable) {
412 /* 485 /*
413 * If filtering is on: 486 * Filtering is on:
414 *
415 * If this record is set to be filtered and
416 * is enabled then do nothing.
417 *
418 * If this record is set to be filtered and
419 * it is not enabled, enable it.
420 *
421 * If this record is not set to be filtered
422 * and it is not enabled do nothing.
423 *
424 * If this record is set not to trace then
425 * do nothing.
426 *
427 * If this record is set not to trace and
428 * it is enabled then disable it.
429 *
430 * If this record is not set to be filtered and
431 * it is enabled, disable it.
432 */ 487 */
433 488
434 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | 489 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
435 FTRACE_FL_ENABLED);
436 490
437 if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || 491 /* Record is filtered and enabled, do nothing */
438 (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || 492 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
439 !fl || (fl == FTRACE_FL_NOTRACE))
440 return 0; 493 return 0;
441 494
442 /* 495 /* Record is not filtered and is not enabled do nothing */
443 * If it is enabled disable it, 496 if (!fl)
444 * otherwise enable it! 497 return 0;
445 */ 498
446 if (fl & FTRACE_FL_ENABLED) { 499 /* Record is not filtered but enabled, disable it */
447 /* swap new and old */ 500 if (fl == FTRACE_FL_ENABLED)
448 new = old;
449 old = ftrace_call_replace(ip, FTRACE_ADDR);
450 rec->flags &= ~FTRACE_FL_ENABLED; 501 rec->flags &= ~FTRACE_FL_ENABLED;
451 } else { 502 else
452 new = ftrace_call_replace(ip, FTRACE_ADDR); 503 /* Otherwise record is filtered but not enabled, enable it */
453 rec->flags |= FTRACE_FL_ENABLED; 504 rec->flags |= FTRACE_FL_ENABLED;
454 }
455 } else { 505 } else {
506 /* Disable or not filtered */
456 507
457 if (enable) { 508 if (enable) {
458 /* 509 /* if record is enabled, do nothing */
459 * If this record is set not to trace and is
460 * not enabled, do nothing.
461 */
462 fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
463 if (fl == FTRACE_FL_NOTRACE)
464 return 0;
465
466 new = ftrace_call_replace(ip, FTRACE_ADDR);
467 } else
468 old = ftrace_call_replace(ip, FTRACE_ADDR);
469
470 if (enable) {
471 if (rec->flags & FTRACE_FL_ENABLED) 510 if (rec->flags & FTRACE_FL_ENABLED)
472 return 0; 511 return 0;
512
473 rec->flags |= FTRACE_FL_ENABLED; 513 rec->flags |= FTRACE_FL_ENABLED;
514
474 } else { 515 } else {
516
517 /* if record is not enabled do nothing */
475 if (!(rec->flags & FTRACE_FL_ENABLED)) 518 if (!(rec->flags & FTRACE_FL_ENABLED))
476 return 0; 519 return 0;
520
477 rec->flags &= ~FTRACE_FL_ENABLED; 521 rec->flags &= ~FTRACE_FL_ENABLED;
478 } 522 }
479 } 523 }
480 524
481 return ftrace_modify_code(ip, old, new); 525 if (rec->flags & FTRACE_FL_ENABLED)
526 return ftrace_make_call(rec, ftrace_addr);
527 else
528 return ftrace_make_nop(NULL, rec, ftrace_addr);
482} 529}
483 530
484static void ftrace_replace_code(int enable) 531static void ftrace_replace_code(int enable)
485{ 532{
486 int i, failed; 533 int i, failed;
487 unsigned char *new = NULL, *old = NULL;
488 struct dyn_ftrace *rec; 534 struct dyn_ftrace *rec;
489 struct ftrace_page *pg; 535 struct ftrace_page *pg;
490 536
491 if (enable)
492 old = ftrace_nop_replace();
493 else
494 new = ftrace_nop_replace();
495
496 for (pg = ftrace_pages_start; pg; pg = pg->next) { 537 for (pg = ftrace_pages_start; pg; pg = pg->next) {
497 for (i = 0; i < pg->index; i++) { 538 for (i = 0; i < pg->index; i++) {
498 rec = &pg->records[i]; 539 rec = &pg->records[i];
499 540
500 /* don't modify code that has already faulted */ 541 /*
501 if (rec->flags & FTRACE_FL_FAILED) 542 * Skip over free records and records that have
543 * failed.
544 */
545 if (rec->flags & FTRACE_FL_FREE ||
546 rec->flags & FTRACE_FL_FAILED)
502 continue; 547 continue;
503 548
504 /* ignore updates to this record's mcount site */ 549 /* ignore updates to this record's mcount site */
@@ -509,78 +554,52 @@ static void ftrace_replace_code(int enable)
509 unfreeze_record(rec); 554 unfreeze_record(rec);
510 } 555 }
511 556
512 failed = __ftrace_replace_code(rec, old, new, enable); 557 failed = __ftrace_replace_code(rec, enable);
513 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 558 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
514 rec->flags |= FTRACE_FL_FAILED; 559 rec->flags |= FTRACE_FL_FAILED;
515 if ((system_state == SYSTEM_BOOTING) || 560 if ((system_state == SYSTEM_BOOTING) ||
516 !core_kernel_text(rec->ip)) { 561 !core_kernel_text(rec->ip)) {
517 ftrace_del_hash(rec);
518 ftrace_free_rec(rec); 562 ftrace_free_rec(rec);
519 } 563 } else
564 ftrace_bug(failed, rec->ip);
520 } 565 }
521 } 566 }
522 } 567 }
523} 568}
524 569
525static void ftrace_shutdown_replenish(void)
526{
527 if (ftrace_pages->next)
528 return;
529
530 /* allocate another page */
531 ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
532}
533
534static int 570static int
535ftrace_code_disable(struct dyn_ftrace *rec) 571ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
536{ 572{
537 unsigned long ip; 573 unsigned long ip;
538 unsigned char *nop, *call; 574 int ret;
539 int failed;
540 575
541 ip = rec->ip; 576 ip = rec->ip;
542 577
543 nop = ftrace_nop_replace(); 578 ret = ftrace_make_nop(mod, rec, mcount_addr);
544 call = ftrace_call_replace(ip, MCOUNT_ADDR); 579 if (ret) {
545 580 ftrace_bug(ret, ip);
546 failed = ftrace_modify_code(ip, call, nop);
547 if (failed) {
548 rec->flags |= FTRACE_FL_FAILED; 581 rec->flags |= FTRACE_FL_FAILED;
549 return 0; 582 return 0;
550 } 583 }
551 return 1; 584 return 1;
552} 585}
553 586
554static int __ftrace_update_code(void *ignore);
555
556static int __ftrace_modify_code(void *data) 587static int __ftrace_modify_code(void *data)
557{ 588{
558 unsigned long addr;
559 int *command = data; 589 int *command = data;
560 590
561 if (*command & FTRACE_ENABLE_CALLS) { 591 if (*command & FTRACE_ENABLE_CALLS)
562 /*
563 * Update any recorded ips now that we have the
564 * machine stopped
565 */
566 __ftrace_update_code(NULL);
567 ftrace_replace_code(1); 592 ftrace_replace_code(1);
568 tracing_on = 1; 593 else if (*command & FTRACE_DISABLE_CALLS)
569 } else if (*command & FTRACE_DISABLE_CALLS) {
570 ftrace_replace_code(0); 594 ftrace_replace_code(0);
571 tracing_on = 0;
572 }
573 595
574 if (*command & FTRACE_UPDATE_TRACE_FUNC) 596 if (*command & FTRACE_UPDATE_TRACE_FUNC)
575 ftrace_update_ftrace_func(ftrace_trace_function); 597 ftrace_update_ftrace_func(ftrace_trace_function);
576 598
577 if (*command & FTRACE_ENABLE_MCOUNT) { 599 if (*command & FTRACE_START_FUNC_RET)
578 addr = (unsigned long)ftrace_record_ip; 600 ftrace_enable_ftrace_graph_caller();
579 ftrace_mcount_set(&addr); 601 else if (*command & FTRACE_STOP_FUNC_RET)
580 } else if (*command & FTRACE_DISABLE_MCOUNT) { 602 ftrace_disable_ftrace_graph_caller();
581 addr = (unsigned long)ftrace_stub;
582 ftrace_mcount_set(&addr);
583 }
584 603
585 return 0; 604 return 0;
586} 605}
@@ -590,62 +609,44 @@ static void ftrace_run_update_code(int command)
590 stop_machine(__ftrace_modify_code, &command, NULL); 609 stop_machine(__ftrace_modify_code, &command, NULL);
591} 610}
592 611
593void ftrace_disable_daemon(void)
594{
595 /* Stop the daemon from calling kstop_machine */
596 mutex_lock(&ftraced_lock);
597 ftraced_stop = 1;
598 mutex_unlock(&ftraced_lock);
599
600 ftrace_force_update();
601}
602
603void ftrace_enable_daemon(void)
604{
605 mutex_lock(&ftraced_lock);
606 ftraced_stop = 0;
607 mutex_unlock(&ftraced_lock);
608
609 ftrace_force_update();
610}
611
612static ftrace_func_t saved_ftrace_func; 612static ftrace_func_t saved_ftrace_func;
613static int ftrace_start_up;
613 614
614static void ftrace_startup(void) 615static void ftrace_startup_enable(int command)
615{ 616{
616 int command = 0;
617
618 if (unlikely(ftrace_disabled))
619 return;
620
621 mutex_lock(&ftraced_lock);
622 ftraced_suspend++;
623 if (ftraced_suspend == 1)
624 command |= FTRACE_ENABLE_CALLS;
625
626 if (saved_ftrace_func != ftrace_trace_function) { 617 if (saved_ftrace_func != ftrace_trace_function) {
627 saved_ftrace_func = ftrace_trace_function; 618 saved_ftrace_func = ftrace_trace_function;
628 command |= FTRACE_UPDATE_TRACE_FUNC; 619 command |= FTRACE_UPDATE_TRACE_FUNC;
629 } 620 }
630 621
631 if (!command || !ftrace_enabled) 622 if (!command || !ftrace_enabled)
632 goto out; 623 return;
633 624
634 ftrace_run_update_code(command); 625 ftrace_run_update_code(command);
635 out:
636 mutex_unlock(&ftraced_lock);
637} 626}
638 627
639static void ftrace_shutdown(void) 628static void ftrace_startup(int command)
640{ 629{
641 int command = 0; 630 if (unlikely(ftrace_disabled))
631 return;
632
633 mutex_lock(&ftrace_start_lock);
634 ftrace_start_up++;
635 command |= FTRACE_ENABLE_CALLS;
642 636
637 ftrace_startup_enable(command);
638
639 mutex_unlock(&ftrace_start_lock);
640}
641
642static void ftrace_shutdown(int command)
643{
643 if (unlikely(ftrace_disabled)) 644 if (unlikely(ftrace_disabled))
644 return; 645 return;
645 646
646 mutex_lock(&ftraced_lock); 647 mutex_lock(&ftrace_start_lock);
647 ftraced_suspend--; 648 ftrace_start_up--;
648 if (!ftraced_suspend) 649 if (!ftrace_start_up)
649 command |= FTRACE_DISABLE_CALLS; 650 command |= FTRACE_DISABLE_CALLS;
650 651
651 if (saved_ftrace_func != ftrace_trace_function) { 652 if (saved_ftrace_func != ftrace_trace_function) {
@@ -658,7 +659,7 @@ static void ftrace_shutdown(void)
658 659
659 ftrace_run_update_code(command); 660 ftrace_run_update_code(command);
660 out: 661 out:
661 mutex_unlock(&ftraced_lock); 662 mutex_unlock(&ftrace_start_lock);
662} 663}
663 664
664static void ftrace_startup_sysctl(void) 665static void ftrace_startup_sysctl(void)
@@ -668,15 +669,15 @@ static void ftrace_startup_sysctl(void)
668 if (unlikely(ftrace_disabled)) 669 if (unlikely(ftrace_disabled))
669 return; 670 return;
670 671
671 mutex_lock(&ftraced_lock); 672 mutex_lock(&ftrace_start_lock);
672 /* Force update next time */ 673 /* Force update next time */
673 saved_ftrace_func = NULL; 674 saved_ftrace_func = NULL;
674 /* ftraced_suspend is true if we want ftrace running */ 675 /* ftrace_start_up is true if we want ftrace running */
675 if (ftraced_suspend) 676 if (ftrace_start_up)
676 command |= FTRACE_ENABLE_CALLS; 677 command |= FTRACE_ENABLE_CALLS;
677 678
678 ftrace_run_update_code(command); 679 ftrace_run_update_code(command);
679 mutex_unlock(&ftraced_lock); 680 mutex_unlock(&ftrace_start_lock);
680} 681}
681 682
682static void ftrace_shutdown_sysctl(void) 683static void ftrace_shutdown_sysctl(void)
@@ -686,153 +687,51 @@ static void ftrace_shutdown_sysctl(void)
686 if (unlikely(ftrace_disabled)) 687 if (unlikely(ftrace_disabled))
687 return; 688 return;
688 689
689 mutex_lock(&ftraced_lock); 690 mutex_lock(&ftrace_start_lock);
690 /* ftraced_suspend is true if ftrace is running */ 691 /* ftrace_start_up is true if ftrace is running */
691 if (ftraced_suspend) 692 if (ftrace_start_up)
692 command |= FTRACE_DISABLE_CALLS; 693 command |= FTRACE_DISABLE_CALLS;
693 694
694 ftrace_run_update_code(command); 695 ftrace_run_update_code(command);
695 mutex_unlock(&ftraced_lock); 696 mutex_unlock(&ftrace_start_lock);
696} 697}
697 698
698static cycle_t ftrace_update_time; 699static cycle_t ftrace_update_time;
699static unsigned long ftrace_update_cnt; 700static unsigned long ftrace_update_cnt;
700unsigned long ftrace_update_tot_cnt; 701unsigned long ftrace_update_tot_cnt;
701 702
702static int __ftrace_update_code(void *ignore) 703static int ftrace_update_code(struct module *mod)
703{ 704{
704 int i, save_ftrace_enabled; 705 struct dyn_ftrace *p, *t;
705 cycle_t start, stop; 706 cycle_t start, stop;
706 struct dyn_ftrace *p;
707 struct hlist_node *t, *n;
708 struct hlist_head *head, temp_list;
709
710 /* Don't be recording funcs now */
711 ftrace_record_suspend++;
712 save_ftrace_enabled = ftrace_enabled;
713 ftrace_enabled = 0;
714 707
715 start = ftrace_now(raw_smp_processor_id()); 708 start = ftrace_now(raw_smp_processor_id());
716 ftrace_update_cnt = 0; 709 ftrace_update_cnt = 0;
717 710
718 /* No locks needed, the machine is stopped! */ 711 list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
719 for (i = 0; i < FTRACE_HASHSIZE; i++) {
720 INIT_HLIST_HEAD(&temp_list);
721 head = &ftrace_hash[i];
722 712
723 /* all CPUS are stopped, we are safe to modify code */ 713 /* If something went wrong, bail without enabling anything */
724 hlist_for_each_entry_safe(p, t, n, head, node) { 714 if (unlikely(ftrace_disabled))
725 /* Skip over failed records which have not been 715 return -1;
726 * freed. */
727 if (p->flags & FTRACE_FL_FAILED)
728 continue;
729
730 /* Unconverted records are always at the head of the
731 * hash bucket. Once we encounter a converted record,
732 * simply skip over to the next bucket. Saves ftraced
733 * some processor cycles (ftrace does its bid for
734 * global warming :-p ). */
735 if (p->flags & (FTRACE_FL_CONVERTED))
736 break;
737
738 /* Ignore updates to this record's mcount site.
739 * Reintroduce this record at the head of this
740 * bucket to attempt to "convert" it again if
741 * the kprobe on it is unregistered before the
742 * next run. */
743 if (get_kprobe((void *)p->ip)) {
744 ftrace_del_hash(p);
745 INIT_HLIST_NODE(&p->node);
746 hlist_add_head(&p->node, &temp_list);
747 freeze_record(p);
748 continue;
749 } else {
750 unfreeze_record(p);
751 }
752 716
753 /* convert record (i.e, patch mcount-call with NOP) */ 717 list_del_init(&p->list);
754 if (ftrace_code_disable(p)) {
755 p->flags |= FTRACE_FL_CONVERTED;
756 ftrace_update_cnt++;
757 } else {
758 if ((system_state == SYSTEM_BOOTING) ||
759 !core_kernel_text(p->ip)) {
760 ftrace_del_hash(p);
761 ftrace_free_rec(p);
762 }
763 }
764 }
765 718
766 hlist_for_each_entry_safe(p, t, n, &temp_list, node) { 719 /* convert record (i.e, patch mcount-call with NOP) */
767 hlist_del(&p->node); 720 if (ftrace_code_disable(mod, p)) {
768 INIT_HLIST_NODE(&p->node); 721 p->flags |= FTRACE_FL_CONVERTED;
769 hlist_add_head(&p->node, head); 722 ftrace_update_cnt++;
770 } 723 } else
724 ftrace_free_rec(p);
771 } 725 }
772 726
773 stop = ftrace_now(raw_smp_processor_id()); 727 stop = ftrace_now(raw_smp_processor_id());
774 ftrace_update_time = stop - start; 728 ftrace_update_time = stop - start;
775 ftrace_update_tot_cnt += ftrace_update_cnt; 729 ftrace_update_tot_cnt += ftrace_update_cnt;
776 ftraced_trigger = 0;
777
778 ftrace_enabled = save_ftrace_enabled;
779 ftrace_record_suspend--;
780 730
781 return 0; 731 return 0;
782} 732}
783 733
784static int ftrace_update_code(void) 734static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
785{
786 if (unlikely(ftrace_disabled) ||
787 !ftrace_enabled || !ftraced_trigger)
788 return 0;
789
790 stop_machine(__ftrace_update_code, NULL, NULL);
791
792 return 1;
793}
794
795static int ftraced(void *ignore)
796{
797 unsigned long usecs;
798
799 while (!kthread_should_stop()) {
800
801 set_current_state(TASK_INTERRUPTIBLE);
802
803 /* check once a second */
804 schedule_timeout(HZ);
805
806 if (unlikely(ftrace_disabled))
807 continue;
808
809 mutex_lock(&ftrace_sysctl_lock);
810 mutex_lock(&ftraced_lock);
811 if (!ftraced_suspend && !ftraced_stop &&
812 ftrace_update_code()) {
813 usecs = nsecs_to_usecs(ftrace_update_time);
814 if (ftrace_update_tot_cnt > 100000) {
815 ftrace_update_tot_cnt = 0;
816 pr_info("hm, dftrace overflow: %lu change%s"
817 " (%lu total) in %lu usec%s\n",
818 ftrace_update_cnt,
819 ftrace_update_cnt != 1 ? "s" : "",
820 ftrace_update_tot_cnt,
821 usecs, usecs != 1 ? "s" : "");
822 ftrace_disabled = 1;
823 WARN_ON_ONCE(1);
824 }
825 }
826 mutex_unlock(&ftraced_lock);
827 mutex_unlock(&ftrace_sysctl_lock);
828
829 ftrace_shutdown_replenish();
830 }
831 __set_current_state(TASK_RUNNING);
832 return 0;
833}
834
835static int __init ftrace_dyn_table_alloc(void)
836{ 735{
837 struct ftrace_page *pg; 736 struct ftrace_page *pg;
838 int cnt; 737 int cnt;
@@ -859,7 +758,9 @@ static int __init ftrace_dyn_table_alloc(void)
859 758
860 pg = ftrace_pages = ftrace_pages_start; 759 pg = ftrace_pages = ftrace_pages_start;
861 760
862 cnt = NR_TO_INIT / ENTRIES_PER_PAGE; 761 cnt = num_to_init / ENTRIES_PER_PAGE;
762 pr_info("ftrace: allocating %ld entries in %d pages\n",
763 num_to_init, cnt + 1);
863 764
864 for (i = 0; i < cnt; i++) { 765 for (i = 0; i < cnt; i++) {
865 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 766 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -884,7 +785,6 @@ enum {
884#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 785#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
885 786
886struct ftrace_iterator { 787struct ftrace_iterator {
887 loff_t pos;
888 struct ftrace_page *pg; 788 struct ftrace_page *pg;
889 unsigned idx; 789 unsigned idx;
890 unsigned flags; 790 unsigned flags;
@@ -901,21 +801,26 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
901 801
902 (*pos)++; 802 (*pos)++;
903 803
804 /* should not be called from interrupt context */
805 spin_lock(&ftrace_lock);
904 retry: 806 retry:
905 if (iter->idx >= iter->pg->index) { 807 if (iter->idx >= iter->pg->index) {
906 if (iter->pg->next) { 808 if (iter->pg->next) {
907 iter->pg = iter->pg->next; 809 iter->pg = iter->pg->next;
908 iter->idx = 0; 810 iter->idx = 0;
909 goto retry; 811 goto retry;
812 } else {
813 iter->idx = -1;
910 } 814 }
911 } else { 815 } else {
912 rec = &iter->pg->records[iter->idx++]; 816 rec = &iter->pg->records[iter->idx++];
913 if ((!(iter->flags & FTRACE_ITER_FAILURES) && 817 if ((rec->flags & FTRACE_FL_FREE) ||
818
819 (!(iter->flags & FTRACE_ITER_FAILURES) &&
914 (rec->flags & FTRACE_FL_FAILED)) || 820 (rec->flags & FTRACE_FL_FAILED)) ||
915 821
916 ((iter->flags & FTRACE_ITER_FAILURES) && 822 ((iter->flags & FTRACE_ITER_FAILURES) &&
917 (!(rec->flags & FTRACE_FL_FAILED) || 823 !(rec->flags & FTRACE_FL_FAILED)) ||
918 (rec->flags & FTRACE_FL_FREE))) ||
919 824
920 ((iter->flags & FTRACE_ITER_FILTER) && 825 ((iter->flags & FTRACE_ITER_FILTER) &&
921 !(rec->flags & FTRACE_FL_FILTER)) || 826 !(rec->flags & FTRACE_FL_FILTER)) ||
@@ -926,8 +831,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
926 goto retry; 831 goto retry;
927 } 832 }
928 } 833 }
929 834 spin_unlock(&ftrace_lock);
930 iter->pos = *pos;
931 835
932 return rec; 836 return rec;
933} 837}
@@ -936,16 +840,16 @@ static void *t_start(struct seq_file *m, loff_t *pos)
936{ 840{
937 struct ftrace_iterator *iter = m->private; 841 struct ftrace_iterator *iter = m->private;
938 void *p = NULL; 842 void *p = NULL;
939 loff_t l = -1;
940 843
941 if (*pos != iter->pos) { 844 if (*pos > 0) {
942 for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) 845 if (iter->idx < 0)
943 ; 846 return p;
944 } else { 847 (*pos)--;
945 l = *pos; 848 iter->idx--;
946 p = t_next(m, p, &l);
947 } 849 }
948 850
851 p = t_next(m, p, pos);
852
949 return p; 853 return p;
950} 854}
951 855
@@ -989,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
989 return -ENOMEM; 893 return -ENOMEM;
990 894
991 iter->pg = ftrace_pages_start; 895 iter->pg = ftrace_pages_start;
992 iter->pos = -1;
993 896
994 ret = seq_open(file, &show_ftrace_seq_ops); 897 ret = seq_open(file, &show_ftrace_seq_ops);
995 if (!ret) { 898 if (!ret) {
@@ -1039,8 +942,8 @@ static void ftrace_filter_reset(int enable)
1039 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 942 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1040 unsigned i; 943 unsigned i;
1041 944
1042 /* keep kstop machine from running */ 945 /* should not be called from interrupt context */
1043 preempt_disable(); 946 spin_lock(&ftrace_lock);
1044 if (enable) 947 if (enable)
1045 ftrace_filtered = 0; 948 ftrace_filtered = 0;
1046 pg = ftrace_pages_start; 949 pg = ftrace_pages_start;
@@ -1053,7 +956,7 @@ static void ftrace_filter_reset(int enable)
1053 } 956 }
1054 pg = pg->next; 957 pg = pg->next;
1055 } 958 }
1056 preempt_enable(); 959 spin_unlock(&ftrace_lock);
1057} 960}
1058 961
1059static int 962static int
@@ -1076,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1076 979
1077 if (file->f_mode & FMODE_READ) { 980 if (file->f_mode & FMODE_READ) {
1078 iter->pg = ftrace_pages_start; 981 iter->pg = ftrace_pages_start;
1079 iter->pos = -1;
1080 iter->flags = enable ? FTRACE_ITER_FILTER : 982 iter->flags = enable ? FTRACE_ITER_FILTER :
1081 FTRACE_ITER_NOTRACE; 983 FTRACE_ITER_NOTRACE;
1082 984
@@ -1145,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable)
1145 int type = MATCH_FULL; 1047 int type = MATCH_FULL;
1146 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1048 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1147 unsigned i, match = 0, search_len = 0; 1049 unsigned i, match = 0, search_len = 0;
1050 int not = 0;
1051
1052 if (buff[0] == '!') {
1053 not = 1;
1054 buff++;
1055 len--;
1056 }
1148 1057
1149 for (i = 0; i < len; i++) { 1058 for (i = 0; i < len; i++) {
1150 if (buff[i] == '*') { 1059 if (buff[i] == '*') {
@@ -1165,8 +1074,8 @@ ftrace_match(unsigned char *buff, int len, int enable)
1165 } 1074 }
1166 } 1075 }
1167 1076
1168 /* keep kstop machine from running */ 1077 /* should not be called from interrupt context */
1169 preempt_disable(); 1078 spin_lock(&ftrace_lock);
1170 if (enable) 1079 if (enable)
1171 ftrace_filtered = 1; 1080 ftrace_filtered = 1;
1172 pg = ftrace_pages_start; 1081 pg = ftrace_pages_start;
@@ -1198,12 +1107,16 @@ ftrace_match(unsigned char *buff, int len, int enable)
1198 matched = 1; 1107 matched = 1;
1199 break; 1108 break;
1200 } 1109 }
1201 if (matched) 1110 if (matched) {
1202 rec->flags |= flag; 1111 if (not)
1112 rec->flags &= ~flag;
1113 else
1114 rec->flags |= flag;
1115 }
1203 } 1116 }
1204 pg = pg->next; 1117 pg = pg->next;
1205 } 1118 }
1206 preempt_enable(); 1119 spin_unlock(&ftrace_lock);
1207} 1120}
1208 1121
1209static ssize_t 1122static ssize_t
@@ -1366,10 +1279,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1366 } 1279 }
1367 1280
1368 mutex_lock(&ftrace_sysctl_lock); 1281 mutex_lock(&ftrace_sysctl_lock);
1369 mutex_lock(&ftraced_lock); 1282 mutex_lock(&ftrace_start_lock);
1370 if (iter->filtered && ftraced_suspend && ftrace_enabled) 1283 if (ftrace_start_up && ftrace_enabled)
1371 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1284 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1372 mutex_unlock(&ftraced_lock); 1285 mutex_unlock(&ftrace_start_lock);
1373 mutex_unlock(&ftrace_sysctl_lock); 1286 mutex_unlock(&ftrace_sysctl_lock);
1374 1287
1375 kfree(iter); 1288 kfree(iter);
@@ -1389,55 +1302,6 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
1389 return ftrace_regex_release(inode, file, 0); 1302 return ftrace_regex_release(inode, file, 0);
1390} 1303}
1391 1304
1392static ssize_t
1393ftraced_read(struct file *filp, char __user *ubuf,
1394 size_t cnt, loff_t *ppos)
1395{
1396 /* don't worry about races */
1397 char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
1398 int r = strlen(buf);
1399
1400 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1401}
1402
1403static ssize_t
1404ftraced_write(struct file *filp, const char __user *ubuf,
1405 size_t cnt, loff_t *ppos)
1406{
1407 char buf[64];
1408 long val;
1409 int ret;
1410
1411 if (cnt >= sizeof(buf))
1412 return -EINVAL;
1413
1414 if (copy_from_user(&buf, ubuf, cnt))
1415 return -EFAULT;
1416
1417 if (strncmp(buf, "enable", 6) == 0)
1418 val = 1;
1419 else if (strncmp(buf, "disable", 7) == 0)
1420 val = 0;
1421 else {
1422 buf[cnt] = 0;
1423
1424 ret = strict_strtoul(buf, 10, &val);
1425 if (ret < 0)
1426 return ret;
1427
1428 val = !!val;
1429 }
1430
1431 if (val)
1432 ftrace_enable_daemon();
1433 else
1434 ftrace_disable_daemon();
1435
1436 filp->f_pos += cnt;
1437
1438 return cnt;
1439}
1440
1441static struct file_operations ftrace_avail_fops = { 1305static struct file_operations ftrace_avail_fops = {
1442 .open = ftrace_avail_open, 1306 .open = ftrace_avail_open,
1443 .read = seq_read, 1307 .read = seq_read,
@@ -1468,60 +1332,233 @@ static struct file_operations ftrace_notrace_fops = {
1468 .release = ftrace_notrace_release, 1332 .release = ftrace_notrace_release,
1469}; 1333};
1470 1334
1471static struct file_operations ftraced_fops = { 1335#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1472 .open = tracing_open_generic, 1336
1473 .read = ftraced_read, 1337static DEFINE_MUTEX(graph_lock);
1474 .write = ftraced_write, 1338
1339int ftrace_graph_count;
1340unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
1341
1342static void *
1343g_next(struct seq_file *m, void *v, loff_t *pos)
1344{
1345 unsigned long *array = m->private;
1346 int index = *pos;
1347
1348 (*pos)++;
1349
1350 if (index >= ftrace_graph_count)
1351 return NULL;
1352
1353 return &array[index];
1354}
1355
1356static void *g_start(struct seq_file *m, loff_t *pos)
1357{
1358 void *p = NULL;
1359
1360 mutex_lock(&graph_lock);
1361
1362 p = g_next(m, p, pos);
1363
1364 return p;
1365}
1366
1367static void g_stop(struct seq_file *m, void *p)
1368{
1369 mutex_unlock(&graph_lock);
1370}
1371
1372static int g_show(struct seq_file *m, void *v)
1373{
1374 unsigned long *ptr = v;
1375 char str[KSYM_SYMBOL_LEN];
1376
1377 if (!ptr)
1378 return 0;
1379
1380 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1381
1382 seq_printf(m, "%s\n", str);
1383
1384 return 0;
1385}
1386
1387static struct seq_operations ftrace_graph_seq_ops = {
1388 .start = g_start,
1389 .next = g_next,
1390 .stop = g_stop,
1391 .show = g_show,
1475}; 1392};
1476 1393
1477/** 1394static int
1478 * ftrace_force_update - force an update to all recording ftrace functions 1395ftrace_graph_open(struct inode *inode, struct file *file)
1479 */
1480int ftrace_force_update(void)
1481{ 1396{
1482 int ret = 0; 1397 int ret = 0;
1483 1398
1484 if (unlikely(ftrace_disabled)) 1399 if (unlikely(ftrace_disabled))
1485 return -ENODEV; 1400 return -ENODEV;
1486 1401
1487 mutex_lock(&ftrace_sysctl_lock); 1402 mutex_lock(&graph_lock);
1488 mutex_lock(&ftraced_lock); 1403 if ((file->f_mode & FMODE_WRITE) &&
1489 1404 !(file->f_flags & O_APPEND)) {
1490 /* 1405 ftrace_graph_count = 0;
1491 * If ftraced_trigger is not set, then there is nothing 1406 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
1492 * to update. 1407 }
1493 */
1494 if (ftraced_trigger && !ftrace_update_code())
1495 ret = -EBUSY;
1496 1408
1497 mutex_unlock(&ftraced_lock); 1409 if (file->f_mode & FMODE_READ) {
1498 mutex_unlock(&ftrace_sysctl_lock); 1410 ret = seq_open(file, &ftrace_graph_seq_ops);
1411 if (!ret) {
1412 struct seq_file *m = file->private_data;
1413 m->private = ftrace_graph_funcs;
1414 }
1415 } else
1416 file->private_data = ftrace_graph_funcs;
1417 mutex_unlock(&graph_lock);
1499 1418
1500 return ret; 1419 return ret;
1501} 1420}
1502 1421
1503static void ftrace_force_shutdown(void) 1422static ssize_t
1423ftrace_graph_read(struct file *file, char __user *ubuf,
1424 size_t cnt, loff_t *ppos)
1504{ 1425{
1505 struct task_struct *task; 1426 if (file->f_mode & FMODE_READ)
1506 int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; 1427 return seq_read(file, ubuf, cnt, ppos);
1428 else
1429 return -EPERM;
1430}
1507 1431
1508 mutex_lock(&ftraced_lock); 1432static int
1509 task = ftraced_task; 1433ftrace_set_func(unsigned long *array, int idx, char *buffer)
1510 ftraced_task = NULL; 1434{
1511 ftraced_suspend = -1; 1435 char str[KSYM_SYMBOL_LEN];
1512 ftrace_run_update_code(command); 1436 struct dyn_ftrace *rec;
1513 mutex_unlock(&ftraced_lock); 1437 struct ftrace_page *pg;
1438 int found = 0;
1439 int i, j;
1440
1441 if (ftrace_disabled)
1442 return -ENODEV;
1443
1444 /* should not be called from interrupt context */
1445 spin_lock(&ftrace_lock);
1446
1447 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1448 for (i = 0; i < pg->index; i++) {
1449 rec = &pg->records[i];
1450
1451 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
1452 continue;
1453
1454 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1455 if (strcmp(str, buffer) == 0) {
1456 found = 1;
1457 for (j = 0; j < idx; j++)
1458 if (array[j] == rec->ip) {
1459 found = 0;
1460 break;
1461 }
1462 if (found)
1463 array[idx] = rec->ip;
1464 break;
1465 }
1466 }
1467 }
1468 spin_unlock(&ftrace_lock);
1514 1469
1515 if (task) 1470 return found ? 0 : -EINVAL;
1516 kthread_stop(task);
1517} 1471}
1518 1472
1519static __init int ftrace_init_debugfs(void) 1473static ssize_t
1474ftrace_graph_write(struct file *file, const char __user *ubuf,
1475 size_t cnt, loff_t *ppos)
1520{ 1476{
1521 struct dentry *d_tracer; 1477 unsigned char buffer[FTRACE_BUFF_MAX+1];
1522 struct dentry *entry; 1478 unsigned long *array;
1479 size_t read = 0;
1480 ssize_t ret;
1481 int index = 0;
1482 char ch;
1523 1483
1524 d_tracer = tracing_init_dentry(); 1484 if (!cnt || cnt < 0)
1485 return 0;
1486
1487 mutex_lock(&graph_lock);
1488
1489 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
1490 ret = -EBUSY;
1491 goto out;
1492 }
1493
1494 if (file->f_mode & FMODE_READ) {
1495 struct seq_file *m = file->private_data;
1496 array = m->private;
1497 } else
1498 array = file->private_data;
1499
1500 ret = get_user(ch, ubuf++);
1501 if (ret)
1502 goto out;
1503 read++;
1504 cnt--;
1505
1506 /* skip white space */
1507 while (cnt && isspace(ch)) {
1508 ret = get_user(ch, ubuf++);
1509 if (ret)
1510 goto out;
1511 read++;
1512 cnt--;
1513 }
1514
1515 if (isspace(ch)) {
1516 *ppos += read;
1517 ret = read;
1518 goto out;
1519 }
1520
1521 while (cnt && !isspace(ch)) {
1522 if (index < FTRACE_BUFF_MAX)
1523 buffer[index++] = ch;
1524 else {
1525 ret = -EINVAL;
1526 goto out;
1527 }
1528 ret = get_user(ch, ubuf++);
1529 if (ret)
1530 goto out;
1531 read++;
1532 cnt--;
1533 }
1534 buffer[index] = 0;
1535
1536 /* we allow only one at a time */
1537 ret = ftrace_set_func(array, ftrace_graph_count, buffer);
1538 if (ret)
1539 goto out;
1540
1541 ftrace_graph_count++;
1542
1543 file->f_pos += read;
1544
1545 ret = read;
1546 out:
1547 mutex_unlock(&graph_lock);
1548
1549 return ret;
1550}
1551
1552static const struct file_operations ftrace_graph_fops = {
1553 .open = ftrace_graph_open,
1554 .read = ftrace_graph_read,
1555 .write = ftrace_graph_write,
1556};
1557#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1558
1559static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
1560{
1561 struct dentry *entry;
1525 1562
1526 entry = debugfs_create_file("available_filter_functions", 0444, 1563 entry = debugfs_create_file("available_filter_functions", 0444,
1527 d_tracer, NULL, &ftrace_avail_fops); 1564 d_tracer, NULL, &ftrace_avail_fops);
@@ -1546,97 +1583,295 @@ static __init int ftrace_init_debugfs(void)
1546 pr_warning("Could not create debugfs " 1583 pr_warning("Could not create debugfs "
1547 "'set_ftrace_notrace' entry\n"); 1584 "'set_ftrace_notrace' entry\n");
1548 1585
1549 entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, 1586#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1550 NULL, &ftraced_fops); 1587 entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
1588 NULL,
1589 &ftrace_graph_fops);
1551 if (!entry) 1590 if (!entry)
1552 pr_warning("Could not create debugfs " 1591 pr_warning("Could not create debugfs "
1553 "'ftraced_enabled' entry\n"); 1592 "'set_graph_function' entry\n");
1593#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1594
1554 return 0; 1595 return 0;
1555} 1596}
1556 1597
1557fs_initcall(ftrace_init_debugfs); 1598static int ftrace_convert_nops(struct module *mod,
1558 1599 unsigned long *start,
1559static int __init ftrace_dynamic_init(void) 1600 unsigned long *end)
1560{ 1601{
1561 struct task_struct *p; 1602 unsigned long *p;
1562 unsigned long addr; 1603 unsigned long addr;
1604 unsigned long flags;
1605
1606 mutex_lock(&ftrace_start_lock);
1607 p = start;
1608 while (p < end) {
1609 addr = ftrace_call_adjust(*p++);
1610 /*
1611 * Some architecture linkers will pad between
1612 * the different mcount_loc sections of different
1613 * object files to satisfy alignments.
1614 * Skip any NULL pointers.
1615 */
1616 if (!addr)
1617 continue;
1618 ftrace_record_ip(addr);
1619 }
1620
1621 /* disable interrupts to prevent kstop machine */
1622 local_irq_save(flags);
1623 ftrace_update_code(mod);
1624 local_irq_restore(flags);
1625 mutex_unlock(&ftrace_start_lock);
1626
1627 return 0;
1628}
1629
1630void ftrace_init_module(struct module *mod,
1631 unsigned long *start, unsigned long *end)
1632{
1633 if (ftrace_disabled || start == end)
1634 return;
1635 ftrace_convert_nops(mod, start, end);
1636}
1637
1638extern unsigned long __start_mcount_loc[];
1639extern unsigned long __stop_mcount_loc[];
1640
1641void __init ftrace_init(void)
1642{
1643 unsigned long count, addr, flags;
1563 int ret; 1644 int ret;
1564 1645
1565 addr = (unsigned long)ftrace_record_ip; 1646 /* Keep the ftrace pointer to the stub */
1647 addr = (unsigned long)ftrace_stub;
1566 1648
1567 stop_machine(ftrace_dyn_arch_init, &addr, NULL); 1649 local_irq_save(flags);
1650 ftrace_dyn_arch_init(&addr);
1651 local_irq_restore(flags);
1568 1652
1569 /* ftrace_dyn_arch_init places the return code in addr */ 1653 /* ftrace_dyn_arch_init places the return code in addr */
1570 if (addr) { 1654 if (addr)
1571 ret = (int)addr;
1572 goto failed; 1655 goto failed;
1573 }
1574 1656
1575 ret = ftrace_dyn_table_alloc(); 1657 count = __stop_mcount_loc - __start_mcount_loc;
1576 if (ret)
1577 goto failed;
1578 1658
1579 p = kthread_run(ftraced, NULL, "ftraced"); 1659 ret = ftrace_dyn_table_alloc(count);
1580 if (IS_ERR(p)) { 1660 if (ret)
1581 ret = -1;
1582 goto failed; 1661 goto failed;
1583 }
1584 1662
1585 last_ftrace_enabled = ftrace_enabled = 1; 1663 last_ftrace_enabled = ftrace_enabled = 1;
1586 ftraced_task = p;
1587 1664
1588 return 0; 1665 ret = ftrace_convert_nops(NULL,
1666 __start_mcount_loc,
1667 __stop_mcount_loc);
1589 1668
1669 return;
1590 failed: 1670 failed:
1591 ftrace_disabled = 1; 1671 ftrace_disabled = 1;
1592 return ret;
1593} 1672}
1594 1673
1595core_initcall(ftrace_dynamic_init);
1596#else 1674#else
1597# define ftrace_startup() do { } while (0) 1675
1598# define ftrace_shutdown() do { } while (0) 1676static int __init ftrace_nodyn_init(void)
1677{
1678 ftrace_enabled = 1;
1679 return 0;
1680}
1681device_initcall(ftrace_nodyn_init);
1682
1683static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
1684static inline void ftrace_startup_enable(int command) { }
1685/* Keep as macros so we do not need to define the commands */
1686# define ftrace_startup(command) do { } while (0)
1687# define ftrace_shutdown(command) do { } while (0)
1599# define ftrace_startup_sysctl() do { } while (0) 1688# define ftrace_startup_sysctl() do { } while (0)
1600# define ftrace_shutdown_sysctl() do { } while (0) 1689# define ftrace_shutdown_sysctl() do { } while (0)
1601# define ftrace_force_shutdown() do { } while (0)
1602#endif /* CONFIG_DYNAMIC_FTRACE */ 1690#endif /* CONFIG_DYNAMIC_FTRACE */
1603 1691
1604/** 1692static ssize_t
1605 * ftrace_kill_atomic - kill ftrace from critical sections 1693ftrace_pid_read(struct file *file, char __user *ubuf,
1606 * 1694 size_t cnt, loff_t *ppos)
1607 * This function should be used by panic code. It stops ftrace
1608 * but in a not so nice way. If you need to simply kill ftrace
1609 * from a non-atomic section, use ftrace_kill.
1610 */
1611void ftrace_kill_atomic(void)
1612{ 1695{
1613 ftrace_disabled = 1; 1696 char buf[64];
1614 ftrace_enabled = 0; 1697 int r;
1615#ifdef CONFIG_DYNAMIC_FTRACE 1698
1616 ftraced_suspend = -1; 1699 if (ftrace_pid_trace == ftrace_swapper_pid)
1617#endif 1700 r = sprintf(buf, "swapper tasks\n");
1618 clear_ftrace_function(); 1701 else if (ftrace_pid_trace)
1702 r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
1703 else
1704 r = sprintf(buf, "no pid\n");
1705
1706 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1707}
1708
1709static void clear_ftrace_swapper(void)
1710{
1711 struct task_struct *p;
1712 int cpu;
1713
1714 get_online_cpus();
1715 for_each_online_cpu(cpu) {
1716 p = idle_task(cpu);
1717 clear_tsk_trace_trace(p);
1718 }
1719 put_online_cpus();
1720}
1721
1722static void set_ftrace_swapper(void)
1723{
1724 struct task_struct *p;
1725 int cpu;
1726
1727 get_online_cpus();
1728 for_each_online_cpu(cpu) {
1729 p = idle_task(cpu);
1730 set_tsk_trace_trace(p);
1731 }
1732 put_online_cpus();
1733}
1734
1735static void clear_ftrace_pid(struct pid *pid)
1736{
1737 struct task_struct *p;
1738
1739 do_each_pid_task(pid, PIDTYPE_PID, p) {
1740 clear_tsk_trace_trace(p);
1741 } while_each_pid_task(pid, PIDTYPE_PID, p);
1742 put_pid(pid);
1743}
1744
1745static void set_ftrace_pid(struct pid *pid)
1746{
1747 struct task_struct *p;
1748
1749 do_each_pid_task(pid, PIDTYPE_PID, p) {
1750 set_tsk_trace_trace(p);
1751 } while_each_pid_task(pid, PIDTYPE_PID, p);
1752}
1753
1754static void clear_ftrace_pid_task(struct pid **pid)
1755{
1756 if (*pid == ftrace_swapper_pid)
1757 clear_ftrace_swapper();
1758 else
1759 clear_ftrace_pid(*pid);
1760
1761 *pid = NULL;
1762}
1763
1764static void set_ftrace_pid_task(struct pid *pid)
1765{
1766 if (pid == ftrace_swapper_pid)
1767 set_ftrace_swapper();
1768 else
1769 set_ftrace_pid(pid);
1770}
1771
1772static ssize_t
1773ftrace_pid_write(struct file *filp, const char __user *ubuf,
1774 size_t cnt, loff_t *ppos)
1775{
1776 struct pid *pid;
1777 char buf[64];
1778 long val;
1779 int ret;
1780
1781 if (cnt >= sizeof(buf))
1782 return -EINVAL;
1783
1784 if (copy_from_user(&buf, ubuf, cnt))
1785 return -EFAULT;
1786
1787 buf[cnt] = 0;
1788
1789 ret = strict_strtol(buf, 10, &val);
1790 if (ret < 0)
1791 return ret;
1792
1793 mutex_lock(&ftrace_start_lock);
1794 if (val < 0) {
1795 /* disable pid tracing */
1796 if (!ftrace_pid_trace)
1797 goto out;
1798
1799 clear_ftrace_pid_task(&ftrace_pid_trace);
1800
1801 } else {
1802 /* swapper task is special */
1803 if (!val) {
1804 pid = ftrace_swapper_pid;
1805 if (pid == ftrace_pid_trace)
1806 goto out;
1807 } else {
1808 pid = find_get_pid(val);
1809
1810 if (pid == ftrace_pid_trace) {
1811 put_pid(pid);
1812 goto out;
1813 }
1814 }
1815
1816 if (ftrace_pid_trace)
1817 clear_ftrace_pid_task(&ftrace_pid_trace);
1818
1819 if (!pid)
1820 goto out;
1821
1822 ftrace_pid_trace = pid;
1823
1824 set_ftrace_pid_task(ftrace_pid_trace);
1825 }
1826
1827 /* update the function call */
1828 ftrace_update_pid_func();
1829 ftrace_startup_enable(0);
1830
1831 out:
1832 mutex_unlock(&ftrace_start_lock);
1833
1834 return cnt;
1619} 1835}
1620 1836
1837static struct file_operations ftrace_pid_fops = {
1838 .read = ftrace_pid_read,
1839 .write = ftrace_pid_write,
1840};
1841
1842static __init int ftrace_init_debugfs(void)
1843{
1844 struct dentry *d_tracer;
1845 struct dentry *entry;
1846
1847 d_tracer = tracing_init_dentry();
1848 if (!d_tracer)
1849 return 0;
1850
1851 ftrace_init_dyn_debugfs(d_tracer);
1852
1853 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
1854 NULL, &ftrace_pid_fops);
1855 if (!entry)
1856 pr_warning("Could not create debugfs "
1857 "'set_ftrace_pid' entry\n");
1858 return 0;
1859}
1860
1861fs_initcall(ftrace_init_debugfs);
1862
1621/** 1863/**
1622 * ftrace_kill - totally shutdown ftrace 1864 * ftrace_kill - kill ftrace
1623 * 1865 *
1624 * This is a safety measure. If something was detected that seems 1866 * This function should be used by panic code. It stops ftrace
1625 * wrong, calling this function will keep ftrace from doing 1867 * but in a not so nice way. If you need to simply kill ftrace
1626 * any more modifications, and updates. 1868 * from a non-atomic section, use ftrace_kill.
1627 * used when something went wrong.
1628 */ 1869 */
1629void ftrace_kill(void) 1870void ftrace_kill(void)
1630{ 1871{
1631 mutex_lock(&ftrace_sysctl_lock);
1632 ftrace_disabled = 1; 1872 ftrace_disabled = 1;
1633 ftrace_enabled = 0; 1873 ftrace_enabled = 0;
1634
1635 clear_ftrace_function(); 1874 clear_ftrace_function();
1636 mutex_unlock(&ftrace_sysctl_lock);
1637
1638 /* Try to totally disable ftrace */
1639 ftrace_force_shutdown();
1640} 1875}
1641 1876
1642/** 1877/**
@@ -1658,10 +1893,11 @@ int register_ftrace_function(struct ftrace_ops *ops)
1658 return -1; 1893 return -1;
1659 1894
1660 mutex_lock(&ftrace_sysctl_lock); 1895 mutex_lock(&ftrace_sysctl_lock);
1896
1661 ret = __register_ftrace_function(ops); 1897 ret = __register_ftrace_function(ops);
1662 ftrace_startup(); 1898 ftrace_startup(0);
1663 mutex_unlock(&ftrace_sysctl_lock);
1664 1899
1900 mutex_unlock(&ftrace_sysctl_lock);
1665 return ret; 1901 return ret;
1666} 1902}
1667 1903
@@ -1677,7 +1913,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1677 1913
1678 mutex_lock(&ftrace_sysctl_lock); 1914 mutex_lock(&ftrace_sysctl_lock);
1679 ret = __unregister_ftrace_function(ops); 1915 ret = __unregister_ftrace_function(ops);
1680 ftrace_shutdown(); 1916 ftrace_shutdown(0);
1681 mutex_unlock(&ftrace_sysctl_lock); 1917 mutex_unlock(&ftrace_sysctl_lock);
1682 1918
1683 return ret; 1919 return ret;
@@ -1725,3 +1961,154 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1725 mutex_unlock(&ftrace_sysctl_lock); 1961 mutex_unlock(&ftrace_sysctl_lock);
1726 return ret; 1962 return ret;
1727} 1963}
1964
1965#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1966
1967static atomic_t ftrace_graph_active;
1968
1969int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1970{
1971 return 0;
1972}
1973
1974/* The callbacks that hook a function */
1975trace_func_graph_ret_t ftrace_graph_return =
1976 (trace_func_graph_ret_t)ftrace_stub;
1977trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
1978
1979/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
1980static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
1981{
1982 int i;
1983 int ret = 0;
1984 unsigned long flags;
1985 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
1986 struct task_struct *g, *t;
1987
1988 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
1989 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
1990 * sizeof(struct ftrace_ret_stack),
1991 GFP_KERNEL);
1992 if (!ret_stack_list[i]) {
1993 start = 0;
1994 end = i;
1995 ret = -ENOMEM;
1996 goto free;
1997 }
1998 }
1999
2000 read_lock_irqsave(&tasklist_lock, flags);
2001 do_each_thread(g, t) {
2002 if (start == end) {
2003 ret = -EAGAIN;
2004 goto unlock;
2005 }
2006
2007 if (t->ret_stack == NULL) {
2008 t->curr_ret_stack = -1;
2009 /* Make sure IRQs see the -1 first: */
2010 barrier();
2011 t->ret_stack = ret_stack_list[start++];
2012 atomic_set(&t->tracing_graph_pause, 0);
2013 atomic_set(&t->trace_overrun, 0);
2014 }
2015 } while_each_thread(g, t);
2016
2017unlock:
2018 read_unlock_irqrestore(&tasklist_lock, flags);
2019free:
2020 for (i = start; i < end; i++)
2021 kfree(ret_stack_list[i]);
2022 return ret;
2023}
2024
2025/* Allocate a return stack for each task */
2026static int start_graph_tracing(void)
2027{
2028 struct ftrace_ret_stack **ret_stack_list;
2029 int ret;
2030
2031 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
2032 sizeof(struct ftrace_ret_stack *),
2033 GFP_KERNEL);
2034
2035 if (!ret_stack_list)
2036 return -ENOMEM;
2037
2038 do {
2039 ret = alloc_retstack_tasklist(ret_stack_list);
2040 } while (ret == -EAGAIN);
2041
2042 kfree(ret_stack_list);
2043 return ret;
2044}
2045
2046int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2047 trace_func_graph_ent_t entryfunc)
2048{
2049 int ret = 0;
2050
2051 mutex_lock(&ftrace_sysctl_lock);
2052
2053 atomic_inc(&ftrace_graph_active);
2054 ret = start_graph_tracing();
2055 if (ret) {
2056 atomic_dec(&ftrace_graph_active);
2057 goto out;
2058 }
2059
2060 ftrace_graph_return = retfunc;
2061 ftrace_graph_entry = entryfunc;
2062
2063 ftrace_startup(FTRACE_START_FUNC_RET);
2064
2065out:
2066 mutex_unlock(&ftrace_sysctl_lock);
2067 return ret;
2068}
2069
2070void unregister_ftrace_graph(void)
2071{
2072 mutex_lock(&ftrace_sysctl_lock);
2073
2074 atomic_dec(&ftrace_graph_active);
2075 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2076 ftrace_graph_entry = ftrace_graph_entry_stub;
2077 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2078
2079 mutex_unlock(&ftrace_sysctl_lock);
2080}
2081
2082/* Allocate a return stack for newly created task */
2083void ftrace_graph_init_task(struct task_struct *t)
2084{
2085 if (atomic_read(&ftrace_graph_active)) {
2086 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2087 * sizeof(struct ftrace_ret_stack),
2088 GFP_KERNEL);
2089 if (!t->ret_stack)
2090 return;
2091 t->curr_ret_stack = -1;
2092 atomic_set(&t->tracing_graph_pause, 0);
2093 atomic_set(&t->trace_overrun, 0);
2094 } else
2095 t->ret_stack = NULL;
2096}
2097
2098void ftrace_graph_exit_task(struct task_struct *t)
2099{
2100 struct ftrace_ret_stack *ret_stack = t->ret_stack;
2101
2102 t->ret_stack = NULL;
2103 /* NULL must become visible to IRQs before we free it: */
2104 barrier();
2105
2106 kfree(ret_stack);
2107}
2108
2109void ftrace_graph_stop(void)
2110{
2111 ftrace_stop();
2112}
2113#endif
2114
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 000000000000..1d601a7c4587
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,2517 @@
1/*
2 * Generic ring buffer
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/ring_buffer.h>
7#include <linux/spinlock.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/percpu.h>
12#include <linux/mutex.h>
13#include <linux/sched.h> /* used for sched_clock() (for now) */
14#include <linux/init.h>
15#include <linux/hash.h>
16#include <linux/list.h>
17#include <linux/fs.h>
18
19#include "trace.h"
20
21/*
22 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to.
25 * Turning this switch on, makes it OK to write to the
26 * ring buffer, if the ring buffer is enabled itself.
27 *
28 * There's three layers that must be on in order to write
29 * to the ring buffer.
30 *
31 * 1) This global flag must be set.
32 * 2) The ring buffer must be enabled for recording.
33 * 3) The per cpu buffer must be enabled for recording.
34 *
35 * In case of an anomaly, this global flag has a bit set that
36 * will permantly disable all ring buffers.
37 */
38
39/*
40 * Global flag to disable all recording to ring buffers
41 * This has two bits: ON, DISABLED
42 *
43 * ON DISABLED
44 * ---- ----------
45 * 0 0 : ring buffers are off
46 * 1 0 : ring buffers are on
47 * X 1 : ring buffers are permanently disabled
48 */
49
50enum {
51 RB_BUFFERS_ON_BIT = 0,
52 RB_BUFFERS_DISABLED_BIT = 1,
53};
54
55enum {
56 RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58};
59
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
61
62/**
63 * tracing_on - enable all tracing buffers
64 *
65 * This function enables all tracing buffers that may have been
66 * disabled with tracing_off.
67 */
68void tracing_on(void)
69{
70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
71}
72EXPORT_SYMBOL_GPL(tracing_on);
73
74/**
75 * tracing_off - turn off all tracing buffers
76 *
77 * This function stops all tracing buffers from recording data.
78 * It does not disable any overhead the tracers themselves may
79 * be causing. This function simply causes all recording to
80 * the ring buffers to fail.
81 */
82void tracing_off(void)
83{
84 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
85}
86EXPORT_SYMBOL_GPL(tracing_off);
87
88/**
89 * tracing_off_permanent - permanently disable ring buffers
90 *
91 * This function, once called, will disable all ring buffers
92 * permanenty.
93 */
94void tracing_off_permanent(void)
95{
96 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
97}
98
99#include "trace.h"
100
101/* Up this if you want to test the TIME_EXTENTS and normalization */
102#define DEBUG_SHIFT 0
103
104/* FIXME!!! */
105u64 ring_buffer_time_stamp(int cpu)
106{
107 u64 time;
108
109 preempt_disable_notrace();
110 /* shift to debug/test normalization and TIME_EXTENTS */
111 time = sched_clock() << DEBUG_SHIFT;
112 preempt_enable_no_resched_notrace();
113
114 return time;
115}
116EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
117
118void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
119{
120 /* Just stupid testing the normalize function and deltas */
121 *ts >>= DEBUG_SHIFT;
122}
123EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
124
125#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
126#define RB_ALIGNMENT_SHIFT 2
127#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
128#define RB_MAX_SMALL_DATA 28
129
130enum {
131 RB_LEN_TIME_EXTEND = 8,
132 RB_LEN_TIME_STAMP = 16,
133};
134
135/* inline for ring buffer fast paths */
136static inline unsigned
137rb_event_length(struct ring_buffer_event *event)
138{
139 unsigned length;
140
141 switch (event->type) {
142 case RINGBUF_TYPE_PADDING:
143 /* undefined */
144 return -1;
145
146 case RINGBUF_TYPE_TIME_EXTEND:
147 return RB_LEN_TIME_EXTEND;
148
149 case RINGBUF_TYPE_TIME_STAMP:
150 return RB_LEN_TIME_STAMP;
151
152 case RINGBUF_TYPE_DATA:
153 if (event->len)
154 length = event->len << RB_ALIGNMENT_SHIFT;
155 else
156 length = event->array[0];
157 return length + RB_EVNT_HDR_SIZE;
158 default:
159 BUG();
160 }
161 /* not hit */
162 return 0;
163}
164
165/**
166 * ring_buffer_event_length - return the length of the event
167 * @event: the event to get the length of
168 */
169unsigned ring_buffer_event_length(struct ring_buffer_event *event)
170{
171 return rb_event_length(event);
172}
173EXPORT_SYMBOL_GPL(ring_buffer_event_length);
174
175/* inline for ring buffer fast paths */
176static inline void *
177rb_event_data(struct ring_buffer_event *event)
178{
179 BUG_ON(event->type != RINGBUF_TYPE_DATA);
180 /* If length is in len field, then array[0] has the data */
181 if (event->len)
182 return (void *)&event->array[0];
183 /* Otherwise length is in array[0] and array[1] has the data */
184 return (void *)&event->array[1];
185}
186
187/**
188 * ring_buffer_event_data - return the data of the event
189 * @event: the event to get the data from
190 */
191void *ring_buffer_event_data(struct ring_buffer_event *event)
192{
193 return rb_event_data(event);
194}
195EXPORT_SYMBOL_GPL(ring_buffer_event_data);
196
197#define for_each_buffer_cpu(buffer, cpu) \
198 for_each_cpu_mask(cpu, buffer->cpumask)
199
200#define TS_SHIFT 27
201#define TS_MASK ((1ULL << TS_SHIFT) - 1)
202#define TS_DELTA_TEST (~TS_MASK)
203
204struct buffer_data_page {
205 u64 time_stamp; /* page time stamp */
206 local_t commit; /* write commited index */
207 unsigned char data[]; /* data of buffer page */
208};
209
210struct buffer_page {
211 local_t write; /* index for next write */
212 unsigned read; /* index for next read */
213 struct list_head list; /* list of free pages */
214 struct buffer_data_page *page; /* Actual data page */
215};
216
217static void rb_init_page(struct buffer_data_page *bpage)
218{
219 local_set(&bpage->commit, 0);
220}
221
222/*
223 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
224 * this issue out.
225 */
226static inline void free_buffer_page(struct buffer_page *bpage)
227{
228 if (bpage->page)
229 free_page((unsigned long)bpage->page);
230 kfree(bpage);
231}
232
233/*
234 * We need to fit the time_stamp delta into 27 bits.
235 */
236static inline int test_time_stamp(u64 delta)
237{
238 if (delta & TS_DELTA_TEST)
239 return 1;
240 return 0;
241}
242
243#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
244
245/*
246 * head_page == tail_page && head == tail then buffer is empty.
247 */
248struct ring_buffer_per_cpu {
249 int cpu;
250 struct ring_buffer *buffer;
251 spinlock_t reader_lock; /* serialize readers */
252 raw_spinlock_t lock;
253 struct lock_class_key lock_key;
254 struct list_head pages;
255 struct buffer_page *head_page; /* read from head */
256 struct buffer_page *tail_page; /* write to tail */
257 struct buffer_page *commit_page; /* commited pages */
258 struct buffer_page *reader_page;
259 unsigned long overrun;
260 unsigned long entries;
261 u64 write_stamp;
262 u64 read_stamp;
263 atomic_t record_disabled;
264};
265
266struct ring_buffer {
267 unsigned pages;
268 unsigned flags;
269 int cpus;
270 cpumask_t cpumask;
271 atomic_t record_disabled;
272
273 struct mutex mutex;
274
275 struct ring_buffer_per_cpu **buffers;
276};
277
278struct ring_buffer_iter {
279 struct ring_buffer_per_cpu *cpu_buffer;
280 unsigned long head;
281 struct buffer_page *head_page;
282 u64 read_stamp;
283};
284
285/* buffer may be either ring_buffer or ring_buffer_per_cpu */
286#define RB_WARN_ON(buffer, cond) \
287 ({ \
288 int _____ret = unlikely(cond); \
289 if (_____ret) { \
290 atomic_inc(&buffer->record_disabled); \
291 WARN_ON(1); \
292 } \
293 _____ret; \
294 })
295
296/**
297 * check_pages - integrity check of buffer pages
298 * @cpu_buffer: CPU buffer with pages to test
299 *
300 * As a safty measure we check to make sure the data pages have not
301 * been corrupted.
302 */
303static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
304{
305 struct list_head *head = &cpu_buffer->pages;
306 struct buffer_page *bpage, *tmp;
307
308 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
309 return -1;
310 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
311 return -1;
312
313 list_for_each_entry_safe(bpage, tmp, head, list) {
314 if (RB_WARN_ON(cpu_buffer,
315 bpage->list.next->prev != &bpage->list))
316 return -1;
317 if (RB_WARN_ON(cpu_buffer,
318 bpage->list.prev->next != &bpage->list))
319 return -1;
320 }
321
322 return 0;
323}
324
325static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
326 unsigned nr_pages)
327{
328 struct list_head *head = &cpu_buffer->pages;
329 struct buffer_page *bpage, *tmp;
330 unsigned long addr;
331 LIST_HEAD(pages);
332 unsigned i;
333
334 for (i = 0; i < nr_pages; i++) {
335 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
336 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
337 if (!bpage)
338 goto free_pages;
339 list_add(&bpage->list, &pages);
340
341 addr = __get_free_page(GFP_KERNEL);
342 if (!addr)
343 goto free_pages;
344 bpage->page = (void *)addr;
345 rb_init_page(bpage->page);
346 }
347
348 list_splice(&pages, head);
349
350 rb_check_pages(cpu_buffer);
351
352 return 0;
353
354 free_pages:
355 list_for_each_entry_safe(bpage, tmp, &pages, list) {
356 list_del_init(&bpage->list);
357 free_buffer_page(bpage);
358 }
359 return -ENOMEM;
360}
361
362static struct ring_buffer_per_cpu *
363rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
364{
365 struct ring_buffer_per_cpu *cpu_buffer;
366 struct buffer_page *bpage;
367 unsigned long addr;
368 int ret;
369
370 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
371 GFP_KERNEL, cpu_to_node(cpu));
372 if (!cpu_buffer)
373 return NULL;
374
375 cpu_buffer->cpu = cpu;
376 cpu_buffer->buffer = buffer;
377 spin_lock_init(&cpu_buffer->reader_lock);
378 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
379 INIT_LIST_HEAD(&cpu_buffer->pages);
380
381 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
382 GFP_KERNEL, cpu_to_node(cpu));
383 if (!bpage)
384 goto fail_free_buffer;
385
386 cpu_buffer->reader_page = bpage;
387 addr = __get_free_page(GFP_KERNEL);
388 if (!addr)
389 goto fail_free_reader;
390 bpage->page = (void *)addr;
391 rb_init_page(bpage->page);
392
393 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
394
395 ret = rb_allocate_pages(cpu_buffer, buffer->pages);
396 if (ret < 0)
397 goto fail_free_reader;
398
399 cpu_buffer->head_page
400 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
401 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
402
403 return cpu_buffer;
404
405 fail_free_reader:
406 free_buffer_page(cpu_buffer->reader_page);
407
408 fail_free_buffer:
409 kfree(cpu_buffer);
410 return NULL;
411}
412
413static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
414{
415 struct list_head *head = &cpu_buffer->pages;
416 struct buffer_page *bpage, *tmp;
417
418 list_del_init(&cpu_buffer->reader_page->list);
419 free_buffer_page(cpu_buffer->reader_page);
420
421 list_for_each_entry_safe(bpage, tmp, head, list) {
422 list_del_init(&bpage->list);
423 free_buffer_page(bpage);
424 }
425 kfree(cpu_buffer);
426}
427
428/*
429 * Causes compile errors if the struct buffer_page gets bigger
430 * than the struct page.
431 */
432extern int ring_buffer_page_too_big(void);
433
434/**
435 * ring_buffer_alloc - allocate a new ring_buffer
436 * @size: the size in bytes per cpu that is needed.
437 * @flags: attributes to set for the ring buffer.
438 *
439 * Currently the only flag that is available is the RB_FL_OVERWRITE
440 * flag. This flag means that the buffer will overwrite old data
441 * when the buffer wraps. If this flag is not set, the buffer will
442 * drop data when the tail hits the head.
443 */
444struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
445{
446 struct ring_buffer *buffer;
447 int bsize;
448 int cpu;
449
450 /* Paranoid! Optimizes out when all is well */
451 if (sizeof(struct buffer_page) > sizeof(struct page))
452 ring_buffer_page_too_big();
453
454
455 /* keep it in its own cache line */
456 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
457 GFP_KERNEL);
458 if (!buffer)
459 return NULL;
460
461 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
462 buffer->flags = flags;
463
464 /* need at least two pages */
465 if (buffer->pages == 1)
466 buffer->pages++;
467
468 buffer->cpumask = cpu_possible_map;
469 buffer->cpus = nr_cpu_ids;
470
471 bsize = sizeof(void *) * nr_cpu_ids;
472 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
473 GFP_KERNEL);
474 if (!buffer->buffers)
475 goto fail_free_buffer;
476
477 for_each_buffer_cpu(buffer, cpu) {
478 buffer->buffers[cpu] =
479 rb_allocate_cpu_buffer(buffer, cpu);
480 if (!buffer->buffers[cpu])
481 goto fail_free_buffers;
482 }
483
484 mutex_init(&buffer->mutex);
485
486 return buffer;
487
488 fail_free_buffers:
489 for_each_buffer_cpu(buffer, cpu) {
490 if (buffer->buffers[cpu])
491 rb_free_cpu_buffer(buffer->buffers[cpu]);
492 }
493 kfree(buffer->buffers);
494
495 fail_free_buffer:
496 kfree(buffer);
497 return NULL;
498}
499EXPORT_SYMBOL_GPL(ring_buffer_alloc);
500
501/**
502 * ring_buffer_free - free a ring buffer.
503 * @buffer: the buffer to free.
504 */
505void
506ring_buffer_free(struct ring_buffer *buffer)
507{
508 int cpu;
509
510 for_each_buffer_cpu(buffer, cpu)
511 rb_free_cpu_buffer(buffer->buffers[cpu]);
512
513 kfree(buffer);
514}
515EXPORT_SYMBOL_GPL(ring_buffer_free);
516
517static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
518
519static void
520rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
521{
522 struct buffer_page *bpage;
523 struct list_head *p;
524 unsigned i;
525
526 atomic_inc(&cpu_buffer->record_disabled);
527 synchronize_sched();
528
529 for (i = 0; i < nr_pages; i++) {
530 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
531 return;
532 p = cpu_buffer->pages.next;
533 bpage = list_entry(p, struct buffer_page, list);
534 list_del_init(&bpage->list);
535 free_buffer_page(bpage);
536 }
537 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
538 return;
539
540 rb_reset_cpu(cpu_buffer);
541
542 rb_check_pages(cpu_buffer);
543
544 atomic_dec(&cpu_buffer->record_disabled);
545
546}
547
548static void
549rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
550 struct list_head *pages, unsigned nr_pages)
551{
552 struct buffer_page *bpage;
553 struct list_head *p;
554 unsigned i;
555
556 atomic_inc(&cpu_buffer->record_disabled);
557 synchronize_sched();
558
559 for (i = 0; i < nr_pages; i++) {
560 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
561 return;
562 p = pages->next;
563 bpage = list_entry(p, struct buffer_page, list);
564 list_del_init(&bpage->list);
565 list_add_tail(&bpage->list, &cpu_buffer->pages);
566 }
567 rb_reset_cpu(cpu_buffer);
568
569 rb_check_pages(cpu_buffer);
570
571 atomic_dec(&cpu_buffer->record_disabled);
572}
573
574/**
575 * ring_buffer_resize - resize the ring buffer
576 * @buffer: the buffer to resize.
577 * @size: the new size.
578 *
579 * The tracer is responsible for making sure that the buffer is
580 * not being used while changing the size.
581 * Note: We may be able to change the above requirement by using
582 * RCU synchronizations.
583 *
584 * Minimum size is 2 * BUF_PAGE_SIZE.
585 *
586 * Returns -1 on failure.
587 */
588int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
589{
590 struct ring_buffer_per_cpu *cpu_buffer;
591 unsigned nr_pages, rm_pages, new_pages;
592 struct buffer_page *bpage, *tmp;
593 unsigned long buffer_size;
594 unsigned long addr;
595 LIST_HEAD(pages);
596 int i, cpu;
597
598 /*
599 * Always succeed at resizing a non-existent buffer:
600 */
601 if (!buffer)
602 return size;
603
604 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
605 size *= BUF_PAGE_SIZE;
606 buffer_size = buffer->pages * BUF_PAGE_SIZE;
607
608 /* we need a minimum of two pages */
609 if (size < BUF_PAGE_SIZE * 2)
610 size = BUF_PAGE_SIZE * 2;
611
612 if (size == buffer_size)
613 return size;
614
615 mutex_lock(&buffer->mutex);
616
617 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
618
619 if (size < buffer_size) {
620
621 /* easy case, just free pages */
622 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
623 mutex_unlock(&buffer->mutex);
624 return -1;
625 }
626
627 rm_pages = buffer->pages - nr_pages;
628
629 for_each_buffer_cpu(buffer, cpu) {
630 cpu_buffer = buffer->buffers[cpu];
631 rb_remove_pages(cpu_buffer, rm_pages);
632 }
633 goto out;
634 }
635
636 /*
637 * This is a bit more difficult. We only want to add pages
638 * when we can allocate enough for all CPUs. We do this
639 * by allocating all the pages and storing them on a local
640 * link list. If we succeed in our allocation, then we
641 * add these pages to the cpu_buffers. Otherwise we just free
642 * them all and return -ENOMEM;
643 */
644 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
645 mutex_unlock(&buffer->mutex);
646 return -1;
647 }
648
649 new_pages = nr_pages - buffer->pages;
650
651 for_each_buffer_cpu(buffer, cpu) {
652 for (i = 0; i < new_pages; i++) {
653 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
654 cache_line_size()),
655 GFP_KERNEL, cpu_to_node(cpu));
656 if (!bpage)
657 goto free_pages;
658 list_add(&bpage->list, &pages);
659 addr = __get_free_page(GFP_KERNEL);
660 if (!addr)
661 goto free_pages;
662 bpage->page = (void *)addr;
663 rb_init_page(bpage->page);
664 }
665 }
666
667 for_each_buffer_cpu(buffer, cpu) {
668 cpu_buffer = buffer->buffers[cpu];
669 rb_insert_pages(cpu_buffer, &pages, new_pages);
670 }
671
672 if (RB_WARN_ON(buffer, !list_empty(&pages))) {
673 mutex_unlock(&buffer->mutex);
674 return -1;
675 }
676
677 out:
678 buffer->pages = nr_pages;
679 mutex_unlock(&buffer->mutex);
680
681 return size;
682
683 free_pages:
684 list_for_each_entry_safe(bpage, tmp, &pages, list) {
685 list_del_init(&bpage->list);
686 free_buffer_page(bpage);
687 }
688 mutex_unlock(&buffer->mutex);
689 return -ENOMEM;
690}
691EXPORT_SYMBOL_GPL(ring_buffer_resize);
692
693static inline int rb_null_event(struct ring_buffer_event *event)
694{
695 return event->type == RINGBUF_TYPE_PADDING;
696}
697
698static inline void *
699__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
700{
701 return bpage->data + index;
702}
703
704static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
705{
706 return bpage->page->data + index;
707}
708
709static inline struct ring_buffer_event *
710rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
711{
712 return __rb_page_index(cpu_buffer->reader_page,
713 cpu_buffer->reader_page->read);
714}
715
716static inline struct ring_buffer_event *
717rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
718{
719 return __rb_page_index(cpu_buffer->head_page,
720 cpu_buffer->head_page->read);
721}
722
723static inline struct ring_buffer_event *
724rb_iter_head_event(struct ring_buffer_iter *iter)
725{
726 return __rb_page_index(iter->head_page, iter->head);
727}
728
729static inline unsigned rb_page_write(struct buffer_page *bpage)
730{
731 return local_read(&bpage->write);
732}
733
734static inline unsigned rb_page_commit(struct buffer_page *bpage)
735{
736 return local_read(&bpage->page->commit);
737}
738
739/* Size is determined by what has been commited */
740static inline unsigned rb_page_size(struct buffer_page *bpage)
741{
742 return rb_page_commit(bpage);
743}
744
745static inline unsigned
746rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
747{
748 return rb_page_commit(cpu_buffer->commit_page);
749}
750
751static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
752{
753 return rb_page_commit(cpu_buffer->head_page);
754}
755
756/*
757 * When the tail hits the head and the buffer is in overwrite mode,
758 * the head jumps to the next page and all content on the previous
759 * page is discarded. But before doing so, we update the overrun
760 * variable of the buffer.
761 */
762static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
763{
764 struct ring_buffer_event *event;
765 unsigned long head;
766
767 for (head = 0; head < rb_head_size(cpu_buffer);
768 head += rb_event_length(event)) {
769
770 event = __rb_page_index(cpu_buffer->head_page, head);
771 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
772 return;
773 /* Only count data entries */
774 if (event->type != RINGBUF_TYPE_DATA)
775 continue;
776 cpu_buffer->overrun++;
777 cpu_buffer->entries--;
778 }
779}
780
781static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
782 struct buffer_page **bpage)
783{
784 struct list_head *p = (*bpage)->list.next;
785
786 if (p == &cpu_buffer->pages)
787 p = p->next;
788
789 *bpage = list_entry(p, struct buffer_page, list);
790}
791
792static inline unsigned
793rb_event_index(struct ring_buffer_event *event)
794{
795 unsigned long addr = (unsigned long)event;
796
797 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
798}
799
800static inline int
801rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
802 struct ring_buffer_event *event)
803{
804 unsigned long addr = (unsigned long)event;
805 unsigned long index;
806
807 index = rb_event_index(event);
808 addr &= PAGE_MASK;
809
810 return cpu_buffer->commit_page->page == (void *)addr &&
811 rb_commit_index(cpu_buffer) == index;
812}
813
814static inline void
815rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
816 struct ring_buffer_event *event)
817{
818 unsigned long addr = (unsigned long)event;
819 unsigned long index;
820
821 index = rb_event_index(event);
822 addr &= PAGE_MASK;
823
824 while (cpu_buffer->commit_page->page != (void *)addr) {
825 if (RB_WARN_ON(cpu_buffer,
826 cpu_buffer->commit_page == cpu_buffer->tail_page))
827 return;
828 cpu_buffer->commit_page->page->commit =
829 cpu_buffer->commit_page->write;
830 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
831 cpu_buffer->write_stamp =
832 cpu_buffer->commit_page->page->time_stamp;
833 }
834
835 /* Now set the commit to the event's index */
836 local_set(&cpu_buffer->commit_page->page->commit, index);
837}
838
839static inline void
840rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
841{
842 /*
843 * We only race with interrupts and NMIs on this CPU.
844 * If we own the commit event, then we can commit
845 * all others that interrupted us, since the interruptions
846 * are in stack format (they finish before they come
847 * back to us). This allows us to do a simple loop to
848 * assign the commit to the tail.
849 */
850 again:
851 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
852 cpu_buffer->commit_page->page->commit =
853 cpu_buffer->commit_page->write;
854 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
855 cpu_buffer->write_stamp =
856 cpu_buffer->commit_page->page->time_stamp;
857 /* add barrier to keep gcc from optimizing too much */
858 barrier();
859 }
860 while (rb_commit_index(cpu_buffer) !=
861 rb_page_write(cpu_buffer->commit_page)) {
862 cpu_buffer->commit_page->page->commit =
863 cpu_buffer->commit_page->write;
864 barrier();
865 }
866
867 /* again, keep gcc from optimizing */
868 barrier();
869
870 /*
871 * If an interrupt came in just after the first while loop
872 * and pushed the tail page forward, we will be left with
873 * a dangling commit that will never go forward.
874 */
875 if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
876 goto again;
877}
878
879static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
880{
881 cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
882 cpu_buffer->reader_page->read = 0;
883}
884
885static inline void rb_inc_iter(struct ring_buffer_iter *iter)
886{
887 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
888
889 /*
890 * The iterator could be on the reader page (it starts there).
891 * But the head could have moved, since the reader was
892 * found. Check for this case and assign the iterator
893 * to the head page instead of next.
894 */
895 if (iter->head_page == cpu_buffer->reader_page)
896 iter->head_page = cpu_buffer->head_page;
897 else
898 rb_inc_page(cpu_buffer, &iter->head_page);
899
900 iter->read_stamp = iter->head_page->page->time_stamp;
901 iter->head = 0;
902}
903
904/**
905 * ring_buffer_update_event - update event type and data
906 * @event: the even to update
907 * @type: the type of event
908 * @length: the size of the event field in the ring buffer
909 *
910 * Update the type and data fields of the event. The length
911 * is the actual size that is written to the ring buffer,
912 * and with this, we can determine what to place into the
913 * data field.
914 */
915static inline void
916rb_update_event(struct ring_buffer_event *event,
917 unsigned type, unsigned length)
918{
919 event->type = type;
920
921 switch (type) {
922
923 case RINGBUF_TYPE_PADDING:
924 break;
925
926 case RINGBUF_TYPE_TIME_EXTEND:
927 event->len =
928 (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
929 >> RB_ALIGNMENT_SHIFT;
930 break;
931
932 case RINGBUF_TYPE_TIME_STAMP:
933 event->len =
934 (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
935 >> RB_ALIGNMENT_SHIFT;
936 break;
937
938 case RINGBUF_TYPE_DATA:
939 length -= RB_EVNT_HDR_SIZE;
940 if (length > RB_MAX_SMALL_DATA) {
941 event->len = 0;
942 event->array[0] = length;
943 } else
944 event->len =
945 (length + (RB_ALIGNMENT-1))
946 >> RB_ALIGNMENT_SHIFT;
947 break;
948 default:
949 BUG();
950 }
951}
952
953static inline unsigned rb_calculate_event_length(unsigned length)
954{
955 struct ring_buffer_event event; /* Used only for sizeof array */
956
957 /* zero length can cause confusions */
958 if (!length)
959 length = 1;
960
961 if (length > RB_MAX_SMALL_DATA)
962 length += sizeof(event.array[0]);
963
964 length += RB_EVNT_HDR_SIZE;
965 length = ALIGN(length, RB_ALIGNMENT);
966
967 return length;
968}
969
970static struct ring_buffer_event *
971__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
972 unsigned type, unsigned long length, u64 *ts)
973{
974 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
975 unsigned long tail, write;
976 struct ring_buffer *buffer = cpu_buffer->buffer;
977 struct ring_buffer_event *event;
978 unsigned long flags;
979
980 commit_page = cpu_buffer->commit_page;
981 /* we just need to protect against interrupts */
982 barrier();
983 tail_page = cpu_buffer->tail_page;
984 write = local_add_return(length, &tail_page->write);
985 tail = write - length;
986
987 /* See if we shot pass the end of this buffer page */
988 if (write > BUF_PAGE_SIZE) {
989 struct buffer_page *next_page = tail_page;
990
991 local_irq_save(flags);
992 __raw_spin_lock(&cpu_buffer->lock);
993
994 rb_inc_page(cpu_buffer, &next_page);
995
996 head_page = cpu_buffer->head_page;
997 reader_page = cpu_buffer->reader_page;
998
999 /* we grabbed the lock before incrementing */
1000 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1001 goto out_unlock;
1002
1003 /*
1004 * If for some reason, we had an interrupt storm that made
1005 * it all the way around the buffer, bail, and warn
1006 * about it.
1007 */
1008 if (unlikely(next_page == commit_page)) {
1009 WARN_ON_ONCE(1);
1010 goto out_unlock;
1011 }
1012
1013 if (next_page == head_page) {
1014 if (!(buffer->flags & RB_FL_OVERWRITE)) {
1015 /* reset write */
1016 if (tail <= BUF_PAGE_SIZE)
1017 local_set(&tail_page->write, tail);
1018 goto out_unlock;
1019 }
1020
1021 /* tail_page has not moved yet? */
1022 if (tail_page == cpu_buffer->tail_page) {
1023 /* count overflows */
1024 rb_update_overflow(cpu_buffer);
1025
1026 rb_inc_page(cpu_buffer, &head_page);
1027 cpu_buffer->head_page = head_page;
1028 cpu_buffer->head_page->read = 0;
1029 }
1030 }
1031
1032 /*
1033 * If the tail page is still the same as what we think
1034 * it is, then it is up to us to update the tail
1035 * pointer.
1036 */
1037 if (tail_page == cpu_buffer->tail_page) {
1038 local_set(&next_page->write, 0);
1039 local_set(&next_page->page->commit, 0);
1040 cpu_buffer->tail_page = next_page;
1041
1042 /* reread the time stamp */
1043 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1044 cpu_buffer->tail_page->page->time_stamp = *ts;
1045 }
1046
1047 /*
1048 * The actual tail page has moved forward.
1049 */
1050 if (tail < BUF_PAGE_SIZE) {
1051 /* Mark the rest of the page with padding */
1052 event = __rb_page_index(tail_page, tail);
1053 event->type = RINGBUF_TYPE_PADDING;
1054 }
1055
1056 if (tail <= BUF_PAGE_SIZE)
1057 /* Set the write back to the previous setting */
1058 local_set(&tail_page->write, tail);
1059
1060 /*
1061 * If this was a commit entry that failed,
1062 * increment that too
1063 */
1064 if (tail_page == cpu_buffer->commit_page &&
1065 tail == rb_commit_index(cpu_buffer)) {
1066 rb_set_commit_to_write(cpu_buffer);
1067 }
1068
1069 __raw_spin_unlock(&cpu_buffer->lock);
1070 local_irq_restore(flags);
1071
1072 /* fail and let the caller try again */
1073 return ERR_PTR(-EAGAIN);
1074 }
1075
1076 /* We reserved something on the buffer */
1077
1078 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1079 return NULL;
1080
1081 event = __rb_page_index(tail_page, tail);
1082 rb_update_event(event, type, length);
1083
1084 /*
1085 * If this is a commit and the tail is zero, then update
1086 * this page's time stamp.
1087 */
1088 if (!tail && rb_is_commit(cpu_buffer, event))
1089 cpu_buffer->commit_page->page->time_stamp = *ts;
1090
1091 return event;
1092
1093 out_unlock:
1094 __raw_spin_unlock(&cpu_buffer->lock);
1095 local_irq_restore(flags);
1096 return NULL;
1097}
1098
1099static int
1100rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1101 u64 *ts, u64 *delta)
1102{
1103 struct ring_buffer_event *event;
1104 static int once;
1105 int ret;
1106
1107 if (unlikely(*delta > (1ULL << 59) && !once++)) {
1108 printk(KERN_WARNING "Delta way too big! %llu"
1109 " ts=%llu write stamp = %llu\n",
1110 (unsigned long long)*delta,
1111 (unsigned long long)*ts,
1112 (unsigned long long)cpu_buffer->write_stamp);
1113 WARN_ON(1);
1114 }
1115
1116 /*
1117 * The delta is too big, we to add a
1118 * new timestamp.
1119 */
1120 event = __rb_reserve_next(cpu_buffer,
1121 RINGBUF_TYPE_TIME_EXTEND,
1122 RB_LEN_TIME_EXTEND,
1123 ts);
1124 if (!event)
1125 return -EBUSY;
1126
1127 if (PTR_ERR(event) == -EAGAIN)
1128 return -EAGAIN;
1129
1130 /* Only a commited time event can update the write stamp */
1131 if (rb_is_commit(cpu_buffer, event)) {
1132 /*
1133 * If this is the first on the page, then we need to
1134 * update the page itself, and just put in a zero.
1135 */
1136 if (rb_event_index(event)) {
1137 event->time_delta = *delta & TS_MASK;
1138 event->array[0] = *delta >> TS_SHIFT;
1139 } else {
1140 cpu_buffer->commit_page->page->time_stamp = *ts;
1141 event->time_delta = 0;
1142 event->array[0] = 0;
1143 }
1144 cpu_buffer->write_stamp = *ts;
1145 /* let the caller know this was the commit */
1146 ret = 1;
1147 } else {
1148 /* Darn, this is just wasted space */
1149 event->time_delta = 0;
1150 event->array[0] = 0;
1151 ret = 0;
1152 }
1153
1154 *delta = 0;
1155
1156 return ret;
1157}
1158
1159static struct ring_buffer_event *
1160rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1161 unsigned type, unsigned long length)
1162{
1163 struct ring_buffer_event *event;
1164 u64 ts, delta;
1165 int commit = 0;
1166 int nr_loops = 0;
1167
1168 again:
1169 /*
1170 * We allow for interrupts to reenter here and do a trace.
1171 * If one does, it will cause this original code to loop
1172 * back here. Even with heavy interrupts happening, this
1173 * should only happen a few times in a row. If this happens
1174 * 1000 times in a row, there must be either an interrupt
1175 * storm or we have something buggy.
1176 * Bail!
1177 */
1178 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1179 return NULL;
1180
1181 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1182
1183 /*
1184 * Only the first commit can update the timestamp.
1185 * Yes there is a race here. If an interrupt comes in
1186 * just after the conditional and it traces too, then it
1187 * will also check the deltas. More than one timestamp may
1188 * also be made. But only the entry that did the actual
1189 * commit will be something other than zero.
1190 */
1191 if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
1192 rb_page_write(cpu_buffer->tail_page) ==
1193 rb_commit_index(cpu_buffer)) {
1194
1195 delta = ts - cpu_buffer->write_stamp;
1196
1197 /* make sure this delta is calculated here */
1198 barrier();
1199
1200 /* Did the write stamp get updated already? */
1201 if (unlikely(ts < cpu_buffer->write_stamp))
1202 delta = 0;
1203
1204 if (test_time_stamp(delta)) {
1205
1206 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1207
1208 if (commit == -EBUSY)
1209 return NULL;
1210
1211 if (commit == -EAGAIN)
1212 goto again;
1213
1214 RB_WARN_ON(cpu_buffer, commit < 0);
1215 }
1216 } else
1217 /* Non commits have zero deltas */
1218 delta = 0;
1219
1220 event = __rb_reserve_next(cpu_buffer, type, length, &ts);
1221 if (PTR_ERR(event) == -EAGAIN)
1222 goto again;
1223
1224 if (!event) {
1225 if (unlikely(commit))
1226 /*
1227 * Ouch! We needed a timestamp and it was commited. But
1228 * we didn't get our event reserved.
1229 */
1230 rb_set_commit_to_write(cpu_buffer);
1231 return NULL;
1232 }
1233
1234 /*
1235 * If the timestamp was commited, make the commit our entry
1236 * now so that we will update it when needed.
1237 */
1238 if (commit)
1239 rb_set_commit_event(cpu_buffer, event);
1240 else if (!rb_is_commit(cpu_buffer, event))
1241 delta = 0;
1242
1243 event->time_delta = delta;
1244
1245 return event;
1246}
1247
1248static DEFINE_PER_CPU(int, rb_need_resched);
1249
1250/**
1251 * ring_buffer_lock_reserve - reserve a part of the buffer
1252 * @buffer: the ring buffer to reserve from
1253 * @length: the length of the data to reserve (excluding event header)
1254 * @flags: a pointer to save the interrupt flags
1255 *
1256 * Returns a reseverd event on the ring buffer to copy directly to.
1257 * The user of this interface will need to get the body to write into
1258 * and can use the ring_buffer_event_data() interface.
1259 *
1260 * The length is the length of the data needed, not the event length
1261 * which also includes the event header.
1262 *
1263 * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
1264 * If NULL is returned, then nothing has been allocated or locked.
1265 */
1266struct ring_buffer_event *
1267ring_buffer_lock_reserve(struct ring_buffer *buffer,
1268 unsigned long length,
1269 unsigned long *flags)
1270{
1271 struct ring_buffer_per_cpu *cpu_buffer;
1272 struct ring_buffer_event *event;
1273 int cpu, resched;
1274
1275 if (ring_buffer_flags != RB_BUFFERS_ON)
1276 return NULL;
1277
1278 if (atomic_read(&buffer->record_disabled))
1279 return NULL;
1280
1281 /* If we are tracing schedule, we don't want to recurse */
1282 resched = ftrace_preempt_disable();
1283
1284 cpu = raw_smp_processor_id();
1285
1286 if (!cpu_isset(cpu, buffer->cpumask))
1287 goto out;
1288
1289 cpu_buffer = buffer->buffers[cpu];
1290
1291 if (atomic_read(&cpu_buffer->record_disabled))
1292 goto out;
1293
1294 length = rb_calculate_event_length(length);
1295 if (length > BUF_PAGE_SIZE)
1296 goto out;
1297
1298 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
1299 if (!event)
1300 goto out;
1301
1302 /*
1303 * Need to store resched state on this cpu.
1304 * Only the first needs to.
1305 */
1306
1307 if (preempt_count() == 1)
1308 per_cpu(rb_need_resched, cpu) = resched;
1309
1310 return event;
1311
1312 out:
1313 ftrace_preempt_enable(resched);
1314 return NULL;
1315}
1316EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1317
1318static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1319 struct ring_buffer_event *event)
1320{
1321 cpu_buffer->entries++;
1322
1323 /* Only process further if we own the commit */
1324 if (!rb_is_commit(cpu_buffer, event))
1325 return;
1326
1327 cpu_buffer->write_stamp += event->time_delta;
1328
1329 rb_set_commit_to_write(cpu_buffer);
1330}
1331
1332/**
1333 * ring_buffer_unlock_commit - commit a reserved
1334 * @buffer: The buffer to commit to
1335 * @event: The event pointer to commit.
1336 * @flags: the interrupt flags received from ring_buffer_lock_reserve.
1337 *
1338 * This commits the data to the ring buffer, and releases any locks held.
1339 *
1340 * Must be paired with ring_buffer_lock_reserve.
1341 */
1342int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1343 struct ring_buffer_event *event,
1344 unsigned long flags)
1345{
1346 struct ring_buffer_per_cpu *cpu_buffer;
1347 int cpu = raw_smp_processor_id();
1348
1349 cpu_buffer = buffer->buffers[cpu];
1350
1351 rb_commit(cpu_buffer, event);
1352
1353 /*
1354 * Only the last preempt count needs to restore preemption.
1355 */
1356 if (preempt_count() == 1)
1357 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1358 else
1359 preempt_enable_no_resched_notrace();
1360
1361 return 0;
1362}
1363EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1364
1365/**
1366 * ring_buffer_write - write data to the buffer without reserving
1367 * @buffer: The ring buffer to write to.
1368 * @length: The length of the data being written (excluding the event header)
1369 * @data: The data to write to the buffer.
1370 *
1371 * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
1372 * one function. If you already have the data to write to the buffer, it
1373 * may be easier to simply call this function.
1374 *
1375 * Note, like ring_buffer_lock_reserve, the length is the length of the data
1376 * and not the length of the event which would hold the header.
1377 */
1378int ring_buffer_write(struct ring_buffer *buffer,
1379 unsigned long length,
1380 void *data)
1381{
1382 struct ring_buffer_per_cpu *cpu_buffer;
1383 struct ring_buffer_event *event;
1384 unsigned long event_length;
1385 void *body;
1386 int ret = -EBUSY;
1387 int cpu, resched;
1388
1389 if (ring_buffer_flags != RB_BUFFERS_ON)
1390 return -EBUSY;
1391
1392 if (atomic_read(&buffer->record_disabled))
1393 return -EBUSY;
1394
1395 resched = ftrace_preempt_disable();
1396
1397 cpu = raw_smp_processor_id();
1398
1399 if (!cpu_isset(cpu, buffer->cpumask))
1400 goto out;
1401
1402 cpu_buffer = buffer->buffers[cpu];
1403
1404 if (atomic_read(&cpu_buffer->record_disabled))
1405 goto out;
1406
1407 event_length = rb_calculate_event_length(length);
1408 event = rb_reserve_next_event(cpu_buffer,
1409 RINGBUF_TYPE_DATA, event_length);
1410 if (!event)
1411 goto out;
1412
1413 body = rb_event_data(event);
1414
1415 memcpy(body, data, length);
1416
1417 rb_commit(cpu_buffer, event);
1418
1419 ret = 0;
1420 out:
1421 ftrace_preempt_enable(resched);
1422
1423 return ret;
1424}
1425EXPORT_SYMBOL_GPL(ring_buffer_write);
1426
1427static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1428{
1429 struct buffer_page *reader = cpu_buffer->reader_page;
1430 struct buffer_page *head = cpu_buffer->head_page;
1431 struct buffer_page *commit = cpu_buffer->commit_page;
1432
1433 return reader->read == rb_page_commit(reader) &&
1434 (commit == reader ||
1435 (commit == head &&
1436 head->read == rb_page_commit(commit)));
1437}
1438
1439/**
1440 * ring_buffer_record_disable - stop all writes into the buffer
1441 * @buffer: The ring buffer to stop writes to.
1442 *
1443 * This prevents all writes to the buffer. Any attempt to write
1444 * to the buffer after this will fail and return NULL.
1445 *
1446 * The caller should call synchronize_sched() after this.
1447 */
1448void ring_buffer_record_disable(struct ring_buffer *buffer)
1449{
1450 atomic_inc(&buffer->record_disabled);
1451}
1452EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
1453
1454/**
1455 * ring_buffer_record_enable - enable writes to the buffer
1456 * @buffer: The ring buffer to enable writes
1457 *
1458 * Note, multiple disables will need the same number of enables
1459 * to truely enable the writing (much like preempt_disable).
1460 */
1461void ring_buffer_record_enable(struct ring_buffer *buffer)
1462{
1463 atomic_dec(&buffer->record_disabled);
1464}
1465EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
1466
1467/**
1468 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
1469 * @buffer: The ring buffer to stop writes to.
1470 * @cpu: The CPU buffer to stop
1471 *
1472 * This prevents all writes to the buffer. Any attempt to write
1473 * to the buffer after this will fail and return NULL.
1474 *
1475 * The caller should call synchronize_sched() after this.
1476 */
1477void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1478{
1479 struct ring_buffer_per_cpu *cpu_buffer;
1480
1481 if (!cpu_isset(cpu, buffer->cpumask))
1482 return;
1483
1484 cpu_buffer = buffer->buffers[cpu];
1485 atomic_inc(&cpu_buffer->record_disabled);
1486}
1487EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
1488
1489/**
1490 * ring_buffer_record_enable_cpu - enable writes to the buffer
1491 * @buffer: The ring buffer to enable writes
1492 * @cpu: The CPU to enable.
1493 *
1494 * Note, multiple disables will need the same number of enables
1495 * to truely enable the writing (much like preempt_disable).
1496 */
1497void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1498{
1499 struct ring_buffer_per_cpu *cpu_buffer;
1500
1501 if (!cpu_isset(cpu, buffer->cpumask))
1502 return;
1503
1504 cpu_buffer = buffer->buffers[cpu];
1505 atomic_dec(&cpu_buffer->record_disabled);
1506}
1507EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
1508
1509/**
1510 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
1511 * @buffer: The ring buffer
1512 * @cpu: The per CPU buffer to get the entries from.
1513 */
1514unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1515{
1516 struct ring_buffer_per_cpu *cpu_buffer;
1517
1518 if (!cpu_isset(cpu, buffer->cpumask))
1519 return 0;
1520
1521 cpu_buffer = buffer->buffers[cpu];
1522 return cpu_buffer->entries;
1523}
1524EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1525
1526/**
1527 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
1528 * @buffer: The ring buffer
1529 * @cpu: The per CPU buffer to get the number of overruns from
1530 */
1531unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1532{
1533 struct ring_buffer_per_cpu *cpu_buffer;
1534
1535 if (!cpu_isset(cpu, buffer->cpumask))
1536 return 0;
1537
1538 cpu_buffer = buffer->buffers[cpu];
1539 return cpu_buffer->overrun;
1540}
1541EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1542
1543/**
1544 * ring_buffer_entries - get the number of entries in a buffer
1545 * @buffer: The ring buffer
1546 *
1547 * Returns the total number of entries in the ring buffer
1548 * (all CPU entries)
1549 */
1550unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1551{
1552 struct ring_buffer_per_cpu *cpu_buffer;
1553 unsigned long entries = 0;
1554 int cpu;
1555
1556 /* if you care about this being correct, lock the buffer */
1557 for_each_buffer_cpu(buffer, cpu) {
1558 cpu_buffer = buffer->buffers[cpu];
1559 entries += cpu_buffer->entries;
1560 }
1561
1562 return entries;
1563}
1564EXPORT_SYMBOL_GPL(ring_buffer_entries);
1565
1566/**
1567 * ring_buffer_overrun_cpu - get the number of overruns in buffer
1568 * @buffer: The ring buffer
1569 *
1570 * Returns the total number of overruns in the ring buffer
1571 * (all CPU entries)
1572 */
1573unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1574{
1575 struct ring_buffer_per_cpu *cpu_buffer;
1576 unsigned long overruns = 0;
1577 int cpu;
1578
1579 /* if you care about this being correct, lock the buffer */
1580 for_each_buffer_cpu(buffer, cpu) {
1581 cpu_buffer = buffer->buffers[cpu];
1582 overruns += cpu_buffer->overrun;
1583 }
1584
1585 return overruns;
1586}
1587EXPORT_SYMBOL_GPL(ring_buffer_overruns);
1588
1589static void rb_iter_reset(struct ring_buffer_iter *iter)
1590{
1591 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1592
1593 /* Iterator usage is expected to have record disabled */
1594 if (list_empty(&cpu_buffer->reader_page->list)) {
1595 iter->head_page = cpu_buffer->head_page;
1596 iter->head = cpu_buffer->head_page->read;
1597 } else {
1598 iter->head_page = cpu_buffer->reader_page;
1599 iter->head = cpu_buffer->reader_page->read;
1600 }
1601 if (iter->head)
1602 iter->read_stamp = cpu_buffer->read_stamp;
1603 else
1604 iter->read_stamp = iter->head_page->page->time_stamp;
1605}
1606
1607/**
1608 * ring_buffer_iter_reset - reset an iterator
1609 * @iter: The iterator to reset
1610 *
1611 * Resets the iterator, so that it will start from the beginning
1612 * again.
1613 */
1614void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1615{
1616 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1617 unsigned long flags;
1618
1619 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1620 rb_iter_reset(iter);
1621 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1622}
1623EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
1624
1625/**
1626 * ring_buffer_iter_empty - check if an iterator has no more to read
1627 * @iter: The iterator to check
1628 */
1629int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
1630{
1631 struct ring_buffer_per_cpu *cpu_buffer;
1632
1633 cpu_buffer = iter->cpu_buffer;
1634
1635 return iter->head_page == cpu_buffer->commit_page &&
1636 iter->head == rb_commit_index(cpu_buffer);
1637}
1638EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
1639
1640static void
1641rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1642 struct ring_buffer_event *event)
1643{
1644 u64 delta;
1645
1646 switch (event->type) {
1647 case RINGBUF_TYPE_PADDING:
1648 return;
1649
1650 case RINGBUF_TYPE_TIME_EXTEND:
1651 delta = event->array[0];
1652 delta <<= TS_SHIFT;
1653 delta += event->time_delta;
1654 cpu_buffer->read_stamp += delta;
1655 return;
1656
1657 case RINGBUF_TYPE_TIME_STAMP:
1658 /* FIXME: not implemented */
1659 return;
1660
1661 case RINGBUF_TYPE_DATA:
1662 cpu_buffer->read_stamp += event->time_delta;
1663 return;
1664
1665 default:
1666 BUG();
1667 }
1668 return;
1669}
1670
1671static void
1672rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
1673 struct ring_buffer_event *event)
1674{
1675 u64 delta;
1676
1677 switch (event->type) {
1678 case RINGBUF_TYPE_PADDING:
1679 return;
1680
1681 case RINGBUF_TYPE_TIME_EXTEND:
1682 delta = event->array[0];
1683 delta <<= TS_SHIFT;
1684 delta += event->time_delta;
1685 iter->read_stamp += delta;
1686 return;
1687
1688 case RINGBUF_TYPE_TIME_STAMP:
1689 /* FIXME: not implemented */
1690 return;
1691
1692 case RINGBUF_TYPE_DATA:
1693 iter->read_stamp += event->time_delta;
1694 return;
1695
1696 default:
1697 BUG();
1698 }
1699 return;
1700}
1701
1702static struct buffer_page *
1703rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1704{
1705 struct buffer_page *reader = NULL;
1706 unsigned long flags;
1707 int nr_loops = 0;
1708
1709 local_irq_save(flags);
1710 __raw_spin_lock(&cpu_buffer->lock);
1711
1712 again:
1713 /*
1714 * This should normally only loop twice. But because the
1715 * start of the reader inserts an empty page, it causes
1716 * a case where we will loop three times. There should be no
1717 * reason to loop four times (that I know of).
1718 */
1719 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
1720 reader = NULL;
1721 goto out;
1722 }
1723
1724 reader = cpu_buffer->reader_page;
1725
1726 /* If there's more to read, return this page */
1727 if (cpu_buffer->reader_page->read < rb_page_size(reader))
1728 goto out;
1729
1730 /* Never should we have an index greater than the size */
1731 if (RB_WARN_ON(cpu_buffer,
1732 cpu_buffer->reader_page->read > rb_page_size(reader)))
1733 goto out;
1734
1735 /* check if we caught up to the tail */
1736 reader = NULL;
1737 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
1738 goto out;
1739
1740 /*
1741 * Splice the empty reader page into the list around the head.
1742 * Reset the reader page to size zero.
1743 */
1744
1745 reader = cpu_buffer->head_page;
1746 cpu_buffer->reader_page->list.next = reader->list.next;
1747 cpu_buffer->reader_page->list.prev = reader->list.prev;
1748
1749 local_set(&cpu_buffer->reader_page->write, 0);
1750 local_set(&cpu_buffer->reader_page->page->commit, 0);
1751
1752 /* Make the reader page now replace the head */
1753 reader->list.prev->next = &cpu_buffer->reader_page->list;
1754 reader->list.next->prev = &cpu_buffer->reader_page->list;
1755
1756 /*
1757 * If the tail is on the reader, then we must set the head
1758 * to the inserted page, otherwise we set it one before.
1759 */
1760 cpu_buffer->head_page = cpu_buffer->reader_page;
1761
1762 if (cpu_buffer->commit_page != reader)
1763 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
1764
1765 /* Finally update the reader page to the new head */
1766 cpu_buffer->reader_page = reader;
1767 rb_reset_reader_page(cpu_buffer);
1768
1769 goto again;
1770
1771 out:
1772 __raw_spin_unlock(&cpu_buffer->lock);
1773 local_irq_restore(flags);
1774
1775 return reader;
1776}
1777
1778static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1779{
1780 struct ring_buffer_event *event;
1781 struct buffer_page *reader;
1782 unsigned length;
1783
1784 reader = rb_get_reader_page(cpu_buffer);
1785
1786 /* This function should not be called when buffer is empty */
1787 if (RB_WARN_ON(cpu_buffer, !reader))
1788 return;
1789
1790 event = rb_reader_event(cpu_buffer);
1791
1792 if (event->type == RINGBUF_TYPE_DATA)
1793 cpu_buffer->entries--;
1794
1795 rb_update_read_stamp(cpu_buffer, event);
1796
1797 length = rb_event_length(event);
1798 cpu_buffer->reader_page->read += length;
1799}
1800
1801static void rb_advance_iter(struct ring_buffer_iter *iter)
1802{
1803 struct ring_buffer *buffer;
1804 struct ring_buffer_per_cpu *cpu_buffer;
1805 struct ring_buffer_event *event;
1806 unsigned length;
1807
1808 cpu_buffer = iter->cpu_buffer;
1809 buffer = cpu_buffer->buffer;
1810
1811 /*
1812 * Check if we are at the end of the buffer.
1813 */
1814 if (iter->head >= rb_page_size(iter->head_page)) {
1815 if (RB_WARN_ON(buffer,
1816 iter->head_page == cpu_buffer->commit_page))
1817 return;
1818 rb_inc_iter(iter);
1819 return;
1820 }
1821
1822 event = rb_iter_head_event(iter);
1823
1824 length = rb_event_length(event);
1825
1826 /*
1827 * This should not be called to advance the header if we are
1828 * at the tail of the buffer.
1829 */
1830 if (RB_WARN_ON(cpu_buffer,
1831 (iter->head_page == cpu_buffer->commit_page) &&
1832 (iter->head + length > rb_commit_index(cpu_buffer))))
1833 return;
1834
1835 rb_update_iter_read_stamp(iter, event);
1836
1837 iter->head += length;
1838
1839 /* check for end of page padding */
1840 if ((iter->head >= rb_page_size(iter->head_page)) &&
1841 (iter->head_page != cpu_buffer->commit_page))
1842 rb_advance_iter(iter);
1843}
1844
1845static struct ring_buffer_event *
1846rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1847{
1848 struct ring_buffer_per_cpu *cpu_buffer;
1849 struct ring_buffer_event *event;
1850 struct buffer_page *reader;
1851 int nr_loops = 0;
1852
1853 if (!cpu_isset(cpu, buffer->cpumask))
1854 return NULL;
1855
1856 cpu_buffer = buffer->buffers[cpu];
1857
1858 again:
1859 /*
1860 * We repeat when a timestamp is encountered. It is possible
1861 * to get multiple timestamps from an interrupt entering just
1862 * as one timestamp is about to be written. The max times
1863 * that this can happen is the number of nested interrupts we
1864 * can have. Nesting 10 deep of interrupts is clearly
1865 * an anomaly.
1866 */
1867 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1868 return NULL;
1869
1870 reader = rb_get_reader_page(cpu_buffer);
1871 if (!reader)
1872 return NULL;
1873
1874 event = rb_reader_event(cpu_buffer);
1875
1876 switch (event->type) {
1877 case RINGBUF_TYPE_PADDING:
1878 RB_WARN_ON(cpu_buffer, 1);
1879 rb_advance_reader(cpu_buffer);
1880 return NULL;
1881
1882 case RINGBUF_TYPE_TIME_EXTEND:
1883 /* Internal data, OK to advance */
1884 rb_advance_reader(cpu_buffer);
1885 goto again;
1886
1887 case RINGBUF_TYPE_TIME_STAMP:
1888 /* FIXME: not implemented */
1889 rb_advance_reader(cpu_buffer);
1890 goto again;
1891
1892 case RINGBUF_TYPE_DATA:
1893 if (ts) {
1894 *ts = cpu_buffer->read_stamp + event->time_delta;
1895 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
1896 }
1897 return event;
1898
1899 default:
1900 BUG();
1901 }
1902
1903 return NULL;
1904}
1905EXPORT_SYMBOL_GPL(ring_buffer_peek);
1906
1907static struct ring_buffer_event *
1908rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1909{
1910 struct ring_buffer *buffer;
1911 struct ring_buffer_per_cpu *cpu_buffer;
1912 struct ring_buffer_event *event;
1913 int nr_loops = 0;
1914
1915 if (ring_buffer_iter_empty(iter))
1916 return NULL;
1917
1918 cpu_buffer = iter->cpu_buffer;
1919 buffer = cpu_buffer->buffer;
1920
1921 again:
1922 /*
1923 * We repeat when a timestamp is encountered. It is possible
1924 * to get multiple timestamps from an interrupt entering just
1925 * as one timestamp is about to be written. The max times
1926 * that this can happen is the number of nested interrupts we
1927 * can have. Nesting 10 deep of interrupts is clearly
1928 * an anomaly.
1929 */
1930 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1931 return NULL;
1932
1933 if (rb_per_cpu_empty(cpu_buffer))
1934 return NULL;
1935
1936 event = rb_iter_head_event(iter);
1937
1938 switch (event->type) {
1939 case RINGBUF_TYPE_PADDING:
1940 rb_inc_iter(iter);
1941 goto again;
1942
1943 case RINGBUF_TYPE_TIME_EXTEND:
1944 /* Internal data, OK to advance */
1945 rb_advance_iter(iter);
1946 goto again;
1947
1948 case RINGBUF_TYPE_TIME_STAMP:
1949 /* FIXME: not implemented */
1950 rb_advance_iter(iter);
1951 goto again;
1952
1953 case RINGBUF_TYPE_DATA:
1954 if (ts) {
1955 *ts = iter->read_stamp + event->time_delta;
1956 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
1957 }
1958 return event;
1959
1960 default:
1961 BUG();
1962 }
1963
1964 return NULL;
1965}
1966EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
1967
1968/**
1969 * ring_buffer_peek - peek at the next event to be read
1970 * @buffer: The ring buffer to read
1971 * @cpu: The cpu to peak at
1972 * @ts: The timestamp counter of this event.
1973 *
1974 * This will return the event that will be read next, but does
1975 * not consume the data.
1976 */
1977struct ring_buffer_event *
1978ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1979{
1980 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1981 struct ring_buffer_event *event;
1982 unsigned long flags;
1983
1984 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1985 event = rb_buffer_peek(buffer, cpu, ts);
1986 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1987
1988 return event;
1989}
1990
1991/**
1992 * ring_buffer_iter_peek - peek at the next event to be read
1993 * @iter: The ring buffer iterator
1994 * @ts: The timestamp counter of this event.
1995 *
1996 * This will return the event that will be read next, but does
1997 * not increment the iterator.
1998 */
1999struct ring_buffer_event *
2000ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2001{
2002 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2003 struct ring_buffer_event *event;
2004 unsigned long flags;
2005
2006 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2007 event = rb_iter_peek(iter, ts);
2008 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2009
2010 return event;
2011}
2012
2013/**
2014 * ring_buffer_consume - return an event and consume it
2015 * @buffer: The ring buffer to get the next event from
2016 *
2017 * Returns the next event in the ring buffer, and that event is consumed.
2018 * Meaning, that sequential reads will keep returning a different event,
2019 * and eventually empty the ring buffer if the producer is slower.
2020 */
2021struct ring_buffer_event *
2022ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2023{
2024 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2025 struct ring_buffer_event *event;
2026 unsigned long flags;
2027
2028 if (!cpu_isset(cpu, buffer->cpumask))
2029 return NULL;
2030
2031 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2032
2033 event = rb_buffer_peek(buffer, cpu, ts);
2034 if (!event)
2035 goto out;
2036
2037 rb_advance_reader(cpu_buffer);
2038
2039 out:
2040 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2041
2042 return event;
2043}
2044EXPORT_SYMBOL_GPL(ring_buffer_consume);
2045
2046/**
2047 * ring_buffer_read_start - start a non consuming read of the buffer
2048 * @buffer: The ring buffer to read from
2049 * @cpu: The cpu buffer to iterate over
2050 *
2051 * This starts up an iteration through the buffer. It also disables
2052 * the recording to the buffer until the reading is finished.
2053 * This prevents the reading from being corrupted. This is not
2054 * a consuming read, so a producer is not expected.
2055 *
2056 * Must be paired with ring_buffer_finish.
2057 */
2058struct ring_buffer_iter *
2059ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
2060{
2061 struct ring_buffer_per_cpu *cpu_buffer;
2062 struct ring_buffer_iter *iter;
2063 unsigned long flags;
2064
2065 if (!cpu_isset(cpu, buffer->cpumask))
2066 return NULL;
2067
2068 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
2069 if (!iter)
2070 return NULL;
2071
2072 cpu_buffer = buffer->buffers[cpu];
2073
2074 iter->cpu_buffer = cpu_buffer;
2075
2076 atomic_inc(&cpu_buffer->record_disabled);
2077 synchronize_sched();
2078
2079 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2080 __raw_spin_lock(&cpu_buffer->lock);
2081 rb_iter_reset(iter);
2082 __raw_spin_unlock(&cpu_buffer->lock);
2083 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2084
2085 return iter;
2086}
2087EXPORT_SYMBOL_GPL(ring_buffer_read_start);
2088
2089/**
2090 * ring_buffer_finish - finish reading the iterator of the buffer
2091 * @iter: The iterator retrieved by ring_buffer_start
2092 *
2093 * This re-enables the recording to the buffer, and frees the
2094 * iterator.
2095 */
2096void
2097ring_buffer_read_finish(struct ring_buffer_iter *iter)
2098{
2099 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2100
2101 atomic_dec(&cpu_buffer->record_disabled);
2102 kfree(iter);
2103}
2104EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
2105
2106/**
2107 * ring_buffer_read - read the next item in the ring buffer by the iterator
2108 * @iter: The ring buffer iterator
2109 * @ts: The time stamp of the event read.
2110 *
2111 * This reads the next event in the ring buffer and increments the iterator.
2112 */
2113struct ring_buffer_event *
2114ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2115{
2116 struct ring_buffer_event *event;
2117 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2118 unsigned long flags;
2119
2120 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2121 event = rb_iter_peek(iter, ts);
2122 if (!event)
2123 goto out;
2124
2125 rb_advance_iter(iter);
2126 out:
2127 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2128
2129 return event;
2130}
2131EXPORT_SYMBOL_GPL(ring_buffer_read);
2132
2133/**
2134 * ring_buffer_size - return the size of the ring buffer (in bytes)
2135 * @buffer: The ring buffer.
2136 */
2137unsigned long ring_buffer_size(struct ring_buffer *buffer)
2138{
2139 return BUF_PAGE_SIZE * buffer->pages;
2140}
2141EXPORT_SYMBOL_GPL(ring_buffer_size);
2142
2143static void
2144rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2145{
2146 cpu_buffer->head_page
2147 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
2148 local_set(&cpu_buffer->head_page->write, 0);
2149 local_set(&cpu_buffer->head_page->page->commit, 0);
2150
2151 cpu_buffer->head_page->read = 0;
2152
2153 cpu_buffer->tail_page = cpu_buffer->head_page;
2154 cpu_buffer->commit_page = cpu_buffer->head_page;
2155
2156 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
2157 local_set(&cpu_buffer->reader_page->write, 0);
2158 local_set(&cpu_buffer->reader_page->page->commit, 0);
2159 cpu_buffer->reader_page->read = 0;
2160
2161 cpu_buffer->overrun = 0;
2162 cpu_buffer->entries = 0;
2163}
2164
2165/**
2166 * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
2167 * @buffer: The ring buffer to reset a per cpu buffer of
2168 * @cpu: The CPU buffer to be reset
2169 */
2170void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2171{
2172 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2173 unsigned long flags;
2174
2175 if (!cpu_isset(cpu, buffer->cpumask))
2176 return;
2177
2178 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2179
2180 __raw_spin_lock(&cpu_buffer->lock);
2181
2182 rb_reset_cpu(cpu_buffer);
2183
2184 __raw_spin_unlock(&cpu_buffer->lock);
2185
2186 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2187}
2188EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2189
2190/**
2191 * ring_buffer_reset - reset a ring buffer
2192 * @buffer: The ring buffer to reset all cpu buffers
2193 */
2194void ring_buffer_reset(struct ring_buffer *buffer)
2195{
2196 int cpu;
2197
2198 for_each_buffer_cpu(buffer, cpu)
2199 ring_buffer_reset_cpu(buffer, cpu);
2200}
2201EXPORT_SYMBOL_GPL(ring_buffer_reset);
2202
2203/**
2204 * rind_buffer_empty - is the ring buffer empty?
2205 * @buffer: The ring buffer to test
2206 */
2207int ring_buffer_empty(struct ring_buffer *buffer)
2208{
2209 struct ring_buffer_per_cpu *cpu_buffer;
2210 int cpu;
2211
2212 /* yes this is racy, but if you don't like the race, lock the buffer */
2213 for_each_buffer_cpu(buffer, cpu) {
2214 cpu_buffer = buffer->buffers[cpu];
2215 if (!rb_per_cpu_empty(cpu_buffer))
2216 return 0;
2217 }
2218 return 1;
2219}
2220EXPORT_SYMBOL_GPL(ring_buffer_empty);
2221
2222/**
2223 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
2224 * @buffer: The ring buffer
2225 * @cpu: The CPU buffer to test
2226 */
2227int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2228{
2229 struct ring_buffer_per_cpu *cpu_buffer;
2230
2231 if (!cpu_isset(cpu, buffer->cpumask))
2232 return 1;
2233
2234 cpu_buffer = buffer->buffers[cpu];
2235 return rb_per_cpu_empty(cpu_buffer);
2236}
2237EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2238
2239/**
2240 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2241 * @buffer_a: One buffer to swap with
2242 * @buffer_b: The other buffer to swap with
2243 *
2244 * This function is useful for tracers that want to take a "snapshot"
2245 * of a CPU buffer and has another back up buffer lying around.
2246 * it is expected that the tracer handles the cpu buffer not being
2247 * used at the moment.
2248 */
2249int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2250 struct ring_buffer *buffer_b, int cpu)
2251{
2252 struct ring_buffer_per_cpu *cpu_buffer_a;
2253 struct ring_buffer_per_cpu *cpu_buffer_b;
2254
2255 if (!cpu_isset(cpu, buffer_a->cpumask) ||
2256 !cpu_isset(cpu, buffer_b->cpumask))
2257 return -EINVAL;
2258
2259 /* At least make sure the two buffers are somewhat the same */
2260 if (buffer_a->pages != buffer_b->pages)
2261 return -EINVAL;
2262
2263 cpu_buffer_a = buffer_a->buffers[cpu];
2264 cpu_buffer_b = buffer_b->buffers[cpu];
2265
2266 /*
2267 * We can't do a synchronize_sched here because this
2268 * function can be called in atomic context.
2269 * Normally this will be called from the same CPU as cpu.
2270 * If not it's up to the caller to protect this.
2271 */
2272 atomic_inc(&cpu_buffer_a->record_disabled);
2273 atomic_inc(&cpu_buffer_b->record_disabled);
2274
2275 buffer_a->buffers[cpu] = cpu_buffer_b;
2276 buffer_b->buffers[cpu] = cpu_buffer_a;
2277
2278 cpu_buffer_b->buffer = buffer_a;
2279 cpu_buffer_a->buffer = buffer_b;
2280
2281 atomic_dec(&cpu_buffer_a->record_disabled);
2282 atomic_dec(&cpu_buffer_b->record_disabled);
2283
2284 return 0;
2285}
2286EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2287
2288static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2289 struct buffer_data_page *bpage)
2290{
2291 struct ring_buffer_event *event;
2292 unsigned long head;
2293
2294 __raw_spin_lock(&cpu_buffer->lock);
2295 for (head = 0; head < local_read(&bpage->commit);
2296 head += rb_event_length(event)) {
2297
2298 event = __rb_data_page_index(bpage, head);
2299 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2300 return;
2301 /* Only count data entries */
2302 if (event->type != RINGBUF_TYPE_DATA)
2303 continue;
2304 cpu_buffer->entries--;
2305 }
2306 __raw_spin_unlock(&cpu_buffer->lock);
2307}
2308
2309/**
2310 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2311 * @buffer: the buffer to allocate for.
2312 *
2313 * This function is used in conjunction with ring_buffer_read_page.
2314 * When reading a full page from the ring buffer, these functions
2315 * can be used to speed up the process. The calling function should
2316 * allocate a few pages first with this function. Then when it
2317 * needs to get pages from the ring buffer, it passes the result
2318 * of this function into ring_buffer_read_page, which will swap
2319 * the page that was allocated, with the read page of the buffer.
2320 *
2321 * Returns:
2322 * The page allocated, or NULL on error.
2323 */
2324void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2325{
2326 unsigned long addr;
2327 struct buffer_data_page *bpage;
2328
2329 addr = __get_free_page(GFP_KERNEL);
2330 if (!addr)
2331 return NULL;
2332
2333 bpage = (void *)addr;
2334
2335 return bpage;
2336}
2337
2338/**
2339 * ring_buffer_free_read_page - free an allocated read page
2340 * @buffer: the buffer the page was allocate for
2341 * @data: the page to free
2342 *
2343 * Free a page allocated from ring_buffer_alloc_read_page.
2344 */
2345void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2346{
2347 free_page((unsigned long)data);
2348}
2349
2350/**
2351 * ring_buffer_read_page - extract a page from the ring buffer
2352 * @buffer: buffer to extract from
2353 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2354 * @cpu: the cpu of the buffer to extract
2355 * @full: should the extraction only happen when the page is full.
2356 *
2357 * This function will pull out a page from the ring buffer and consume it.
2358 * @data_page must be the address of the variable that was returned
2359 * from ring_buffer_alloc_read_page. This is because the page might be used
2360 * to swap with a page in the ring buffer.
2361 *
2362 * for example:
2363 * rpage = ring_buffer_alloc_page(buffer);
2364 * if (!rpage)
2365 * return error;
2366 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
2367 * if (ret)
2368 * process_page(rpage);
2369 *
2370 * When @full is set, the function will not return true unless
2371 * the writer is off the reader page.
2372 *
2373 * Note: it is up to the calling functions to handle sleeps and wakeups.
2374 * The ring buffer can be used anywhere in the kernel and can not
2375 * blindly call wake_up. The layer that uses the ring buffer must be
2376 * responsible for that.
2377 *
2378 * Returns:
2379 * 1 if data has been transferred
2380 * 0 if no data has been transferred.
2381 */
2382int ring_buffer_read_page(struct ring_buffer *buffer,
2383 void **data_page, int cpu, int full)
2384{
2385 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2386 struct ring_buffer_event *event;
2387 struct buffer_data_page *bpage;
2388 unsigned long flags;
2389 int ret = 0;
2390
2391 if (!data_page)
2392 return 0;
2393
2394 bpage = *data_page;
2395 if (!bpage)
2396 return 0;
2397
2398 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2399
2400 /*
2401 * rb_buffer_peek will get the next ring buffer if
2402 * the current reader page is empty.
2403 */
2404 event = rb_buffer_peek(buffer, cpu, NULL);
2405 if (!event)
2406 goto out;
2407
2408 /* check for data */
2409 if (!local_read(&cpu_buffer->reader_page->page->commit))
2410 goto out;
2411 /*
2412 * If the writer is already off of the read page, then simply
2413 * switch the read page with the given page. Otherwise
2414 * we need to copy the data from the reader to the writer.
2415 */
2416 if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
2417 unsigned int read = cpu_buffer->reader_page->read;
2418
2419 if (full)
2420 goto out;
2421 /* The writer is still on the reader page, we must copy */
2422 bpage = cpu_buffer->reader_page->page;
2423 memcpy(bpage->data,
2424 cpu_buffer->reader_page->page->data + read,
2425 local_read(&bpage->commit) - read);
2426
2427 /* consume what was read */
2428 cpu_buffer->reader_page += read;
2429
2430 } else {
2431 /* swap the pages */
2432 rb_init_page(bpage);
2433 bpage = cpu_buffer->reader_page->page;
2434 cpu_buffer->reader_page->page = *data_page;
2435 cpu_buffer->reader_page->read = 0;
2436 *data_page = bpage;
2437 }
2438 ret = 1;
2439
2440 /* update the entry counter */
2441 rb_remove_entries(cpu_buffer, bpage);
2442 out:
2443 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2444
2445 return ret;
2446}
2447
2448static ssize_t
2449rb_simple_read(struct file *filp, char __user *ubuf,
2450 size_t cnt, loff_t *ppos)
2451{
2452 long *p = filp->private_data;
2453 char buf[64];
2454 int r;
2455
2456 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
2457 r = sprintf(buf, "permanently disabled\n");
2458 else
2459 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
2460
2461 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2462}
2463
2464static ssize_t
2465rb_simple_write(struct file *filp, const char __user *ubuf,
2466 size_t cnt, loff_t *ppos)
2467{
2468 long *p = filp->private_data;
2469 char buf[64];
2470 long val;
2471 int ret;
2472
2473 if (cnt >= sizeof(buf))
2474 return -EINVAL;
2475
2476 if (copy_from_user(&buf, ubuf, cnt))
2477 return -EFAULT;
2478
2479 buf[cnt] = 0;
2480
2481 ret = strict_strtoul(buf, 10, &val);
2482 if (ret < 0)
2483 return ret;
2484
2485 if (val)
2486 set_bit(RB_BUFFERS_ON_BIT, p);
2487 else
2488 clear_bit(RB_BUFFERS_ON_BIT, p);
2489
2490 (*ppos)++;
2491
2492 return cnt;
2493}
2494
2495static struct file_operations rb_simple_fops = {
2496 .open = tracing_open_generic,
2497 .read = rb_simple_read,
2498 .write = rb_simple_write,
2499};
2500
2501
2502static __init int rb_init_debugfs(void)
2503{
2504 struct dentry *d_tracer;
2505 struct dentry *entry;
2506
2507 d_tracer = tracing_init_dentry();
2508
2509 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2510 &ring_buffer_flags, &rb_simple_fops);
2511 if (!entry)
2512 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2513
2514 return 0;
2515}
2516
2517fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f3fb3db61c3..4185d5221633 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -14,6 +14,7 @@
14#include <linux/utsrelease.h> 14#include <linux/utsrelease.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/notifier.h>
17#include <linux/debugfs.h> 18#include <linux/debugfs.h>
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/hardirq.h> 20#include <linux/hardirq.h>
@@ -22,6 +23,7 @@
22#include <linux/ftrace.h> 23#include <linux/ftrace.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/kdebug.h>
25#include <linux/ctype.h> 27#include <linux/ctype.h>
26#include <linux/init.h> 28#include <linux/init.h>
27#include <linux/poll.h> 29#include <linux/poll.h>
@@ -31,24 +33,97 @@
31#include <linux/writeback.h> 33#include <linux/writeback.h>
32 34
33#include <linux/stacktrace.h> 35#include <linux/stacktrace.h>
36#include <linux/ring_buffer.h>
37#include <linux/irqflags.h>
34 38
35#include "trace.h" 39#include "trace.h"
36 40
41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
42
37unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
38unsigned long __read_mostly tracing_thresh; 44unsigned long __read_mostly tracing_thresh;
39 45
40static unsigned long __read_mostly tracing_nr_buffers; 46/*
47 * We need to change this state when a selftest is running.
48 * A selftest will lurk into the ring-buffer to count the
49 * entries inserted during the selftest although some concurrent
50 * insertions into the ring-buffer such as ftrace_printk could occurred
51 * at the same time, giving false positive or negative results.
52 */
53static bool __read_mostly tracing_selftest_running;
54
55/* For tracers that don't implement custom flags */
56static struct tracer_opt dummy_tracer_opt[] = {
57 { }
58};
59
60static struct tracer_flags dummy_tracer_flags = {
61 .val = 0,
62 .opts = dummy_tracer_opt
63};
64
65static int dummy_set_flag(u32 old_flags, u32 bit, int set)
66{
67 return 0;
68}
69
70/*
71 * Kill all tracing for good (never come back).
72 * It is initialized to 1 but will turn to zero if the initialization
73 * of the tracer is successful. But that is the only place that sets
74 * this back to zero.
75 */
76int tracing_disabled = 1;
77
78static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
79
80static inline void ftrace_disable_cpu(void)
81{
82 preempt_disable();
83 local_inc(&__get_cpu_var(ftrace_cpu_disabled));
84}
85
86static inline void ftrace_enable_cpu(void)
87{
88 local_dec(&__get_cpu_var(ftrace_cpu_disabled));
89 preempt_enable();
90}
91
41static cpumask_t __read_mostly tracing_buffer_mask; 92static cpumask_t __read_mostly tracing_buffer_mask;
42 93
43#define for_each_tracing_cpu(cpu) \ 94#define for_each_tracing_cpu(cpu) \
44 for_each_cpu_mask(cpu, tracing_buffer_mask) 95 for_each_cpu_mask(cpu, tracing_buffer_mask)
45 96
46static int trace_alloc_page(void); 97/*
47static int trace_free_page(void); 98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
99 *
100 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
101 * is set, then ftrace_dump is called. This will output the contents
102 * of the ftrace buffers to the console. This is very useful for
103 * capturing traces that lead to crashes and outputing it to a
104 * serial console.
105 *
106 * It is default off, but you can enable it with either specifying
107 * "ftrace_dump_on_oops" in the kernel command line, or setting
108 * /proc/sys/kernel/ftrace_dump_on_oops to true.
109 */
110int ftrace_dump_on_oops;
48 111
49static int tracing_disabled = 1; 112static int tracing_set_tracer(char *buf);
50 113
51static unsigned long tracing_pages_allocated; 114static int __init set_ftrace(char *str)
115{
116 tracing_set_tracer(str);
117 return 1;
118}
119__setup("ftrace", set_ftrace);
120
121static int __init set_ftrace_dump_on_oops(char *str)
122{
123 ftrace_dump_on_oops = 1;
124 return 1;
125}
126__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
52 127
53long 128long
54ns2usecs(cycle_t nsec) 129ns2usecs(cycle_t nsec)
@@ -60,7 +135,9 @@ ns2usecs(cycle_t nsec)
60 135
61cycle_t ftrace_now(int cpu) 136cycle_t ftrace_now(int cpu)
62{ 137{
63 return cpu_clock(cpu); 138 u64 ts = ring_buffer_time_stamp(cpu);
139 ring_buffer_normalize_time_stamp(cpu, &ts);
140 return ts;
64} 141}
65 142
66/* 143/*
@@ -96,15 +173,35 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
96/* tracer_enabled is used to toggle activation of a tracer */ 173/* tracer_enabled is used to toggle activation of a tracer */
97static int tracer_enabled = 1; 174static int tracer_enabled = 1;
98 175
176/**
177 * tracing_is_enabled - return tracer_enabled status
178 *
179 * This function is used by other tracers to know the status
180 * of the tracer_enabled flag. Tracers may use this function
181 * to know if it should enable their features when starting
182 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
183 */
184int tracing_is_enabled(void)
185{
186 return tracer_enabled;
187}
188
99/* function tracing enabled */ 189/* function tracing enabled */
100int ftrace_function_enabled; 190int ftrace_function_enabled;
101 191
102/* 192/*
103 * trace_nr_entries is the number of entries that is allocated 193 * trace_buf_size is the size in bytes that is allocated
104 * for a buffer. Note, the number of entries is always rounded 194 * for a buffer. Note, the number of bytes is always rounded
105 * to ENTRIES_PER_PAGE. 195 * to page size.
196 *
197 * This number is purposely set to a low number of 16384.
198 * If the dump on oops happens, it will be much appreciated
199 * to not have to wait for all that output. Anyway this can be
200 * boot time and run time configurable.
106 */ 201 */
107static unsigned long trace_nr_entries = 65536UL; 202#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */
203
204static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
108 205
109/* trace_types holds a link list of available tracers. */ 206/* trace_types holds a link list of available tracers. */
110static struct tracer *trace_types __read_mostly; 207static struct tracer *trace_types __read_mostly;
@@ -130,26 +227,9 @@ static DEFINE_MUTEX(trace_types_lock);
130/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 227/* trace_wait is a waitqueue for tasks blocked on trace_poll */
131static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 228static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
132 229
133/* trace_flags holds iter_ctrl options */ 230/* trace_flags holds trace_options default values */
134unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 231unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
135 232 TRACE_ITER_ANNOTATE;
136static notrace void no_trace_init(struct trace_array *tr)
137{
138 int cpu;
139
140 ftrace_function_enabled = 0;
141 if(tr->ctrl)
142 for_each_online_cpu(cpu)
143 tracing_reset(tr->data[cpu]);
144 tracer_enabled = 0;
145}
146
147/* dummy trace to disable tracing */
148static struct tracer no_tracer __read_mostly = {
149 .name = "none",
150 .init = no_trace_init
151};
152
153 233
154/** 234/**
155 * trace_wake_up - wake up tasks waiting for trace input 235 * trace_wake_up - wake up tasks waiting for trace input
@@ -167,51 +247,27 @@ void trace_wake_up(void)
167 wake_up(&trace_wait); 247 wake_up(&trace_wait);
168} 248}
169 249
170#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) 250static int __init set_buf_size(char *str)
171
172static int __init set_nr_entries(char *str)
173{ 251{
174 unsigned long nr_entries; 252 unsigned long buf_size;
175 int ret; 253 int ret;
176 254
177 if (!str) 255 if (!str)
178 return 0; 256 return 0;
179 ret = strict_strtoul(str, 0, &nr_entries); 257 ret = strict_strtoul(str, 0, &buf_size);
180 /* nr_entries can not be zero */ 258 /* nr_entries can not be zero */
181 if (ret < 0 || nr_entries == 0) 259 if (ret < 0 || buf_size == 0)
182 return 0; 260 return 0;
183 trace_nr_entries = nr_entries; 261 trace_buf_size = buf_size;
184 return 1; 262 return 1;
185} 263}
186__setup("trace_entries=", set_nr_entries); 264__setup("trace_buf_size=", set_buf_size);
187 265
188unsigned long nsecs_to_usecs(unsigned long nsecs) 266unsigned long nsecs_to_usecs(unsigned long nsecs)
189{ 267{
190 return nsecs / 1000; 268 return nsecs / 1000;
191} 269}
192 270
193/*
194 * trace_flag_type is an enumeration that holds different
195 * states when a trace occurs. These are:
196 * IRQS_OFF - interrupts were disabled
197 * NEED_RESCED - reschedule is requested
198 * HARDIRQ - inside an interrupt handler
199 * SOFTIRQ - inside a softirq handler
200 */
201enum trace_flag_type {
202 TRACE_FLAG_IRQS_OFF = 0x01,
203 TRACE_FLAG_NEED_RESCHED = 0x02,
204 TRACE_FLAG_HARDIRQ = 0x04,
205 TRACE_FLAG_SOFTIRQ = 0x08,
206};
207
208/*
209 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
210 * control the output of kernel symbols.
211 */
212#define TRACE_ITER_SYM_MASK \
213 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
214
215/* These must match the bit postions in trace_iterator_flags */ 271/* These must match the bit postions in trace_iterator_flags */
216static const char *trace_options[] = { 272static const char *trace_options[] = {
217 "print-parent", 273 "print-parent",
@@ -224,6 +280,13 @@ static const char *trace_options[] = {
224 "block", 280 "block",
225 "stacktrace", 281 "stacktrace",
226 "sched-tree", 282 "sched-tree",
283 "ftrace_printk",
284 "ftrace_preempt",
285 "branch",
286 "annotate",
287 "userstacktrace",
288 "sym-userobj",
289 "printk-msg-only",
227 NULL 290 NULL
228}; 291};
229 292
@@ -257,7 +320,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
257 320
258 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 321 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
259 data->pid = tsk->pid; 322 data->pid = tsk->pid;
260 data->uid = tsk->uid; 323 data->uid = task_uid(tsk);
261 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 324 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
262 data->policy = tsk->policy; 325 data->policy = tsk->policy;
263 data->rt_priority = tsk->rt_priority; 326 data->rt_priority = tsk->rt_priority;
@@ -266,54 +329,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
266 tracing_record_cmdline(current); 329 tracing_record_cmdline(current);
267} 330}
268 331
269#define CHECK_COND(cond) \
270 if (unlikely(cond)) { \
271 tracing_disabled = 1; \
272 WARN_ON(1); \
273 return -1; \
274 }
275
276/**
277 * check_pages - integrity check of trace buffers
278 *
279 * As a safty measure we check to make sure the data pages have not
280 * been corrupted.
281 */
282int check_pages(struct trace_array_cpu *data)
283{
284 struct page *page, *tmp;
285
286 CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
287 CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
288
289 list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
290 CHECK_COND(page->lru.next->prev != &page->lru);
291 CHECK_COND(page->lru.prev->next != &page->lru);
292 }
293
294 return 0;
295}
296
297/**
298 * head_page - page address of the first page in per_cpu buffer.
299 *
300 * head_page returns the page address of the first page in
301 * a per_cpu buffer. This also preforms various consistency
302 * checks to make sure the buffer has not been corrupted.
303 */
304void *head_page(struct trace_array_cpu *data)
305{
306 struct page *page;
307
308 if (list_empty(&data->trace_pages))
309 return NULL;
310
311 page = list_entry(data->trace_pages.next, struct page, lru);
312 BUG_ON(&page->lru == &data->trace_pages);
313
314 return page_address(page);
315}
316
317/** 332/**
318 * trace_seq_printf - sequence printing of trace information 333 * trace_seq_printf - sequence printing of trace information
319 * @s: trace sequence descriptor 334 * @s: trace sequence descriptor
@@ -395,34 +410,51 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
395 return len; 410 return len;
396} 411}
397 412
398#define HEX_CHARS 17 413#define MAX_MEMHEX_BYTES 8
399static const char hex2asc[] = "0123456789abcdef"; 414#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
400 415
401static int 416static int
402trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) 417trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
403{ 418{
404 unsigned char hex[HEX_CHARS]; 419 unsigned char hex[HEX_CHARS];
405 unsigned char *data = mem; 420 unsigned char *data = mem;
406 unsigned char byte;
407 int i, j; 421 int i, j;
408 422
409 BUG_ON(len >= HEX_CHARS);
410
411#ifdef __BIG_ENDIAN 423#ifdef __BIG_ENDIAN
412 for (i = 0, j = 0; i < len; i++) { 424 for (i = 0, j = 0; i < len; i++) {
413#else 425#else
414 for (i = len-1, j = 0; i >= 0; i--) { 426 for (i = len-1, j = 0; i >= 0; i--) {
415#endif 427#endif
416 byte = data[i]; 428 hex[j++] = hex_asc_hi(data[i]);
417 429 hex[j++] = hex_asc_lo(data[i]);
418 hex[j++] = hex2asc[byte & 0x0f];
419 hex[j++] = hex2asc[byte >> 4];
420 } 430 }
421 hex[j++] = ' '; 431 hex[j++] = ' ';
422 432
423 return trace_seq_putmem(s, hex, j); 433 return trace_seq_putmem(s, hex, j);
424} 434}
425 435
436static int
437trace_seq_path(struct trace_seq *s, struct path *path)
438{
439 unsigned char *p;
440
441 if (s->len >= (PAGE_SIZE - 1))
442 return 0;
443 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
444 if (!IS_ERR(p)) {
445 p = mangle_path(s->buffer + s->len, p, "\n");
446 if (p) {
447 s->len = p - s->buffer;
448 return 1;
449 }
450 } else {
451 s->buffer[s->len++] = '?';
452 return 1;
453 }
454
455 return 0;
456}
457
426static void 458static void
427trace_seq_reset(struct trace_seq *s) 459trace_seq_reset(struct trace_seq *s)
428{ 460{
@@ -460,34 +492,6 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
460 trace_seq_reset(s); 492 trace_seq_reset(s);
461} 493}
462 494
463/*
464 * flip the trace buffers between two trace descriptors.
465 * This usually is the buffers between the global_trace and
466 * the max_tr to record a snapshot of a current trace.
467 *
468 * The ftrace_max_lock must be held.
469 */
470static void
471flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
472{
473 struct list_head flip_pages;
474
475 INIT_LIST_HEAD(&flip_pages);
476
477 memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
478 sizeof(struct trace_array_cpu) -
479 offsetof(struct trace_array_cpu, trace_head_idx));
480
481 check_pages(tr1);
482 check_pages(tr2);
483 list_splice_init(&tr1->trace_pages, &flip_pages);
484 list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
485 list_splice_init(&flip_pages, &tr2->trace_pages);
486 BUG_ON(!list_empty(&flip_pages));
487 check_pages(tr1);
488 check_pages(tr2);
489}
490
491/** 495/**
492 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 496 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
493 * @tr: tracer 497 * @tr: tracer
@@ -500,17 +504,17 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
500void 504void
501update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 505update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
502{ 506{
503 struct trace_array_cpu *data; 507 struct ring_buffer *buf = tr->buffer;
504 int i;
505 508
506 WARN_ON_ONCE(!irqs_disabled()); 509 WARN_ON_ONCE(!irqs_disabled());
507 __raw_spin_lock(&ftrace_max_lock); 510 __raw_spin_lock(&ftrace_max_lock);
508 /* clear out all the previous traces */ 511
509 for_each_tracing_cpu(i) { 512 tr->buffer = max_tr.buffer;
510 data = tr->data[i]; 513 max_tr.buffer = buf;
511 flip_trace(max_tr.data[i], data); 514
512 tracing_reset(data); 515 ftrace_disable_cpu();
513 } 516 ring_buffer_reset(tr->buffer);
517 ftrace_enable_cpu();
514 518
515 __update_max_tr(tr, tsk, cpu); 519 __update_max_tr(tr, tsk, cpu);
516 __raw_spin_unlock(&ftrace_max_lock); 520 __raw_spin_unlock(&ftrace_max_lock);
@@ -527,16 +531,19 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
527void 531void
528update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) 532update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
529{ 533{
530 struct trace_array_cpu *data = tr->data[cpu]; 534 int ret;
531 int i;
532 535
533 WARN_ON_ONCE(!irqs_disabled()); 536 WARN_ON_ONCE(!irqs_disabled());
534 __raw_spin_lock(&ftrace_max_lock); 537 __raw_spin_lock(&ftrace_max_lock);
535 for_each_tracing_cpu(i)
536 tracing_reset(max_tr.data[i]);
537 538
538 flip_trace(max_tr.data[cpu], data); 539 ftrace_disable_cpu();
539 tracing_reset(data); 540
541 ring_buffer_reset(max_tr.buffer);
542 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
543
544 ftrace_enable_cpu();
545
546 WARN_ON_ONCE(ret);
540 547
541 __update_max_tr(tr, tsk, cpu); 548 __update_max_tr(tr, tsk, cpu);
542 __raw_spin_unlock(&ftrace_max_lock); 549 __raw_spin_unlock(&ftrace_max_lock);
@@ -559,7 +566,17 @@ int register_tracer(struct tracer *type)
559 return -1; 566 return -1;
560 } 567 }
561 568
569 /*
570 * When this gets called we hold the BKL which means that
571 * preemption is disabled. Various trace selftests however
572 * need to disable and enable preemption for successful tests.
573 * So we drop the BKL here and grab it after the tests again.
574 */
575 unlock_kernel();
562 mutex_lock(&trace_types_lock); 576 mutex_lock(&trace_types_lock);
577
578 tracing_selftest_running = true;
579
563 for (t = trace_types; t; t = t->next) { 580 for (t = trace_types; t; t = t->next) {
564 if (strcmp(type->name, t->name) == 0) { 581 if (strcmp(type->name, t->name) == 0) {
565 /* already found */ 582 /* already found */
@@ -570,13 +587,20 @@ int register_tracer(struct tracer *type)
570 } 587 }
571 } 588 }
572 589
590 if (!type->set_flag)
591 type->set_flag = &dummy_set_flag;
592 if (!type->flags)
593 type->flags = &dummy_tracer_flags;
594 else
595 if (!type->flags->opts)
596 type->flags->opts = dummy_tracer_opt;
597
573#ifdef CONFIG_FTRACE_STARTUP_TEST 598#ifdef CONFIG_FTRACE_STARTUP_TEST
574 if (type->selftest) { 599 if (type->selftest) {
575 struct tracer *saved_tracer = current_trace; 600 struct tracer *saved_tracer = current_trace;
576 struct trace_array_cpu *data;
577 struct trace_array *tr = &global_trace; 601 struct trace_array *tr = &global_trace;
578 int saved_ctrl = tr->ctrl;
579 int i; 602 int i;
603
580 /* 604 /*
581 * Run a selftest on this tracer. 605 * Run a selftest on this tracer.
582 * Here we reset the trace buffer, and set the current 606 * Here we reset the trace buffer, and set the current
@@ -584,31 +608,23 @@ int register_tracer(struct tracer *type)
584 * internal tracing to verify that everything is in order. 608 * internal tracing to verify that everything is in order.
585 * If we fail, we do not register this tracer. 609 * If we fail, we do not register this tracer.
586 */ 610 */
587 for_each_tracing_cpu(i) { 611 for_each_tracing_cpu(i)
588 data = tr->data[i]; 612 tracing_reset(tr, i);
589 if (!head_page(data)) 613
590 continue;
591 tracing_reset(data);
592 }
593 current_trace = type; 614 current_trace = type;
594 tr->ctrl = 0;
595 /* the test is responsible for initializing and enabling */ 615 /* the test is responsible for initializing and enabling */
596 pr_info("Testing tracer %s: ", type->name); 616 pr_info("Testing tracer %s: ", type->name);
597 ret = type->selftest(type, tr); 617 ret = type->selftest(type, tr);
598 /* the test is responsible for resetting too */ 618 /* the test is responsible for resetting too */
599 current_trace = saved_tracer; 619 current_trace = saved_tracer;
600 tr->ctrl = saved_ctrl;
601 if (ret) { 620 if (ret) {
602 printk(KERN_CONT "FAILED!\n"); 621 printk(KERN_CONT "FAILED!\n");
603 goto out; 622 goto out;
604 } 623 }
605 /* Only reset on passing, to avoid touching corrupted buffers */ 624 /* Only reset on passing, to avoid touching corrupted buffers */
606 for_each_tracing_cpu(i) { 625 for_each_tracing_cpu(i)
607 data = tr->data[i]; 626 tracing_reset(tr, i);
608 if (!head_page(data)) 627
609 continue;
610 tracing_reset(data);
611 }
612 printk(KERN_CONT "PASSED\n"); 628 printk(KERN_CONT "PASSED\n");
613 } 629 }
614#endif 630#endif
@@ -620,7 +636,9 @@ int register_tracer(struct tracer *type)
620 max_tracer_type_len = len; 636 max_tracer_type_len = len;
621 637
622 out: 638 out:
639 tracing_selftest_running = false;
623 mutex_unlock(&trace_types_lock); 640 mutex_unlock(&trace_types_lock);
641 lock_kernel();
624 642
625 return ret; 643 return ret;
626} 644}
@@ -653,13 +671,21 @@ void unregister_tracer(struct tracer *type)
653 mutex_unlock(&trace_types_lock); 671 mutex_unlock(&trace_types_lock);
654} 672}
655 673
656void tracing_reset(struct trace_array_cpu *data) 674void tracing_reset(struct trace_array *tr, int cpu)
657{ 675{
658 data->trace_idx = 0; 676 ftrace_disable_cpu();
659 data->overrun = 0; 677 ring_buffer_reset_cpu(tr->buffer, cpu);
660 data->trace_head = data->trace_tail = head_page(data); 678 ftrace_enable_cpu();
661 data->trace_head_idx = 0; 679}
662 data->trace_tail_idx = 0; 680
681void tracing_reset_online_cpus(struct trace_array *tr)
682{
683 int cpu;
684
685 tr->time_start = ftrace_now(tr->cpu);
686
687 for_each_online_cpu(cpu)
688 tracing_reset(tr, cpu);
663} 689}
664 690
665#define SAVED_CMDLINES 128 691#define SAVED_CMDLINES 128
@@ -679,6 +705,91 @@ static void trace_init_cmdlines(void)
679 cmdline_idx = 0; 705 cmdline_idx = 0;
680} 706}
681 707
708static int trace_stop_count;
709static DEFINE_SPINLOCK(tracing_start_lock);
710
711/**
712 * ftrace_off_permanent - disable all ftrace code permanently
713 *
714 * This should only be called when a serious anomally has
715 * been detected. This will turn off the function tracing,
716 * ring buffers, and other tracing utilites. It takes no
717 * locks and can be called from any context.
718 */
719void ftrace_off_permanent(void)
720{
721 tracing_disabled = 1;
722 ftrace_stop();
723 tracing_off_permanent();
724}
725
726/**
727 * tracing_start - quick start of the tracer
728 *
729 * If tracing is enabled but was stopped by tracing_stop,
730 * this will start the tracer back up.
731 */
732void tracing_start(void)
733{
734 struct ring_buffer *buffer;
735 unsigned long flags;
736
737 if (tracing_disabled)
738 return;
739
740 spin_lock_irqsave(&tracing_start_lock, flags);
741 if (--trace_stop_count)
742 goto out;
743
744 if (trace_stop_count < 0) {
745 /* Someone screwed up their debugging */
746 WARN_ON_ONCE(1);
747 trace_stop_count = 0;
748 goto out;
749 }
750
751
752 buffer = global_trace.buffer;
753 if (buffer)
754 ring_buffer_record_enable(buffer);
755
756 buffer = max_tr.buffer;
757 if (buffer)
758 ring_buffer_record_enable(buffer);
759
760 ftrace_start();
761 out:
762 spin_unlock_irqrestore(&tracing_start_lock, flags);
763}
764
765/**
766 * tracing_stop - quick stop of the tracer
767 *
768 * Light weight way to stop tracing. Use in conjunction with
769 * tracing_start.
770 */
771void tracing_stop(void)
772{
773 struct ring_buffer *buffer;
774 unsigned long flags;
775
776 ftrace_stop();
777 spin_lock_irqsave(&tracing_start_lock, flags);
778 if (trace_stop_count++)
779 goto out;
780
781 buffer = global_trace.buffer;
782 if (buffer)
783 ring_buffer_record_disable(buffer);
784
785 buffer = max_tr.buffer;
786 if (buffer)
787 ring_buffer_record_disable(buffer);
788
789 out:
790 spin_unlock_irqrestore(&tracing_start_lock, flags);
791}
792
682void trace_stop_cmdline_recording(void); 793void trace_stop_cmdline_recording(void);
683 794
684static void trace_save_cmdline(struct task_struct *tsk) 795static void trace_save_cmdline(struct task_struct *tsk)
@@ -716,7 +827,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
716 spin_unlock(&trace_cmdline_lock); 827 spin_unlock(&trace_cmdline_lock);
717} 828}
718 829
719static char *trace_find_cmdline(int pid) 830char *trace_find_cmdline(int pid)
720{ 831{
721 char *cmdline = "<...>"; 832 char *cmdline = "<...>";
722 unsigned map; 833 unsigned map;
@@ -745,82 +856,21 @@ void tracing_record_cmdline(struct task_struct *tsk)
745 trace_save_cmdline(tsk); 856 trace_save_cmdline(tsk);
746} 857}
747 858
748static inline struct list_head * 859void
749trace_next_list(struct trace_array_cpu *data, struct list_head *next) 860tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
750{ 861 int pc)
751 /*
752 * Roundrobin - but skip the head (which is not a real page):
753 */
754 next = next->next;
755 if (unlikely(next == &data->trace_pages))
756 next = next->next;
757 BUG_ON(next == &data->trace_pages);
758
759 return next;
760}
761
762static inline void *
763trace_next_page(struct trace_array_cpu *data, void *addr)
764{
765 struct list_head *next;
766 struct page *page;
767
768 page = virt_to_page(addr);
769
770 next = trace_next_list(data, &page->lru);
771 page = list_entry(next, struct page, lru);
772
773 return page_address(page);
774}
775
776static inline struct trace_entry *
777tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
778{
779 unsigned long idx, idx_next;
780 struct trace_entry *entry;
781
782 data->trace_idx++;
783 idx = data->trace_head_idx;
784 idx_next = idx + 1;
785
786 BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
787
788 entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
789
790 if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
791 data->trace_head = trace_next_page(data, data->trace_head);
792 idx_next = 0;
793 }
794
795 if (data->trace_head == data->trace_tail &&
796 idx_next == data->trace_tail_idx) {
797 /* overrun */
798 data->overrun++;
799 data->trace_tail_idx++;
800 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
801 data->trace_tail =
802 trace_next_page(data, data->trace_tail);
803 data->trace_tail_idx = 0;
804 }
805 }
806
807 data->trace_head_idx = idx_next;
808
809 return entry;
810}
811
812static inline void
813tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
814{ 862{
815 struct task_struct *tsk = current; 863 struct task_struct *tsk = current;
816 unsigned long pc;
817 864
818 pc = preempt_count(); 865 entry->preempt_count = pc & 0xff;
819 866 entry->pid = (tsk) ? tsk->pid : 0;
820 entry->preempt_count = pc & 0xff; 867 entry->tgid = (tsk) ? tsk->tgid : 0;
821 entry->pid = (tsk) ? tsk->pid : 0; 868 entry->flags =
822 entry->t = ftrace_now(raw_smp_processor_id()); 869#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
823 entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 870 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
871#else
872 TRACE_FLAG_IRQS_NOSUPPORT |
873#endif
824 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | 874 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
825 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 875 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
826 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 876 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
@@ -828,145 +878,233 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
828 878
829void 879void
830trace_function(struct trace_array *tr, struct trace_array_cpu *data, 880trace_function(struct trace_array *tr, struct trace_array_cpu *data,
831 unsigned long ip, unsigned long parent_ip, unsigned long flags) 881 unsigned long ip, unsigned long parent_ip, unsigned long flags,
882 int pc)
832{ 883{
833 struct trace_entry *entry; 884 struct ring_buffer_event *event;
885 struct ftrace_entry *entry;
834 unsigned long irq_flags; 886 unsigned long irq_flags;
835 887
836 raw_local_irq_save(irq_flags); 888 /* If we are reading the ring buffer, don't trace */
837 __raw_spin_lock(&data->lock); 889 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
838 entry = tracing_get_trace_entry(tr, data); 890 return;
839 tracing_generic_entry_update(entry, flags);
840 entry->type = TRACE_FN;
841 entry->fn.ip = ip;
842 entry->fn.parent_ip = parent_ip;
843 __raw_spin_unlock(&data->lock);
844 raw_local_irq_restore(irq_flags);
845}
846 891
847void 892 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
848ftrace(struct trace_array *tr, struct trace_array_cpu *data, 893 &irq_flags);
849 unsigned long ip, unsigned long parent_ip, unsigned long flags) 894 if (!event)
850{ 895 return;
851 if (likely(!atomic_read(&data->disabled))) 896 entry = ring_buffer_event_data(event);
852 trace_function(tr, data, ip, parent_ip, flags); 897 tracing_generic_entry_update(&entry->ent, flags, pc);
898 entry->ent.type = TRACE_FN;
899 entry->ip = ip;
900 entry->parent_ip = parent_ip;
901 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
902}
903
904#ifdef CONFIG_FUNCTION_GRAPH_TRACER
905static void __trace_graph_entry(struct trace_array *tr,
906 struct trace_array_cpu *data,
907 struct ftrace_graph_ent *trace,
908 unsigned long flags,
909 int pc)
910{
911 struct ring_buffer_event *event;
912 struct ftrace_graph_ent_entry *entry;
913 unsigned long irq_flags;
914
915 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
916 return;
917
918 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
919 &irq_flags);
920 if (!event)
921 return;
922 entry = ring_buffer_event_data(event);
923 tracing_generic_entry_update(&entry->ent, flags, pc);
924 entry->ent.type = TRACE_GRAPH_ENT;
925 entry->graph_ent = *trace;
926 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
853} 927}
854 928
855#ifdef CONFIG_MMIOTRACE 929static void __trace_graph_return(struct trace_array *tr,
856void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, 930 struct trace_array_cpu *data,
857 struct mmiotrace_rw *rw) 931 struct ftrace_graph_ret *trace,
932 unsigned long flags,
933 int pc)
858{ 934{
859 struct trace_entry *entry; 935 struct ring_buffer_event *event;
936 struct ftrace_graph_ret_entry *entry;
860 unsigned long irq_flags; 937 unsigned long irq_flags;
861 938
862 raw_local_irq_save(irq_flags); 939 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
863 __raw_spin_lock(&data->lock); 940 return;
864
865 entry = tracing_get_trace_entry(tr, data);
866 tracing_generic_entry_update(entry, 0);
867 entry->type = TRACE_MMIO_RW;
868 entry->mmiorw = *rw;
869 941
870 __raw_spin_unlock(&data->lock); 942 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
871 raw_local_irq_restore(irq_flags); 943 &irq_flags);
944 if (!event)
945 return;
946 entry = ring_buffer_event_data(event);
947 tracing_generic_entry_update(&entry->ent, flags, pc);
948 entry->ent.type = TRACE_GRAPH_RET;
949 entry->ret = *trace;
950 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
951}
952#endif
872 953
873 trace_wake_up(); 954void
955ftrace(struct trace_array *tr, struct trace_array_cpu *data,
956 unsigned long ip, unsigned long parent_ip, unsigned long flags,
957 int pc)
958{
959 if (likely(!atomic_read(&data->disabled)))
960 trace_function(tr, data, ip, parent_ip, flags, pc);
874} 961}
875 962
876void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, 963static void ftrace_trace_stack(struct trace_array *tr,
877 struct mmiotrace_map *map) 964 struct trace_array_cpu *data,
965 unsigned long flags,
966 int skip, int pc)
878{ 967{
879 struct trace_entry *entry; 968#ifdef CONFIG_STACKTRACE
969 struct ring_buffer_event *event;
970 struct stack_entry *entry;
971 struct stack_trace trace;
880 unsigned long irq_flags; 972 unsigned long irq_flags;
881 973
882 raw_local_irq_save(irq_flags); 974 if (!(trace_flags & TRACE_ITER_STACKTRACE))
883 __raw_spin_lock(&data->lock); 975 return;
884 976
885 entry = tracing_get_trace_entry(tr, data); 977 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
886 tracing_generic_entry_update(entry, 0); 978 &irq_flags);
887 entry->type = TRACE_MMIO_MAP; 979 if (!event)
888 entry->mmiomap = *map; 980 return;
981 entry = ring_buffer_event_data(event);
982 tracing_generic_entry_update(&entry->ent, flags, pc);
983 entry->ent.type = TRACE_STACK;
889 984
890 __raw_spin_unlock(&data->lock); 985 memset(&entry->caller, 0, sizeof(entry->caller));
891 raw_local_irq_restore(irq_flags);
892 986
893 trace_wake_up(); 987 trace.nr_entries = 0;
894} 988 trace.max_entries = FTRACE_STACK_ENTRIES;
989 trace.skip = skip;
990 trace.entries = entry->caller;
991
992 save_stack_trace(&trace);
993 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
895#endif 994#endif
995}
896 996
897void __trace_stack(struct trace_array *tr, 997void __trace_stack(struct trace_array *tr,
898 struct trace_array_cpu *data, 998 struct trace_array_cpu *data,
899 unsigned long flags, 999 unsigned long flags,
900 int skip) 1000 int skip)
901{ 1001{
902 struct trace_entry *entry; 1002 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
1003}
1004
1005static void ftrace_trace_userstack(struct trace_array *tr,
1006 struct trace_array_cpu *data,
1007 unsigned long flags, int pc)
1008{
1009#ifdef CONFIG_STACKTRACE
1010 struct ring_buffer_event *event;
1011 struct userstack_entry *entry;
903 struct stack_trace trace; 1012 struct stack_trace trace;
1013 unsigned long irq_flags;
904 1014
905 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1015 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
906 return; 1016 return;
907 1017
908 entry = tracing_get_trace_entry(tr, data); 1018 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
909 tracing_generic_entry_update(entry, flags); 1019 &irq_flags);
910 entry->type = TRACE_STACK; 1020 if (!event)
1021 return;
1022 entry = ring_buffer_event_data(event);
1023 tracing_generic_entry_update(&entry->ent, flags, pc);
1024 entry->ent.type = TRACE_USER_STACK;
911 1025
912 memset(&entry->stack, 0, sizeof(entry->stack)); 1026 memset(&entry->caller, 0, sizeof(entry->caller));
913 1027
914 trace.nr_entries = 0; 1028 trace.nr_entries = 0;
915 trace.max_entries = FTRACE_STACK_ENTRIES; 1029 trace.max_entries = FTRACE_STACK_ENTRIES;
916 trace.skip = skip; 1030 trace.skip = 0;
917 trace.entries = entry->stack.caller; 1031 trace.entries = entry->caller;
918 1032
919 save_stack_trace(&trace); 1033 save_stack_trace_user(&trace);
1034 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1035#endif
920} 1036}
921 1037
922void 1038void __trace_userstack(struct trace_array *tr,
923__trace_special(void *__tr, void *__data, 1039 struct trace_array_cpu *data,
924 unsigned long arg1, unsigned long arg2, unsigned long arg3) 1040 unsigned long flags)
925{ 1041{
1042 ftrace_trace_userstack(tr, data, flags, preempt_count());
1043}
1044
1045static void
1046ftrace_trace_special(void *__tr, void *__data,
1047 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1048 int pc)
1049{
1050 struct ring_buffer_event *event;
926 struct trace_array_cpu *data = __data; 1051 struct trace_array_cpu *data = __data;
927 struct trace_array *tr = __tr; 1052 struct trace_array *tr = __tr;
928 struct trace_entry *entry; 1053 struct special_entry *entry;
929 unsigned long irq_flags; 1054 unsigned long irq_flags;
930 1055
931 raw_local_irq_save(irq_flags); 1056 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
932 __raw_spin_lock(&data->lock); 1057 &irq_flags);
933 entry = tracing_get_trace_entry(tr, data); 1058 if (!event)
934 tracing_generic_entry_update(entry, 0); 1059 return;
935 entry->type = TRACE_SPECIAL; 1060 entry = ring_buffer_event_data(event);
936 entry->special.arg1 = arg1; 1061 tracing_generic_entry_update(&entry->ent, 0, pc);
937 entry->special.arg2 = arg2; 1062 entry->ent.type = TRACE_SPECIAL;
938 entry->special.arg3 = arg3; 1063 entry->arg1 = arg1;
939 __trace_stack(tr, data, irq_flags, 4); 1064 entry->arg2 = arg2;
940 __raw_spin_unlock(&data->lock); 1065 entry->arg3 = arg3;
941 raw_local_irq_restore(irq_flags); 1066 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1067 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1068 ftrace_trace_userstack(tr, data, irq_flags, pc);
942 1069
943 trace_wake_up(); 1070 trace_wake_up();
944} 1071}
945 1072
946void 1073void
1074__trace_special(void *__tr, void *__data,
1075 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1076{
1077 ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
1078}
1079
1080void
947tracing_sched_switch_trace(struct trace_array *tr, 1081tracing_sched_switch_trace(struct trace_array *tr,
948 struct trace_array_cpu *data, 1082 struct trace_array_cpu *data,
949 struct task_struct *prev, 1083 struct task_struct *prev,
950 struct task_struct *next, 1084 struct task_struct *next,
951 unsigned long flags) 1085 unsigned long flags, int pc)
952{ 1086{
953 struct trace_entry *entry; 1087 struct ring_buffer_event *event;
1088 struct ctx_switch_entry *entry;
954 unsigned long irq_flags; 1089 unsigned long irq_flags;
955 1090
956 raw_local_irq_save(irq_flags); 1091 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
957 __raw_spin_lock(&data->lock); 1092 &irq_flags);
958 entry = tracing_get_trace_entry(tr, data); 1093 if (!event)
959 tracing_generic_entry_update(entry, flags); 1094 return;
960 entry->type = TRACE_CTX; 1095 entry = ring_buffer_event_data(event);
961 entry->ctx.prev_pid = prev->pid; 1096 tracing_generic_entry_update(&entry->ent, flags, pc);
962 entry->ctx.prev_prio = prev->prio; 1097 entry->ent.type = TRACE_CTX;
963 entry->ctx.prev_state = prev->state; 1098 entry->prev_pid = prev->pid;
964 entry->ctx.next_pid = next->pid; 1099 entry->prev_prio = prev->prio;
965 entry->ctx.next_prio = next->prio; 1100 entry->prev_state = prev->state;
966 entry->ctx.next_state = next->state; 1101 entry->next_pid = next->pid;
967 __trace_stack(tr, data, flags, 5); 1102 entry->next_prio = next->prio;
968 __raw_spin_unlock(&data->lock); 1103 entry->next_state = next->state;
969 raw_local_irq_restore(irq_flags); 1104 entry->next_cpu = task_cpu(next);
1105 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1106 ftrace_trace_stack(tr, data, flags, 5, pc);
1107 ftrace_trace_userstack(tr, data, flags, pc);
970} 1108}
971 1109
972void 1110void
@@ -974,25 +1112,29 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
974 struct trace_array_cpu *data, 1112 struct trace_array_cpu *data,
975 struct task_struct *wakee, 1113 struct task_struct *wakee,
976 struct task_struct *curr, 1114 struct task_struct *curr,
977 unsigned long flags) 1115 unsigned long flags, int pc)
978{ 1116{
979 struct trace_entry *entry; 1117 struct ring_buffer_event *event;
1118 struct ctx_switch_entry *entry;
980 unsigned long irq_flags; 1119 unsigned long irq_flags;
981 1120
982 raw_local_irq_save(irq_flags); 1121 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
983 __raw_spin_lock(&data->lock); 1122 &irq_flags);
984 entry = tracing_get_trace_entry(tr, data); 1123 if (!event)
985 tracing_generic_entry_update(entry, flags); 1124 return;
986 entry->type = TRACE_WAKE; 1125 entry = ring_buffer_event_data(event);
987 entry->ctx.prev_pid = curr->pid; 1126 tracing_generic_entry_update(&entry->ent, flags, pc);
988 entry->ctx.prev_prio = curr->prio; 1127 entry->ent.type = TRACE_WAKE;
989 entry->ctx.prev_state = curr->state; 1128 entry->prev_pid = curr->pid;
990 entry->ctx.next_pid = wakee->pid; 1129 entry->prev_prio = curr->prio;
991 entry->ctx.next_prio = wakee->prio; 1130 entry->prev_state = curr->state;
992 entry->ctx.next_state = wakee->state; 1131 entry->next_pid = wakee->pid;
993 __trace_stack(tr, data, flags, 6); 1132 entry->next_prio = wakee->prio;
994 __raw_spin_unlock(&data->lock); 1133 entry->next_state = wakee->state;
995 raw_local_irq_restore(irq_flags); 1134 entry->next_cpu = task_cpu(wakee);
1135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1136 ftrace_trace_stack(tr, data, flags, 6, pc);
1137 ftrace_trace_userstack(tr, data, flags, pc);
996 1138
997 trace_wake_up(); 1139 trace_wake_up();
998} 1140}
@@ -1003,25 +1145,52 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1003 struct trace_array *tr = &global_trace; 1145 struct trace_array *tr = &global_trace;
1004 struct trace_array_cpu *data; 1146 struct trace_array_cpu *data;
1005 unsigned long flags; 1147 unsigned long flags;
1006 long disabled;
1007 int cpu; 1148 int cpu;
1149 int pc;
1008 1150
1009 if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) 1151 if (tracing_disabled)
1010 return; 1152 return;
1011 1153
1154 pc = preempt_count();
1012 local_irq_save(flags); 1155 local_irq_save(flags);
1013 cpu = raw_smp_processor_id(); 1156 cpu = raw_smp_processor_id();
1014 data = tr->data[cpu]; 1157 data = tr->data[cpu];
1158
1159 if (likely(atomic_inc_return(&data->disabled) == 1))
1160 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
1161
1162 atomic_dec(&data->disabled);
1163 local_irq_restore(flags);
1164}
1165
1166#ifdef CONFIG_FUNCTION_TRACER
1167static void
1168function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
1169{
1170 struct trace_array *tr = &global_trace;
1171 struct trace_array_cpu *data;
1172 unsigned long flags;
1173 long disabled;
1174 int cpu, resched;
1175 int pc;
1176
1177 if (unlikely(!ftrace_function_enabled))
1178 return;
1179
1180 pc = preempt_count();
1181 resched = ftrace_preempt_disable();
1182 local_save_flags(flags);
1183 cpu = raw_smp_processor_id();
1184 data = tr->data[cpu];
1015 disabled = atomic_inc_return(&data->disabled); 1185 disabled = atomic_inc_return(&data->disabled);
1016 1186
1017 if (likely(disabled == 1)) 1187 if (likely(disabled == 1))
1018 __trace_special(tr, data, arg1, arg2, arg3); 1188 trace_function(tr, data, ip, parent_ip, flags, pc);
1019 1189
1020 atomic_dec(&data->disabled); 1190 atomic_dec(&data->disabled);
1021 local_irq_restore(flags); 1191 ftrace_preempt_enable(resched);
1022} 1192}
1023 1193
1024#ifdef CONFIG_FTRACE
1025static void 1194static void
1026function_trace_call(unsigned long ip, unsigned long parent_ip) 1195function_trace_call(unsigned long ip, unsigned long parent_ip)
1027{ 1196{
@@ -1030,24 +1199,85 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
1030 unsigned long flags; 1199 unsigned long flags;
1031 long disabled; 1200 long disabled;
1032 int cpu; 1201 int cpu;
1202 int pc;
1033 1203
1034 if (unlikely(!ftrace_function_enabled)) 1204 if (unlikely(!ftrace_function_enabled))
1035 return; 1205 return;
1036 1206
1037 if (skip_trace(ip)) 1207 /*
1038 return; 1208 * Need to use raw, since this must be called before the
1209 * recursive protection is performed.
1210 */
1211 local_irq_save(flags);
1212 cpu = raw_smp_processor_id();
1213 data = tr->data[cpu];
1214 disabled = atomic_inc_return(&data->disabled);
1215
1216 if (likely(disabled == 1)) {
1217 pc = preempt_count();
1218 trace_function(tr, data, ip, parent_ip, flags, pc);
1219 }
1220
1221 atomic_dec(&data->disabled);
1222 local_irq_restore(flags);
1223}
1224
1225#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1226int trace_graph_entry(struct ftrace_graph_ent *trace)
1227{
1228 struct trace_array *tr = &global_trace;
1229 struct trace_array_cpu *data;
1230 unsigned long flags;
1231 long disabled;
1232 int cpu;
1233 int pc;
1234
1235 if (!ftrace_trace_task(current))
1236 return 0;
1237
1238 if (!ftrace_graph_addr(trace->func))
1239 return 0;
1039 1240
1040 local_irq_save(flags); 1241 local_irq_save(flags);
1041 cpu = raw_smp_processor_id(); 1242 cpu = raw_smp_processor_id();
1042 data = tr->data[cpu]; 1243 data = tr->data[cpu];
1043 disabled = atomic_inc_return(&data->disabled); 1244 disabled = atomic_inc_return(&data->disabled);
1245 if (likely(disabled == 1)) {
1246 pc = preempt_count();
1247 __trace_graph_entry(tr, data, trace, flags, pc);
1248 }
1249 /* Only do the atomic if it is not already set */
1250 if (!test_tsk_trace_graph(current))
1251 set_tsk_trace_graph(current);
1252 atomic_dec(&data->disabled);
1253 local_irq_restore(flags);
1044 1254
1045 if (likely(disabled == 1)) 1255 return 1;
1046 trace_function(tr, data, ip, parent_ip, flags); 1256}
1047 1257
1258void trace_graph_return(struct ftrace_graph_ret *trace)
1259{
1260 struct trace_array *tr = &global_trace;
1261 struct trace_array_cpu *data;
1262 unsigned long flags;
1263 long disabled;
1264 int cpu;
1265 int pc;
1266
1267 local_irq_save(flags);
1268 cpu = raw_smp_processor_id();
1269 data = tr->data[cpu];
1270 disabled = atomic_inc_return(&data->disabled);
1271 if (likely(disabled == 1)) {
1272 pc = preempt_count();
1273 __trace_graph_return(tr, data, trace, flags, pc);
1274 }
1275 if (!trace->depth)
1276 clear_tsk_trace_graph(current);
1048 atomic_dec(&data->disabled); 1277 atomic_dec(&data->disabled);
1049 local_irq_restore(flags); 1278 local_irq_restore(flags);
1050} 1279}
1280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1051 1281
1052static struct ftrace_ops trace_ops __read_mostly = 1282static struct ftrace_ops trace_ops __read_mostly =
1053{ 1283{
@@ -1057,9 +1287,14 @@ static struct ftrace_ops trace_ops __read_mostly =
1057void tracing_start_function_trace(void) 1287void tracing_start_function_trace(void)
1058{ 1288{
1059 ftrace_function_enabled = 0; 1289 ftrace_function_enabled = 0;
1290
1291 if (trace_flags & TRACE_ITER_PREEMPTONLY)
1292 trace_ops.func = function_trace_call_preempt_only;
1293 else
1294 trace_ops.func = function_trace_call;
1295
1060 register_ftrace_function(&trace_ops); 1296 register_ftrace_function(&trace_ops);
1061 if (tracer_enabled) 1297 ftrace_function_enabled = 1;
1062 ftrace_function_enabled = 1;
1063} 1298}
1064 1299
1065void tracing_stop_function_trace(void) 1300void tracing_stop_function_trace(void)
@@ -1071,113 +1306,99 @@ void tracing_stop_function_trace(void)
1071 1306
1072enum trace_file_type { 1307enum trace_file_type {
1073 TRACE_FILE_LAT_FMT = 1, 1308 TRACE_FILE_LAT_FMT = 1,
1309 TRACE_FILE_ANNOTATE = 2,
1074}; 1310};
1075 1311
1076static struct trace_entry * 1312static void trace_iterator_increment(struct trace_iterator *iter)
1077trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
1078 struct trace_iterator *iter, int cpu)
1079{ 1313{
1080 struct page *page; 1314 /* Don't allow ftrace to trace into the ring buffers */
1081 struct trace_entry *array; 1315 ftrace_disable_cpu();
1082 1316
1083 if (iter->next_idx[cpu] >= tr->entries || 1317 iter->idx++;
1084 iter->next_idx[cpu] >= data->trace_idx || 1318 if (iter->buffer_iter[iter->cpu])
1085 (data->trace_head == data->trace_tail && 1319 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1086 data->trace_head_idx == data->trace_tail_idx))
1087 return NULL;
1088 1320
1089 if (!iter->next_page[cpu]) { 1321 ftrace_enable_cpu();
1090 /* Initialize the iterator for this cpu trace buffer */ 1322}
1091 WARN_ON(!data->trace_tail);
1092 page = virt_to_page(data->trace_tail);
1093 iter->next_page[cpu] = &page->lru;
1094 iter->next_page_idx[cpu] = data->trace_tail_idx;
1095 }
1096 1323
1097 page = list_entry(iter->next_page[cpu], struct page, lru); 1324static struct trace_entry *
1098 BUG_ON(&data->trace_pages == &page->lru); 1325peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1326{
1327 struct ring_buffer_event *event;
1328 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
1099 1329
1100 array = page_address(page); 1330 /* Don't allow ftrace to trace into the ring buffers */
1331 ftrace_disable_cpu();
1101 1332
1102 WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); 1333 if (buf_iter)
1103 return &array[iter->next_page_idx[cpu]]; 1334 event = ring_buffer_iter_peek(buf_iter, ts);
1335 else
1336 event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
1337
1338 ftrace_enable_cpu();
1339
1340 return event ? ring_buffer_event_data(event) : NULL;
1104} 1341}
1105 1342
1106static struct trace_entry * 1343static struct trace_entry *
1107find_next_entry(struct trace_iterator *iter, int *ent_cpu) 1344__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1108{ 1345{
1109 struct trace_array *tr = iter->tr; 1346 struct ring_buffer *buffer = iter->tr->buffer;
1110 struct trace_entry *ent, *next = NULL; 1347 struct trace_entry *ent, *next = NULL;
1348 u64 next_ts = 0, ts;
1111 int next_cpu = -1; 1349 int next_cpu = -1;
1112 int cpu; 1350 int cpu;
1113 1351
1114 for_each_tracing_cpu(cpu) { 1352 for_each_tracing_cpu(cpu) {
1115 if (!head_page(tr->data[cpu])) 1353
1354 if (ring_buffer_empty_cpu(buffer, cpu))
1116 continue; 1355 continue;
1117 ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); 1356
1357 ent = peek_next_entry(iter, cpu, &ts);
1358
1118 /* 1359 /*
1119 * Pick the entry with the smallest timestamp: 1360 * Pick the entry with the smallest timestamp:
1120 */ 1361 */
1121 if (ent && (!next || ent->t < next->t)) { 1362 if (ent && (!next || ts < next_ts)) {
1122 next = ent; 1363 next = ent;
1123 next_cpu = cpu; 1364 next_cpu = cpu;
1365 next_ts = ts;
1124 } 1366 }
1125 } 1367 }
1126 1368
1127 if (ent_cpu) 1369 if (ent_cpu)
1128 *ent_cpu = next_cpu; 1370 *ent_cpu = next_cpu;
1129 1371
1372 if (ent_ts)
1373 *ent_ts = next_ts;
1374
1130 return next; 1375 return next;
1131} 1376}
1132 1377
1133static void trace_iterator_increment(struct trace_iterator *iter) 1378/* Find the next real entry, without updating the iterator itself */
1379static struct trace_entry *
1380find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1134{ 1381{
1135 iter->idx++; 1382 return __find_next_entry(iter, ent_cpu, ent_ts);
1136 iter->next_idx[iter->cpu]++;
1137 iter->next_page_idx[iter->cpu]++;
1138
1139 if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
1140 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1141
1142 iter->next_page_idx[iter->cpu] = 0;
1143 iter->next_page[iter->cpu] =
1144 trace_next_list(data, iter->next_page[iter->cpu]);
1145 }
1146} 1383}
1147 1384
1148static void trace_consume(struct trace_iterator *iter) 1385/* Find the next real entry, and increment the iterator to the next entry */
1386static void *find_next_entry_inc(struct trace_iterator *iter)
1149{ 1387{
1150 struct trace_array_cpu *data = iter->tr->data[iter->cpu]; 1388 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
1151 1389
1152 data->trace_tail_idx++; 1390 if (iter->ent)
1153 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { 1391 trace_iterator_increment(iter);
1154 data->trace_tail = trace_next_page(data, data->trace_tail);
1155 data->trace_tail_idx = 0;
1156 }
1157 1392
1158 /* Check if we empty it, then reset the index */ 1393 return iter->ent ? iter : NULL;
1159 if (data->trace_head == data->trace_tail &&
1160 data->trace_head_idx == data->trace_tail_idx)
1161 data->trace_idx = 0;
1162} 1394}
1163 1395
1164static void *find_next_entry_inc(struct trace_iterator *iter) 1396static void trace_consume(struct trace_iterator *iter)
1165{ 1397{
1166 struct trace_entry *next; 1398 /* Don't allow ftrace to trace into the ring buffers */
1167 int next_cpu = -1; 1399 ftrace_disable_cpu();
1168 1400 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
1169 next = find_next_entry(iter, &next_cpu); 1401 ftrace_enable_cpu();
1170
1171 iter->prev_ent = iter->ent;
1172 iter->prev_cpu = iter->cpu;
1173
1174 iter->ent = next;
1175 iter->cpu = next_cpu;
1176
1177 if (next)
1178 trace_iterator_increment(iter);
1179
1180 return next ? iter : NULL;
1181} 1402}
1182 1403
1183static void *s_next(struct seq_file *m, void *v, loff_t *pos) 1404static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1210,7 +1431,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1210 struct trace_iterator *iter = m->private; 1431 struct trace_iterator *iter = m->private;
1211 void *p = NULL; 1432 void *p = NULL;
1212 loff_t l = 0; 1433 loff_t l = 0;
1213 int i; 1434 int cpu;
1214 1435
1215 mutex_lock(&trace_types_lock); 1436 mutex_lock(&trace_types_lock);
1216 1437
@@ -1221,22 +1442,19 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1221 1442
1222 atomic_inc(&trace_record_cmdline_disabled); 1443 atomic_inc(&trace_record_cmdline_disabled);
1223 1444
1224 /* let the tracer grab locks here if needed */
1225 if (current_trace->start)
1226 current_trace->start(iter);
1227
1228 if (*pos != iter->pos) { 1445 if (*pos != iter->pos) {
1229 iter->ent = NULL; 1446 iter->ent = NULL;
1230 iter->cpu = 0; 1447 iter->cpu = 0;
1231 iter->idx = -1; 1448 iter->idx = -1;
1232 iter->prev_ent = NULL;
1233 iter->prev_cpu = -1;
1234 1449
1235 for_each_tracing_cpu(i) { 1450 ftrace_disable_cpu();
1236 iter->next_idx[i] = 0; 1451
1237 iter->next_page[i] = NULL; 1452 for_each_tracing_cpu(cpu) {
1453 ring_buffer_iter_reset(iter->buffer_iter[cpu]);
1238 } 1454 }
1239 1455
1456 ftrace_enable_cpu();
1457
1240 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1458 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1241 ; 1459 ;
1242 1460
@@ -1250,28 +1468,24 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1250 1468
1251static void s_stop(struct seq_file *m, void *p) 1469static void s_stop(struct seq_file *m, void *p)
1252{ 1470{
1253 struct trace_iterator *iter = m->private;
1254
1255 atomic_dec(&trace_record_cmdline_disabled); 1471 atomic_dec(&trace_record_cmdline_disabled);
1256
1257 /* let the tracer release locks here if needed */
1258 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1259 iter->trace->stop(iter);
1260
1261 mutex_unlock(&trace_types_lock); 1472 mutex_unlock(&trace_types_lock);
1262} 1473}
1263 1474
1264#define KRETPROBE_MSG "[unknown/kretprobe'd]"
1265
1266#ifdef CONFIG_KRETPROBES 1475#ifdef CONFIG_KRETPROBES
1267static inline int kretprobed(unsigned long addr) 1476static inline const char *kretprobed(const char *name)
1268{ 1477{
1269 return addr == (unsigned long)kretprobe_trampoline; 1478 static const char tramp_name[] = "kretprobe_trampoline";
1479 int size = sizeof(tramp_name);
1480
1481 if (strncmp(tramp_name, name, size) == 0)
1482 return "[unknown/kretprobe'd]";
1483 return name;
1270} 1484}
1271#else 1485#else
1272static inline int kretprobed(unsigned long addr) 1486static inline const char *kretprobed(const char *name)
1273{ 1487{
1274 return 0; 1488 return name;
1275} 1489}
1276#endif /* CONFIG_KRETPROBES */ 1490#endif /* CONFIG_KRETPROBES */
1277 1491
@@ -1280,10 +1494,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1280{ 1494{
1281#ifdef CONFIG_KALLSYMS 1495#ifdef CONFIG_KALLSYMS
1282 char str[KSYM_SYMBOL_LEN]; 1496 char str[KSYM_SYMBOL_LEN];
1497 const char *name;
1283 1498
1284 kallsyms_lookup(address, NULL, NULL, NULL, str); 1499 kallsyms_lookup(address, NULL, NULL, NULL, str);
1285 1500
1286 return trace_seq_printf(s, fmt, str); 1501 name = kretprobed(str);
1502
1503 return trace_seq_printf(s, fmt, name);
1287#endif 1504#endif
1288 return 1; 1505 return 1;
1289} 1506}
@@ -1294,9 +1511,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1294{ 1511{
1295#ifdef CONFIG_KALLSYMS 1512#ifdef CONFIG_KALLSYMS
1296 char str[KSYM_SYMBOL_LEN]; 1513 char str[KSYM_SYMBOL_LEN];
1514 const char *name;
1297 1515
1298 sprint_symbol(str, address); 1516 sprint_symbol(str, address);
1299 return trace_seq_printf(s, fmt, str); 1517 name = kretprobed(str);
1518
1519 return trace_seq_printf(s, fmt, name);
1300#endif 1520#endif
1301 return 1; 1521 return 1;
1302} 1522}
@@ -1307,7 +1527,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1307# define IP_FMT "%016lx" 1527# define IP_FMT "%016lx"
1308#endif 1528#endif
1309 1529
1310static int 1530int
1311seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 1531seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1312{ 1532{
1313 int ret; 1533 int ret;
@@ -1328,23 +1548,95 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1328 return ret; 1548 return ret;
1329} 1549}
1330 1550
1551static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1552 unsigned long ip, unsigned long sym_flags)
1553{
1554 struct file *file = NULL;
1555 unsigned long vmstart = 0;
1556 int ret = 1;
1557
1558 if (mm) {
1559 const struct vm_area_struct *vma;
1560
1561 down_read(&mm->mmap_sem);
1562 vma = find_vma(mm, ip);
1563 if (vma) {
1564 file = vma->vm_file;
1565 vmstart = vma->vm_start;
1566 }
1567 if (file) {
1568 ret = trace_seq_path(s, &file->f_path);
1569 if (ret)
1570 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1571 }
1572 up_read(&mm->mmap_sem);
1573 }
1574 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1575 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1576 return ret;
1577}
1578
1579static int
1580seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1581 unsigned long sym_flags)
1582{
1583 struct mm_struct *mm = NULL;
1584 int ret = 1;
1585 unsigned int i;
1586
1587 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1588 struct task_struct *task;
1589 /*
1590 * we do the lookup on the thread group leader,
1591 * since individual threads might have already quit!
1592 */
1593 rcu_read_lock();
1594 task = find_task_by_vpid(entry->ent.tgid);
1595 if (task)
1596 mm = get_task_mm(task);
1597 rcu_read_unlock();
1598 }
1599
1600 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1601 unsigned long ip = entry->caller[i];
1602
1603 if (ip == ULONG_MAX || !ret)
1604 break;
1605 if (i && ret)
1606 ret = trace_seq_puts(s, " <- ");
1607 if (!ip) {
1608 if (ret)
1609 ret = trace_seq_puts(s, "??");
1610 continue;
1611 }
1612 if (!ret)
1613 break;
1614 if (ret)
1615 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1616 }
1617
1618 if (mm)
1619 mmput(mm);
1620 return ret;
1621}
1622
1331static void print_lat_help_header(struct seq_file *m) 1623static void print_lat_help_header(struct seq_file *m)
1332{ 1624{
1333 seq_puts(m, "# _------=> CPU# \n"); 1625 seq_puts(m, "# _------=> CPU# \n");
1334 seq_puts(m, "# / _-----=> irqs-off \n"); 1626 seq_puts(m, "# / _-----=> irqs-off \n");
1335 seq_puts(m, "# | / _----=> need-resched \n"); 1627 seq_puts(m, "# | / _----=> need-resched \n");
1336 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1628 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1337 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1629 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1338 seq_puts(m, "# |||| / \n"); 1630 seq_puts(m, "# |||| / \n");
1339 seq_puts(m, "# ||||| delay \n"); 1631 seq_puts(m, "# ||||| delay \n");
1340 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1632 seq_puts(m, "# cmd pid ||||| time | caller \n");
1341 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1633 seq_puts(m, "# \\ / ||||| \\ | / \n");
1342} 1634}
1343 1635
1344static void print_func_help_header(struct seq_file *m) 1636static void print_func_help_header(struct seq_file *m)
1345{ 1637{
1346 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1638 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1347 seq_puts(m, "# | | | | |\n"); 1639 seq_puts(m, "# | | | | |\n");
1348} 1640}
1349 1641
1350 1642
@@ -1355,23 +1647,16 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1355 struct trace_array *tr = iter->tr; 1647 struct trace_array *tr = iter->tr;
1356 struct trace_array_cpu *data = tr->data[tr->cpu]; 1648 struct trace_array_cpu *data = tr->data[tr->cpu];
1357 struct tracer *type = current_trace; 1649 struct tracer *type = current_trace;
1358 unsigned long total = 0; 1650 unsigned long total;
1359 unsigned long entries = 0; 1651 unsigned long entries;
1360 int cpu;
1361 const char *name = "preemption"; 1652 const char *name = "preemption";
1362 1653
1363 if (type) 1654 if (type)
1364 name = type->name; 1655 name = type->name;
1365 1656
1366 for_each_tracing_cpu(cpu) { 1657 entries = ring_buffer_entries(iter->tr->buffer);
1367 if (head_page(tr->data[cpu])) { 1658 total = entries +
1368 total += tr->data[cpu]->trace_idx; 1659 ring_buffer_overruns(iter->tr->buffer);
1369 if (tr->data[cpu]->trace_idx > tr->entries)
1370 entries += tr->entries;
1371 else
1372 entries += tr->data[cpu]->trace_idx;
1373 }
1374 }
1375 1660
1376 seq_printf(m, "%s latency trace v1.1.5 on %s\n", 1661 seq_printf(m, "%s latency trace v1.1.5 on %s\n",
1377 name, UTS_RELEASE); 1662 name, UTS_RELEASE);
@@ -1428,9 +1713,10 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1428 comm = trace_find_cmdline(entry->pid); 1713 comm = trace_find_cmdline(entry->pid);
1429 1714
1430 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); 1715 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1431 trace_seq_printf(s, "%d", cpu); 1716 trace_seq_printf(s, "%3d", cpu);
1432 trace_seq_printf(s, "%c%c", 1717 trace_seq_printf(s, "%c%c",
1433 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', 1718 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
1719 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
1434 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); 1720 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1435 1721
1436 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 1722 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
@@ -1457,7 +1743,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1457unsigned long preempt_mark_thresh = 100; 1743unsigned long preempt_mark_thresh = 100;
1458 1744
1459static void 1745static void
1460lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, 1746lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1461 unsigned long rel_usecs) 1747 unsigned long rel_usecs)
1462{ 1748{
1463 trace_seq_printf(s, " %4lldus", abs_usecs); 1749 trace_seq_printf(s, " %4lldus", abs_usecs);
@@ -1471,34 +1757,101 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
1471 1757
1472static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 1758static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1473 1759
1474static int 1760static int task_state_char(unsigned long state)
1761{
1762 int bit = state ? __ffs(state) + 1 : 0;
1763
1764 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1765}
1766
1767/*
1768 * The message is supposed to contain an ending newline.
1769 * If the printing stops prematurely, try to add a newline of our own.
1770 */
1771void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1772{
1773 struct trace_entry *ent;
1774 struct trace_field_cont *cont;
1775 bool ok = true;
1776
1777 ent = peek_next_entry(iter, iter->cpu, NULL);
1778 if (!ent || ent->type != TRACE_CONT) {
1779 trace_seq_putc(s, '\n');
1780 return;
1781 }
1782
1783 do {
1784 cont = (struct trace_field_cont *)ent;
1785 if (ok)
1786 ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
1787
1788 ftrace_disable_cpu();
1789
1790 if (iter->buffer_iter[iter->cpu])
1791 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1792 else
1793 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
1794
1795 ftrace_enable_cpu();
1796
1797 ent = peek_next_entry(iter, iter->cpu, NULL);
1798 } while (ent && ent->type == TRACE_CONT);
1799
1800 if (!ok)
1801 trace_seq_putc(s, '\n');
1802}
1803
1804static void test_cpu_buff_start(struct trace_iterator *iter)
1805{
1806 struct trace_seq *s = &iter->seq;
1807
1808 if (!(trace_flags & TRACE_ITER_ANNOTATE))
1809 return;
1810
1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1812 return;
1813
1814 if (cpu_isset(iter->cpu, iter->started))
1815 return;
1816
1817 cpu_set(iter->cpu, iter->started);
1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1819}
1820
1821static enum print_line_t
1475print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1822print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1476{ 1823{
1477 struct trace_seq *s = &iter->seq; 1824 struct trace_seq *s = &iter->seq;
1478 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1825 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1479 struct trace_entry *next_entry = find_next_entry(iter, NULL); 1826 struct trace_entry *next_entry;
1480 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 1827 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1481 struct trace_entry *entry = iter->ent; 1828 struct trace_entry *entry = iter->ent;
1482 unsigned long abs_usecs; 1829 unsigned long abs_usecs;
1483 unsigned long rel_usecs; 1830 unsigned long rel_usecs;
1831 u64 next_ts;
1484 char *comm; 1832 char *comm;
1485 int S, T; 1833 int S, T;
1486 int i; 1834 int i;
1487 unsigned state;
1488 1835
1836 if (entry->type == TRACE_CONT)
1837 return TRACE_TYPE_HANDLED;
1838
1839 test_cpu_buff_start(iter);
1840
1841 next_entry = find_next_entry(iter, NULL, &next_ts);
1489 if (!next_entry) 1842 if (!next_entry)
1490 next_entry = entry; 1843 next_ts = iter->ts;
1491 rel_usecs = ns2usecs(next_entry->t - entry->t); 1844 rel_usecs = ns2usecs(next_ts - iter->ts);
1492 abs_usecs = ns2usecs(entry->t - iter->tr->time_start); 1845 abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
1493 1846
1494 if (verbose) { 1847 if (verbose) {
1495 comm = trace_find_cmdline(entry->pid); 1848 comm = trace_find_cmdline(entry->pid);
1496 trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" 1849 trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
1497 " %ld.%03ldms (+%ld.%03ldms): ", 1850 " %ld.%03ldms (+%ld.%03ldms): ",
1498 comm, 1851 comm,
1499 entry->pid, cpu, entry->flags, 1852 entry->pid, cpu, entry->flags,
1500 entry->preempt_count, trace_idx, 1853 entry->preempt_count, trace_idx,
1501 ns2usecs(entry->t), 1854 ns2usecs(iter->ts),
1502 abs_usecs/1000, 1855 abs_usecs/1000,
1503 abs_usecs % 1000, rel_usecs/1000, 1856 abs_usecs % 1000, rel_usecs/1000,
1504 rel_usecs % 1000); 1857 rel_usecs % 1000);
@@ -1507,52 +1860,99 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1507 lat_print_timestamp(s, abs_usecs, rel_usecs); 1860 lat_print_timestamp(s, abs_usecs, rel_usecs);
1508 } 1861 }
1509 switch (entry->type) { 1862 switch (entry->type) {
1510 case TRACE_FN: 1863 case TRACE_FN: {
1511 seq_print_ip_sym(s, entry->fn.ip, sym_flags); 1864 struct ftrace_entry *field;
1865
1866 trace_assign_type(field, entry);
1867
1868 seq_print_ip_sym(s, field->ip, sym_flags);
1512 trace_seq_puts(s, " ("); 1869 trace_seq_puts(s, " (");
1513 if (kretprobed(entry->fn.parent_ip)) 1870 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1514 trace_seq_puts(s, KRETPROBE_MSG);
1515 else
1516 seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
1517 trace_seq_puts(s, ")\n"); 1871 trace_seq_puts(s, ")\n");
1518 break; 1872 break;
1873 }
1519 case TRACE_CTX: 1874 case TRACE_CTX:
1520 case TRACE_WAKE: 1875 case TRACE_WAKE: {
1521 T = entry->ctx.next_state < sizeof(state_to_char) ? 1876 struct ctx_switch_entry *field;
1522 state_to_char[entry->ctx.next_state] : 'X'; 1877
1523 1878 trace_assign_type(field, entry);
1524 state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; 1879
1525 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; 1880 T = task_state_char(field->next_state);
1526 comm = trace_find_cmdline(entry->ctx.next_pid); 1881 S = task_state_char(field->prev_state);
1527 trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", 1882 comm = trace_find_cmdline(field->next_pid);
1528 entry->ctx.prev_pid, 1883 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1529 entry->ctx.prev_prio, 1884 field->prev_pid,
1885 field->prev_prio,
1530 S, entry->type == TRACE_CTX ? "==>" : " +", 1886 S, entry->type == TRACE_CTX ? "==>" : " +",
1531 entry->ctx.next_pid, 1887 field->next_cpu,
1532 entry->ctx.next_prio, 1888 field->next_pid,
1889 field->next_prio,
1533 T, comm); 1890 T, comm);
1534 break; 1891 break;
1535 case TRACE_SPECIAL: 1892 }
1893 case TRACE_SPECIAL: {
1894 struct special_entry *field;
1895
1896 trace_assign_type(field, entry);
1897
1536 trace_seq_printf(s, "# %ld %ld %ld\n", 1898 trace_seq_printf(s, "# %ld %ld %ld\n",
1537 entry->special.arg1, 1899 field->arg1,
1538 entry->special.arg2, 1900 field->arg2,
1539 entry->special.arg3); 1901 field->arg3);
1540 break; 1902 break;
1541 case TRACE_STACK: 1903 }
1904 case TRACE_STACK: {
1905 struct stack_entry *field;
1906
1907 trace_assign_type(field, entry);
1908
1542 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1909 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1543 if (i) 1910 if (i)
1544 trace_seq_puts(s, " <= "); 1911 trace_seq_puts(s, " <= ");
1545 seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); 1912 seq_print_ip_sym(s, field->caller[i], sym_flags);
1546 } 1913 }
1547 trace_seq_puts(s, "\n"); 1914 trace_seq_puts(s, "\n");
1548 break; 1915 break;
1916 }
1917 case TRACE_PRINT: {
1918 struct print_entry *field;
1919
1920 trace_assign_type(field, entry);
1921
1922 seq_print_ip_sym(s, field->ip, sym_flags);
1923 trace_seq_printf(s, ": %s", field->buf);
1924 if (entry->flags & TRACE_FLAG_CONT)
1925 trace_seq_print_cont(s, iter);
1926 break;
1927 }
1928 case TRACE_BRANCH: {
1929 struct trace_branch *field;
1930
1931 trace_assign_type(field, entry);
1932
1933 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1934 field->correct ? " ok " : " MISS ",
1935 field->func,
1936 field->file,
1937 field->line);
1938 break;
1939 }
1940 case TRACE_USER_STACK: {
1941 struct userstack_entry *field;
1942
1943 trace_assign_type(field, entry);
1944
1945 seq_print_userip_objs(field, s, sym_flags);
1946 trace_seq_putc(s, '\n');
1947 break;
1948 }
1549 default: 1949 default:
1550 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1950 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1551 } 1951 }
1552 return 1; 1952 return TRACE_TYPE_HANDLED;
1553} 1953}
1554 1954
1555static int print_trace_fmt(struct trace_iterator *iter) 1955static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1556{ 1956{
1557 struct trace_seq *s = &iter->seq; 1957 struct trace_seq *s = &iter->seq;
1558 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1958 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1567,90 +1967,154 @@ static int print_trace_fmt(struct trace_iterator *iter)
1567 1967
1568 entry = iter->ent; 1968 entry = iter->ent;
1569 1969
1970 if (entry->type == TRACE_CONT)
1971 return TRACE_TYPE_HANDLED;
1972
1973 test_cpu_buff_start(iter);
1974
1570 comm = trace_find_cmdline(iter->ent->pid); 1975 comm = trace_find_cmdline(iter->ent->pid);
1571 1976
1572 t = ns2usecs(entry->t); 1977 t = ns2usecs(iter->ts);
1573 usec_rem = do_div(t, 1000000ULL); 1978 usec_rem = do_div(t, 1000000ULL);
1574 secs = (unsigned long)t; 1979 secs = (unsigned long)t;
1575 1980
1576 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); 1981 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1577 if (!ret) 1982 if (!ret)
1578 return 0; 1983 return TRACE_TYPE_PARTIAL_LINE;
1579 ret = trace_seq_printf(s, "[%02d] ", iter->cpu); 1984 ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
1580 if (!ret) 1985 if (!ret)
1581 return 0; 1986 return TRACE_TYPE_PARTIAL_LINE;
1582 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); 1987 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1583 if (!ret) 1988 if (!ret)
1584 return 0; 1989 return TRACE_TYPE_PARTIAL_LINE;
1585 1990
1586 switch (entry->type) { 1991 switch (entry->type) {
1587 case TRACE_FN: 1992 case TRACE_FN: {
1588 ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); 1993 struct ftrace_entry *field;
1994
1995 trace_assign_type(field, entry);
1996
1997 ret = seq_print_ip_sym(s, field->ip, sym_flags);
1589 if (!ret) 1998 if (!ret)
1590 return 0; 1999 return TRACE_TYPE_PARTIAL_LINE;
1591 if ((sym_flags & TRACE_ITER_PRINT_PARENT) && 2000 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
1592 entry->fn.parent_ip) { 2001 field->parent_ip) {
1593 ret = trace_seq_printf(s, " <-"); 2002 ret = trace_seq_printf(s, " <-");
1594 if (!ret) 2003 if (!ret)
1595 return 0; 2004 return TRACE_TYPE_PARTIAL_LINE;
1596 if (kretprobed(entry->fn.parent_ip)) 2005 ret = seq_print_ip_sym(s,
1597 ret = trace_seq_puts(s, KRETPROBE_MSG); 2006 field->parent_ip,
1598 else 2007 sym_flags);
1599 ret = seq_print_ip_sym(s, entry->fn.parent_ip,
1600 sym_flags);
1601 if (!ret) 2008 if (!ret)
1602 return 0; 2009 return TRACE_TYPE_PARTIAL_LINE;
1603 } 2010 }
1604 ret = trace_seq_printf(s, "\n"); 2011 ret = trace_seq_printf(s, "\n");
1605 if (!ret) 2012 if (!ret)
1606 return 0; 2013 return TRACE_TYPE_PARTIAL_LINE;
1607 break; 2014 break;
2015 }
1608 case TRACE_CTX: 2016 case TRACE_CTX:
1609 case TRACE_WAKE: 2017 case TRACE_WAKE: {
1610 S = entry->ctx.prev_state < sizeof(state_to_char) ? 2018 struct ctx_switch_entry *field;
1611 state_to_char[entry->ctx.prev_state] : 'X'; 2019
1612 T = entry->ctx.next_state < sizeof(state_to_char) ? 2020 trace_assign_type(field, entry);
1613 state_to_char[entry->ctx.next_state] : 'X'; 2021
1614 ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", 2022 T = task_state_char(field->next_state);
1615 entry->ctx.prev_pid, 2023 S = task_state_char(field->prev_state);
1616 entry->ctx.prev_prio, 2024 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
2025 field->prev_pid,
2026 field->prev_prio,
1617 S, 2027 S,
1618 entry->type == TRACE_CTX ? "==>" : " +", 2028 entry->type == TRACE_CTX ? "==>" : " +",
1619 entry->ctx.next_pid, 2029 field->next_cpu,
1620 entry->ctx.next_prio, 2030 field->next_pid,
2031 field->next_prio,
1621 T); 2032 T);
1622 if (!ret) 2033 if (!ret)
1623 return 0; 2034 return TRACE_TYPE_PARTIAL_LINE;
1624 break; 2035 break;
1625 case TRACE_SPECIAL: 2036 }
2037 case TRACE_SPECIAL: {
2038 struct special_entry *field;
2039
2040 trace_assign_type(field, entry);
2041
1626 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 2042 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1627 entry->special.arg1, 2043 field->arg1,
1628 entry->special.arg2, 2044 field->arg2,
1629 entry->special.arg3); 2045 field->arg3);
1630 if (!ret) 2046 if (!ret)
1631 return 0; 2047 return TRACE_TYPE_PARTIAL_LINE;
1632 break; 2048 break;
1633 case TRACE_STACK: 2049 }
2050 case TRACE_STACK: {
2051 struct stack_entry *field;
2052
2053 trace_assign_type(field, entry);
2054
1634 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 2055 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1635 if (i) { 2056 if (i) {
1636 ret = trace_seq_puts(s, " <= "); 2057 ret = trace_seq_puts(s, " <= ");
1637 if (!ret) 2058 if (!ret)
1638 return 0; 2059 return TRACE_TYPE_PARTIAL_LINE;
1639 } 2060 }
1640 ret = seq_print_ip_sym(s, entry->stack.caller[i], 2061 ret = seq_print_ip_sym(s, field->caller[i],
1641 sym_flags); 2062 sym_flags);
1642 if (!ret) 2063 if (!ret)
1643 return 0; 2064 return TRACE_TYPE_PARTIAL_LINE;
1644 } 2065 }
1645 ret = trace_seq_puts(s, "\n"); 2066 ret = trace_seq_puts(s, "\n");
1646 if (!ret) 2067 if (!ret)
1647 return 0; 2068 return TRACE_TYPE_PARTIAL_LINE;
1648 break; 2069 break;
1649 } 2070 }
1650 return 1; 2071 case TRACE_PRINT: {
2072 struct print_entry *field;
2073
2074 trace_assign_type(field, entry);
2075
2076 seq_print_ip_sym(s, field->ip, sym_flags);
2077 trace_seq_printf(s, ": %s", field->buf);
2078 if (entry->flags & TRACE_FLAG_CONT)
2079 trace_seq_print_cont(s, iter);
2080 break;
2081 }
2082 case TRACE_GRAPH_RET: {
2083 return print_graph_function(iter);
2084 }
2085 case TRACE_GRAPH_ENT: {
2086 return print_graph_function(iter);
2087 }
2088 case TRACE_BRANCH: {
2089 struct trace_branch *field;
2090
2091 trace_assign_type(field, entry);
2092
2093 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2094 field->correct ? " ok " : " MISS ",
2095 field->func,
2096 field->file,
2097 field->line);
2098 break;
2099 }
2100 case TRACE_USER_STACK: {
2101 struct userstack_entry *field;
2102
2103 trace_assign_type(field, entry);
2104
2105 ret = seq_print_userip_objs(field, s, sym_flags);
2106 if (!ret)
2107 return TRACE_TYPE_PARTIAL_LINE;
2108 ret = trace_seq_putc(s, '\n');
2109 if (!ret)
2110 return TRACE_TYPE_PARTIAL_LINE;
2111 break;
2112 }
2113 }
2114 return TRACE_TYPE_HANDLED;
1651} 2115}
1652 2116
1653static int print_raw_fmt(struct trace_iterator *iter) 2117static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1654{ 2118{
1655 struct trace_seq *s = &iter->seq; 2119 struct trace_seq *s = &iter->seq;
1656 struct trace_entry *entry; 2120 struct trace_entry *entry;
@@ -1659,47 +2123,75 @@ static int print_raw_fmt(struct trace_iterator *iter)
1659 2123
1660 entry = iter->ent; 2124 entry = iter->ent;
1661 2125
2126 if (entry->type == TRACE_CONT)
2127 return TRACE_TYPE_HANDLED;
2128
1662 ret = trace_seq_printf(s, "%d %d %llu ", 2129 ret = trace_seq_printf(s, "%d %d %llu ",
1663 entry->pid, iter->cpu, entry->t); 2130 entry->pid, iter->cpu, iter->ts);
1664 if (!ret) 2131 if (!ret)
1665 return 0; 2132 return TRACE_TYPE_PARTIAL_LINE;
1666 2133
1667 switch (entry->type) { 2134 switch (entry->type) {
1668 case TRACE_FN: 2135 case TRACE_FN: {
2136 struct ftrace_entry *field;
2137
2138 trace_assign_type(field, entry);
2139
1669 ret = trace_seq_printf(s, "%x %x\n", 2140 ret = trace_seq_printf(s, "%x %x\n",
1670 entry->fn.ip, entry->fn.parent_ip); 2141 field->ip,
2142 field->parent_ip);
1671 if (!ret) 2143 if (!ret)
1672 return 0; 2144 return TRACE_TYPE_PARTIAL_LINE;
1673 break; 2145 break;
2146 }
1674 case TRACE_CTX: 2147 case TRACE_CTX:
1675 case TRACE_WAKE: 2148 case TRACE_WAKE: {
1676 S = entry->ctx.prev_state < sizeof(state_to_char) ? 2149 struct ctx_switch_entry *field;
1677 state_to_char[entry->ctx.prev_state] : 'X'; 2150
1678 T = entry->ctx.next_state < sizeof(state_to_char) ? 2151 trace_assign_type(field, entry);
1679 state_to_char[entry->ctx.next_state] : 'X'; 2152
1680 if (entry->type == TRACE_WAKE) 2153 T = task_state_char(field->next_state);
1681 S = '+'; 2154 S = entry->type == TRACE_WAKE ? '+' :
1682 ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", 2155 task_state_char(field->prev_state);
1683 entry->ctx.prev_pid, 2156 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
1684 entry->ctx.prev_prio, 2157 field->prev_pid,
2158 field->prev_prio,
1685 S, 2159 S,
1686 entry->ctx.next_pid, 2160 field->next_cpu,
1687 entry->ctx.next_prio, 2161 field->next_pid,
2162 field->next_prio,
1688 T); 2163 T);
1689 if (!ret) 2164 if (!ret)
1690 return 0; 2165 return TRACE_TYPE_PARTIAL_LINE;
1691 break; 2166 break;
2167 }
1692 case TRACE_SPECIAL: 2168 case TRACE_SPECIAL:
1693 case TRACE_STACK: 2169 case TRACE_USER_STACK:
2170 case TRACE_STACK: {
2171 struct special_entry *field;
2172
2173 trace_assign_type(field, entry);
2174
1694 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 2175 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1695 entry->special.arg1, 2176 field->arg1,
1696 entry->special.arg2, 2177 field->arg2,
1697 entry->special.arg3); 2178 field->arg3);
1698 if (!ret) 2179 if (!ret)
1699 return 0; 2180 return TRACE_TYPE_PARTIAL_LINE;
1700 break; 2181 break;
1701 } 2182 }
1702 return 1; 2183 case TRACE_PRINT: {
2184 struct print_entry *field;
2185
2186 trace_assign_type(field, entry);
2187
2188 trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
2189 if (entry->flags & TRACE_FLAG_CONT)
2190 trace_seq_print_cont(s, iter);
2191 break;
2192 }
2193 }
2194 return TRACE_TYPE_HANDLED;
1703} 2195}
1704 2196
1705#define SEQ_PUT_FIELD_RET(s, x) \ 2197#define SEQ_PUT_FIELD_RET(s, x) \
@@ -1710,11 +2202,12 @@ do { \
1710 2202
1711#define SEQ_PUT_HEX_FIELD_RET(s, x) \ 2203#define SEQ_PUT_HEX_FIELD_RET(s, x) \
1712do { \ 2204do { \
2205 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
1713 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ 2206 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
1714 return 0; \ 2207 return 0; \
1715} while (0) 2208} while (0)
1716 2209
1717static int print_hex_fmt(struct trace_iterator *iter) 2210static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1718{ 2211{
1719 struct trace_seq *s = &iter->seq; 2212 struct trace_seq *s = &iter->seq;
1720 unsigned char newline = '\n'; 2213 unsigned char newline = '\n';
@@ -1723,97 +2216,162 @@ static int print_hex_fmt(struct trace_iterator *iter)
1723 2216
1724 entry = iter->ent; 2217 entry = iter->ent;
1725 2218
2219 if (entry->type == TRACE_CONT)
2220 return TRACE_TYPE_HANDLED;
2221
1726 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 2222 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
1727 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 2223 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
1728 SEQ_PUT_HEX_FIELD_RET(s, entry->t); 2224 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
1729 2225
1730 switch (entry->type) { 2226 switch (entry->type) {
1731 case TRACE_FN: 2227 case TRACE_FN: {
1732 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); 2228 struct ftrace_entry *field;
1733 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); 2229
2230 trace_assign_type(field, entry);
2231
2232 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
2233 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
1734 break; 2234 break;
2235 }
1735 case TRACE_CTX: 2236 case TRACE_CTX:
1736 case TRACE_WAKE: 2237 case TRACE_WAKE: {
1737 S = entry->ctx.prev_state < sizeof(state_to_char) ? 2238 struct ctx_switch_entry *field;
1738 state_to_char[entry->ctx.prev_state] : 'X'; 2239
1739 T = entry->ctx.next_state < sizeof(state_to_char) ? 2240 trace_assign_type(field, entry);
1740 state_to_char[entry->ctx.next_state] : 'X'; 2241
1741 if (entry->type == TRACE_WAKE) 2242 T = task_state_char(field->next_state);
1742 S = '+'; 2243 S = entry->type == TRACE_WAKE ? '+' :
1743 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); 2244 task_state_char(field->prev_state);
1744 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); 2245 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
2246 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
1745 SEQ_PUT_HEX_FIELD_RET(s, S); 2247 SEQ_PUT_HEX_FIELD_RET(s, S);
1746 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); 2248 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
1747 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); 2249 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
1748 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); 2250 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
1749 SEQ_PUT_HEX_FIELD_RET(s, T); 2251 SEQ_PUT_HEX_FIELD_RET(s, T);
1750 break; 2252 break;
2253 }
1751 case TRACE_SPECIAL: 2254 case TRACE_SPECIAL:
1752 case TRACE_STACK: 2255 case TRACE_USER_STACK:
1753 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); 2256 case TRACE_STACK: {
1754 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); 2257 struct special_entry *field;
1755 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); 2258
2259 trace_assign_type(field, entry);
2260
2261 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
2262 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
2263 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1756 break; 2264 break;
1757 } 2265 }
2266 }
1758 SEQ_PUT_FIELD_RET(s, newline); 2267 SEQ_PUT_FIELD_RET(s, newline);
1759 2268
1760 return 1; 2269 return TRACE_TYPE_HANDLED;
2270}
2271
2272static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
2273{
2274 struct trace_seq *s = &iter->seq;
2275 struct trace_entry *entry = iter->ent;
2276 struct print_entry *field;
2277 int ret;
2278
2279 trace_assign_type(field, entry);
2280
2281 ret = trace_seq_printf(s, field->buf);
2282 if (!ret)
2283 return TRACE_TYPE_PARTIAL_LINE;
2284
2285 if (entry->flags & TRACE_FLAG_CONT)
2286 trace_seq_print_cont(s, iter);
2287
2288 return TRACE_TYPE_HANDLED;
1761} 2289}
1762 2290
1763static int print_bin_fmt(struct trace_iterator *iter) 2291static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1764{ 2292{
1765 struct trace_seq *s = &iter->seq; 2293 struct trace_seq *s = &iter->seq;
1766 struct trace_entry *entry; 2294 struct trace_entry *entry;
1767 2295
1768 entry = iter->ent; 2296 entry = iter->ent;
1769 2297
2298 if (entry->type == TRACE_CONT)
2299 return TRACE_TYPE_HANDLED;
2300
1770 SEQ_PUT_FIELD_RET(s, entry->pid); 2301 SEQ_PUT_FIELD_RET(s, entry->pid);
1771 SEQ_PUT_FIELD_RET(s, entry->cpu); 2302 SEQ_PUT_FIELD_RET(s, entry->cpu);
1772 SEQ_PUT_FIELD_RET(s, entry->t); 2303 SEQ_PUT_FIELD_RET(s, iter->ts);
1773 2304
1774 switch (entry->type) { 2305 switch (entry->type) {
1775 case TRACE_FN: 2306 case TRACE_FN: {
1776 SEQ_PUT_FIELD_RET(s, entry->fn.ip); 2307 struct ftrace_entry *field;
1777 SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); 2308
2309 trace_assign_type(field, entry);
2310
2311 SEQ_PUT_FIELD_RET(s, field->ip);
2312 SEQ_PUT_FIELD_RET(s, field->parent_ip);
1778 break; 2313 break;
1779 case TRACE_CTX: 2314 }
1780 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); 2315 case TRACE_CTX: {
1781 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); 2316 struct ctx_switch_entry *field;
1782 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); 2317
1783 SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); 2318 trace_assign_type(field, entry);
1784 SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); 2319
1785 SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); 2320 SEQ_PUT_FIELD_RET(s, field->prev_pid);
2321 SEQ_PUT_FIELD_RET(s, field->prev_prio);
2322 SEQ_PUT_FIELD_RET(s, field->prev_state);
2323 SEQ_PUT_FIELD_RET(s, field->next_pid);
2324 SEQ_PUT_FIELD_RET(s, field->next_prio);
2325 SEQ_PUT_FIELD_RET(s, field->next_state);
1786 break; 2326 break;
2327 }
1787 case TRACE_SPECIAL: 2328 case TRACE_SPECIAL:
1788 case TRACE_STACK: 2329 case TRACE_USER_STACK:
1789 SEQ_PUT_FIELD_RET(s, entry->special.arg1); 2330 case TRACE_STACK: {
1790 SEQ_PUT_FIELD_RET(s, entry->special.arg2); 2331 struct special_entry *field;
1791 SEQ_PUT_FIELD_RET(s, entry->special.arg3); 2332
2333 trace_assign_type(field, entry);
2334
2335 SEQ_PUT_FIELD_RET(s, field->arg1);
2336 SEQ_PUT_FIELD_RET(s, field->arg2);
2337 SEQ_PUT_FIELD_RET(s, field->arg3);
1792 break; 2338 break;
1793 } 2339 }
2340 }
1794 return 1; 2341 return 1;
1795} 2342}
1796 2343
1797static int trace_empty(struct trace_iterator *iter) 2344static int trace_empty(struct trace_iterator *iter)
1798{ 2345{
1799 struct trace_array_cpu *data;
1800 int cpu; 2346 int cpu;
1801 2347
1802 for_each_tracing_cpu(cpu) { 2348 for_each_tracing_cpu(cpu) {
1803 data = iter->tr->data[cpu]; 2349 if (iter->buffer_iter[cpu]) {
1804 2350 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
1805 if (head_page(data) && data->trace_idx && 2351 return 0;
1806 (data->trace_tail != data->trace_head || 2352 } else {
1807 data->trace_tail_idx != data->trace_head_idx)) 2353 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
1808 return 0; 2354 return 0;
2355 }
1809 } 2356 }
2357
1810 return 1; 2358 return 1;
1811} 2359}
1812 2360
1813static int print_trace_line(struct trace_iterator *iter) 2361static enum print_line_t print_trace_line(struct trace_iterator *iter)
1814{ 2362{
1815 if (iter->trace && iter->trace->print_line) 2363 enum print_line_t ret;
1816 return iter->trace->print_line(iter); 2364
2365 if (iter->trace && iter->trace->print_line) {
2366 ret = iter->trace->print_line(iter);
2367 if (ret != TRACE_TYPE_UNHANDLED)
2368 return ret;
2369 }
2370
2371 if (iter->ent->type == TRACE_PRINT &&
2372 trace_flags & TRACE_ITER_PRINTK &&
2373 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2374 return print_printk_msg_only(iter);
1817 2375
1818 if (trace_flags & TRACE_ITER_BIN) 2376 if (trace_flags & TRACE_ITER_BIN)
1819 return print_bin_fmt(iter); 2377 return print_bin_fmt(iter);
@@ -1839,7 +2397,9 @@ static int s_show(struct seq_file *m, void *v)
1839 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2397 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1840 seq_puts(m, "#\n"); 2398 seq_puts(m, "#\n");
1841 } 2399 }
1842 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2400 if (iter->trace && iter->trace->print_header)
2401 iter->trace->print_header(m);
2402 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1843 /* print nothing if the buffers are empty */ 2403 /* print nothing if the buffers are empty */
1844 if (trace_empty(iter)) 2404 if (trace_empty(iter))
1845 return 0; 2405 return 0;
@@ -1869,6 +2429,8 @@ static struct trace_iterator *
1869__tracing_open(struct inode *inode, struct file *file, int *ret) 2429__tracing_open(struct inode *inode, struct file *file, int *ret)
1870{ 2430{
1871 struct trace_iterator *iter; 2431 struct trace_iterator *iter;
2432 struct seq_file *m;
2433 int cpu;
1872 2434
1873 if (tracing_disabled) { 2435 if (tracing_disabled) {
1874 *ret = -ENODEV; 2436 *ret = -ENODEV;
@@ -1889,28 +2451,49 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1889 iter->trace = current_trace; 2451 iter->trace = current_trace;
1890 iter->pos = -1; 2452 iter->pos = -1;
1891 2453
2454 /* Notify the tracer early; before we stop tracing. */
2455 if (iter->trace && iter->trace->open)
2456 iter->trace->open(iter);
2457
2458 /* Annotate start of buffers if we had overruns */
2459 if (ring_buffer_overruns(iter->tr->buffer))
2460 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2461
2462
2463 for_each_tracing_cpu(cpu) {
2464
2465 iter->buffer_iter[cpu] =
2466 ring_buffer_read_start(iter->tr->buffer, cpu);
2467
2468 if (!iter->buffer_iter[cpu])
2469 goto fail_buffer;
2470 }
2471
1892 /* TODO stop tracer */ 2472 /* TODO stop tracer */
1893 *ret = seq_open(file, &tracer_seq_ops); 2473 *ret = seq_open(file, &tracer_seq_ops);
1894 if (!*ret) { 2474 if (*ret)
1895 struct seq_file *m = file->private_data; 2475 goto fail_buffer;
1896 m->private = iter;
1897 2476
1898 /* stop the trace while dumping */ 2477 m = file->private_data;
1899 if (iter->tr->ctrl) { 2478 m->private = iter;
1900 tracer_enabled = 0; 2479
1901 ftrace_function_enabled = 0; 2480 /* stop the trace while dumping */
1902 } 2481 tracing_stop();
1903 2482
1904 if (iter->trace && iter->trace->open)
1905 iter->trace->open(iter);
1906 } else {
1907 kfree(iter);
1908 iter = NULL;
1909 }
1910 mutex_unlock(&trace_types_lock); 2483 mutex_unlock(&trace_types_lock);
1911 2484
1912 out: 2485 out:
1913 return iter; 2486 return iter;
2487
2488 fail_buffer:
2489 for_each_tracing_cpu(cpu) {
2490 if (iter->buffer_iter[cpu])
2491 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2492 }
2493 mutex_unlock(&trace_types_lock);
2494 kfree(iter);
2495
2496 return ERR_PTR(-ENOMEM);
1914} 2497}
1915 2498
1916int tracing_open_generic(struct inode *inode, struct file *filp) 2499int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1926,20 +2509,19 @@ int tracing_release(struct inode *inode, struct file *file)
1926{ 2509{
1927 struct seq_file *m = (struct seq_file *)file->private_data; 2510 struct seq_file *m = (struct seq_file *)file->private_data;
1928 struct trace_iterator *iter = m->private; 2511 struct trace_iterator *iter = m->private;
2512 int cpu;
1929 2513
1930 mutex_lock(&trace_types_lock); 2514 mutex_lock(&trace_types_lock);
2515 for_each_tracing_cpu(cpu) {
2516 if (iter->buffer_iter[cpu])
2517 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2518 }
2519
1931 if (iter->trace && iter->trace->close) 2520 if (iter->trace && iter->trace->close)
1932 iter->trace->close(iter); 2521 iter->trace->close(iter);
1933 2522
1934 /* reenable tracing if it was previously enabled */ 2523 /* reenable tracing if it was previously enabled */
1935 if (iter->tr->ctrl) { 2524 tracing_start();
1936 tracer_enabled = 1;
1937 /*
1938 * It is safe to enable function tracing even if it
1939 * isn't used
1940 */
1941 ftrace_function_enabled = 1;
1942 }
1943 mutex_unlock(&trace_types_lock); 2525 mutex_unlock(&trace_types_lock);
1944 2526
1945 seq_release(inode, file); 2527 seq_release(inode, file);
@@ -2117,7 +2699,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2117 if (err) 2699 if (err)
2118 goto err_unlock; 2700 goto err_unlock;
2119 2701
2120 raw_local_irq_disable(); 2702 local_irq_disable();
2121 __raw_spin_lock(&ftrace_max_lock); 2703 __raw_spin_lock(&ftrace_max_lock);
2122 for_each_tracing_cpu(cpu) { 2704 for_each_tracing_cpu(cpu) {
2123 /* 2705 /*
@@ -2134,7 +2716,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2134 } 2716 }
2135 } 2717 }
2136 __raw_spin_unlock(&ftrace_max_lock); 2718 __raw_spin_unlock(&ftrace_max_lock);
2137 raw_local_irq_enable(); 2719 local_irq_enable();
2138 2720
2139 tracing_cpumask = tracing_cpumask_new; 2721 tracing_cpumask = tracing_cpumask_new;
2140 2722
@@ -2155,13 +2737,16 @@ static struct file_operations tracing_cpumask_fops = {
2155}; 2737};
2156 2738
2157static ssize_t 2739static ssize_t
2158tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, 2740tracing_trace_options_read(struct file *filp, char __user *ubuf,
2159 size_t cnt, loff_t *ppos) 2741 size_t cnt, loff_t *ppos)
2160{ 2742{
2743 int i;
2161 char *buf; 2744 char *buf;
2162 int r = 0; 2745 int r = 0;
2163 int len = 0; 2746 int len = 0;
2164 int i; 2747 u32 tracer_flags = current_trace->flags->val;
2748 struct tracer_opt *trace_opts = current_trace->flags->opts;
2749
2165 2750
2166 /* calulate max size */ 2751 /* calulate max size */
2167 for (i = 0; trace_options[i]; i++) { 2752 for (i = 0; trace_options[i]; i++) {
@@ -2169,6 +2754,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2169 len += 3; /* "no" and space */ 2754 len += 3; /* "no" and space */
2170 } 2755 }
2171 2756
2757 /*
2758 * Increase the size with names of options specific
2759 * of the current tracer.
2760 */
2761 for (i = 0; trace_opts[i].name; i++) {
2762 len += strlen(trace_opts[i].name);
2763 len += 3; /* "no" and space */
2764 }
2765
2172 /* +2 for \n and \0 */ 2766 /* +2 for \n and \0 */
2173 buf = kmalloc(len + 2, GFP_KERNEL); 2767 buf = kmalloc(len + 2, GFP_KERNEL);
2174 if (!buf) 2768 if (!buf)
@@ -2181,6 +2775,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2181 r += sprintf(buf + r, "no%s ", trace_options[i]); 2775 r += sprintf(buf + r, "no%s ", trace_options[i]);
2182 } 2776 }
2183 2777
2778 for (i = 0; trace_opts[i].name; i++) {
2779 if (tracer_flags & trace_opts[i].bit)
2780 r += sprintf(buf + r, "%s ",
2781 trace_opts[i].name);
2782 else
2783 r += sprintf(buf + r, "no%s ",
2784 trace_opts[i].name);
2785 }
2786
2184 r += sprintf(buf + r, "\n"); 2787 r += sprintf(buf + r, "\n");
2185 WARN_ON(r >= len + 2); 2788 WARN_ON(r >= len + 2);
2186 2789
@@ -2191,13 +2794,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2191 return r; 2794 return r;
2192} 2795}
2193 2796
2797/* Try to assign a tracer specific option */
2798static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2799{
2800 struct tracer_flags *trace_flags = trace->flags;
2801 struct tracer_opt *opts = NULL;
2802 int ret = 0, i = 0;
2803 int len;
2804
2805 for (i = 0; trace_flags->opts[i].name; i++) {
2806 opts = &trace_flags->opts[i];
2807 len = strlen(opts->name);
2808
2809 if (strncmp(cmp, opts->name, len) == 0) {
2810 ret = trace->set_flag(trace_flags->val,
2811 opts->bit, !neg);
2812 break;
2813 }
2814 }
2815 /* Not found */
2816 if (!trace_flags->opts[i].name)
2817 return -EINVAL;
2818
2819 /* Refused to handle */
2820 if (ret)
2821 return ret;
2822
2823 if (neg)
2824 trace_flags->val &= ~opts->bit;
2825 else
2826 trace_flags->val |= opts->bit;
2827
2828 return 0;
2829}
2830
2194static ssize_t 2831static ssize_t
2195tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, 2832tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2196 size_t cnt, loff_t *ppos) 2833 size_t cnt, loff_t *ppos)
2197{ 2834{
2198 char buf[64]; 2835 char buf[64];
2199 char *cmp = buf; 2836 char *cmp = buf;
2200 int neg = 0; 2837 int neg = 0;
2838 int ret;
2201 int i; 2839 int i;
2202 2840
2203 if (cnt >= sizeof(buf)) 2841 if (cnt >= sizeof(buf))
@@ -2224,11 +2862,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2224 break; 2862 break;
2225 } 2863 }
2226 } 2864 }
2227 /* 2865
2228 * If no option could be set, return an error: 2866 /* If no option could be set, test the specific tracer options */
2229 */ 2867 if (!trace_options[i]) {
2230 if (!trace_options[i]) 2868 ret = set_tracer_option(current_trace, cmp, neg);
2231 return -EINVAL; 2869 if (ret)
2870 return ret;
2871 }
2232 2872
2233 filp->f_pos += cnt; 2873 filp->f_pos += cnt;
2234 2874
@@ -2237,8 +2877,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2237 2877
2238static struct file_operations tracing_iter_fops = { 2878static struct file_operations tracing_iter_fops = {
2239 .open = tracing_open_generic, 2879 .open = tracing_open_generic,
2240 .read = tracing_iter_ctrl_read, 2880 .read = tracing_trace_options_read,
2241 .write = tracing_iter_ctrl_write, 2881 .write = tracing_trace_options_write,
2242}; 2882};
2243 2883
2244static const char readme_msg[] = 2884static const char readme_msg[] =
@@ -2252,9 +2892,9 @@ static const char readme_msg[] =
2252 "# echo sched_switch > /debug/tracing/current_tracer\n" 2892 "# echo sched_switch > /debug/tracing/current_tracer\n"
2253 "# cat /debug/tracing/current_tracer\n" 2893 "# cat /debug/tracing/current_tracer\n"
2254 "sched_switch\n" 2894 "sched_switch\n"
2255 "# cat /debug/tracing/iter_ctrl\n" 2895 "# cat /debug/tracing/trace_options\n"
2256 "noprint-parent nosym-offset nosym-addr noverbose\n" 2896 "noprint-parent nosym-offset nosym-addr noverbose\n"
2257 "# echo print-parent > /debug/tracing/iter_ctrl\n" 2897 "# echo print-parent > /debug/tracing/trace_options\n"
2258 "# echo 1 > /debug/tracing/tracing_enabled\n" 2898 "# echo 1 > /debug/tracing/tracing_enabled\n"
2259 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2899 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2260 "echo 0 > /debug/tracing/tracing_enabled\n" 2900 "echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2277,11 +2917,10 @@ static ssize_t
2277tracing_ctrl_read(struct file *filp, char __user *ubuf, 2917tracing_ctrl_read(struct file *filp, char __user *ubuf,
2278 size_t cnt, loff_t *ppos) 2918 size_t cnt, loff_t *ppos)
2279{ 2919{
2280 struct trace_array *tr = filp->private_data;
2281 char buf[64]; 2920 char buf[64];
2282 int r; 2921 int r;
2283 2922
2284 r = sprintf(buf, "%ld\n", tr->ctrl); 2923 r = sprintf(buf, "%u\n", tracer_enabled);
2285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2924 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2286} 2925}
2287 2926
@@ -2309,16 +2948,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2309 val = !!val; 2948 val = !!val;
2310 2949
2311 mutex_lock(&trace_types_lock); 2950 mutex_lock(&trace_types_lock);
2312 if (tr->ctrl ^ val) { 2951 if (tracer_enabled ^ val) {
2313 if (val) 2952 if (val) {
2314 tracer_enabled = 1; 2953 tracer_enabled = 1;
2315 else 2954 if (current_trace->start)
2955 current_trace->start(tr);
2956 tracing_start();
2957 } else {
2316 tracer_enabled = 0; 2958 tracer_enabled = 0;
2317 2959 tracing_stop();
2318 tr->ctrl = val; 2960 if (current_trace->stop)
2319 2961 current_trace->stop(tr);
2320 if (current_trace && current_trace->ctrl_update) 2962 }
2321 current_trace->ctrl_update(tr);
2322 } 2963 }
2323 mutex_unlock(&trace_types_lock); 2964 mutex_unlock(&trace_types_lock);
2324 2965
@@ -2344,14 +2985,52 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2344 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2985 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2345} 2986}
2346 2987
2988static int tracing_set_tracer(char *buf)
2989{
2990 struct trace_array *tr = &global_trace;
2991 struct tracer *t;
2992 int ret = 0;
2993
2994 mutex_lock(&trace_types_lock);
2995 for (t = trace_types; t; t = t->next) {
2996 if (strcmp(t->name, buf) == 0)
2997 break;
2998 }
2999 if (!t) {
3000 ret = -EINVAL;
3001 goto out;
3002 }
3003 if (t == current_trace)
3004 goto out;
3005
3006 trace_branch_disable();
3007 if (current_trace && current_trace->reset)
3008 current_trace->reset(tr);
3009
3010 current_trace = t;
3011 if (t->init) {
3012 ret = t->init(tr);
3013 if (ret)
3014 goto out;
3015 }
3016
3017 trace_branch_enable(tr);
3018 out:
3019 mutex_unlock(&trace_types_lock);
3020
3021 return ret;
3022}
3023
2347static ssize_t 3024static ssize_t
2348tracing_set_trace_write(struct file *filp, const char __user *ubuf, 3025tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2349 size_t cnt, loff_t *ppos) 3026 size_t cnt, loff_t *ppos)
2350{ 3027{
2351 struct trace_array *tr = &global_trace;
2352 struct tracer *t;
2353 char buf[max_tracer_type_len+1]; 3028 char buf[max_tracer_type_len+1];
2354 int i; 3029 int i;
3030 size_t ret;
3031 int err;
3032
3033 ret = cnt;
2355 3034
2356 if (cnt > max_tracer_type_len) 3035 if (cnt > max_tracer_type_len)
2357 cnt = max_tracer_type_len; 3036 cnt = max_tracer_type_len;
@@ -2365,27 +3044,13 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2365 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) 3044 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2366 buf[i] = 0; 3045 buf[i] = 0;
2367 3046
2368 mutex_lock(&trace_types_lock); 3047 err = tracing_set_tracer(buf);
2369 for (t = trace_types; t; t = t->next) { 3048 if (err)
2370 if (strcmp(t->name, buf) == 0) 3049 return err;
2371 break;
2372 }
2373 if (!t || t == current_trace)
2374 goto out;
2375
2376 if (current_trace && current_trace->reset)
2377 current_trace->reset(tr);
2378
2379 current_trace = t;
2380 if (t->init)
2381 t->init(tr);
2382 3050
2383 out: 3051 filp->f_pos += ret;
2384 mutex_unlock(&trace_types_lock);
2385 3052
2386 filp->f_pos += cnt; 3053 return ret;
2387
2388 return cnt;
2389} 3054}
2390 3055
2391static ssize_t 3056static ssize_t
@@ -2450,6 +3115,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2450 return -ENOMEM; 3115 return -ENOMEM;
2451 3116
2452 mutex_lock(&trace_types_lock); 3117 mutex_lock(&trace_types_lock);
3118
3119 /* trace pipe does not show start of buffer */
3120 cpus_setall(iter->started);
3121
2453 iter->tr = &global_trace; 3122 iter->tr = &global_trace;
2454 iter->trace = current_trace; 3123 iter->trace = current_trace;
2455 filp->private_data = iter; 3124 filp->private_data = iter;
@@ -2500,20 +3169,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2500 size_t cnt, loff_t *ppos) 3169 size_t cnt, loff_t *ppos)
2501{ 3170{
2502 struct trace_iterator *iter = filp->private_data; 3171 struct trace_iterator *iter = filp->private_data;
2503 struct trace_array_cpu *data;
2504 static cpumask_t mask;
2505 unsigned long flags;
2506#ifdef CONFIG_FTRACE
2507 int ftrace_save;
2508#endif
2509 int cpu;
2510 ssize_t sret; 3172 ssize_t sret;
2511 3173
2512 /* return any leftover data */ 3174 /* return any leftover data */
2513 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 3175 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2514 if (sret != -EBUSY) 3176 if (sret != -EBUSY)
2515 return sret; 3177 return sret;
2516 sret = 0;
2517 3178
2518 trace_seq_reset(&iter->seq); 3179 trace_seq_reset(&iter->seq);
2519 3180
@@ -2524,6 +3185,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2524 goto out; 3185 goto out;
2525 } 3186 }
2526 3187
3188waitagain:
3189 sret = 0;
2527 while (trace_empty(iter)) { 3190 while (trace_empty(iter)) {
2528 3191
2529 if ((filp->f_flags & O_NONBLOCK)) { 3192 if ((filp->f_flags & O_NONBLOCK)) {
@@ -2588,46 +3251,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2588 offsetof(struct trace_iterator, seq)); 3251 offsetof(struct trace_iterator, seq));
2589 iter->pos = -1; 3252 iter->pos = -1;
2590 3253
2591 /*
2592 * We need to stop all tracing on all CPUS to read the
2593 * the next buffer. This is a bit expensive, but is
2594 * not done often. We fill all what we can read,
2595 * and then release the locks again.
2596 */
2597
2598 cpus_clear(mask);
2599 local_irq_save(flags);
2600#ifdef CONFIG_FTRACE
2601 ftrace_save = ftrace_enabled;
2602 ftrace_enabled = 0;
2603#endif
2604 smp_wmb();
2605 for_each_tracing_cpu(cpu) {
2606 data = iter->tr->data[cpu];
2607
2608 if (!head_page(data) || !data->trace_idx)
2609 continue;
2610
2611 atomic_inc(&data->disabled);
2612 cpu_set(cpu, mask);
2613 }
2614
2615 for_each_cpu_mask(cpu, mask) {
2616 data = iter->tr->data[cpu];
2617 __raw_spin_lock(&data->lock);
2618
2619 if (data->overrun > iter->last_overrun[cpu])
2620 iter->overrun[cpu] +=
2621 data->overrun - iter->last_overrun[cpu];
2622 iter->last_overrun[cpu] = data->overrun;
2623 }
2624
2625 while (find_next_entry_inc(iter) != NULL) { 3254 while (find_next_entry_inc(iter) != NULL) {
2626 int ret; 3255 enum print_line_t ret;
2627 int len = iter->seq.len; 3256 int len = iter->seq.len;
2628 3257
2629 ret = print_trace_line(iter); 3258 ret = print_trace_line(iter);
2630 if (!ret) { 3259 if (ret == TRACE_TYPE_PARTIAL_LINE) {
2631 /* don't print partial lines */ 3260 /* don't print partial lines */
2632 iter->seq.len = len; 3261 iter->seq.len = len;
2633 break; 3262 break;
@@ -2639,26 +3268,17 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2639 break; 3268 break;
2640 } 3269 }
2641 3270
2642 for_each_cpu_mask(cpu, mask) {
2643 data = iter->tr->data[cpu];
2644 __raw_spin_unlock(&data->lock);
2645 }
2646
2647 for_each_cpu_mask(cpu, mask) {
2648 data = iter->tr->data[cpu];
2649 atomic_dec(&data->disabled);
2650 }
2651#ifdef CONFIG_FTRACE
2652 ftrace_enabled = ftrace_save;
2653#endif
2654 local_irq_restore(flags);
2655
2656 /* Now copy what we have to the user */ 3271 /* Now copy what we have to the user */
2657 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 3272 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2658 if (iter->seq.readpos >= iter->seq.len) 3273 if (iter->seq.readpos >= iter->seq.len)
2659 trace_seq_reset(&iter->seq); 3274 trace_seq_reset(&iter->seq);
3275
3276 /*
3277 * If there was nothing to send to user, inspite of consuming trace
3278 * entries, go back to wait for more entries.
3279 */
2660 if (sret == -EBUSY) 3280 if (sret == -EBUSY)
2661 sret = 0; 3281 goto waitagain;
2662 3282
2663out: 3283out:
2664 mutex_unlock(&trace_types_lock); 3284 mutex_unlock(&trace_types_lock);
@@ -2674,7 +3294,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
2674 char buf[64]; 3294 char buf[64];
2675 int r; 3295 int r;
2676 3296
2677 r = sprintf(buf, "%lu\n", tr->entries); 3297 r = sprintf(buf, "%lu\n", tr->entries >> 10);
2678 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3298 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2679} 3299}
2680 3300
@@ -2684,7 +3304,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2684{ 3304{
2685 unsigned long val; 3305 unsigned long val;
2686 char buf[64]; 3306 char buf[64];
2687 int i, ret; 3307 int ret, cpu;
2688 3308
2689 if (cnt >= sizeof(buf)) 3309 if (cnt >= sizeof(buf))
2690 return -EINVAL; 3310 return -EINVAL;
@@ -2704,71 +3324,109 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2704 3324
2705 mutex_lock(&trace_types_lock); 3325 mutex_lock(&trace_types_lock);
2706 3326
2707 if (current_trace != &no_tracer) { 3327 tracing_stop();
2708 cnt = -EBUSY;
2709 pr_info("ftrace: set current_tracer to none"
2710 " before modifying buffer size\n");
2711 goto out;
2712 }
2713
2714 if (val > global_trace.entries) {
2715 long pages_requested;
2716 unsigned long freeable_pages;
2717
2718 /* make sure we have enough memory before mapping */
2719 pages_requested =
2720 (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
2721
2722 /* account for each buffer (and max_tr) */
2723 pages_requested *= tracing_nr_buffers * 2;
2724 3328
2725 /* Check for overflow */ 3329 /* disable all cpu buffers */
2726 if (pages_requested < 0) { 3330 for_each_tracing_cpu(cpu) {
2727 cnt = -ENOMEM; 3331 if (global_trace.data[cpu])
2728 goto out; 3332 atomic_inc(&global_trace.data[cpu]->disabled);
2729 } 3333 if (max_tr.data[cpu])
3334 atomic_inc(&max_tr.data[cpu]->disabled);
3335 }
2730 3336
2731 freeable_pages = determine_dirtyable_memory(); 3337 /* value is in KB */
3338 val <<= 10;
2732 3339
2733 /* we only allow to request 1/4 of useable memory */ 3340 if (val != global_trace.entries) {
2734 if (pages_requested > 3341 ret = ring_buffer_resize(global_trace.buffer, val);
2735 ((freeable_pages + tracing_pages_allocated) / 4)) { 3342 if (ret < 0) {
2736 cnt = -ENOMEM; 3343 cnt = ret;
2737 goto out; 3344 goto out;
2738 } 3345 }
2739 3346
2740 while (global_trace.entries < val) { 3347 ret = ring_buffer_resize(max_tr.buffer, val);
2741 if (trace_alloc_page()) { 3348 if (ret < 0) {
2742 cnt = -ENOMEM; 3349 int r;
2743 goto out; 3350 cnt = ret;
3351 r = ring_buffer_resize(global_trace.buffer,
3352 global_trace.entries);
3353 if (r < 0) {
3354 /* AARGH! We are left with different
3355 * size max buffer!!!! */
3356 WARN_ON(1);
3357 tracing_disabled = 1;
2744 } 3358 }
2745 /* double check that we don't go over the known pages */ 3359 goto out;
2746 if (tracing_pages_allocated > pages_requested)
2747 break;
2748 } 3360 }
2749 3361
2750 } else { 3362 global_trace.entries = val;
2751 /* include the number of entries in val (inc of page entries) */
2752 while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
2753 trace_free_page();
2754 } 3363 }
2755 3364
2756 /* check integrity */
2757 for_each_tracing_cpu(i)
2758 check_pages(global_trace.data[i]);
2759
2760 filp->f_pos += cnt; 3365 filp->f_pos += cnt;
2761 3366
2762 /* If check pages failed, return ENOMEM */ 3367 /* If check pages failed, return ENOMEM */
2763 if (tracing_disabled) 3368 if (tracing_disabled)
2764 cnt = -ENOMEM; 3369 cnt = -ENOMEM;
2765 out: 3370 out:
3371 for_each_tracing_cpu(cpu) {
3372 if (global_trace.data[cpu])
3373 atomic_dec(&global_trace.data[cpu]->disabled);
3374 if (max_tr.data[cpu])
3375 atomic_dec(&max_tr.data[cpu]->disabled);
3376 }
3377
3378 tracing_start();
2766 max_tr.entries = global_trace.entries; 3379 max_tr.entries = global_trace.entries;
2767 mutex_unlock(&trace_types_lock); 3380 mutex_unlock(&trace_types_lock);
2768 3381
2769 return cnt; 3382 return cnt;
2770} 3383}
2771 3384
3385static int mark_printk(const char *fmt, ...)
3386{
3387 int ret;
3388 va_list args;
3389 va_start(args, fmt);
3390 ret = trace_vprintk(0, -1, fmt, args);
3391 va_end(args);
3392 return ret;
3393}
3394
3395static ssize_t
3396tracing_mark_write(struct file *filp, const char __user *ubuf,
3397 size_t cnt, loff_t *fpos)
3398{
3399 char *buf;
3400 char *end;
3401
3402 if (tracing_disabled)
3403 return -EINVAL;
3404
3405 if (cnt > TRACE_BUF_SIZE)
3406 cnt = TRACE_BUF_SIZE;
3407
3408 buf = kmalloc(cnt + 1, GFP_KERNEL);
3409 if (buf == NULL)
3410 return -ENOMEM;
3411
3412 if (copy_from_user(buf, ubuf, cnt)) {
3413 kfree(buf);
3414 return -EFAULT;
3415 }
3416
3417 /* Cut from the first nil or newline. */
3418 buf[cnt] = '\0';
3419 end = strchr(buf, '\n');
3420 if (end)
3421 *end = '\0';
3422
3423 cnt = mark_printk("%s\n", buf);
3424 kfree(buf);
3425 *fpos += cnt;
3426
3427 return cnt;
3428}
3429
2772static struct file_operations tracing_max_lat_fops = { 3430static struct file_operations tracing_max_lat_fops = {
2773 .open = tracing_open_generic, 3431 .open = tracing_open_generic,
2774 .read = tracing_max_lat_read, 3432 .read = tracing_max_lat_read,
@@ -2800,24 +3458,45 @@ static struct file_operations tracing_entries_fops = {
2800 .write = tracing_entries_write, 3458 .write = tracing_entries_write,
2801}; 3459};
2802 3460
3461static struct file_operations tracing_mark_fops = {
3462 .open = tracing_open_generic,
3463 .write = tracing_mark_write,
3464};
3465
2803#ifdef CONFIG_DYNAMIC_FTRACE 3466#ifdef CONFIG_DYNAMIC_FTRACE
2804 3467
3468int __weak ftrace_arch_read_dyn_info(char *buf, int size)
3469{
3470 return 0;
3471}
3472
2805static ssize_t 3473static ssize_t
2806tracing_read_long(struct file *filp, char __user *ubuf, 3474tracing_read_dyn_info(struct file *filp, char __user *ubuf,
2807 size_t cnt, loff_t *ppos) 3475 size_t cnt, loff_t *ppos)
2808{ 3476{
3477 static char ftrace_dyn_info_buffer[1024];
3478 static DEFINE_MUTEX(dyn_info_mutex);
2809 unsigned long *p = filp->private_data; 3479 unsigned long *p = filp->private_data;
2810 char buf[64]; 3480 char *buf = ftrace_dyn_info_buffer;
3481 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
2811 int r; 3482 int r;
2812 3483
2813 r = sprintf(buf, "%ld\n", *p); 3484 mutex_lock(&dyn_info_mutex);
3485 r = sprintf(buf, "%ld ", *p);
2814 3486
2815 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3487 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
3488 buf[r++] = '\n';
3489
3490 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3491
3492 mutex_unlock(&dyn_info_mutex);
3493
3494 return r;
2816} 3495}
2817 3496
2818static struct file_operations tracing_read_long_fops = { 3497static struct file_operations tracing_dyn_info_fops = {
2819 .open = tracing_open_generic, 3498 .open = tracing_open_generic,
2820 .read = tracing_read_long, 3499 .read = tracing_read_dyn_info,
2821}; 3500};
2822#endif 3501#endif
2823 3502
@@ -2846,7 +3525,7 @@ struct dentry *tracing_init_dentry(void)
2846#include "trace_selftest.c" 3525#include "trace_selftest.c"
2847#endif 3526#endif
2848 3527
2849static __init void tracer_init_debugfs(void) 3528static __init int tracer_init_debugfs(void)
2850{ 3529{
2851 struct dentry *d_tracer; 3530 struct dentry *d_tracer;
2852 struct dentry *entry; 3531 struct dentry *entry;
@@ -2858,10 +3537,10 @@ static __init void tracer_init_debugfs(void)
2858 if (!entry) 3537 if (!entry)
2859 pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); 3538 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2860 3539
2861 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, 3540 entry = debugfs_create_file("trace_options", 0644, d_tracer,
2862 NULL, &tracing_iter_fops); 3541 NULL, &tracing_iter_fops);
2863 if (!entry) 3542 if (!entry)
2864 pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); 3543 pr_warning("Could not create debugfs 'trace_options' entry\n");
2865 3544
2866 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3545 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2867 NULL, &tracing_cpumask_fops); 3546 NULL, &tracing_cpumask_fops);
@@ -2881,12 +3560,12 @@ static __init void tracer_init_debugfs(void)
2881 entry = debugfs_create_file("available_tracers", 0444, d_tracer, 3560 entry = debugfs_create_file("available_tracers", 0444, d_tracer,
2882 &global_trace, &show_traces_fops); 3561 &global_trace, &show_traces_fops);
2883 if (!entry) 3562 if (!entry)
2884 pr_warning("Could not create debugfs 'trace' entry\n"); 3563 pr_warning("Could not create debugfs 'available_tracers' entry\n");
2885 3564
2886 entry = debugfs_create_file("current_tracer", 0444, d_tracer, 3565 entry = debugfs_create_file("current_tracer", 0444, d_tracer,
2887 &global_trace, &set_tracer_fops); 3566 &global_trace, &set_tracer_fops);
2888 if (!entry) 3567 if (!entry)
2889 pr_warning("Could not create debugfs 'trace' entry\n"); 3568 pr_warning("Could not create debugfs 'current_tracer' entry\n");
2890 3569
2891 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, 3570 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
2892 &tracing_max_latency, 3571 &tracing_max_latency,
@@ -2899,7 +3578,7 @@ static __init void tracer_init_debugfs(void)
2899 &tracing_thresh, &tracing_max_lat_fops); 3578 &tracing_thresh, &tracing_max_lat_fops);
2900 if (!entry) 3579 if (!entry)
2901 pr_warning("Could not create debugfs " 3580 pr_warning("Could not create debugfs "
2902 "'tracing_threash' entry\n"); 3581 "'tracing_thresh' entry\n");
2903 entry = debugfs_create_file("README", 0644, d_tracer, 3582 entry = debugfs_create_file("README", 0644, d_tracer,
2904 NULL, &tracing_readme_fops); 3583 NULL, &tracing_readme_fops);
2905 if (!entry) 3584 if (!entry)
@@ -2909,18 +3588,24 @@ static __init void tracer_init_debugfs(void)
2909 NULL, &tracing_pipe_fops); 3588 NULL, &tracing_pipe_fops);
2910 if (!entry) 3589 if (!entry)
2911 pr_warning("Could not create debugfs " 3590 pr_warning("Could not create debugfs "
2912 "'tracing_threash' entry\n"); 3591 "'trace_pipe' entry\n");
2913 3592
2914 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 3593 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
2915 &global_trace, &tracing_entries_fops); 3594 &global_trace, &tracing_entries_fops);
2916 if (!entry) 3595 if (!entry)
2917 pr_warning("Could not create debugfs " 3596 pr_warning("Could not create debugfs "
2918 "'tracing_threash' entry\n"); 3597 "'buffer_size_kb' entry\n");
3598
3599 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
3600 NULL, &tracing_mark_fops);
3601 if (!entry)
3602 pr_warning("Could not create debugfs "
3603 "'trace_marker' entry\n");
2919 3604
2920#ifdef CONFIG_DYNAMIC_FTRACE 3605#ifdef CONFIG_DYNAMIC_FTRACE
2921 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 3606 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2922 &ftrace_update_tot_cnt, 3607 &ftrace_update_tot_cnt,
2923 &tracing_read_long_fops); 3608 &tracing_dyn_info_fops);
2924 if (!entry) 3609 if (!entry)
2925 pr_warning("Could not create debugfs " 3610 pr_warning("Could not create debugfs "
2926 "'dyn_ftrace_total_info' entry\n"); 3611 "'dyn_ftrace_total_info' entry\n");
@@ -2928,230 +3613,268 @@ static __init void tracer_init_debugfs(void)
2928#ifdef CONFIG_SYSPROF_TRACER 3613#ifdef CONFIG_SYSPROF_TRACER
2929 init_tracer_sysprof_debugfs(d_tracer); 3614 init_tracer_sysprof_debugfs(d_tracer);
2930#endif 3615#endif
3616 return 0;
2931} 3617}
2932 3618
2933static int trace_alloc_page(void) 3619int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
2934{ 3620{
3621 static DEFINE_SPINLOCK(trace_buf_lock);
3622 static char trace_buf[TRACE_BUF_SIZE];
3623
3624 struct ring_buffer_event *event;
3625 struct trace_array *tr = &global_trace;
2935 struct trace_array_cpu *data; 3626 struct trace_array_cpu *data;
2936 struct page *page, *tmp; 3627 int cpu, len = 0, size, pc;
2937 LIST_HEAD(pages); 3628 struct print_entry *entry;
2938 void *array; 3629 unsigned long irq_flags;
2939 unsigned pages_allocated = 0;
2940 int i;
2941 3630
2942 /* first allocate a page for each CPU */ 3631 if (tracing_disabled || tracing_selftest_running)
2943 for_each_tracing_cpu(i) { 3632 return 0;
2944 array = (void *)__get_free_page(GFP_KERNEL);
2945 if (array == NULL) {
2946 printk(KERN_ERR "tracer: failed to allocate page"
2947 "for trace buffer!\n");
2948 goto free_pages;
2949 }
2950 3633
2951 pages_allocated++; 3634 pc = preempt_count();
2952 page = virt_to_page(array); 3635 preempt_disable_notrace();
2953 list_add(&page->lru, &pages); 3636 cpu = raw_smp_processor_id();
3637 data = tr->data[cpu];
2954 3638
2955/* Only allocate if we are actually using the max trace */ 3639 if (unlikely(atomic_read(&data->disabled)))
2956#ifdef CONFIG_TRACER_MAX_TRACE 3640 goto out;
2957 array = (void *)__get_free_page(GFP_KERNEL);
2958 if (array == NULL) {
2959 printk(KERN_ERR "tracer: failed to allocate page"
2960 "for trace buffer!\n");
2961 goto free_pages;
2962 }
2963 pages_allocated++;
2964 page = virt_to_page(array);
2965 list_add(&page->lru, &pages);
2966#endif
2967 }
2968 3641
2969 /* Now that we successfully allocate a page per CPU, add them */ 3642 pause_graph_tracing();
2970 for_each_tracing_cpu(i) { 3643 spin_lock_irqsave(&trace_buf_lock, irq_flags);
2971 data = global_trace.data[i]; 3644 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
2972 page = list_entry(pages.next, struct page, lru); 3645
2973 list_del_init(&page->lru); 3646 len = min(len, TRACE_BUF_SIZE-1);
2974 list_add_tail(&page->lru, &data->trace_pages); 3647 trace_buf[len] = 0;
2975 ClearPageLRU(page); 3648
3649 size = sizeof(*entry) + len + 1;
3650 event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
3651 if (!event)
3652 goto out_unlock;
3653 entry = ring_buffer_event_data(event);
3654 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
3655 entry->ent.type = TRACE_PRINT;
3656 entry->ip = ip;
3657 entry->depth = depth;
3658
3659 memcpy(&entry->buf, trace_buf, len);
3660 entry->buf[len] = 0;
3661 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3662
3663 out_unlock:
3664 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3665 unpause_graph_tracing();
3666 out:
3667 preempt_enable_notrace();
2976 3668
2977#ifdef CONFIG_TRACER_MAX_TRACE 3669 return len;
2978 data = max_tr.data[i]; 3670}
2979 page = list_entry(pages.next, struct page, lru); 3671EXPORT_SYMBOL_GPL(trace_vprintk);
2980 list_del_init(&page->lru);
2981 list_add_tail(&page->lru, &data->trace_pages);
2982 SetPageLRU(page);
2983#endif
2984 }
2985 tracing_pages_allocated += pages_allocated;
2986 global_trace.entries += ENTRIES_PER_PAGE;
2987 3672
2988 return 0; 3673int __ftrace_printk(unsigned long ip, const char *fmt, ...)
3674{
3675 int ret;
3676 va_list ap;
2989 3677
2990 free_pages: 3678 if (!(trace_flags & TRACE_ITER_PRINTK))
2991 list_for_each_entry_safe(page, tmp, &pages, lru) { 3679 return 0;
2992 list_del_init(&page->lru); 3680
2993 __free_page(page); 3681 va_start(ap, fmt);
3682 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3683 va_end(ap);
3684 return ret;
3685}
3686EXPORT_SYMBOL_GPL(__ftrace_printk);
3687
3688static int trace_panic_handler(struct notifier_block *this,
3689 unsigned long event, void *unused)
3690{
3691 if (ftrace_dump_on_oops)
3692 ftrace_dump();
3693 return NOTIFY_OK;
3694}
3695
3696static struct notifier_block trace_panic_notifier = {
3697 .notifier_call = trace_panic_handler,
3698 .next = NULL,
3699 .priority = 150 /* priority: INT_MAX >= x >= 0 */
3700};
3701
3702static int trace_die_handler(struct notifier_block *self,
3703 unsigned long val,
3704 void *data)
3705{
3706 switch (val) {
3707 case DIE_OOPS:
3708 if (ftrace_dump_on_oops)
3709 ftrace_dump();
3710 break;
3711 default:
3712 break;
2994 } 3713 }
2995 return -ENOMEM; 3714 return NOTIFY_OK;
3715}
3716
3717static struct notifier_block trace_die_notifier = {
3718 .notifier_call = trace_die_handler,
3719 .priority = 200
3720};
3721
3722/*
3723 * printk is set to max of 1024, we really don't need it that big.
3724 * Nothing should be printing 1000 characters anyway.
3725 */
3726#define TRACE_MAX_PRINT 1000
3727
3728/*
3729 * Define here KERN_TRACE so that we have one place to modify
3730 * it if we decide to change what log level the ftrace dump
3731 * should be at.
3732 */
3733#define KERN_TRACE KERN_INFO
3734
3735static void
3736trace_printk_seq(struct trace_seq *s)
3737{
3738 /* Probably should print a warning here. */
3739 if (s->len >= 1000)
3740 s->len = 1000;
3741
3742 /* should be zero ended, but we are paranoid. */
3743 s->buffer[s->len] = 0;
3744
3745 printk(KERN_TRACE "%s", s->buffer);
3746
3747 trace_seq_reset(s);
2996} 3748}
2997 3749
2998static int trace_free_page(void) 3750void ftrace_dump(void)
2999{ 3751{
3000 struct trace_array_cpu *data; 3752 static DEFINE_SPINLOCK(ftrace_dump_lock);
3001 struct page *page; 3753 /* use static because iter can be a bit big for the stack */
3002 struct list_head *p; 3754 static struct trace_iterator iter;
3003 int i; 3755 static cpumask_t mask;
3004 int ret = 0; 3756 static int dump_ran;
3757 unsigned long flags;
3758 int cnt = 0, cpu;
3005 3759
3006 /* free one page from each buffer */ 3760 /* only one dump */
3007 for_each_tracing_cpu(i) { 3761 spin_lock_irqsave(&ftrace_dump_lock, flags);
3008 data = global_trace.data[i]; 3762 if (dump_ran)
3009 p = data->trace_pages.next; 3763 goto out;
3010 if (p == &data->trace_pages) {
3011 /* should never happen */
3012 WARN_ON(1);
3013 tracing_disabled = 1;
3014 ret = -1;
3015 break;
3016 }
3017 page = list_entry(p, struct page, lru);
3018 ClearPageLRU(page);
3019 list_del(&page->lru);
3020 tracing_pages_allocated--;
3021 tracing_pages_allocated--;
3022 __free_page(page);
3023 3764
3024 tracing_reset(data); 3765 dump_ran = 1;
3025 3766
3026#ifdef CONFIG_TRACER_MAX_TRACE 3767 /* No turning back! */
3027 data = max_tr.data[i]; 3768 ftrace_kill();
3028 p = data->trace_pages.next; 3769
3029 if (p == &data->trace_pages) { 3770 for_each_tracing_cpu(cpu) {
3030 /* should never happen */ 3771 atomic_inc(&global_trace.data[cpu]->disabled);
3031 WARN_ON(1); 3772 }
3032 tracing_disabled = 1; 3773
3033 ret = -1; 3774 /* don't look at user memory in panic mode */
3034 break; 3775 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3776
3777 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3778
3779 iter.tr = &global_trace;
3780 iter.trace = current_trace;
3781
3782 /*
3783 * We need to stop all tracing on all CPUS to read the
3784 * the next buffer. This is a bit expensive, but is
3785 * not done often. We fill all what we can read,
3786 * and then release the locks again.
3787 */
3788
3789 cpus_clear(mask);
3790
3791 while (!trace_empty(&iter)) {
3792
3793 if (!cnt)
3794 printk(KERN_TRACE "---------------------------------\n");
3795
3796 cnt++;
3797
3798 /* reset all but tr, trace, and overruns */
3799 memset(&iter.seq, 0,
3800 sizeof(struct trace_iterator) -
3801 offsetof(struct trace_iterator, seq));
3802 iter.iter_flags |= TRACE_FILE_LAT_FMT;
3803 iter.pos = -1;
3804
3805 if (find_next_entry_inc(&iter) != NULL) {
3806 print_trace_line(&iter);
3807 trace_consume(&iter);
3035 } 3808 }
3036 page = list_entry(p, struct page, lru);
3037 ClearPageLRU(page);
3038 list_del(&page->lru);
3039 __free_page(page);
3040 3809
3041 tracing_reset(data); 3810 trace_printk_seq(&iter.seq);
3042#endif
3043 } 3811 }
3044 global_trace.entries -= ENTRIES_PER_PAGE;
3045 3812
3046 return ret; 3813 if (!cnt)
3814 printk(KERN_TRACE " (ftrace buffer empty)\n");
3815 else
3816 printk(KERN_TRACE "---------------------------------\n");
3817
3818 out:
3819 spin_unlock_irqrestore(&ftrace_dump_lock, flags);
3047} 3820}
3048 3821
3049__init static int tracer_alloc_buffers(void) 3822__init static int tracer_alloc_buffers(void)
3050{ 3823{
3051 struct trace_array_cpu *data; 3824 struct trace_array_cpu *data;
3052 void *array;
3053 struct page *page;
3054 int pages = 0;
3055 int ret = -ENOMEM;
3056 int i; 3825 int i;
3057 3826
3058 /* TODO: make the number of buffers hot pluggable with CPUS */ 3827 /* TODO: make the number of buffers hot pluggable with CPUS */
3059 tracing_nr_buffers = num_possible_cpus();
3060 tracing_buffer_mask = cpu_possible_map; 3828 tracing_buffer_mask = cpu_possible_map;
3061 3829
3062 /* Allocate the first page for all buffers */ 3830 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3063 for_each_tracing_cpu(i) { 3831 TRACE_BUFFER_FLAGS);
3064 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 3832 if (!global_trace.buffer) {
3065 max_tr.data[i] = &per_cpu(max_data, i); 3833 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3066 3834 WARN_ON(1);
3067 array = (void *)__get_free_page(GFP_KERNEL); 3835 return 0;
3068 if (array == NULL) { 3836 }
3069 printk(KERN_ERR "tracer: failed to allocate page" 3837 global_trace.entries = ring_buffer_size(global_trace.buffer);
3070 "for trace buffer!\n");
3071 goto free_buffers;
3072 }
3073
3074 /* set the array to the list */
3075 INIT_LIST_HEAD(&data->trace_pages);
3076 page = virt_to_page(array);
3077 list_add(&page->lru, &data->trace_pages);
3078 /* use the LRU flag to differentiate the two buffers */
3079 ClearPageLRU(page);
3080
3081 data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
3082 max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
3083 3838
3084/* Only allocate if we are actually using the max trace */
3085#ifdef CONFIG_TRACER_MAX_TRACE 3839#ifdef CONFIG_TRACER_MAX_TRACE
3086 array = (void *)__get_free_page(GFP_KERNEL); 3840 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3087 if (array == NULL) { 3841 TRACE_BUFFER_FLAGS);
3088 printk(KERN_ERR "tracer: failed to allocate page" 3842 if (!max_tr.buffer) {
3089 "for trace buffer!\n"); 3843 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3090 goto free_buffers; 3844 WARN_ON(1);
3091 } 3845 ring_buffer_free(global_trace.buffer);
3092 3846 return 0;
3093 INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
3094 page = virt_to_page(array);
3095 list_add(&page->lru, &max_tr.data[i]->trace_pages);
3096 SetPageLRU(page);
3097#endif
3098 } 3847 }
3848 max_tr.entries = ring_buffer_size(max_tr.buffer);
3849 WARN_ON(max_tr.entries != global_trace.entries);
3850#endif
3099 3851
3100 /* 3852 /* Allocate the first page for all buffers */
3101 * Since we allocate by orders of pages, we may be able to 3853 for_each_tracing_cpu(i) {
3102 * round up a bit. 3854 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
3103 */ 3855 max_tr.data[i] = &per_cpu(max_data, i);
3104 global_trace.entries = ENTRIES_PER_PAGE;
3105 pages++;
3106
3107 while (global_trace.entries < trace_nr_entries) {
3108 if (trace_alloc_page())
3109 break;
3110 pages++;
3111 } 3856 }
3112 max_tr.entries = global_trace.entries;
3113
3114 pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
3115 pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
3116 pr_info(" actual entries %ld\n", global_trace.entries);
3117
3118 tracer_init_debugfs();
3119 3857
3120 trace_init_cmdlines(); 3858 trace_init_cmdlines();
3121 3859
3122 register_tracer(&no_tracer); 3860 register_tracer(&nop_trace);
3123 current_trace = &no_tracer; 3861#ifdef CONFIG_BOOT_TRACER
3862 register_tracer(&boot_tracer);
3863 current_trace = &boot_tracer;
3864 current_trace->init(&global_trace);
3865#else
3866 current_trace = &nop_trace;
3867#endif
3124 3868
3125 /* All seems OK, enable tracing */ 3869 /* All seems OK, enable tracing */
3126 global_trace.ctrl = tracer_enabled;
3127 tracing_disabled = 0; 3870 tracing_disabled = 0;
3128 3871
3129 return 0; 3872 atomic_notifier_chain_register(&panic_notifier_list,
3873 &trace_panic_notifier);
3130 3874
3131 free_buffers: 3875 register_die_notifier(&trace_die_notifier);
3132 for (i-- ; i >= 0; i--) {
3133 struct page *page, *tmp;
3134 struct trace_array_cpu *data = global_trace.data[i];
3135
3136 if (data) {
3137 list_for_each_entry_safe(page, tmp,
3138 &data->trace_pages, lru) {
3139 list_del_init(&page->lru);
3140 __free_page(page);
3141 }
3142 }
3143 3876
3144#ifdef CONFIG_TRACER_MAX_TRACE 3877 return 0;
3145 data = max_tr.data[i];
3146 if (data) {
3147 list_for_each_entry_safe(page, tmp,
3148 &data->trace_pages, lru) {
3149 list_del_init(&page->lru);
3150 __free_page(page);
3151 }
3152 }
3153#endif
3154 }
3155 return ret;
3156} 3878}
3157fs_initcall(tracer_alloc_buffers); 3879early_initcall(tracer_alloc_buffers);
3880fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69f86788c2b..cc7a4f864036 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,7 +5,10 @@
5#include <asm/atomic.h> 5#include <asm/atomic.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h>
8#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h>
11#include <trace/boot.h>
9 12
10enum trace_type { 13enum trace_type {
11 __TRACE_FIRST_TYPE = 0, 14 __TRACE_FIRST_TYPE = 0,
@@ -13,38 +16,80 @@ enum trace_type {
13 TRACE_FN, 16 TRACE_FN,
14 TRACE_CTX, 17 TRACE_CTX,
15 TRACE_WAKE, 18 TRACE_WAKE,
19 TRACE_CONT,
16 TRACE_STACK, 20 TRACE_STACK,
21 TRACE_PRINT,
17 TRACE_SPECIAL, 22 TRACE_SPECIAL,
18 TRACE_MMIO_RW, 23 TRACE_MMIO_RW,
19 TRACE_MMIO_MAP, 24 TRACE_MMIO_MAP,
25 TRACE_BRANCH,
26 TRACE_BOOT_CALL,
27 TRACE_BOOT_RET,
28 TRACE_GRAPH_RET,
29 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK,
31 TRACE_HW_BRANCHES,
32 TRACE_POWER,
20 33
21 __TRACE_LAST_TYPE 34 __TRACE_LAST_TYPE
22}; 35};
23 36
24/* 37/*
38 * The trace entry - the most basic unit of tracing. This is what
39 * is printed in the end as a single line in the trace output, such as:
40 *
41 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
42 */
43struct trace_entry {
44 unsigned char type;
45 unsigned char cpu;
46 unsigned char flags;
47 unsigned char preempt_count;
48 int pid;
49 int tgid;
50};
51
52/*
25 * Function trace entry - function address and parent function addres: 53 * Function trace entry - function address and parent function addres:
26 */ 54 */
27struct ftrace_entry { 55struct ftrace_entry {
56 struct trace_entry ent;
28 unsigned long ip; 57 unsigned long ip;
29 unsigned long parent_ip; 58 unsigned long parent_ip;
30}; 59};
31 60
61/* Function call entry */
62struct ftrace_graph_ent_entry {
63 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent;
65};
66
67/* Function return entry */
68struct ftrace_graph_ret_entry {
69 struct trace_entry ent;
70 struct ftrace_graph_ret ret;
71};
72extern struct tracer boot_tracer;
73
32/* 74/*
33 * Context switch trace entry - which task (and prio) we switched from/to: 75 * Context switch trace entry - which task (and prio) we switched from/to:
34 */ 76 */
35struct ctx_switch_entry { 77struct ctx_switch_entry {
78 struct trace_entry ent;
36 unsigned int prev_pid; 79 unsigned int prev_pid;
37 unsigned char prev_prio; 80 unsigned char prev_prio;
38 unsigned char prev_state; 81 unsigned char prev_state;
39 unsigned int next_pid; 82 unsigned int next_pid;
40 unsigned char next_prio; 83 unsigned char next_prio;
41 unsigned char next_state; 84 unsigned char next_state;
85 unsigned int next_cpu;
42}; 86};
43 87
44/* 88/*
45 * Special (free-form) trace entry: 89 * Special (free-form) trace entry:
46 */ 90 */
47struct special_entry { 91struct special_entry {
92 struct trace_entry ent;
48 unsigned long arg1; 93 unsigned long arg1;
49 unsigned long arg2; 94 unsigned long arg2;
50 unsigned long arg3; 95 unsigned long arg3;
@@ -57,33 +102,94 @@ struct special_entry {
57#define FTRACE_STACK_ENTRIES 8 102#define FTRACE_STACK_ENTRIES 8
58 103
59struct stack_entry { 104struct stack_entry {
105 struct trace_entry ent;
106 unsigned long caller[FTRACE_STACK_ENTRIES];
107};
108
109struct userstack_entry {
110 struct trace_entry ent;
60 unsigned long caller[FTRACE_STACK_ENTRIES]; 111 unsigned long caller[FTRACE_STACK_ENTRIES];
61}; 112};
62 113
63/* 114/*
64 * The trace entry - the most basic unit of tracing. This is what 115 * ftrace_printk entry:
65 * is printed in the end as a single line in the trace output, such as:
66 *
67 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
68 */ 116 */
69struct trace_entry { 117struct print_entry {
70 char type; 118 struct trace_entry ent;
71 char cpu; 119 unsigned long ip;
72 char flags; 120 int depth;
73 char preempt_count; 121 char buf[];
74 int pid; 122};
75 cycle_t t; 123
76 union { 124#define TRACE_OLD_SIZE 88
77 struct ftrace_entry fn; 125
78 struct ctx_switch_entry ctx; 126struct trace_field_cont {
79 struct special_entry special; 127 unsigned char type;
80 struct stack_entry stack; 128 /* Temporary till we get rid of this completely */
81 struct mmiotrace_rw mmiorw; 129 char buf[TRACE_OLD_SIZE - 1];
82 struct mmiotrace_map mmiomap;
83 };
84}; 130};
85 131
86#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) 132struct trace_mmiotrace_rw {
133 struct trace_entry ent;
134 struct mmiotrace_rw rw;
135};
136
137struct trace_mmiotrace_map {
138 struct trace_entry ent;
139 struct mmiotrace_map map;
140};
141
142struct trace_boot_call {
143 struct trace_entry ent;
144 struct boot_trace_call boot_call;
145};
146
147struct trace_boot_ret {
148 struct trace_entry ent;
149 struct boot_trace_ret boot_ret;
150};
151
152#define TRACE_FUNC_SIZE 30
153#define TRACE_FILE_SIZE 20
154struct trace_branch {
155 struct trace_entry ent;
156 unsigned line;
157 char func[TRACE_FUNC_SIZE+1];
158 char file[TRACE_FILE_SIZE+1];
159 char correct;
160};
161
162struct hw_branch_entry {
163 struct trace_entry ent;
164 u64 from;
165 u64 to;
166};
167
168struct trace_power {
169 struct trace_entry ent;
170 struct power_trace state_data;
171};
172
173/*
174 * trace_flag_type is an enumeration that holds different
175 * states when a trace occurs. These are:
176 * IRQS_OFF - interrupts were disabled
177 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
178 * NEED_RESCED - reschedule is requested
179 * HARDIRQ - inside an interrupt handler
180 * SOFTIRQ - inside a softirq handler
181 * CONT - multiple entries hold the trace item
182 */
183enum trace_flag_type {
184 TRACE_FLAG_IRQS_OFF = 0x01,
185 TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
186 TRACE_FLAG_NEED_RESCHED = 0x04,
187 TRACE_FLAG_HARDIRQ = 0x08,
188 TRACE_FLAG_SOFTIRQ = 0x10,
189 TRACE_FLAG_CONT = 0x20,
190};
191
192#define TRACE_BUF_SIZE 1024
87 193
88/* 194/*
89 * The CPU trace array - it consists of thousands of trace entries 195 * The CPU trace array - it consists of thousands of trace entries
@@ -91,16 +197,9 @@ struct trace_entry {
91 * the trace, etc.) 197 * the trace, etc.)
92 */ 198 */
93struct trace_array_cpu { 199struct trace_array_cpu {
94 struct list_head trace_pages;
95 atomic_t disabled; 200 atomic_t disabled;
96 raw_spinlock_t lock;
97 struct lock_class_key lock_key;
98 201
99 /* these fields get copied into max-trace: */ 202 /* these fields get copied into max-trace: */
100 unsigned trace_head_idx;
101 unsigned trace_tail_idx;
102 void *trace_head; /* producer */
103 void *trace_tail; /* consumer */
104 unsigned long trace_idx; 203 unsigned long trace_idx;
105 unsigned long overrun; 204 unsigned long overrun;
106 unsigned long saved_latency; 205 unsigned long saved_latency;
@@ -124,37 +223,123 @@ struct trace_iterator;
124 * They have on/off state as well: 223 * They have on/off state as well:
125 */ 224 */
126struct trace_array { 225struct trace_array {
226 struct ring_buffer *buffer;
127 unsigned long entries; 227 unsigned long entries;
128 long ctrl;
129 int cpu; 228 int cpu;
130 cycle_t time_start; 229 cycle_t time_start;
131 struct task_struct *waiter; 230 struct task_struct *waiter;
132 struct trace_array_cpu *data[NR_CPUS]; 231 struct trace_array_cpu *data[NR_CPUS];
133}; 232};
134 233
234#define FTRACE_CMP_TYPE(var, type) \
235 __builtin_types_compatible_p(typeof(var), type *)
236
237#undef IF_ASSIGN
238#define IF_ASSIGN(var, entry, etype, id) \
239 if (FTRACE_CMP_TYPE(var, etype)) { \
240 var = (typeof(var))(entry); \
241 WARN_ON(id && (entry)->type != id); \
242 break; \
243 }
244
245/* Will cause compile errors if type is not found. */
246extern void __ftrace_bad_type(void);
247
248/*
249 * The trace_assign_type is a verifier that the entry type is
250 * the same as the type being assigned. To add new types simply
251 * add a line with the following format:
252 *
253 * IF_ASSIGN(var, ent, type, id);
254 *
255 * Where "type" is the trace type that includes the trace_entry
256 * as the "ent" item. And "id" is the trace identifier that is
257 * used in the trace_type enum.
258 *
259 * If the type can have more than one id, then use zero.
260 */
261#define trace_assign_type(var, ent) \
262 do { \
263 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
269 IF_ASSIGN(var, ent, struct special_entry, 0); \
270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
271 TRACE_MMIO_RW); \
272 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
273 TRACE_MMIO_MAP); \
274 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
275 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
276 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
277 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
278 TRACE_GRAPH_ENT); \
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
283 __ftrace_bad_type(); \
284 } while (0)
285
286/* Return values for print_line callback */
287enum print_line_t {
288 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
289 TRACE_TYPE_HANDLED = 1,
290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
291};
292
293
294/*
295 * An option specific to a tracer. This is a boolean value.
296 * The bit is the bit index that sets its value on the
297 * flags value in struct tracer_flags.
298 */
299struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */
302};
303
304/*
305 * The set of specific options for a tracer. Your tracer
306 * have to set the initial value of the flags val.
307 */
308struct tracer_flags {
309 u32 val;
310 struct tracer_opt *opts;
311};
312
313/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b
315
135/* 316/*
136 * A specific tracer, represented by methods that operate on a trace array: 317 * A specific tracer, represented by methods that operate on a trace array:
137 */ 318 */
138struct tracer { 319struct tracer {
139 const char *name; 320 const char *name;
140 void (*init)(struct trace_array *tr); 321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr);
141 void (*reset)(struct trace_array *tr); 323 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr);
142 void (*open)(struct trace_iterator *iter); 326 void (*open)(struct trace_iterator *iter);
143 void (*pipe_open)(struct trace_iterator *iter); 327 void (*pipe_open)(struct trace_iterator *iter);
144 void (*close)(struct trace_iterator *iter); 328 void (*close)(struct trace_iterator *iter);
145 void (*start)(struct trace_iterator *iter);
146 void (*stop)(struct trace_iterator *iter);
147 ssize_t (*read)(struct trace_iterator *iter, 329 ssize_t (*read)(struct trace_iterator *iter,
148 struct file *filp, char __user *ubuf, 330 struct file *filp, char __user *ubuf,
149 size_t cnt, loff_t *ppos); 331 size_t cnt, loff_t *ppos);
150 void (*ctrl_update)(struct trace_array *tr);
151#ifdef CONFIG_FTRACE_STARTUP_TEST 332#ifdef CONFIG_FTRACE_STARTUP_TEST
152 int (*selftest)(struct tracer *trace, 333 int (*selftest)(struct tracer *trace,
153 struct trace_array *tr); 334 struct trace_array *tr);
154#endif 335#endif
155 int (*print_line)(struct trace_iterator *iter); 336 void (*print_header)(struct seq_file *m);
337 enum print_line_t (*print_line)(struct trace_iterator *iter);
338 /* If you handled the flag setting, return 0 */
339 int (*set_flag)(u32 old_flags, u32 bit, int set);
156 struct tracer *next; 340 struct tracer *next;
157 int print_max; 341 int print_max;
342 struct tracer_flags *flags;
158}; 343};
159 344
160struct trace_seq { 345struct trace_seq {
@@ -171,60 +356,72 @@ struct trace_iterator {
171 struct trace_array *tr; 356 struct trace_array *tr;
172 struct tracer *trace; 357 struct tracer *trace;
173 void *private; 358 void *private;
174 long last_overrun[NR_CPUS]; 359 struct ring_buffer_iter *buffer_iter[NR_CPUS];
175 long overrun[NR_CPUS];
176 360
177 /* The below is zeroed out in pipe_read */ 361 /* The below is zeroed out in pipe_read */
178 struct trace_seq seq; 362 struct trace_seq seq;
179 struct trace_entry *ent; 363 struct trace_entry *ent;
180 int cpu; 364 int cpu;
181 365 u64 ts;
182 struct trace_entry *prev_ent;
183 int prev_cpu;
184 366
185 unsigned long iter_flags; 367 unsigned long iter_flags;
186 loff_t pos; 368 loff_t pos;
187 unsigned long next_idx[NR_CPUS];
188 struct list_head *next_page[NR_CPUS];
189 unsigned next_page_idx[NR_CPUS];
190 long idx; 369 long idx;
370
371 cpumask_t started;
191}; 372};
192 373
193void tracing_reset(struct trace_array_cpu *data); 374int tracing_is_enabled(void);
375void trace_wake_up(void);
376void tracing_reset(struct trace_array *tr, int cpu);
377void tracing_reset_online_cpus(struct trace_array *tr);
194int tracing_open_generic(struct inode *inode, struct file *filp); 378int tracing_open_generic(struct inode *inode, struct file *filp);
195struct dentry *tracing_init_dentry(void); 379struct dentry *tracing_init_dentry(void);
196void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 380void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
197 381
382struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
383 struct trace_array_cpu *data);
384void tracing_generic_entry_update(struct trace_entry *entry,
385 unsigned long flags,
386 int pc);
387
198void ftrace(struct trace_array *tr, 388void ftrace(struct trace_array *tr,
199 struct trace_array_cpu *data, 389 struct trace_array_cpu *data,
200 unsigned long ip, 390 unsigned long ip,
201 unsigned long parent_ip, 391 unsigned long parent_ip,
202 unsigned long flags); 392 unsigned long flags, int pc);
203void tracing_sched_switch_trace(struct trace_array *tr, 393void tracing_sched_switch_trace(struct trace_array *tr,
204 struct trace_array_cpu *data, 394 struct trace_array_cpu *data,
205 struct task_struct *prev, 395 struct task_struct *prev,
206 struct task_struct *next, 396 struct task_struct *next,
207 unsigned long flags); 397 unsigned long flags, int pc);
208void tracing_record_cmdline(struct task_struct *tsk); 398void tracing_record_cmdline(struct task_struct *tsk);
209 399
210void tracing_sched_wakeup_trace(struct trace_array *tr, 400void tracing_sched_wakeup_trace(struct trace_array *tr,
211 struct trace_array_cpu *data, 401 struct trace_array_cpu *data,
212 struct task_struct *wakee, 402 struct task_struct *wakee,
213 struct task_struct *cur, 403 struct task_struct *cur,
214 unsigned long flags); 404 unsigned long flags, int pc);
215void trace_special(struct trace_array *tr, 405void trace_special(struct trace_array *tr,
216 struct trace_array_cpu *data, 406 struct trace_array_cpu *data,
217 unsigned long arg1, 407 unsigned long arg1,
218 unsigned long arg2, 408 unsigned long arg2,
219 unsigned long arg3); 409 unsigned long arg3, int pc);
220void trace_function(struct trace_array *tr, 410void trace_function(struct trace_array *tr,
221 struct trace_array_cpu *data, 411 struct trace_array_cpu *data,
222 unsigned long ip, 412 unsigned long ip,
223 unsigned long parent_ip, 413 unsigned long parent_ip,
224 unsigned long flags); 414 unsigned long flags, int pc);
415
416void trace_graph_return(struct ftrace_graph_ret *trace);
417int trace_graph_entry(struct ftrace_graph_ent *trace);
418void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
225 419
226void tracing_start_cmdline_record(void); 420void tracing_start_cmdline_record(void);
227void tracing_stop_cmdline_record(void); 421void tracing_stop_cmdline_record(void);
422void tracing_sched_switch_assign_trace(struct trace_array *tr);
423void tracing_stop_sched_switch_record(void);
424void tracing_start_sched_switch_record(void);
228int register_tracer(struct tracer *type); 425int register_tracer(struct tracer *type);
229void unregister_tracer(struct tracer *type); 426void unregister_tracer(struct tracer *type);
230 427
@@ -239,7 +436,7 @@ void update_max_tr_single(struct trace_array *tr,
239 436
240extern cycle_t ftrace_now(int cpu); 437extern cycle_t ftrace_now(int cpu);
241 438
242#ifdef CONFIG_FTRACE 439#ifdef CONFIG_FUNCTION_TRACER
243void tracing_start_function_trace(void); 440void tracing_start_function_trace(void);
244void tracing_stop_function_trace(void); 441void tracing_stop_function_trace(void);
245#else 442#else
@@ -260,6 +457,7 @@ struct tracer_switch_ops {
260 struct tracer_switch_ops *next; 457 struct tracer_switch_ops *next;
261}; 458};
262 459
460char *trace_find_cmdline(int pid);
263#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 461#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
264 462
265#ifdef CONFIG_DYNAMIC_FTRACE 463#ifdef CONFIG_DYNAMIC_FTRACE
@@ -268,54 +466,96 @@ extern unsigned long ftrace_update_tot_cnt;
268extern int DYN_FTRACE_TEST_NAME(void); 466extern int DYN_FTRACE_TEST_NAME(void);
269#endif 467#endif
270 468
271#ifdef CONFIG_MMIOTRACE
272extern void __trace_mmiotrace_rw(struct trace_array *tr,
273 struct trace_array_cpu *data,
274 struct mmiotrace_rw *rw);
275extern void __trace_mmiotrace_map(struct trace_array *tr,
276 struct trace_array_cpu *data,
277 struct mmiotrace_map *map);
278#endif
279
280#ifdef CONFIG_FTRACE_STARTUP_TEST 469#ifdef CONFIG_FTRACE_STARTUP_TEST
281#ifdef CONFIG_FTRACE
282extern int trace_selftest_startup_function(struct tracer *trace, 470extern int trace_selftest_startup_function(struct tracer *trace,
283 struct trace_array *tr); 471 struct trace_array *tr);
284#endif
285#ifdef CONFIG_IRQSOFF_TRACER
286extern int trace_selftest_startup_irqsoff(struct tracer *trace, 472extern int trace_selftest_startup_irqsoff(struct tracer *trace,
287 struct trace_array *tr); 473 struct trace_array *tr);
288#endif
289#ifdef CONFIG_PREEMPT_TRACER
290extern int trace_selftest_startup_preemptoff(struct tracer *trace, 474extern int trace_selftest_startup_preemptoff(struct tracer *trace,
291 struct trace_array *tr); 475 struct trace_array *tr);
292#endif
293#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
294extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, 476extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
295 struct trace_array *tr); 477 struct trace_array *tr);
296#endif
297#ifdef CONFIG_SCHED_TRACER
298extern int trace_selftest_startup_wakeup(struct tracer *trace, 478extern int trace_selftest_startup_wakeup(struct tracer *trace,
299 struct trace_array *tr); 479 struct trace_array *tr);
300#endif 480extern int trace_selftest_startup_nop(struct tracer *trace,
301#ifdef CONFIG_CONTEXT_SWITCH_TRACER 481 struct trace_array *tr);
302extern int trace_selftest_startup_sched_switch(struct tracer *trace, 482extern int trace_selftest_startup_sched_switch(struct tracer *trace,
303 struct trace_array *tr); 483 struct trace_array *tr);
304#endif
305#ifdef CONFIG_SYSPROF_TRACER
306extern int trace_selftest_startup_sysprof(struct tracer *trace, 484extern int trace_selftest_startup_sysprof(struct tracer *trace,
307 struct trace_array *tr); 485 struct trace_array *tr);
308#endif 486extern int trace_selftest_startup_branch(struct tracer *trace,
487 struct trace_array *tr);
309#endif /* CONFIG_FTRACE_STARTUP_TEST */ 488#endif /* CONFIG_FTRACE_STARTUP_TEST */
310 489
311extern void *head_page(struct trace_array_cpu *data); 490extern void *head_page(struct trace_array_cpu *data);
312extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 491extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
492extern void trace_seq_print_cont(struct trace_seq *s,
493 struct trace_iterator *iter);
494
495extern int
496seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
497 unsigned long sym_flags);
313extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 498extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
314 size_t cnt); 499 size_t cnt);
315extern long ns2usecs(cycle_t nsec); 500extern long ns2usecs(cycle_t nsec);
501extern int
502trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
316 503
317extern unsigned long trace_flags; 504extern unsigned long trace_flags;
318 505
506/* Standard output formatting function used for function return traces */
507#ifdef CONFIG_FUNCTION_GRAPH_TRACER
508extern enum print_line_t print_graph_function(struct trace_iterator *iter);
509
510#ifdef CONFIG_DYNAMIC_FTRACE
511/* TODO: make this variable */
512#define FTRACE_GRAPH_MAX_FUNCS 32
513extern int ftrace_graph_count;
514extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
515
516static inline int ftrace_graph_addr(unsigned long addr)
517{
518 int i;
519
520 if (!ftrace_graph_count || test_tsk_trace_graph(current))
521 return 1;
522
523 for (i = 0; i < ftrace_graph_count; i++) {
524 if (addr == ftrace_graph_funcs[i])
525 return 1;
526 }
527
528 return 0;
529}
530#else
531static inline int ftrace_trace_addr(unsigned long addr)
532{
533 return 1;
534}
535static inline int ftrace_graph_addr(unsigned long addr)
536{
537 return 1;
538}
539#endif /* CONFIG_DYNAMIC_FTRACE */
540
541#else /* CONFIG_FUNCTION_GRAPH_TRACER */
542static inline enum print_line_t
543print_graph_function(struct trace_iterator *iter)
544{
545 return TRACE_TYPE_UNHANDLED;
546}
547#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
548
549extern struct pid *ftrace_pid_trace;
550
551static inline int ftrace_trace_task(struct task_struct *task)
552{
553 if (!ftrace_pid_trace)
554 return 1;
555
556 return test_tsk_trace_trace(task);
557}
558
319/* 559/*
320 * trace_iterator_flags is an enumeration that defines bit 560 * trace_iterator_flags is an enumeration that defines bit
321 * positions into trace_flags that controls the output. 561 * positions into trace_flags that controls the output.
@@ -334,6 +574,94 @@ enum trace_iterator_flags {
334 TRACE_ITER_BLOCK = 0x80, 574 TRACE_ITER_BLOCK = 0x80,
335 TRACE_ITER_STACKTRACE = 0x100, 575 TRACE_ITER_STACKTRACE = 0x100,
336 TRACE_ITER_SCHED_TREE = 0x200, 576 TRACE_ITER_SCHED_TREE = 0x200,
577 TRACE_ITER_PRINTK = 0x400,
578 TRACE_ITER_PREEMPTONLY = 0x800,
579 TRACE_ITER_BRANCH = 0x1000,
580 TRACE_ITER_ANNOTATE = 0x2000,
581 TRACE_ITER_USERSTACKTRACE = 0x4000,
582 TRACE_ITER_SYM_USEROBJ = 0x8000,
583 TRACE_ITER_PRINTK_MSGONLY = 0x10000
337}; 584};
338 585
586/*
587 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
588 * control the output of kernel symbols.
589 */
590#define TRACE_ITER_SYM_MASK \
591 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
592
593extern struct tracer nop_trace;
594
595/**
596 * ftrace_preempt_disable - disable preemption scheduler safe
597 *
598 * When tracing can happen inside the scheduler, there exists
599 * cases that the tracing might happen before the need_resched
600 * flag is checked. If this happens and the tracer calls
601 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion.
603 *
604 * To prevent this, we read the need_recshed flag before
605 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable.
608 *
609 * The rational for doing the above is that if need resched is set
610 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched.
613 */
614static inline int ftrace_preempt_disable(void)
615{
616 int resched;
617
618 resched = need_resched();
619 preempt_disable_notrace();
620
621 return resched;
622}
623
624/**
625 * ftrace_preempt_enable - enable preemption scheduler safe
626 * @resched: the return value from ftrace_preempt_disable
627 *
628 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or
631 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead.
634 */
635static inline void ftrace_preempt_enable(int resched)
636{
637 if (resched)
638 preempt_enable_no_resched_notrace();
639 else
640 preempt_enable_notrace();
641}
642
643#ifdef CONFIG_BRANCH_TRACER
644extern int enable_branch_tracing(struct trace_array *tr);
645extern void disable_branch_tracing(void);
646static inline int trace_branch_enable(struct trace_array *tr)
647{
648 if (trace_flags & TRACE_ITER_BRANCH)
649 return enable_branch_tracing(tr);
650 return 0;
651}
652static inline void trace_branch_disable(void)
653{
654 /* due to races, always disable */
655 disable_branch_tracing();
656}
657#else
658static inline int trace_branch_enable(struct trace_array *tr)
659{
660 return 0;
661}
662static inline void trace_branch_disable(void)
663{
664}
665#endif /* CONFIG_BRANCH_TRACER */
666
339#endif /* _LINUX_KERNEL_TRACE_H */ 667#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 000000000000..3ccebde28482
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,186 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12
13#include "trace.h"
14
15static struct trace_array *boot_trace;
16static bool pre_initcalls_finished;
17
18/* Tells the boot tracer that the pre_smp_initcalls are finished.
19 * So we are ready .
20 * It doesn't enable sched events tracing however.
21 * You have to call enable_boot_trace to do so.
22 */
23void start_boot_trace(void)
24{
25 pre_initcalls_finished = true;
26}
27
28void enable_boot_trace(void)
29{
30 if (pre_initcalls_finished)
31 tracing_start_sched_switch_record();
32}
33
34void disable_boot_trace(void)
35{
36 if (pre_initcalls_finished)
37 tracing_stop_sched_switch_record();
38}
39
40static int boot_trace_init(struct trace_array *tr)
41{
42 int cpu;
43 boot_trace = tr;
44
45 for_each_cpu_mask(cpu, cpu_possible_map)
46 tracing_reset(tr, cpu);
47
48 tracing_sched_switch_assign_trace(tr);
49 return 0;
50}
51
52static enum print_line_t
53initcall_call_print_line(struct trace_iterator *iter)
54{
55 struct trace_entry *entry = iter->ent;
56 struct trace_seq *s = &iter->seq;
57 struct trace_boot_call *field;
58 struct boot_trace_call *call;
59 u64 ts;
60 unsigned long nsec_rem;
61 int ret;
62
63 trace_assign_type(field, entry);
64 call = &field->boot_call;
65 ts = iter->ts;
66 nsec_rem = do_div(ts, 1000000000);
67
68 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
69 (unsigned long)ts, nsec_rem, call->func, call->caller);
70
71 if (!ret)
72 return TRACE_TYPE_PARTIAL_LINE;
73 else
74 return TRACE_TYPE_HANDLED;
75}
76
77static enum print_line_t
78initcall_ret_print_line(struct trace_iterator *iter)
79{
80 struct trace_entry *entry = iter->ent;
81 struct trace_seq *s = &iter->seq;
82 struct trace_boot_ret *field;
83 struct boot_trace_ret *init_ret;
84 u64 ts;
85 unsigned long nsec_rem;
86 int ret;
87
88 trace_assign_type(field, entry);
89 init_ret = &field->boot_ret;
90 ts = iter->ts;
91 nsec_rem = do_div(ts, 1000000000);
92
93 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
94 "returned %d after %llu msecs\n",
95 (unsigned long) ts,
96 nsec_rem,
97 init_ret->func, init_ret->result, init_ret->duration);
98
99 if (!ret)
100 return TRACE_TYPE_PARTIAL_LINE;
101 else
102 return TRACE_TYPE_HANDLED;
103}
104
105static enum print_line_t initcall_print_line(struct trace_iterator *iter)
106{
107 struct trace_entry *entry = iter->ent;
108
109 switch (entry->type) {
110 case TRACE_BOOT_CALL:
111 return initcall_call_print_line(iter);
112 case TRACE_BOOT_RET:
113 return initcall_ret_print_line(iter);
114 default:
115 return TRACE_TYPE_UNHANDLED;
116 }
117}
118
119struct tracer boot_tracer __read_mostly =
120{
121 .name = "initcall",
122 .init = boot_trace_init,
123 .reset = tracing_reset_online_cpus,
124 .print_line = initcall_print_line,
125};
126
127void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
128{
129 struct ring_buffer_event *event;
130 struct trace_boot_call *entry;
131 unsigned long irq_flags;
132 struct trace_array *tr = boot_trace;
133
134 if (!pre_initcalls_finished)
135 return;
136
137 /* Get its name now since this function could
138 * disappear because it is in the .init section.
139 */
140 sprint_symbol(bt->func, (unsigned long)fn);
141 preempt_disable();
142
143 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
144 &irq_flags);
145 if (!event)
146 goto out;
147 entry = ring_buffer_event_data(event);
148 tracing_generic_entry_update(&entry->ent, 0, 0);
149 entry->ent.type = TRACE_BOOT_CALL;
150 entry->boot_call = *bt;
151 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
152
153 trace_wake_up();
154
155 out:
156 preempt_enable();
157}
158
159void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
160{
161 struct ring_buffer_event *event;
162 struct trace_boot_ret *entry;
163 unsigned long irq_flags;
164 struct trace_array *tr = boot_trace;
165
166 if (!pre_initcalls_finished)
167 return;
168
169 sprint_symbol(bt->func, (unsigned long)fn);
170 preempt_disable();
171
172 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
173 &irq_flags);
174 if (!event)
175 goto out;
176 entry = ring_buffer_event_data(event);
177 tracing_generic_entry_update(&entry->ent, 0, 0);
178 entry->ent.type = TRACE_BOOT_RET;
179 entry->boot_ret = *bt;
180 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
181
182 trace_wake_up();
183
184 out:
185 preempt_enable();
186}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000000..6c00feb3bac7
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,342 @@
1/*
2 * unlikely profiler
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h>
12#include <linux/module.h>
13#include <linux/ftrace.h>
14#include <linux/hash.h>
15#include <linux/fs.h>
16#include <asm/local.h>
17#include "trace.h"
18
19#ifdef CONFIG_BRANCH_TRACER
20
21static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex);
23static struct trace_array *branch_tracer;
24
25static void
26probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
27{
28 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event;
30 struct trace_branch *entry;
31 unsigned long flags, irq_flags;
32 int cpu, pc;
33 const char *p;
34
35 /*
36 * I would love to save just the ftrace_likely_data pointer, but
37 * this code can also be used by modules. Ugly things can happen
38 * if the module is unloaded, and then we go and read the
39 * pointer. This is slower, but much safer.
40 */
41
42 if (unlikely(!tr))
43 return;
44
45 local_irq_save(flags);
46 cpu = raw_smp_processor_id();
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
51 &irq_flags);
52 if (!event)
53 goto out;
54
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59
60 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file);
62 while (p >= f->file && *p != '/')
63 p--;
64 p++;
65
66 strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
67 strncpy(entry->file, p, TRACE_FILE_SIZE);
68 entry->func[TRACE_FUNC_SIZE] = 0;
69 entry->file[TRACE_FILE_SIZE] = 0;
70 entry->line = f->line;
71 entry->correct = val == expect;
72
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
74
75 out:
76 atomic_dec(&tr->data[cpu]->disabled);
77 local_irq_restore(flags);
78}
79
80static inline
81void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
82{
83 if (!branch_tracing_enabled)
84 return;
85
86 probe_likely_condition(f, val, expect);
87}
88
89int enable_branch_tracing(struct trace_array *tr)
90{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr;
95 /*
96 * Must be seen before enabling. The reader is a condition
97 * where we do not need a matching rmb()
98 */
99 smp_wmb();
100 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex);
102
103 return ret;
104}
105
106void disable_branch_tracing(void)
107{
108 mutex_lock(&branch_tracing_mutex);
109
110 if (!branch_tracing_enabled)
111 goto out_unlock;
112
113 branch_tracing_enabled--;
114
115 out_unlock:
116 mutex_unlock(&branch_tracing_mutex);
117}
118
119static void start_branch_trace(struct trace_array *tr)
120{
121 enable_branch_tracing(tr);
122}
123
124static void stop_branch_trace(struct trace_array *tr)
125{
126 disable_branch_tracing();
127}
128
129static int branch_trace_init(struct trace_array *tr)
130{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr);
137 return 0;
138}
139
140static void branch_trace_reset(struct trace_array *tr)
141{
142 stop_branch_trace(tr);
143}
144
145struct tracer branch_trace __read_mostly =
146{
147 .name = "branch",
148 .init = branch_trace_init,
149 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch,
152#endif
153};
154
155__init static int init_branch_trace(void)
156{
157 return register_tracer(&branch_trace);
158}
159
160device_initcall(init_branch_trace);
161#else
162static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
164{
165}
166#endif /* CONFIG_BRANCH_TRACER */
167
168void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
169{
170 /*
171 * I would love to have a trace point here instead, but the
172 * trace point code is so inundated with unlikely and likely
173 * conditions that the recursive nightmare that exists is too
174 * much to try to get working. At least for now.
175 */
176 trace_likely_condition(f, val, expect);
177
178 /* FIXME: Make this atomic! */
179 if (val == expect)
180 f->correct++;
181 else
182 f->incorrect++;
183}
184EXPORT_SYMBOL(ftrace_likely_update);
185
186struct ftrace_pointer {
187 void *start;
188 void *stop;
189 int hit;
190};
191
192static void *
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{
195 const struct ftrace_pointer *f = m->private;
196 struct ftrace_branch_data *p = v;
197
198 (*pos)++;
199
200 if (v == (void *)1)
201 return f->start;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209}
210
211static void *t_start(struct seq_file *m, loff_t *pos)
212{
213 void *t = (void *)1;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218
219 return t;
220}
221
222static void t_stop(struct seq_file *m, void *p)
223{
224}
225
226static int t_show(struct seq_file *m, void *v)
227{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v;
230 const char *f;
231 long percent;
232
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */
247 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/')
249 f--;
250 f++;
251
252 /*
253 * The miss is overlayed on correct, and hit on incorrect.
254 */
255 if (p->correct) {
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0)
263 seq_printf(m, " X ");
264 else
265 seq_printf(m, "%3ld ", percent);
266 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
267 return 0;
268}
269
270static struct seq_operations tracing_likely_seq_ops = {
271 .start = t_start,
272 .next = t_next,
273 .stop = t_stop,
274 .show = t_show,
275};
276
277static int tracing_branch_open(struct inode *inode, struct file *file)
278{
279 int ret;
280
281 ret = seq_open(file, &tracing_likely_seq_ops);
282 if (!ret) {
283 struct seq_file *m = file->private_data;
284 m->private = (void *)inode->i_private;
285 }
286
287 return ret;
288}
289
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295
296#ifdef CONFIG_PROFILE_ALL_BRANCHES
297extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[];
299
300static const struct ftrace_pointer ftrace_branch_pos = {
301 .start = __start_branch_profile,
302 .stop = __stop_branch_profile,
303 .hit = 1,
304};
305
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */
307
308extern unsigned long __start_annotated_branch_profile[];
309extern unsigned long __stop_annotated_branch_profile[];
310
311static const struct ftrace_pointer ftrace_annotated_branch_pos = {
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315
316static __init int ftrace_branch_init(void)
317{
318 struct dentry *d_tracer;
319 struct dentry *entry;
320
321 d_tracer = tracing_init_dentry();
322
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
324 (void *)&ftrace_annotated_branch_pos,
325 &tracing_branch_fops);
326 if (!entry)
327 pr_warning("Could not create debugfs "
328 "'profile_annotatet_branch' entry\n");
329
330#ifdef CONFIG_PROFILE_ALL_BRANCHES
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer,
332 (void *)&ftrace_branch_pos,
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338
339 return 0;
340}
341
342device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 312144897970..9236d7e25a16 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,20 +16,10 @@
16 16
17#include "trace.h" 17#include "trace.h"
18 18
19static void function_reset(struct trace_array *tr)
20{
21 int cpu;
22
23 tr->time_start = ftrace_now(tr->cpu);
24
25 for_each_online_cpu(cpu)
26 tracing_reset(tr->data[cpu]);
27}
28
29static void start_function_trace(struct trace_array *tr) 19static void start_function_trace(struct trace_array *tr)
30{ 20{
31 tr->cpu = get_cpu(); 21 tr->cpu = get_cpu();
32 function_reset(tr); 22 tracing_reset_online_cpus(tr);
33 put_cpu(); 23 put_cpu();
34 24
35 tracing_start_cmdline_record(); 25 tracing_start_cmdline_record();
@@ -42,32 +32,28 @@ static void stop_function_trace(struct trace_array *tr)
42 tracing_stop_cmdline_record(); 32 tracing_stop_cmdline_record();
43} 33}
44 34
45static void function_trace_init(struct trace_array *tr) 35static int function_trace_init(struct trace_array *tr)
46{ 36{
47 if (tr->ctrl) 37 start_function_trace(tr);
48 start_function_trace(tr); 38 return 0;
49} 39}
50 40
51static void function_trace_reset(struct trace_array *tr) 41static void function_trace_reset(struct trace_array *tr)
52{ 42{
53 if (tr->ctrl) 43 stop_function_trace(tr);
54 stop_function_trace(tr);
55} 44}
56 45
57static void function_trace_ctrl_update(struct trace_array *tr) 46static void function_trace_start(struct trace_array *tr)
58{ 47{
59 if (tr->ctrl) 48 tracing_reset_online_cpus(tr);
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63} 49}
64 50
65static struct tracer function_trace __read_mostly = 51static struct tracer function_trace __read_mostly =
66{ 52{
67 .name = "ftrace", 53 .name = "function",
68 .init = function_trace_init, 54 .init = function_trace_init,
69 .reset = function_trace_reset, 55 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update, 56 .start = function_trace_start,
71#ifdef CONFIG_FTRACE_SELFTEST 57#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function, 58 .selftest = trace_selftest_startup_function,
73#endif 59#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 000000000000..4bf39fcae97a
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,669 @@
1/*
2 *
3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 *
8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/fs.h>
13
14#include "trace.h"
15
16#define TRACE_GRAPH_INDENT 2
17
18/* Flag options */
19#define TRACE_GRAPH_PRINT_OVERRUN 0x1
20#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8
23
24static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
29 /* Display Overhead ? */
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
33 { } /* Empty entry */
34};
35
36static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
39 .opts = trace_opts
40};
41
42/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44
45static int graph_trace_init(struct trace_array *tr)
46{
47 int cpu, ret;
48
49 for_each_online_cpu(cpu)
50 tracing_reset(tr, cpu);
51
52 ret = register_ftrace_graph(&trace_graph_return,
53 &trace_graph_entry);
54 if (ret)
55 return ret;
56 tracing_start_cmdline_record();
57
58 return 0;
59}
60
61static void graph_trace_reset(struct trace_array *tr)
62{
63 tracing_stop_cmdline_record();
64 unregister_ftrace_graph();
65}
66
67static inline int log10_cpu(int nb)
68{
69 if (nb / 100)
70 return 3;
71 if (nb / 10)
72 return 2;
73 return 1;
74}
75
76static enum print_line_t
77print_graph_cpu(struct trace_seq *s, int cpu)
78{
79 int i;
80 int ret;
81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
83
84
85 /*
86 * Start with a space character - to make it stand out
87 * to the right a bit when trace output is pasted into
88 * email:
89 */
90 ret = trace_seq_printf(s, " ");
91
92 /*
93 * Tricky - we space the CPU field according to the max
94 * number of online CPUs. On a 2-cpu system it would take
95 * a maximum of 1 digit - on a 128 cpu system it would
96 * take up to 3 digits:
97 */
98 for (i = 0; i < log10_all - log10_this; i++) {
99 ret = trace_seq_printf(s, " ");
100 if (!ret)
101 return TRACE_TYPE_PARTIAL_LINE;
102 }
103 ret = trace_seq_printf(s, "%d) ", cpu);
104 if (!ret)
105 return TRACE_TYPE_PARTIAL_LINE;
106
107 return TRACE_TYPE_HANDLED;
108}
109
110#define TRACE_GRAPH_PROCINFO_LENGTH 14
111
112static enum print_line_t
113print_graph_proc(struct trace_seq *s, pid_t pid)
114{
115 int i;
116 int ret;
117 int len;
118 char comm[8];
119 int spaces = 0;
120 /* sign + log10(MAX_INT) + '\0' */
121 char pid_str[11];
122
123 strncpy(comm, trace_find_cmdline(pid), 7);
124 comm[7] = '\0';
125 sprintf(pid_str, "%d", pid);
126
127 /* 1 stands for the "-" character */
128 len = strlen(comm) + strlen(pid_str) + 1;
129
130 if (len < TRACE_GRAPH_PROCINFO_LENGTH)
131 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
132
133 /* First spaces to align center */
134 for (i = 0; i < spaces / 2; i++) {
135 ret = trace_seq_printf(s, " ");
136 if (!ret)
137 return TRACE_TYPE_PARTIAL_LINE;
138 }
139
140 ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
141 if (!ret)
142 return TRACE_TYPE_PARTIAL_LINE;
143
144 /* Last spaces to align center */
145 for (i = 0; i < spaces - (spaces / 2); i++) {
146 ret = trace_seq_printf(s, " ");
147 if (!ret)
148 return TRACE_TYPE_PARTIAL_LINE;
149 }
150 return TRACE_TYPE_HANDLED;
151}
152
153
154/* If the pid changed since the last trace, output this event */
155static enum print_line_t
156verif_pid(struct trace_seq *s, pid_t pid, int cpu)
157{
158 pid_t prev_pid;
159 int ret;
160
161 if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
162 return TRACE_TYPE_HANDLED;
163
164 prev_pid = last_pid[cpu];
165 last_pid[cpu] = pid;
166
167/*
168 * Context-switch trace line:
169
170 ------------------------------------------
171 | 1) migration/0--1 => sshd-1755
172 ------------------------------------------
173
174 */
175 ret = trace_seq_printf(s,
176 " ------------------------------------------\n");
177 if (!ret)
178 TRACE_TYPE_PARTIAL_LINE;
179
180 ret = print_graph_cpu(s, cpu);
181 if (ret == TRACE_TYPE_PARTIAL_LINE)
182 TRACE_TYPE_PARTIAL_LINE;
183
184 ret = print_graph_proc(s, prev_pid);
185 if (ret == TRACE_TYPE_PARTIAL_LINE)
186 TRACE_TYPE_PARTIAL_LINE;
187
188 ret = trace_seq_printf(s, " => ");
189 if (!ret)
190 TRACE_TYPE_PARTIAL_LINE;
191
192 ret = print_graph_proc(s, pid);
193 if (ret == TRACE_TYPE_PARTIAL_LINE)
194 TRACE_TYPE_PARTIAL_LINE;
195
196 ret = trace_seq_printf(s,
197 "\n ------------------------------------------\n\n");
198 if (!ret)
199 TRACE_TYPE_PARTIAL_LINE;
200
201 return ret;
202}
203
204static bool
205trace_branch_is_leaf(struct trace_iterator *iter,
206 struct ftrace_graph_ent_entry *curr)
207{
208 struct ring_buffer_iter *ring_iter;
209 struct ring_buffer_event *event;
210 struct ftrace_graph_ret_entry *next;
211
212 ring_iter = iter->buffer_iter[iter->cpu];
213
214 if (!ring_iter)
215 return false;
216
217 event = ring_buffer_iter_peek(ring_iter, NULL);
218
219 if (!event)
220 return false;
221
222 next = ring_buffer_event_data(event);
223
224 if (next->ent.type != TRACE_GRAPH_RET)
225 return false;
226
227 if (curr->ent.pid != next->ent.pid ||
228 curr->graph_ent.func != next->ret.func)
229 return false;
230
231 return true;
232}
233
234static enum print_line_t
235print_graph_irq(struct trace_seq *s, unsigned long addr,
236 enum trace_type type, int cpu, pid_t pid)
237{
238 int ret;
239
240 if (addr < (unsigned long)__irqentry_text_start ||
241 addr >= (unsigned long)__irqentry_text_end)
242 return TRACE_TYPE_UNHANDLED;
243
244 if (type == TRACE_GRAPH_ENT) {
245 ret = trace_seq_printf(s, "==========> | ");
246 } else {
247 /* Cpu */
248 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
249 ret = print_graph_cpu(s, cpu);
250 if (ret == TRACE_TYPE_PARTIAL_LINE)
251 return TRACE_TYPE_PARTIAL_LINE;
252 }
253 /* Proc */
254 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
255 ret = print_graph_proc(s, pid);
256 if (ret == TRACE_TYPE_PARTIAL_LINE)
257 return TRACE_TYPE_PARTIAL_LINE;
258
259 ret = trace_seq_printf(s, " | ");
260 if (!ret)
261 return TRACE_TYPE_PARTIAL_LINE;
262 }
263
264 /* No overhead */
265 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
266 ret = trace_seq_printf(s, " ");
267 if (!ret)
268 return TRACE_TYPE_PARTIAL_LINE;
269 }
270
271 ret = trace_seq_printf(s, "<========== |\n");
272 }
273 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE;
275 return TRACE_TYPE_HANDLED;
276}
277
278static enum print_line_t
279print_graph_duration(unsigned long long duration, struct trace_seq *s)
280{
281 unsigned long nsecs_rem = do_div(duration, 1000);
282 /* log10(ULONG_MAX) + '\0' */
283 char msecs_str[21];
284 char nsecs_str[5];
285 int ret, len;
286 int i;
287
288 sprintf(msecs_str, "%lu", (unsigned long) duration);
289
290 /* Print msecs */
291 ret = trace_seq_printf(s, msecs_str);
292 if (!ret)
293 return TRACE_TYPE_PARTIAL_LINE;
294
295 len = strlen(msecs_str);
296
297 /* Print nsecs (we don't want to exceed 7 numbers) */
298 if (len < 7) {
299 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
300 ret = trace_seq_printf(s, ".%s", nsecs_str);
301 if (!ret)
302 return TRACE_TYPE_PARTIAL_LINE;
303 len += strlen(nsecs_str);
304 }
305
306 ret = trace_seq_printf(s, " us ");
307 if (!ret)
308 return TRACE_TYPE_PARTIAL_LINE;
309
310 /* Print remaining spaces to fit the row's width */
311 for (i = len; i < 7; i++) {
312 ret = trace_seq_printf(s, " ");
313 if (!ret)
314 return TRACE_TYPE_PARTIAL_LINE;
315 }
316
317 ret = trace_seq_printf(s, "| ");
318 if (!ret)
319 return TRACE_TYPE_PARTIAL_LINE;
320 return TRACE_TYPE_HANDLED;
321
322}
323
324/* Signal a overhead of time execution to the output */
325static int
326print_graph_overhead(unsigned long long duration, struct trace_seq *s)
327{
328 /* Duration exceeded 100 msecs */
329 if (duration > 100000ULL)
330 return trace_seq_printf(s, "! ");
331
332 /* Duration exceeded 10 msecs */
333 if (duration > 10000ULL)
334 return trace_seq_printf(s, "+ ");
335
336 return trace_seq_printf(s, " ");
337}
338
339/* Case of a leaf function on its call entry */
340static enum print_line_t
341print_graph_entry_leaf(struct trace_iterator *iter,
342 struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
343{
344 struct ftrace_graph_ret_entry *ret_entry;
345 struct ftrace_graph_ret *graph_ret;
346 struct ring_buffer_event *event;
347 struct ftrace_graph_ent *call;
348 unsigned long long duration;
349 int ret;
350 int i;
351
352 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
353 ret_entry = ring_buffer_event_data(event);
354 graph_ret = &ret_entry->ret;
355 call = &entry->graph_ent;
356 duration = graph_ret->rettime - graph_ret->calltime;
357
358 /* Overhead */
359 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
360 ret = print_graph_overhead(duration, s);
361 if (!ret)
362 return TRACE_TYPE_PARTIAL_LINE;
363 }
364
365 /* Duration */
366 ret = print_graph_duration(duration, s);
367 if (ret == TRACE_TYPE_PARTIAL_LINE)
368 return TRACE_TYPE_PARTIAL_LINE;
369
370 /* Function */
371 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
372 ret = trace_seq_printf(s, " ");
373 if (!ret)
374 return TRACE_TYPE_PARTIAL_LINE;
375 }
376
377 ret = seq_print_ip_sym(s, call->func, 0);
378 if (!ret)
379 return TRACE_TYPE_PARTIAL_LINE;
380
381 ret = trace_seq_printf(s, "();\n");
382 if (!ret)
383 return TRACE_TYPE_PARTIAL_LINE;
384
385 return TRACE_TYPE_HANDLED;
386}
387
388static enum print_line_t
389print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
390 struct trace_seq *s, pid_t pid, int cpu)
391{
392 int i;
393 int ret;
394 struct ftrace_graph_ent *call = &entry->graph_ent;
395
396 /* No overhead */
397 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
398 ret = trace_seq_printf(s, " ");
399 if (!ret)
400 return TRACE_TYPE_PARTIAL_LINE;
401 }
402
403 /* Interrupt */
404 ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
405 if (ret == TRACE_TYPE_UNHANDLED) {
406 /* No time */
407 ret = trace_seq_printf(s, " | ");
408 if (!ret)
409 return TRACE_TYPE_PARTIAL_LINE;
410 } else {
411 if (ret == TRACE_TYPE_PARTIAL_LINE)
412 return TRACE_TYPE_PARTIAL_LINE;
413 }
414
415
416 /* Function */
417 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
418 ret = trace_seq_printf(s, " ");
419 if (!ret)
420 return TRACE_TYPE_PARTIAL_LINE;
421 }
422
423 ret = seq_print_ip_sym(s, call->func, 0);
424 if (!ret)
425 return TRACE_TYPE_PARTIAL_LINE;
426
427 ret = trace_seq_printf(s, "() {\n");
428 if (!ret)
429 return TRACE_TYPE_PARTIAL_LINE;
430
431 return TRACE_TYPE_HANDLED;
432}
433
434static enum print_line_t
435print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
436 struct trace_iterator *iter, int cpu)
437{
438 int ret;
439 struct trace_entry *ent = iter->ent;
440
441 /* Pid */
442 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
443 return TRACE_TYPE_PARTIAL_LINE;
444
445 /* Cpu */
446 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
447 ret = print_graph_cpu(s, cpu);
448 if (ret == TRACE_TYPE_PARTIAL_LINE)
449 return TRACE_TYPE_PARTIAL_LINE;
450 }
451
452 /* Proc */
453 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
454 ret = print_graph_proc(s, ent->pid);
455 if (ret == TRACE_TYPE_PARTIAL_LINE)
456 return TRACE_TYPE_PARTIAL_LINE;
457
458 ret = trace_seq_printf(s, " | ");
459 if (!ret)
460 return TRACE_TYPE_PARTIAL_LINE;
461 }
462
463 if (trace_branch_is_leaf(iter, field))
464 return print_graph_entry_leaf(iter, field, s);
465 else
466 return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
467
468}
469
470static enum print_line_t
471print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
472 struct trace_entry *ent, int cpu)
473{
474 int i;
475 int ret;
476 unsigned long long duration = trace->rettime - trace->calltime;
477
478 /* Pid */
479 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
480 return TRACE_TYPE_PARTIAL_LINE;
481
482 /* Cpu */
483 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
484 ret = print_graph_cpu(s, cpu);
485 if (ret == TRACE_TYPE_PARTIAL_LINE)
486 return TRACE_TYPE_PARTIAL_LINE;
487 }
488
489 /* Proc */
490 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
491 ret = print_graph_proc(s, ent->pid);
492 if (ret == TRACE_TYPE_PARTIAL_LINE)
493 return TRACE_TYPE_PARTIAL_LINE;
494
495 ret = trace_seq_printf(s, " | ");
496 if (!ret)
497 return TRACE_TYPE_PARTIAL_LINE;
498 }
499
500 /* Overhead */
501 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
502 ret = print_graph_overhead(duration, s);
503 if (!ret)
504 return TRACE_TYPE_PARTIAL_LINE;
505 }
506
507 /* Duration */
508 ret = print_graph_duration(duration, s);
509 if (ret == TRACE_TYPE_PARTIAL_LINE)
510 return TRACE_TYPE_PARTIAL_LINE;
511
512 /* Closing brace */
513 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
514 ret = trace_seq_printf(s, " ");
515 if (!ret)
516 return TRACE_TYPE_PARTIAL_LINE;
517 }
518
519 ret = trace_seq_printf(s, "}\n");
520 if (!ret)
521 return TRACE_TYPE_PARTIAL_LINE;
522
523 /* Overrun */
524 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
525 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
526 trace->overrun);
527 if (!ret)
528 return TRACE_TYPE_PARTIAL_LINE;
529 }
530
531 ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
532 if (ret == TRACE_TYPE_PARTIAL_LINE)
533 return TRACE_TYPE_PARTIAL_LINE;
534
535 return TRACE_TYPE_HANDLED;
536}
537
538static enum print_line_t
539print_graph_comment(struct print_entry *trace, struct trace_seq *s,
540 struct trace_entry *ent, struct trace_iterator *iter)
541{
542 int i;
543 int ret;
544
545 /* Pid */
546 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
547 return TRACE_TYPE_PARTIAL_LINE;
548
549 /* Cpu */
550 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
551 ret = print_graph_cpu(s, iter->cpu);
552 if (ret == TRACE_TYPE_PARTIAL_LINE)
553 return TRACE_TYPE_PARTIAL_LINE;
554 }
555
556 /* Proc */
557 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
558 ret = print_graph_proc(s, ent->pid);
559 if (ret == TRACE_TYPE_PARTIAL_LINE)
560 return TRACE_TYPE_PARTIAL_LINE;
561
562 ret = trace_seq_printf(s, " | ");
563 if (!ret)
564 return TRACE_TYPE_PARTIAL_LINE;
565 }
566
567 /* No overhead */
568 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
569 ret = trace_seq_printf(s, " ");
570 if (!ret)
571 return TRACE_TYPE_PARTIAL_LINE;
572 }
573
574 /* No time */
575 ret = trace_seq_printf(s, " | ");
576 if (!ret)
577 return TRACE_TYPE_PARTIAL_LINE;
578
579 /* Indentation */
580 if (trace->depth > 0)
581 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
582 ret = trace_seq_printf(s, " ");
583 if (!ret)
584 return TRACE_TYPE_PARTIAL_LINE;
585 }
586
587 /* The comment */
588 ret = trace_seq_printf(s, "/* %s", trace->buf);
589 if (!ret)
590 return TRACE_TYPE_PARTIAL_LINE;
591
592 if (ent->flags & TRACE_FLAG_CONT)
593 trace_seq_print_cont(s, iter);
594
595 ret = trace_seq_printf(s, " */\n");
596 if (!ret)
597 return TRACE_TYPE_PARTIAL_LINE;
598
599 return TRACE_TYPE_HANDLED;
600}
601
602
603enum print_line_t
604print_graph_function(struct trace_iterator *iter)
605{
606 struct trace_seq *s = &iter->seq;
607 struct trace_entry *entry = iter->ent;
608
609 switch (entry->type) {
610 case TRACE_GRAPH_ENT: {
611 struct ftrace_graph_ent_entry *field;
612 trace_assign_type(field, entry);
613 return print_graph_entry(field, s, iter,
614 iter->cpu);
615 }
616 case TRACE_GRAPH_RET: {
617 struct ftrace_graph_ret_entry *field;
618 trace_assign_type(field, entry);
619 return print_graph_return(&field->ret, s, entry, iter->cpu);
620 }
621 case TRACE_PRINT: {
622 struct print_entry *field;
623 trace_assign_type(field, entry);
624 return print_graph_comment(field, s, entry, iter);
625 }
626 default:
627 return TRACE_TYPE_UNHANDLED;
628 }
629}
630
631static void print_graph_headers(struct seq_file *s)
632{
633 /* 1st line */
634 seq_printf(s, "# ");
635 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
636 seq_printf(s, "CPU ");
637 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
638 seq_printf(s, "TASK/PID ");
639 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
640 seq_printf(s, "OVERHEAD/");
641 seq_printf(s, "DURATION FUNCTION CALLS\n");
642
643 /* 2nd line */
644 seq_printf(s, "# ");
645 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
646 seq_printf(s, "| ");
647 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
648 seq_printf(s, "| | ");
649 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
650 seq_printf(s, "| ");
651 seq_printf(s, "| | | | |\n");
652 } else
653 seq_printf(s, " | | | | |\n");
654}
655static struct tracer graph_trace __read_mostly = {
656 .name = "function_graph",
657 .init = graph_trace_init,
658 .reset = graph_trace_reset,
659 .print_line = print_graph_function,
660 .print_header = print_graph_headers,
661 .flags = &tracer_flags,
662};
663
664static __init int init_graph_trace(void)
665{
666 return register_tracer(&graph_trace);
667}
668
669device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
new file mode 100644
index 000000000000..b6a3e20a49a9
--- /dev/null
+++ b/kernel/trace/trace_hw_branches.c
@@ -0,0 +1,195 @@
1/*
2 * h/w branch tracer for x86 based on bts
3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/kallsyms.h>
13
14#include <asm/ds.h>
15
16#include "trace.h"
17
18
19#define SIZEOF_BTS (1 << 13)
20
21static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23
24#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id())
26
27
28static void bts_trace_start_cpu(void *arg)
29{
30 if (this_tracer)
31 ds_release_bts(this_tracer);
32
33 this_tracer =
34 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
35 /* ovfl = */ NULL, /* th = */ (size_t)-1,
36 BTS_KERNEL);
37 if (IS_ERR(this_tracer)) {
38 this_tracer = NULL;
39 return;
40 }
41}
42
43static void bts_trace_start(struct trace_array *tr)
44{
45 int cpu;
46
47 tracing_reset_online_cpus(tr);
48
49 for_each_cpu_mask(cpu, cpu_possible_map)
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51}
52
53static void bts_trace_stop_cpu(void *arg)
54{
55 if (this_tracer) {
56 ds_release_bts(this_tracer);
57 this_tracer = NULL;
58 }
59}
60
61static void bts_trace_stop(struct trace_array *tr)
62{
63 int cpu;
64
65 for_each_cpu_mask(cpu, cpu_possible_map)
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
67}
68
69static int bts_trace_init(struct trace_array *tr)
70{
71 tracing_reset_online_cpus(tr);
72 bts_trace_start(tr);
73
74 return 0;
75}
76
77static void bts_trace_print_header(struct seq_file *m)
78{
79 seq_puts(m,
80 "# CPU# FROM TO FUNCTION\n");
81 seq_puts(m,
82 "# | | | |\n");
83}
84
85static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
86{
87 struct trace_entry *entry = iter->ent;
88 struct trace_seq *seq = &iter->seq;
89 struct hw_branch_entry *it;
90
91 trace_assign_type(it, entry);
92
93 if (entry->type == TRACE_HW_BRANCHES) {
94 if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
95 trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
96 it->from, it->to) &&
97 (!it->from ||
98 seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
99 trace_seq_printf(seq, "\n"))
100 return TRACE_TYPE_HANDLED;
101 return TRACE_TYPE_PARTIAL_LINE;;
102 }
103 return TRACE_TYPE_UNHANDLED;
104}
105
106void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
107{
108 struct ring_buffer_event *event;
109 struct hw_branch_entry *entry;
110 unsigned long irq;
111
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
113 if (!event)
114 return;
115 entry = ring_buffer_event_data(event);
116 tracing_generic_entry_update(&entry->ent, 0, from);
117 entry->ent.type = TRACE_HW_BRANCHES;
118 entry->ent.cpu = smp_processor_id();
119 entry->from = from;
120 entry->to = to;
121 ring_buffer_unlock_commit(tr->buffer, event, irq);
122}
123
124static void trace_bts_at(struct trace_array *tr,
125 const struct bts_trace *trace, void *at)
126{
127 struct bts_struct bts;
128 int err = 0;
129
130 WARN_ON_ONCE(!trace->read);
131 if (!trace->read)
132 return;
133
134 err = trace->read(this_tracer, at, &bts);
135 if (err < 0)
136 return;
137
138 switch (bts.qualifier) {
139 case BTS_BRANCH:
140 trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
141 break;
142 }
143}
144
145static void trace_bts_cpu(void *arg)
146{
147 struct trace_array *tr = (struct trace_array *) arg;
148 const struct bts_trace *trace;
149 unsigned char *at;
150
151 if (!this_tracer)
152 return;
153
154 ds_suspend_bts(this_tracer);
155 trace = ds_read_bts(this_tracer);
156 if (!trace)
157 goto out;
158
159 for (at = trace->ds.top; (void *)at < trace->ds.end;
160 at += trace->ds.size)
161 trace_bts_at(tr, trace, at);
162
163 for (at = trace->ds.begin; (void *)at < trace->ds.top;
164 at += trace->ds.size)
165 trace_bts_at(tr, trace, at);
166
167out:
168 ds_resume_bts(this_tracer);
169}
170
171static void trace_bts_prepare(struct trace_iterator *iter)
172{
173 int cpu;
174
175 for_each_cpu_mask(cpu, cpu_possible_map)
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177}
178
179struct tracer bts_tracer __read_mostly =
180{
181 .name = "hw-branch-tracer",
182 .init = bts_trace_init,
183 .reset = bts_trace_stop,
184 .print_header = bts_trace_print_header,
185 .print_line = bts_trace_print_line,
186 .start = bts_trace_start,
187 .stop = bts_trace_stop,
188 .open = trace_bts_prepare
189};
190
191__init static int init_bts_trace(void)
192{
193 return register_tracer(&bts_tracer);
194}
195device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ece6cfb649fa..7c2e326bbc8b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -63,7 +63,7 @@ irq_trace(void)
63 */ 63 */
64static __cacheline_aligned_in_smp unsigned long max_sequence; 64static __cacheline_aligned_in_smp unsigned long max_sequence;
65 65
66#ifdef CONFIG_FTRACE 66#ifdef CONFIG_FUNCTION_TRACER
67/* 67/*
68 * irqsoff uses its own tracer function to keep the overhead down: 68 * irqsoff uses its own tracer function to keep the overhead down:
69 */ 69 */
@@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
95 disabled = atomic_inc_return(&data->disabled); 95 disabled = atomic_inc_return(&data->disabled);
96 96
97 if (likely(disabled == 1)) 97 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags); 98 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
99 99
100 atomic_dec(&data->disabled); 100 atomic_dec(&data->disabled);
101} 101}
@@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly =
104{ 104{
105 .func = irqsoff_tracer_call, 105 .func = irqsoff_tracer_call,
106}; 106};
107#endif /* CONFIG_FTRACE */ 107#endif /* CONFIG_FUNCTION_TRACER */
108 108
109/* 109/*
110 * Should this new latency be reported/recorded? 110 * Should this new latency be reported/recorded?
@@ -130,6 +130,7 @@ check_critical_timing(struct trace_array *tr,
130 unsigned long latency, t0, t1; 130 unsigned long latency, t0, t1;
131 cycle_t T0, T1, delta; 131 cycle_t T0, T1, delta;
132 unsigned long flags; 132 unsigned long flags;
133 int pc;
133 134
134 /* 135 /*
135 * usecs conversion is slow so we try to delay the conversion 136 * usecs conversion is slow so we try to delay the conversion
@@ -141,6 +142,8 @@ check_critical_timing(struct trace_array *tr,
141 142
142 local_save_flags(flags); 143 local_save_flags(flags);
143 144
145 pc = preempt_count();
146
144 if (!report_latency(delta)) 147 if (!report_latency(delta))
145 goto out; 148 goto out;
146 149
@@ -150,7 +153,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 153 if (!report_latency(delta))
151 goto out_unlock; 154 goto out_unlock;
152 155
153 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); 156 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
154 157
155 latency = nsecs_to_usecs(delta); 158 latency = nsecs_to_usecs(delta);
156 159
@@ -173,8 +176,8 @@ out_unlock:
173out: 176out:
174 data->critical_sequence = max_sequence; 177 data->critical_sequence = max_sequence;
175 data->preempt_timestamp = ftrace_now(cpu); 178 data->preempt_timestamp = ftrace_now(cpu);
176 tracing_reset(data); 179 tracing_reset(tr, cpu);
177 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); 180 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
178} 181}
179 182
180static inline void 183static inline void
@@ -203,11 +206,11 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
203 data->critical_sequence = max_sequence; 206 data->critical_sequence = max_sequence;
204 data->preempt_timestamp = ftrace_now(cpu); 207 data->preempt_timestamp = ftrace_now(cpu);
205 data->critical_start = parent_ip ? : ip; 208 data->critical_start = parent_ip ? : ip;
206 tracing_reset(data); 209 tracing_reset(tr, cpu);
207 210
208 local_save_flags(flags); 211 local_save_flags(flags);
209 212
210 trace_function(tr, data, ip, parent_ip, flags); 213 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
211 214
212 per_cpu(tracing_cpu, cpu) = 1; 215 per_cpu(tracing_cpu, cpu) = 1;
213 216
@@ -234,14 +237,14 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
234 237
235 data = tr->data[cpu]; 238 data = tr->data[cpu];
236 239
237 if (unlikely(!data) || unlikely(!head_page(data)) || 240 if (unlikely(!data) ||
238 !data->critical_start || atomic_read(&data->disabled)) 241 !data->critical_start || atomic_read(&data->disabled))
239 return; 242 return;
240 243
241 atomic_inc(&data->disabled); 244 atomic_inc(&data->disabled);
242 245
243 local_save_flags(flags); 246 local_save_flags(flags);
244 trace_function(tr, data, ip, parent_ip, flags); 247 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
245 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 248 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
246 data->critical_start = 0; 249 data->critical_start = 0;
247 atomic_dec(&data->disabled); 250 atomic_dec(&data->disabled);
@@ -350,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
350} 353}
351#endif /* CONFIG_PREEMPT_TRACER */ 354#endif /* CONFIG_PREEMPT_TRACER */
352 355
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
353static void start_irqsoff_tracer(struct trace_array *tr) 362static void start_irqsoff_tracer(struct trace_array *tr)
354{ 363{
355 register_ftrace_function(&trace_ops); 364 register_ftrace_function(&trace_ops);
356 tracer_enabled = 1; 365 if (tracing_is_enabled()) {
366 tracer_enabled = 1;
367 save_tracer_enabled = 1;
368 } else {
369 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
357} 372}
358 373
359static void stop_irqsoff_tracer(struct trace_array *tr) 374static void stop_irqsoff_tracer(struct trace_array *tr)
360{ 375{
361 tracer_enabled = 0; 376 tracer_enabled = 0;
377 save_tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 378 unregister_ftrace_function(&trace_ops);
363} 379}
364 380
@@ -367,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
367 irqsoff_trace = tr; 383 irqsoff_trace = tr;
368 /* make sure that the tracer is visible */ 384 /* make sure that the tracer is visible */
369 smp_wmb(); 385 smp_wmb();
370 386 start_irqsoff_tracer(tr);
371 if (tr->ctrl)
372 start_irqsoff_tracer(tr);
373} 387}
374 388
375static void irqsoff_tracer_reset(struct trace_array *tr) 389static void irqsoff_tracer_reset(struct trace_array *tr)
376{ 390{
377 if (tr->ctrl) 391 stop_irqsoff_tracer(tr);
378 stop_irqsoff_tracer(tr);
379} 392}
380 393
381static void irqsoff_tracer_ctrl_update(struct trace_array *tr) 394static void irqsoff_tracer_start(struct trace_array *tr)
382{ 395{
383 if (tr->ctrl) 396 tracer_enabled = 1;
384 start_irqsoff_tracer(tr); 397 save_tracer_enabled = 1;
385 else 398}
386 stop_irqsoff_tracer(tr); 399
400static void irqsoff_tracer_stop(struct trace_array *tr)
401{
402 tracer_enabled = 0;
403 save_tracer_enabled = 0;
387} 404}
388 405
389static void irqsoff_tracer_open(struct trace_iterator *iter) 406static void irqsoff_tracer_open(struct trace_iterator *iter)
390{ 407{
391 /* stop the trace while dumping */ 408 /* stop the trace while dumping */
392 if (iter->tr->ctrl) 409 tracer_enabled = 0;
393 stop_irqsoff_tracer(iter->tr);
394} 410}
395 411
396static void irqsoff_tracer_close(struct trace_iterator *iter) 412static void irqsoff_tracer_close(struct trace_iterator *iter)
397{ 413{
398 if (iter->tr->ctrl) 414 /* restart tracing */
399 start_irqsoff_tracer(iter->tr); 415 tracer_enabled = save_tracer_enabled;
400} 416}
401 417
402#ifdef CONFIG_IRQSOFF_TRACER 418#ifdef CONFIG_IRQSOFF_TRACER
403static void irqsoff_tracer_init(struct trace_array *tr) 419static int irqsoff_tracer_init(struct trace_array *tr)
404{ 420{
405 trace_type = TRACER_IRQS_OFF; 421 trace_type = TRACER_IRQS_OFF;
406 422
407 __irqsoff_tracer_init(tr); 423 __irqsoff_tracer_init(tr);
424 return 0;
408} 425}
409static struct tracer irqsoff_tracer __read_mostly = 426static struct tracer irqsoff_tracer __read_mostly =
410{ 427{
411 .name = "irqsoff", 428 .name = "irqsoff",
412 .init = irqsoff_tracer_init, 429 .init = irqsoff_tracer_init,
413 .reset = irqsoff_tracer_reset, 430 .reset = irqsoff_tracer_reset,
431 .start = irqsoff_tracer_start,
432 .stop = irqsoff_tracer_stop,
414 .open = irqsoff_tracer_open, 433 .open = irqsoff_tracer_open,
415 .close = irqsoff_tracer_close, 434 .close = irqsoff_tracer_close,
416 .ctrl_update = irqsoff_tracer_ctrl_update,
417 .print_max = 1, 435 .print_max = 1,
418#ifdef CONFIG_FTRACE_SELFTEST 436#ifdef CONFIG_FTRACE_SELFTEST
419 .selftest = trace_selftest_startup_irqsoff, 437 .selftest = trace_selftest_startup_irqsoff,
@@ -425,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
425#endif 443#endif
426 444
427#ifdef CONFIG_PREEMPT_TRACER 445#ifdef CONFIG_PREEMPT_TRACER
428static void preemptoff_tracer_init(struct trace_array *tr) 446static int preemptoff_tracer_init(struct trace_array *tr)
429{ 447{
430 trace_type = TRACER_PREEMPT_OFF; 448 trace_type = TRACER_PREEMPT_OFF;
431 449
432 __irqsoff_tracer_init(tr); 450 __irqsoff_tracer_init(tr);
451 return 0;
433} 452}
434 453
435static struct tracer preemptoff_tracer __read_mostly = 454static struct tracer preemptoff_tracer __read_mostly =
@@ -437,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
437 .name = "preemptoff", 456 .name = "preemptoff",
438 .init = preemptoff_tracer_init, 457 .init = preemptoff_tracer_init,
439 .reset = irqsoff_tracer_reset, 458 .reset = irqsoff_tracer_reset,
459 .start = irqsoff_tracer_start,
460 .stop = irqsoff_tracer_stop,
440 .open = irqsoff_tracer_open, 461 .open = irqsoff_tracer_open,
441 .close = irqsoff_tracer_close, 462 .close = irqsoff_tracer_close,
442 .ctrl_update = irqsoff_tracer_ctrl_update,
443 .print_max = 1, 463 .print_max = 1,
444#ifdef CONFIG_FTRACE_SELFTEST 464#ifdef CONFIG_FTRACE_SELFTEST
445 .selftest = trace_selftest_startup_preemptoff, 465 .selftest = trace_selftest_startup_preemptoff,
@@ -453,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
453#if defined(CONFIG_IRQSOFF_TRACER) && \ 473#if defined(CONFIG_IRQSOFF_TRACER) && \
454 defined(CONFIG_PREEMPT_TRACER) 474 defined(CONFIG_PREEMPT_TRACER)
455 475
456static void preemptirqsoff_tracer_init(struct trace_array *tr) 476static int preemptirqsoff_tracer_init(struct trace_array *tr)
457{ 477{
458 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 478 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
459 479
460 __irqsoff_tracer_init(tr); 480 __irqsoff_tracer_init(tr);
481 return 0;
461} 482}
462 483
463static struct tracer preemptirqsoff_tracer __read_mostly = 484static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -465,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
465 .name = "preemptirqsoff", 486 .name = "preemptirqsoff",
466 .init = preemptirqsoff_tracer_init, 487 .init = preemptirqsoff_tracer_init,
467 .reset = irqsoff_tracer_reset, 488 .reset = irqsoff_tracer_reset,
489 .start = irqsoff_tracer_start,
490 .stop = irqsoff_tracer_stop,
468 .open = irqsoff_tracer_open, 491 .open = irqsoff_tracer_open,
469 .close = irqsoff_tracer_close, 492 .close = irqsoff_tracer_close,
470 .ctrl_update = irqsoff_tracer_ctrl_update,
471 .print_max = 1, 493 .print_max = 1,
472#ifdef CONFIG_FTRACE_SELFTEST 494#ifdef CONFIG_FTRACE_SELFTEST
473 .selftest = trace_selftest_startup_preemptirqsoff, 495 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b13dc19dcbb4..fffcb069f1dc 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -18,46 +18,39 @@ struct header_iter {
18 18
19static struct trace_array *mmio_trace_array; 19static struct trace_array *mmio_trace_array;
20static bool overrun_detected; 20static bool overrun_detected;
21static unsigned long prev_overruns;
21 22
22static void mmio_reset_data(struct trace_array *tr) 23static void mmio_reset_data(struct trace_array *tr)
23{ 24{
24 int cpu;
25
26 overrun_detected = false; 25 overrun_detected = false;
27 tr->time_start = ftrace_now(tr->cpu); 26 prev_overruns = 0;
28 27
29 for_each_online_cpu(cpu) 28 tracing_reset_online_cpus(tr);
30 tracing_reset(tr->data[cpu]);
31} 29}
32 30
33static void mmio_trace_init(struct trace_array *tr) 31static int mmio_trace_init(struct trace_array *tr)
34{ 32{
35 pr_debug("in %s\n", __func__); 33 pr_debug("in %s\n", __func__);
36 mmio_trace_array = tr; 34 mmio_trace_array = tr;
37 if (tr->ctrl) { 35
38 mmio_reset_data(tr); 36 mmio_reset_data(tr);
39 enable_mmiotrace(); 37 enable_mmiotrace();
40 } 38 return 0;
41} 39}
42 40
43static void mmio_trace_reset(struct trace_array *tr) 41static void mmio_trace_reset(struct trace_array *tr)
44{ 42{
45 pr_debug("in %s\n", __func__); 43 pr_debug("in %s\n", __func__);
46 if (tr->ctrl) 44
47 disable_mmiotrace(); 45 disable_mmiotrace();
48 mmio_reset_data(tr); 46 mmio_reset_data(tr);
49 mmio_trace_array = NULL; 47 mmio_trace_array = NULL;
50} 48}
51 49
52static void mmio_trace_ctrl_update(struct trace_array *tr) 50static void mmio_trace_start(struct trace_array *tr)
53{ 51{
54 pr_debug("in %s\n", __func__); 52 pr_debug("in %s\n", __func__);
55 if (tr->ctrl) { 53 mmio_reset_data(tr);
56 mmio_reset_data(tr);
57 enable_mmiotrace();
58 } else {
59 disable_mmiotrace();
60 }
61} 54}
62 55
63static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 56static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -128,12 +121,12 @@ static void mmio_close(struct trace_iterator *iter)
128 121
129static unsigned long count_overruns(struct trace_iterator *iter) 122static unsigned long count_overruns(struct trace_iterator *iter)
130{ 123{
131 int cpu;
132 unsigned long cnt = 0; 124 unsigned long cnt = 0;
133 for_each_online_cpu(cpu) { 125 unsigned long over = ring_buffer_overruns(iter->tr->buffer);
134 cnt += iter->overrun[cpu]; 126
135 iter->overrun[cpu] = 0; 127 if (over > prev_overruns)
136 } 128 cnt = over - prev_overruns;
129 prev_overruns = over;
137 return cnt; 130 return cnt;
138} 131}
139 132
@@ -171,17 +164,21 @@ print_out:
171 return (ret == -EBUSY) ? 0 : ret; 164 return (ret == -EBUSY) ? 0 : ret;
172} 165}
173 166
174static int mmio_print_rw(struct trace_iterator *iter) 167static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
175{ 168{
176 struct trace_entry *entry = iter->ent; 169 struct trace_entry *entry = iter->ent;
177 struct mmiotrace_rw *rw = &entry->mmiorw; 170 struct trace_mmiotrace_rw *field;
171 struct mmiotrace_rw *rw;
178 struct trace_seq *s = &iter->seq; 172 struct trace_seq *s = &iter->seq;
179 unsigned long long t = ns2usecs(entry->t); 173 unsigned long long t = ns2usecs(iter->ts);
180 unsigned long usec_rem = do_div(t, 1000000ULL); 174 unsigned long usec_rem = do_div(t, 1000000ULL);
181 unsigned secs = (unsigned long)t; 175 unsigned secs = (unsigned long)t;
182 int ret = 1; 176 int ret = 1;
183 177
184 switch (entry->mmiorw.opcode) { 178 trace_assign_type(field, entry);
179 rw = &field->rw;
180
181 switch (rw->opcode) {
185 case MMIO_READ: 182 case MMIO_READ:
186 ret = trace_seq_printf(s, 183 ret = trace_seq_printf(s,
187 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 184 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
@@ -209,21 +206,25 @@ static int mmio_print_rw(struct trace_iterator *iter)
209 break; 206 break;
210 } 207 }
211 if (ret) 208 if (ret)
212 return 1; 209 return TRACE_TYPE_HANDLED;
213 return 0; 210 return TRACE_TYPE_PARTIAL_LINE;
214} 211}
215 212
216static int mmio_print_map(struct trace_iterator *iter) 213static enum print_line_t mmio_print_map(struct trace_iterator *iter)
217{ 214{
218 struct trace_entry *entry = iter->ent; 215 struct trace_entry *entry = iter->ent;
219 struct mmiotrace_map *m = &entry->mmiomap; 216 struct trace_mmiotrace_map *field;
217 struct mmiotrace_map *m;
220 struct trace_seq *s = &iter->seq; 218 struct trace_seq *s = &iter->seq;
221 unsigned long long t = ns2usecs(entry->t); 219 unsigned long long t = ns2usecs(iter->ts);
222 unsigned long usec_rem = do_div(t, 1000000ULL); 220 unsigned long usec_rem = do_div(t, 1000000ULL);
223 unsigned secs = (unsigned long)t; 221 unsigned secs = (unsigned long)t;
224 int ret = 1; 222 int ret;
225 223
226 switch (entry->mmiorw.opcode) { 224 trace_assign_type(field, entry);
225 m = &field->map;
226
227 switch (m->opcode) {
227 case MMIO_PROBE: 228 case MMIO_PROBE:
228 ret = trace_seq_printf(s, 229 ret = trace_seq_printf(s,
229 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 230 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
@@ -241,20 +242,43 @@ static int mmio_print_map(struct trace_iterator *iter)
241 break; 242 break;
242 } 243 }
243 if (ret) 244 if (ret)
244 return 1; 245 return TRACE_TYPE_HANDLED;
245 return 0; 246 return TRACE_TYPE_PARTIAL_LINE;
247}
248
249static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
250{
251 struct trace_entry *entry = iter->ent;
252 struct print_entry *print = (struct print_entry *)entry;
253 const char *msg = print->buf;
254 struct trace_seq *s = &iter->seq;
255 unsigned long long t = ns2usecs(iter->ts);
256 unsigned long usec_rem = do_div(t, 1000000ULL);
257 unsigned secs = (unsigned long)t;
258 int ret;
259
260 /* The trailing newline must be in the message. */
261 ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
262 if (!ret)
263 return TRACE_TYPE_PARTIAL_LINE;
264
265 if (entry->flags & TRACE_FLAG_CONT)
266 trace_seq_print_cont(s, iter);
267
268 return TRACE_TYPE_HANDLED;
246} 269}
247 270
248/* return 0 to abort printing without consuming current entry in pipe mode */ 271static enum print_line_t mmio_print_line(struct trace_iterator *iter)
249static int mmio_print_line(struct trace_iterator *iter)
250{ 272{
251 switch (iter->ent->type) { 273 switch (iter->ent->type) {
252 case TRACE_MMIO_RW: 274 case TRACE_MMIO_RW:
253 return mmio_print_rw(iter); 275 return mmio_print_rw(iter);
254 case TRACE_MMIO_MAP: 276 case TRACE_MMIO_MAP:
255 return mmio_print_map(iter); 277 return mmio_print_map(iter);
278 case TRACE_PRINT:
279 return mmio_print_mark(iter);
256 default: 280 default:
257 return 1; /* ignore unknown entries */ 281 return TRACE_TYPE_HANDLED; /* ignore unknown entries */
258 } 282 }
259} 283}
260 284
@@ -263,10 +287,10 @@ static struct tracer mmio_tracer __read_mostly =
263 .name = "mmiotrace", 287 .name = "mmiotrace",
264 .init = mmio_trace_init, 288 .init = mmio_trace_init,
265 .reset = mmio_trace_reset, 289 .reset = mmio_trace_reset,
290 .start = mmio_trace_start,
266 .pipe_open = mmio_pipe_open, 291 .pipe_open = mmio_pipe_open,
267 .close = mmio_close, 292 .close = mmio_close,
268 .read = mmio_read, 293 .read = mmio_read,
269 .ctrl_update = mmio_trace_ctrl_update,
270 .print_line = mmio_print_line, 294 .print_line = mmio_print_line,
271}; 295};
272 296
@@ -276,6 +300,27 @@ __init static int init_mmio_trace(void)
276} 300}
277device_initcall(init_mmio_trace); 301device_initcall(init_mmio_trace);
278 302
303static void __trace_mmiotrace_rw(struct trace_array *tr,
304 struct trace_array_cpu *data,
305 struct mmiotrace_rw *rw)
306{
307 struct ring_buffer_event *event;
308 struct trace_mmiotrace_rw *entry;
309 unsigned long irq_flags;
310
311 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
312 &irq_flags);
313 if (!event)
314 return;
315 entry = ring_buffer_event_data(event);
316 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
317 entry->ent.type = TRACE_MMIO_RW;
318 entry->rw = *rw;
319 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
320
321 trace_wake_up();
322}
323
279void mmio_trace_rw(struct mmiotrace_rw *rw) 324void mmio_trace_rw(struct mmiotrace_rw *rw)
280{ 325{
281 struct trace_array *tr = mmio_trace_array; 326 struct trace_array *tr = mmio_trace_array;
@@ -283,6 +328,27 @@ void mmio_trace_rw(struct mmiotrace_rw *rw)
283 __trace_mmiotrace_rw(tr, data, rw); 328 __trace_mmiotrace_rw(tr, data, rw);
284} 329}
285 330
331static void __trace_mmiotrace_map(struct trace_array *tr,
332 struct trace_array_cpu *data,
333 struct mmiotrace_map *map)
334{
335 struct ring_buffer_event *event;
336 struct trace_mmiotrace_map *entry;
337 unsigned long irq_flags;
338
339 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
340 &irq_flags);
341 if (!event)
342 return;
343 entry = ring_buffer_event_data(event);
344 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
345 entry->ent.type = TRACE_MMIO_MAP;
346 entry->map = *map;
347 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
348
349 trace_wake_up();
350}
351
286void mmio_trace_mapping(struct mmiotrace_map *map) 352void mmio_trace_mapping(struct mmiotrace_map *map)
287{ 353{
288 struct trace_array *tr = mmio_trace_array; 354 struct trace_array *tr = mmio_trace_array;
@@ -293,3 +359,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
293 __trace_mmiotrace_map(tr, data, map); 359 __trace_mmiotrace_map(tr, data, map);
294 preempt_enable(); 360 preempt_enable();
295} 361}
362
363int mmio_trace_printk(const char *fmt, va_list args)
364{
365 return trace_vprintk(0, -1, fmt, args);
366}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
new file mode 100644
index 000000000000..b9767acd30ac
--- /dev/null
+++ b/kernel/trace/trace_nop.c
@@ -0,0 +1,105 @@
1/*
2 * nop tracer
3 *
4 * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12
13#include "trace.h"
14
15/* Our two options */
16enum {
17 TRACE_NOP_OPT_ACCEPT = 0x1,
18 TRACE_NOP_OPT_REFUSE = 0x2
19};
20
21/* Options for the tracer (see trace_options file) */
22static struct tracer_opt nop_opts[] = {
23 /* Option that will be accepted by set_flag callback */
24 { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
25 /* Option that will be refused by set_flag callback */
26 { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
27 { } /* Always set a last empty entry */
28};
29
30static struct tracer_flags nop_flags = {
31 /* You can check your flags value here when you want. */
32 .val = 0, /* By default: all flags disabled */
33 .opts = nop_opts
34};
35
36static struct trace_array *ctx_trace;
37
38static void start_nop_trace(struct trace_array *tr)
39{
40 /* Nothing to do! */
41}
42
43static void stop_nop_trace(struct trace_array *tr)
44{
45 /* Nothing to do! */
46}
47
48static int nop_trace_init(struct trace_array *tr)
49{
50 int cpu;
51 ctx_trace = tr;
52
53 for_each_online_cpu(cpu)
54 tracing_reset(tr, cpu);
55
56 start_nop_trace(tr);
57 return 0;
58}
59
60static void nop_trace_reset(struct trace_array *tr)
61{
62 stop_nop_trace(tr);
63}
64
65/* It only serves as a signal handler and a callback to
66 * accept or refuse tthe setting of a flag.
67 * If you don't implement it, then the flag setting will be
68 * automatically accepted.
69 */
70static int nop_set_flag(u32 old_flags, u32 bit, int set)
71{
72 /*
73 * Note that you don't need to update nop_flags.val yourself.
74 * The tracing Api will do it automatically if you return 0
75 */
76 if (bit == TRACE_NOP_OPT_ACCEPT) {
77 printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
78 " Now cat trace_options to see the result\n",
79 set);
80 return 0;
81 }
82
83 if (bit == TRACE_NOP_OPT_REFUSE) {
84 printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
85 "Now cat trace_options to see the result\n",
86 set);
87 return -EINVAL;
88 }
89
90 return 0;
91}
92
93
94struct tracer nop_trace __read_mostly =
95{
96 .name = "nop",
97 .init = nop_trace_init,
98 .reset = nop_trace_reset,
99#ifdef CONFIG_FTRACE_SELFTEST
100 .selftest = trace_selftest_startup_nop,
101#endif
102 .flags = &nop_flags,
103 .set_flag = nop_set_flag
104};
105
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 000000000000..a7172a352f62
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <linux/ftrace.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19
20static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled;
22
23
24static void start_power_trace(struct trace_array *tr)
25{
26 trace_power_enabled = 1;
27}
28
29static void stop_power_trace(struct trace_array *tr)
30{
31 trace_power_enabled = 0;
32}
33
34
35static int power_trace_init(struct trace_array *tr)
36{
37 int cpu;
38 power_trace = tr;
39
40 trace_power_enabled = 1;
41
42 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu);
44 return 0;
45}
46
47static enum print_line_t power_print_line(struct trace_iterator *iter)
48{
49 int ret = 0;
50 struct trace_entry *entry = iter->ent;
51 struct trace_power *field ;
52 struct power_trace *it;
53 struct trace_seq *s = &iter->seq;
54 struct timespec stamp;
55 struct timespec duration;
56
57 trace_assign_type(field, entry);
58 it = &field->state_data;
59 stamp = ktime_to_timespec(it->stamp);
60 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
61
62 if (entry->type == TRACE_POWER) {
63 if (it->type == POWER_CSTATE)
64 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
65 stamp.tv_sec,
66 stamp.tv_nsec,
67 it->state, iter->cpu,
68 duration.tv_sec,
69 duration.tv_nsec);
70 if (it->type == POWER_PSTATE)
71 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
72 stamp.tv_sec,
73 stamp.tv_nsec,
74 it->state, iter->cpu);
75 if (!ret)
76 return TRACE_TYPE_PARTIAL_LINE;
77 return TRACE_TYPE_HANDLED;
78 }
79 return TRACE_TYPE_UNHANDLED;
80}
81
82static struct tracer power_tracer __read_mostly =
83{
84 .name = "power",
85 .init = power_trace_init,
86 .start = start_power_trace,
87 .stop = stop_power_trace,
88 .reset = stop_power_trace,
89 .print_line = power_print_line,
90};
91
92static int init_power_trace(void)
93{
94 return register_tracer(&power_tracer);
95}
96device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb817a209aa0..df175cb4564f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -9,25 +9,27 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/marker.h>
13#include <linux/ftrace.h> 12#include <linux/ftrace.h>
13#include <trace/sched.h>
14 14
15#include "trace.h" 15#include "trace.h"
16 16
17static struct trace_array *ctx_trace; 17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex);
20 21
21static void 22static void
22sched_switch_func(void *private, void *__rq, struct task_struct *prev, 23probe_sched_switch(struct rq *__rq, struct task_struct *prev,
23 struct task_struct *next) 24 struct task_struct *next)
24{ 25{
25 struct trace_array **ptr = private;
26 struct trace_array *tr = *ptr;
27 struct trace_array_cpu *data; 26 struct trace_array_cpu *data;
28 unsigned long flags; 27 unsigned long flags;
29 long disabled;
30 int cpu; 28 int cpu;
29 int pc;
30
31 if (!sched_ref)
32 return;
31 33
32 tracing_record_cmdline(prev); 34 tracing_record_cmdline(prev);
33 tracing_record_cmdline(next); 35 tracing_record_cmdline(next);
@@ -35,183 +37,95 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev,
35 if (!tracer_enabled) 37 if (!tracer_enabled)
36 return; 38 return;
37 39
40 pc = preempt_count();
38 local_irq_save(flags); 41 local_irq_save(flags);
39 cpu = raw_smp_processor_id(); 42 cpu = raw_smp_processor_id();
40 data = tr->data[cpu]; 43 data = ctx_trace->data[cpu];
41 disabled = atomic_inc_return(&data->disabled);
42 44
43 if (likely(disabled == 1)) 45 if (likely(!atomic_read(&data->disabled)))
44 tracing_sched_switch_trace(tr, data, prev, next, flags); 46 tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
45 47
46 atomic_dec(&data->disabled);
47 local_irq_restore(flags); 48 local_irq_restore(flags);
48} 49}
49 50
50static notrace void
51sched_switch_callback(void *probe_data, void *call_data,
52 const char *format, va_list *args)
53{
54 struct task_struct *prev;
55 struct task_struct *next;
56 struct rq *__rq;
57
58 if (!atomic_read(&sched_ref))
59 return;
60
61 /* skip prev_pid %d next_pid %d prev_state %ld */
62 (void)va_arg(*args, int);
63 (void)va_arg(*args, int);
64 (void)va_arg(*args, long);
65 __rq = va_arg(*args, typeof(__rq));
66 prev = va_arg(*args, typeof(prev));
67 next = va_arg(*args, typeof(next));
68
69 /*
70 * If tracer_switch_func only points to the local
71 * switch func, it still needs the ptr passed to it.
72 */
73 sched_switch_func(probe_data, __rq, prev, next);
74}
75
76static void 51static void
77wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct 52probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
78 task_struct *curr)
79{ 53{
80 struct trace_array **ptr = private;
81 struct trace_array *tr = *ptr;
82 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
83 unsigned long flags; 55 unsigned long flags;
84 long disabled; 56 int cpu, pc;
85 int cpu;
86 57
87 if (!tracer_enabled) 58 if (!likely(tracer_enabled))
88 return; 59 return;
89 60
90 tracing_record_cmdline(curr); 61 pc = preempt_count();
62 tracing_record_cmdline(current);
91 63
92 local_irq_save(flags); 64 local_irq_save(flags);
93 cpu = raw_smp_processor_id(); 65 cpu = raw_smp_processor_id();
94 data = tr->data[cpu]; 66 data = ctx_trace->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96 67
97 if (likely(disabled == 1)) 68 if (likely(!atomic_read(&data->disabled)))
98 tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); 69 tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
70 flags, pc);
99 71
100 atomic_dec(&data->disabled);
101 local_irq_restore(flags); 72 local_irq_restore(flags);
102} 73}
103 74
104static notrace void
105wake_up_callback(void *probe_data, void *call_data,
106 const char *format, va_list *args)
107{
108 struct task_struct *curr;
109 struct task_struct *task;
110 struct rq *__rq;
111
112 if (likely(!tracer_enabled))
113 return;
114
115 /* Skip pid %d state %ld */
116 (void)va_arg(*args, int);
117 (void)va_arg(*args, long);
118 /* now get the meat: "rq %p task %p rq->curr %p" */
119 __rq = va_arg(*args, typeof(__rq));
120 task = va_arg(*args, typeof(task));
121 curr = va_arg(*args, typeof(curr));
122
123 tracing_record_cmdline(task);
124 tracing_record_cmdline(curr);
125
126 wakeup_func(probe_data, __rq, task, curr);
127}
128
129static void sched_switch_reset(struct trace_array *tr)
130{
131 int cpu;
132
133 tr->time_start = ftrace_now(tr->cpu);
134
135 for_each_online_cpu(cpu)
136 tracing_reset(tr->data[cpu]);
137}
138
139static int tracing_sched_register(void) 75static int tracing_sched_register(void)
140{ 76{
141 int ret; 77 int ret;
142 78
143 ret = marker_probe_register("kernel_sched_wakeup", 79 ret = register_trace_sched_wakeup(probe_sched_wakeup);
144 "pid %d state %ld ## rq %p task %p rq->curr %p",
145 wake_up_callback,
146 &ctx_trace);
147 if (ret) { 80 if (ret) {
148 pr_info("wakeup trace: Couldn't add marker" 81 pr_info("wakeup trace: Couldn't activate tracepoint"
149 " probe to kernel_sched_wakeup\n"); 82 " probe to kernel_sched_wakeup\n");
150 return ret; 83 return ret;
151 } 84 }
152 85
153 ret = marker_probe_register("kernel_sched_wakeup_new", 86 ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
154 "pid %d state %ld ## rq %p task %p rq->curr %p",
155 wake_up_callback,
156 &ctx_trace);
157 if (ret) { 87 if (ret) {
158 pr_info("wakeup trace: Couldn't add marker" 88 pr_info("wakeup trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_wakeup_new\n"); 89 " probe to kernel_sched_wakeup_new\n");
160 goto fail_deprobe; 90 goto fail_deprobe;
161 } 91 }
162 92
163 ret = marker_probe_register("kernel_sched_schedule", 93 ret = register_trace_sched_switch(probe_sched_switch);
164 "prev_pid %d next_pid %d prev_state %ld "
165 "## rq %p prev %p next %p",
166 sched_switch_callback,
167 &ctx_trace);
168 if (ret) { 94 if (ret) {
169 pr_info("sched trace: Couldn't add marker" 95 pr_info("sched trace: Couldn't activate tracepoint"
170 " probe to kernel_sched_schedule\n"); 96 " probe to kernel_sched_schedule\n");
171 goto fail_deprobe_wake_new; 97 goto fail_deprobe_wake_new;
172 } 98 }
173 99
174 return ret; 100 return ret;
175fail_deprobe_wake_new: 101fail_deprobe_wake_new:
176 marker_probe_unregister("kernel_sched_wakeup_new", 102 unregister_trace_sched_wakeup_new(probe_sched_wakeup);
177 wake_up_callback,
178 &ctx_trace);
179fail_deprobe: 103fail_deprobe:
180 marker_probe_unregister("kernel_sched_wakeup", 104 unregister_trace_sched_wakeup(probe_sched_wakeup);
181 wake_up_callback,
182 &ctx_trace);
183 return ret; 105 return ret;
184} 106}
185 107
186static void tracing_sched_unregister(void) 108static void tracing_sched_unregister(void)
187{ 109{
188 marker_probe_unregister("kernel_sched_schedule", 110 unregister_trace_sched_switch(probe_sched_switch);
189 sched_switch_callback, 111 unregister_trace_sched_wakeup_new(probe_sched_wakeup);
190 &ctx_trace); 112 unregister_trace_sched_wakeup(probe_sched_wakeup);
191 marker_probe_unregister("kernel_sched_wakeup_new",
192 wake_up_callback,
193 &ctx_trace);
194 marker_probe_unregister("kernel_sched_wakeup",
195 wake_up_callback,
196 &ctx_trace);
197} 113}
198 114
199static void tracing_start_sched_switch(void) 115static void tracing_start_sched_switch(void)
200{ 116{
201 long ref; 117 mutex_lock(&sched_register_mutex);
202 118 if (!(sched_ref++))
203 ref = atomic_inc_return(&sched_ref);
204 if (ref == 1)
205 tracing_sched_register(); 119 tracing_sched_register();
120 mutex_unlock(&sched_register_mutex);
206} 121}
207 122
208static void tracing_stop_sched_switch(void) 123static void tracing_stop_sched_switch(void)
209{ 124{
210 long ref; 125 mutex_lock(&sched_register_mutex);
211 126 if (!(--sched_ref))
212 ref = atomic_dec_and_test(&sched_ref);
213 if (ref)
214 tracing_sched_unregister(); 127 tracing_sched_unregister();
128 mutex_unlock(&sched_register_mutex);
215} 129}
216 130
217void tracing_start_cmdline_record(void) 131void tracing_start_cmdline_record(void)
@@ -224,40 +138,86 @@ void tracing_stop_cmdline_record(void)
224 tracing_stop_sched_switch(); 138 tracing_stop_sched_switch();
225} 139}
226 140
141/**
142 * tracing_start_sched_switch_record - start tracing context switches
143 *
144 * Turns on context switch tracing for a tracer.
145 */
146void tracing_start_sched_switch_record(void)
147{
148 if (unlikely(!ctx_trace)) {
149 WARN_ON(1);
150 return;
151 }
152
153 tracing_start_sched_switch();
154
155 mutex_lock(&sched_register_mutex);
156 tracer_enabled++;
157 mutex_unlock(&sched_register_mutex);
158}
159
160/**
161 * tracing_stop_sched_switch_record - start tracing context switches
162 *
163 * Turns off context switch tracing for a tracer.
164 */
165void tracing_stop_sched_switch_record(void)
166{
167 mutex_lock(&sched_register_mutex);
168 tracer_enabled--;
169 WARN_ON(tracer_enabled < 0);
170 mutex_unlock(&sched_register_mutex);
171
172 tracing_stop_sched_switch();
173}
174
175/**
176 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
177 * @tr: trace array pointer to assign
178 *
179 * Some tracers might want to record the context switches in their
180 * trace. This function lets those tracers assign the trace array
181 * to use.
182 */
183void tracing_sched_switch_assign_trace(struct trace_array *tr)
184{
185 ctx_trace = tr;
186}
187
227static void start_sched_trace(struct trace_array *tr) 188static void start_sched_trace(struct trace_array *tr)
228{ 189{
229 sched_switch_reset(tr); 190 tracing_reset_online_cpus(tr);
230 tracing_start_cmdline_record(); 191 tracing_start_sched_switch_record();
231 tracer_enabled = 1;
232} 192}
233 193
234static void stop_sched_trace(struct trace_array *tr) 194static void stop_sched_trace(struct trace_array *tr)
235{ 195{
236 tracer_enabled = 0; 196 tracing_stop_sched_switch_record();
237 tracing_stop_cmdline_record();
238} 197}
239 198
240static void sched_switch_trace_init(struct trace_array *tr) 199static int sched_switch_trace_init(struct trace_array *tr)
241{ 200{
242 ctx_trace = tr; 201 ctx_trace = tr;
243 202 start_sched_trace(tr);
244 if (tr->ctrl) 203 return 0;
245 start_sched_trace(tr);
246} 204}
247 205
248static void sched_switch_trace_reset(struct trace_array *tr) 206static void sched_switch_trace_reset(struct trace_array *tr)
249{ 207{
250 if (tr->ctrl) 208 if (sched_ref)
251 stop_sched_trace(tr); 209 stop_sched_trace(tr);
252} 210}
253 211
254static void sched_switch_trace_ctrl_update(struct trace_array *tr) 212static void sched_switch_trace_start(struct trace_array *tr)
255{ 213{
256 /* When starting a new trace, reset the buffers */ 214 tracing_reset_online_cpus(tr);
257 if (tr->ctrl) 215 tracing_start_sched_switch();
258 start_sched_trace(tr); 216}
259 else 217
260 stop_sched_trace(tr); 218static void sched_switch_trace_stop(struct trace_array *tr)
219{
220 tracing_stop_sched_switch();
261} 221}
262 222
263static struct tracer sched_switch_trace __read_mostly = 223static struct tracer sched_switch_trace __read_mostly =
@@ -265,7 +225,8 @@ static struct tracer sched_switch_trace __read_mostly =
265 .name = "sched_switch", 225 .name = "sched_switch",
266 .init = sched_switch_trace_init, 226 .init = sched_switch_trace_init,
267 .reset = sched_switch_trace_reset, 227 .reset = sched_switch_trace_reset,
268 .ctrl_update = sched_switch_trace_ctrl_update, 228 .start = sched_switch_trace_start,
229 .stop = sched_switch_trace_stop,
269#ifdef CONFIG_FTRACE_SELFTEST 230#ifdef CONFIG_FTRACE_SELFTEST
270 .selftest = trace_selftest_startup_sched_switch, 231 .selftest = trace_selftest_startup_sched_switch,
271#endif 232#endif
@@ -273,14 +234,7 @@ static struct tracer sched_switch_trace __read_mostly =
273 234
274__init static int init_sched_switch_trace(void) 235__init static int init_sched_switch_trace(void)
275{ 236{
276 int ret = 0;
277
278 if (atomic_read(&sched_ref))
279 ret = tracing_sched_register();
280 if (ret) {
281 pr_info("error registering scheduler trace\n");
282 return ret;
283 }
284 return register_tracer(&sched_switch_trace); 237 return register_tracer(&sched_switch_trace);
285} 238}
286device_initcall(init_sched_switch_trace); 239device_initcall(init_sched_switch_trace);
240
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e303ccb62cdf..43586b689e31 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/marker.h> 18#include <trace/sched.h>
19 19
20#include "trace.h" 20#include "trace.h"
21 21
@@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock =
31 31
32static void __wakeup_reset(struct trace_array *tr); 32static void __wakeup_reset(struct trace_array *tr);
33 33
34#ifdef CONFIG_FTRACE 34#ifdef CONFIG_FUNCTION_TRACER
35/* 35/*
36 * irqsoff uses its own tracer function to keep the overhead down: 36 * irqsoff uses its own tracer function to keep the overhead down:
37 */ 37 */
@@ -44,12 +44,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
44 long disabled; 44 long disabled;
45 int resched; 45 int resched;
46 int cpu; 46 int cpu;
47 int pc;
47 48
48 if (likely(!wakeup_task)) 49 if (likely(!wakeup_task))
49 return; 50 return;
50 51
51 resched = need_resched(); 52 pc = preempt_count();
52 preempt_disable_notrace(); 53 resched = ftrace_preempt_disable();
53 54
54 cpu = raw_smp_processor_id(); 55 cpu = raw_smp_processor_id();
55 data = tr->data[cpu]; 56 data = tr->data[cpu];
@@ -70,7 +71,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
70 if (task_cpu(wakeup_task) != cpu) 71 if (task_cpu(wakeup_task) != cpu)
71 goto unlock; 72 goto unlock;
72 73
73 trace_function(tr, data, ip, parent_ip, flags); 74 trace_function(tr, data, ip, parent_ip, flags, pc);
74 75
75 unlock: 76 unlock:
76 __raw_spin_unlock(&wakeup_lock); 77 __raw_spin_unlock(&wakeup_lock);
@@ -79,22 +80,14 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
79 out: 80 out:
80 atomic_dec(&data->disabled); 81 atomic_dec(&data->disabled);
81 82
82 /* 83 ftrace_preempt_enable(resched);
83 * To prevent recursion from the scheduler, if the
84 * resched flag was set before we entered, then
85 * don't reschedule.
86 */
87 if (resched)
88 preempt_enable_no_resched_notrace();
89 else
90 preempt_enable_notrace();
91} 84}
92 85
93static struct ftrace_ops trace_ops __read_mostly = 86static struct ftrace_ops trace_ops __read_mostly =
94{ 87{
95 .func = wakeup_tracer_call, 88 .func = wakeup_tracer_call,
96}; 89};
97#endif /* CONFIG_FTRACE */ 90#endif /* CONFIG_FUNCTION_TRACER */
98 91
99/* 92/*
100 * Should this new latency be reported/recorded? 93 * Should this new latency be reported/recorded?
@@ -112,17 +105,18 @@ static int report_latency(cycle_t delta)
112} 105}
113 106
114static void notrace 107static void notrace
115wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, 108probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
116 struct task_struct *next) 109 struct task_struct *next)
117{ 110{
118 unsigned long latency = 0, t0 = 0, t1 = 0; 111 unsigned long latency = 0, t0 = 0, t1 = 0;
119 struct trace_array **ptr = private;
120 struct trace_array *tr = *ptr;
121 struct trace_array_cpu *data; 112 struct trace_array_cpu *data;
122 cycle_t T0, T1, delta; 113 cycle_t T0, T1, delta;
123 unsigned long flags; 114 unsigned long flags;
124 long disabled; 115 long disabled;
125 int cpu; 116 int cpu;
117 int pc;
118
119 tracing_record_cmdline(prev);
126 120
127 if (unlikely(!tracer_enabled)) 121 if (unlikely(!tracer_enabled))
128 return; 122 return;
@@ -139,12 +133,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
139 if (next != wakeup_task) 133 if (next != wakeup_task)
140 return; 134 return;
141 135
136 pc = preempt_count();
137
142 /* The task we are waiting for is waking up */ 138 /* The task we are waiting for is waking up */
143 data = tr->data[wakeup_cpu]; 139 data = wakeup_trace->data[wakeup_cpu];
144 140
145 /* disable local data, not wakeup_cpu data */ 141 /* disable local data, not wakeup_cpu data */
146 cpu = raw_smp_processor_id(); 142 cpu = raw_smp_processor_id();
147 disabled = atomic_inc_return(&tr->data[cpu]->disabled); 143 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
148 if (likely(disabled != 1)) 144 if (likely(disabled != 1))
149 goto out; 145 goto out;
150 146
@@ -155,7 +151,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
155 if (unlikely(!tracer_enabled || next != wakeup_task)) 151 if (unlikely(!tracer_enabled || next != wakeup_task))
156 goto out_unlock; 152 goto out_unlock;
157 153
158 trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); 154 trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
159 155
160 /* 156 /*
161 * usecs conversion is slow so we try to delay the conversion 157 * usecs conversion is slow so we try to delay the conversion
@@ -174,39 +170,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
174 t0 = nsecs_to_usecs(T0); 170 t0 = nsecs_to_usecs(T0);
175 t1 = nsecs_to_usecs(T1); 171 t1 = nsecs_to_usecs(T1);
176 172
177 update_max_tr(tr, wakeup_task, wakeup_cpu); 173 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 174
179out_unlock: 175out_unlock:
180 __wakeup_reset(tr); 176 __wakeup_reset(wakeup_trace);
181 __raw_spin_unlock(&wakeup_lock); 177 __raw_spin_unlock(&wakeup_lock);
182 local_irq_restore(flags); 178 local_irq_restore(flags);
183out: 179out:
184 atomic_dec(&tr->data[cpu]->disabled); 180 atomic_dec(&wakeup_trace->data[cpu]->disabled);
185}
186
187static notrace void
188sched_switch_callback(void *probe_data, void *call_data,
189 const char *format, va_list *args)
190{
191 struct task_struct *prev;
192 struct task_struct *next;
193 struct rq *__rq;
194
195 /* skip prev_pid %d next_pid %d prev_state %ld */
196 (void)va_arg(*args, int);
197 (void)va_arg(*args, int);
198 (void)va_arg(*args, long);
199 __rq = va_arg(*args, typeof(__rq));
200 prev = va_arg(*args, typeof(prev));
201 next = va_arg(*args, typeof(next));
202
203 tracing_record_cmdline(prev);
204
205 /*
206 * If tracer_switch_func only points to the local
207 * switch func, it still needs the ptr passed to it.
208 */
209 wakeup_sched_switch(probe_data, __rq, prev, next);
210} 181}
211 182
212static void __wakeup_reset(struct trace_array *tr) 183static void __wakeup_reset(struct trace_array *tr)
@@ -216,7 +187,7 @@ static void __wakeup_reset(struct trace_array *tr)
216 187
217 for_each_possible_cpu(cpu) { 188 for_each_possible_cpu(cpu) {
218 data = tr->data[cpu]; 189 data = tr->data[cpu];
219 tracing_reset(data); 190 tracing_reset(tr, cpu);
220 } 191 }
221 192
222 wakeup_cpu = -1; 193 wakeup_cpu = -1;
@@ -240,19 +211,26 @@ static void wakeup_reset(struct trace_array *tr)
240} 211}
241 212
242static void 213static void
243wakeup_check_start(struct trace_array *tr, struct task_struct *p, 214probe_wakeup(struct rq *rq, struct task_struct *p, int success)
244 struct task_struct *curr)
245{ 215{
246 int cpu = smp_processor_id(); 216 int cpu = smp_processor_id();
247 unsigned long flags; 217 unsigned long flags;
248 long disabled; 218 long disabled;
219 int pc;
220
221 if (likely(!tracer_enabled))
222 return;
223
224 tracing_record_cmdline(p);
225 tracing_record_cmdline(current);
249 226
250 if (likely(!rt_task(p)) || 227 if (likely(!rt_task(p)) ||
251 p->prio >= wakeup_prio || 228 p->prio >= wakeup_prio ||
252 p->prio >= curr->prio) 229 p->prio >= current->prio)
253 return; 230 return;
254 231
255 disabled = atomic_inc_return(&tr->data[cpu]->disabled); 232 pc = preempt_count();
233 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
256 if (unlikely(disabled != 1)) 234 if (unlikely(disabled != 1))
257 goto out; 235 goto out;
258 236
@@ -264,7 +242,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
264 goto out_locked; 242 goto out_locked;
265 243
266 /* reset the trace */ 244 /* reset the trace */
267 __wakeup_reset(tr); 245 __wakeup_reset(wakeup_trace);
268 246
269 wakeup_cpu = task_cpu(p); 247 wakeup_cpu = task_cpu(p);
270 wakeup_prio = p->prio; 248 wakeup_prio = p->prio;
@@ -274,74 +252,43 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
274 252
275 local_save_flags(flags); 253 local_save_flags(flags);
276 254
277 tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); 255 wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
278 trace_function(tr, tr->data[wakeup_cpu], 256 trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
279 CALLER_ADDR1, CALLER_ADDR2, flags); 257 CALLER_ADDR1, CALLER_ADDR2, flags, pc);
280 258
281out_locked: 259out_locked:
282 __raw_spin_unlock(&wakeup_lock); 260 __raw_spin_unlock(&wakeup_lock);
283out: 261out:
284 atomic_dec(&tr->data[cpu]->disabled); 262 atomic_dec(&wakeup_trace->data[cpu]->disabled);
285} 263}
286 264
287static notrace void 265/*
288wake_up_callback(void *probe_data, void *call_data, 266 * save_tracer_enabled is used to save the state of the tracer_enabled
289 const char *format, va_list *args) 267 * variable when we disable it when we open a trace output file.
290{ 268 */
291 struct trace_array **ptr = probe_data; 269static int save_tracer_enabled;
292 struct trace_array *tr = *ptr;
293 struct task_struct *curr;
294 struct task_struct *task;
295 struct rq *__rq;
296
297 if (likely(!tracer_enabled))
298 return;
299
300 /* Skip pid %d state %ld */
301 (void)va_arg(*args, int);
302 (void)va_arg(*args, long);
303 /* now get the meat: "rq %p task %p rq->curr %p" */
304 __rq = va_arg(*args, typeof(__rq));
305 task = va_arg(*args, typeof(task));
306 curr = va_arg(*args, typeof(curr));
307
308 tracing_record_cmdline(task);
309 tracing_record_cmdline(curr);
310
311 wakeup_check_start(tr, task, curr);
312}
313 270
314static void start_wakeup_tracer(struct trace_array *tr) 271static void start_wakeup_tracer(struct trace_array *tr)
315{ 272{
316 int ret; 273 int ret;
317 274
318 ret = marker_probe_register("kernel_sched_wakeup", 275 ret = register_trace_sched_wakeup(probe_wakeup);
319 "pid %d state %ld ## rq %p task %p rq->curr %p",
320 wake_up_callback,
321 &wakeup_trace);
322 if (ret) { 276 if (ret) {
323 pr_info("wakeup trace: Couldn't add marker" 277 pr_info("wakeup trace: Couldn't activate tracepoint"
324 " probe to kernel_sched_wakeup\n"); 278 " probe to kernel_sched_wakeup\n");
325 return; 279 return;
326 } 280 }
327 281
328 ret = marker_probe_register("kernel_sched_wakeup_new", 282 ret = register_trace_sched_wakeup_new(probe_wakeup);
329 "pid %d state %ld ## rq %p task %p rq->curr %p",
330 wake_up_callback,
331 &wakeup_trace);
332 if (ret) { 283 if (ret) {
333 pr_info("wakeup trace: Couldn't add marker" 284 pr_info("wakeup trace: Couldn't activate tracepoint"
334 " probe to kernel_sched_wakeup_new\n"); 285 " probe to kernel_sched_wakeup_new\n");
335 goto fail_deprobe; 286 goto fail_deprobe;
336 } 287 }
337 288
338 ret = marker_probe_register("kernel_sched_schedule", 289 ret = register_trace_sched_switch(probe_wakeup_sched_switch);
339 "prev_pid %d next_pid %d prev_state %ld "
340 "## rq %p prev %p next %p",
341 sched_switch_callback,
342 &wakeup_trace);
343 if (ret) { 290 if (ret) {
344 pr_info("sched trace: Couldn't add marker" 291 pr_info("sched trace: Couldn't activate tracepoint"
345 " probe to kernel_sched_schedule\n"); 292 " probe to kernel_sched_schedule\n");
346 goto fail_deprobe_wake_new; 293 goto fail_deprobe_wake_new;
347 } 294 }
@@ -359,71 +306,71 @@ static void start_wakeup_tracer(struct trace_array *tr)
359 306
360 register_ftrace_function(&trace_ops); 307 register_ftrace_function(&trace_ops);
361 308
362 tracer_enabled = 1; 309 if (tracing_is_enabled()) {
310 tracer_enabled = 1;
311 save_tracer_enabled = 1;
312 } else {
313 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
363 316
364 return; 317 return;
365fail_deprobe_wake_new: 318fail_deprobe_wake_new:
366 marker_probe_unregister("kernel_sched_wakeup_new", 319 unregister_trace_sched_wakeup_new(probe_wakeup);
367 wake_up_callback,
368 &wakeup_trace);
369fail_deprobe: 320fail_deprobe:
370 marker_probe_unregister("kernel_sched_wakeup", 321 unregister_trace_sched_wakeup(probe_wakeup);
371 wake_up_callback,
372 &wakeup_trace);
373} 322}
374 323
375static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
376{ 325{
377 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
378 unregister_ftrace_function(&trace_ops); 328 unregister_ftrace_function(&trace_ops);
379 marker_probe_unregister("kernel_sched_schedule", 329 unregister_trace_sched_switch(probe_wakeup_sched_switch);
380 sched_switch_callback, 330 unregister_trace_sched_wakeup_new(probe_wakeup);
381 &wakeup_trace); 331 unregister_trace_sched_wakeup(probe_wakeup);
382 marker_probe_unregister("kernel_sched_wakeup_new",
383 wake_up_callback,
384 &wakeup_trace);
385 marker_probe_unregister("kernel_sched_wakeup",
386 wake_up_callback,
387 &wakeup_trace);
388} 332}
389 333
390static void wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
391{ 335{
392 wakeup_trace = tr; 336 wakeup_trace = tr;
393 337 start_wakeup_tracer(tr);
394 if (tr->ctrl) 338 return 0;
395 start_wakeup_tracer(tr);
396} 339}
397 340
398static void wakeup_tracer_reset(struct trace_array *tr) 341static void wakeup_tracer_reset(struct trace_array *tr)
399{ 342{
400 if (tr->ctrl) { 343 stop_wakeup_tracer(tr);
401 stop_wakeup_tracer(tr); 344 /* make sure we put back any tasks we are tracing */
402 /* make sure we put back any tasks we are tracing */ 345 wakeup_reset(tr);
403 wakeup_reset(tr);
404 }
405} 346}
406 347
407static void wakeup_tracer_ctrl_update(struct trace_array *tr) 348static void wakeup_tracer_start(struct trace_array *tr)
408{ 349{
409 if (tr->ctrl) 350 wakeup_reset(tr);
410 start_wakeup_tracer(tr); 351 tracer_enabled = 1;
411 else 352 save_tracer_enabled = 1;
412 stop_wakeup_tracer(tr); 353}
354
355static void wakeup_tracer_stop(struct trace_array *tr)
356{
357 tracer_enabled = 0;
358 save_tracer_enabled = 0;
413} 359}
414 360
415static void wakeup_tracer_open(struct trace_iterator *iter) 361static void wakeup_tracer_open(struct trace_iterator *iter)
416{ 362{
417 /* stop the trace while dumping */ 363 /* stop the trace while dumping */
418 if (iter->tr->ctrl) 364 tracer_enabled = 0;
419 stop_wakeup_tracer(iter->tr);
420} 365}
421 366
422static void wakeup_tracer_close(struct trace_iterator *iter) 367static void wakeup_tracer_close(struct trace_iterator *iter)
423{ 368{
424 /* forget about any processes we were recording */ 369 /* forget about any processes we were recording */
425 if (iter->tr->ctrl) 370 if (save_tracer_enabled) {
426 start_wakeup_tracer(iter->tr); 371 wakeup_reset(iter->tr);
372 tracer_enabled = 1;
373 }
427} 374}
428 375
429static struct tracer wakeup_tracer __read_mostly = 376static struct tracer wakeup_tracer __read_mostly =
@@ -431,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
431 .name = "wakeup", 378 .name = "wakeup",
432 .init = wakeup_tracer_init, 379 .init = wakeup_tracer_init,
433 .reset = wakeup_tracer_reset, 380 .reset = wakeup_tracer_reset,
381 .start = wakeup_tracer_start,
382 .stop = wakeup_tracer_stop,
434 .open = wakeup_tracer_open, 383 .open = wakeup_tracer_open,
435 .close = wakeup_tracer_close, 384 .close = wakeup_tracer_close,
436 .ctrl_update = wakeup_tracer_ctrl_update,
437 .print_max = 1, 385 .print_max = 1,
438#ifdef CONFIG_FTRACE_SELFTEST 386#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_wakeup, 387 .selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0911b7e073bf..88c8eb70f54a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,65 +9,30 @@ static inline int trace_valid_entry(struct trace_entry *entry)
9 case TRACE_FN: 9 case TRACE_FN:
10 case TRACE_CTX: 10 case TRACE_CTX:
11 case TRACE_WAKE: 11 case TRACE_WAKE:
12 case TRACE_CONT:
12 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT:
13 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH:
14 return 1; 17 return 1;
15 } 18 }
16 return 0; 19 return 0;
17} 20}
18 21
19static int 22static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
20trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
21{ 23{
22 struct trace_entry *entries; 24 struct ring_buffer_event *event;
23 struct page *page; 25 struct trace_entry *entry;
24 int idx = 0;
25 int i;
26 26
27 BUG_ON(list_empty(&data->trace_pages)); 27 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
28 page = list_entry(data->trace_pages.next, struct page, lru); 28 entry = ring_buffer_event_data(event);
29 entries = page_address(page);
30 29
31 check_pages(data); 30 if (!trace_valid_entry(entry)) {
32 if (head_page(data) != entries)
33 goto failed;
34
35 /*
36 * The starting trace buffer always has valid elements,
37 * if any element exists.
38 */
39 entries = head_page(data);
40
41 for (i = 0; i < tr->entries; i++) {
42
43 if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
44 printk(KERN_CONT ".. invalid entry %d ", 31 printk(KERN_CONT ".. invalid entry %d ",
45 entries[idx].type); 32 entry->type);
46 goto failed; 33 goto failed;
47 } 34 }
48
49 idx++;
50 if (idx >= ENTRIES_PER_PAGE) {
51 page = virt_to_page(entries);
52 if (page->lru.next == &data->trace_pages) {
53 if (i != tr->entries - 1) {
54 printk(KERN_CONT ".. entries buffer mismatch");
55 goto failed;
56 }
57 } else {
58 page = list_entry(page->lru.next, struct page, lru);
59 entries = page_address(page);
60 }
61 idx = 0;
62 }
63 } 35 }
64
65 page = virt_to_page(entries);
66 if (page->lru.next != &data->trace_pages) {
67 printk(KERN_CONT ".. too many entries");
68 goto failed;
69 }
70
71 return 0; 36 return 0;
72 37
73 failed: 38 failed:
@@ -87,20 +52,18 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
87 int cpu, ret = 0; 52 int cpu, ret = 0;
88 53
89 /* Don't allow flipping of max traces now */ 54 /* Don't allow flipping of max traces now */
90 raw_local_irq_save(flags); 55 local_irq_save(flags);
91 __raw_spin_lock(&ftrace_max_lock); 56 __raw_spin_lock(&ftrace_max_lock);
92 for_each_possible_cpu(cpu) {
93 if (!head_page(tr->data[cpu]))
94 continue;
95 57
96 cnt += tr->data[cpu]->trace_idx; 58 cnt = ring_buffer_entries(tr->buffer);
97 59
98 ret = trace_test_buffer_cpu(tr, tr->data[cpu]); 60 for_each_possible_cpu(cpu) {
61 ret = trace_test_buffer_cpu(tr, cpu);
99 if (ret) 62 if (ret)
100 break; 63 break;
101 } 64 }
102 __raw_spin_unlock(&ftrace_max_lock); 65 __raw_spin_unlock(&ftrace_max_lock);
103 raw_local_irq_restore(flags); 66 local_irq_restore(flags);
104 67
105 if (count) 68 if (count)
106 *count = cnt; 69 *count = cnt;
@@ -108,7 +71,12 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
108 return ret; 71 return ret;
109} 72}
110 73
111#ifdef CONFIG_FTRACE 74static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
75{
76 printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
77 trace->name, init_ret);
78}
79#ifdef CONFIG_FUNCTION_TRACER
112 80
113#ifdef CONFIG_DYNAMIC_FTRACE 81#ifdef CONFIG_DYNAMIC_FTRACE
114 82
@@ -120,11 +88,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
120 struct trace_array *tr, 88 struct trace_array *tr,
121 int (*func)(void)) 89 int (*func)(void))
122{ 90{
123 unsigned long count;
124 int ret;
125 int save_ftrace_enabled = ftrace_enabled; 91 int save_ftrace_enabled = ftrace_enabled;
126 int save_tracer_enabled = tracer_enabled; 92 int save_tracer_enabled = tracer_enabled;
93 unsigned long count;
127 char *func_name; 94 char *func_name;
95 int ret;
128 96
129 /* The ftrace test PASSED */ 97 /* The ftrace test PASSED */
130 printk(KERN_CONT "PASSED\n"); 98 printk(KERN_CONT "PASSED\n");
@@ -137,13 +105,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
137 /* passed in by parameter to fool gcc from optimizing */ 105 /* passed in by parameter to fool gcc from optimizing */
138 func(); 106 func();
139 107
140 /* update the records */
141 ret = ftrace_force_update();
142 if (ret) {
143 printk(KERN_CONT ".. ftraced failed .. ");
144 return ret;
145 }
146
147 /* 108 /*
148 * Some archs *cough*PowerPC*cough* add charachters to the 109 * Some archs *cough*PowerPC*cough* add charachters to the
149 * start of the function names. We simply put a '*' to 110 * start of the function names. We simply put a '*' to
@@ -155,8 +116,12 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
155 ftrace_set_filter(func_name, strlen(func_name), 1); 116 ftrace_set_filter(func_name, strlen(func_name), 1);
156 117
157 /* enable tracing */ 118 /* enable tracing */
158 tr->ctrl = 1; 119 ret = trace->init(tr);
159 trace->init(tr); 120 if (ret) {
121 warn_failed_init_tracer(trace, ret);
122 goto out;
123 }
124
160 /* Sleep for a 1/10 of a second */ 125 /* Sleep for a 1/10 of a second */
161 msleep(100); 126 msleep(100);
162 127
@@ -178,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
178 msleep(100); 143 msleep(100);
179 144
180 /* stop the tracing. */ 145 /* stop the tracing. */
181 tr->ctrl = 0; 146 tracing_stop();
182 trace->ctrl_update(tr);
183 ftrace_enabled = 0; 147 ftrace_enabled = 0;
184 148
185 /* check the trace buffer */ 149 /* check the trace buffer */
186 ret = trace_test_buffer(tr, &count); 150 ret = trace_test_buffer(tr, &count);
187 trace->reset(tr); 151 trace->reset(tr);
152 tracing_start();
188 153
189 /* we should only have one item */ 154 /* we should only have one item */
190 if (!ret && count != 1) { 155 if (!ret && count != 1) {
@@ -192,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
192 ret = -1; 157 ret = -1;
193 goto out; 158 goto out;
194 } 159 }
160
195 out: 161 out:
196 ftrace_enabled = save_ftrace_enabled; 162 ftrace_enabled = save_ftrace_enabled;
197 tracer_enabled = save_tracer_enabled; 163 tracer_enabled = save_tracer_enabled;
@@ -212,37 +178,34 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
212int 178int
213trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 179trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
214{ 180{
215 unsigned long count;
216 int ret;
217 int save_ftrace_enabled = ftrace_enabled; 181 int save_ftrace_enabled = ftrace_enabled;
218 int save_tracer_enabled = tracer_enabled; 182 int save_tracer_enabled = tracer_enabled;
183 unsigned long count;
184 int ret;
219 185
220 /* make sure msleep has been recorded */ 186 /* make sure msleep has been recorded */
221 msleep(1); 187 msleep(1);
222 188
223 /* force the recorded functions to be traced */
224 ret = ftrace_force_update();
225 if (ret) {
226 printk(KERN_CONT ".. ftraced failed .. ");
227 return ret;
228 }
229
230 /* start the tracing */ 189 /* start the tracing */
231 ftrace_enabled = 1; 190 ftrace_enabled = 1;
232 tracer_enabled = 1; 191 tracer_enabled = 1;
233 192
234 tr->ctrl = 1; 193 ret = trace->init(tr);
235 trace->init(tr); 194 if (ret) {
195 warn_failed_init_tracer(trace, ret);
196 goto out;
197 }
198
236 /* Sleep for a 1/10 of a second */ 199 /* Sleep for a 1/10 of a second */
237 msleep(100); 200 msleep(100);
238 /* stop the tracing. */ 201 /* stop the tracing. */
239 tr->ctrl = 0; 202 tracing_stop();
240 trace->ctrl_update(tr);
241 ftrace_enabled = 0; 203 ftrace_enabled = 0;
242 204
243 /* check the trace buffer */ 205 /* check the trace buffer */
244 ret = trace_test_buffer(tr, &count); 206 ret = trace_test_buffer(tr, &count);
245 trace->reset(tr); 207 trace->reset(tr);
208 tracing_start();
246 209
247 if (!ret && !count) { 210 if (!ret && !count) {
248 printk(KERN_CONT ".. no entries found .."); 211 printk(KERN_CONT ".. no entries found ..");
@@ -263,7 +226,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
263 226
264 return ret; 227 return ret;
265} 228}
266#endif /* CONFIG_FTRACE */ 229#endif /* CONFIG_FUNCTION_TRACER */
267 230
268#ifdef CONFIG_IRQSOFF_TRACER 231#ifdef CONFIG_IRQSOFF_TRACER
269int 232int
@@ -274,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
274 int ret; 237 int ret;
275 238
276 /* start the tracing */ 239 /* start the tracing */
277 tr->ctrl = 1; 240 ret = trace->init(tr);
278 trace->init(tr); 241 if (ret) {
242 warn_failed_init_tracer(trace, ret);
243 return ret;
244 }
245
279 /* reset the max latency */ 246 /* reset the max latency */
280 tracing_max_latency = 0; 247 tracing_max_latency = 0;
281 /* disable interrupts for a bit */ 248 /* disable interrupts for a bit */
@@ -283,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
283 udelay(100); 250 udelay(100);
284 local_irq_enable(); 251 local_irq_enable();
285 /* stop the tracing. */ 252 /* stop the tracing. */
286 tr->ctrl = 0; 253 tracing_stop();
287 trace->ctrl_update(tr);
288 /* check both trace buffers */ 254 /* check both trace buffers */
289 ret = trace_test_buffer(tr, NULL); 255 ret = trace_test_buffer(tr, NULL);
290 if (!ret) 256 if (!ret)
291 ret = trace_test_buffer(&max_tr, &count); 257 ret = trace_test_buffer(&max_tr, &count);
292 trace->reset(tr); 258 trace->reset(tr);
259 tracing_start();
293 260
294 if (!ret && !count) { 261 if (!ret && !count) {
295 printk(KERN_CONT ".. no entries found .."); 262 printk(KERN_CONT ".. no entries found ..");
@@ -310,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
310 unsigned long count; 277 unsigned long count;
311 int ret; 278 int ret;
312 279
280 /*
281 * Now that the big kernel lock is no longer preemptable,
282 * and this is called with the BKL held, it will always
283 * fail. If preemption is already disabled, simply
284 * pass the test. When the BKL is removed, or becomes
285 * preemptible again, we will once again test this,
286 * so keep it in.
287 */
288 if (preempt_count()) {
289 printk(KERN_CONT "can not test ... force ");
290 return 0;
291 }
292
313 /* start the tracing */ 293 /* start the tracing */
314 tr->ctrl = 1; 294 ret = trace->init(tr);
315 trace->init(tr); 295 if (ret) {
296 warn_failed_init_tracer(trace, ret);
297 return ret;
298 }
299
316 /* reset the max latency */ 300 /* reset the max latency */
317 tracing_max_latency = 0; 301 tracing_max_latency = 0;
318 /* disable preemption for a bit */ 302 /* disable preemption for a bit */
@@ -320,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
320 udelay(100); 304 udelay(100);
321 preempt_enable(); 305 preempt_enable();
322 /* stop the tracing. */ 306 /* stop the tracing. */
323 tr->ctrl = 0; 307 tracing_stop();
324 trace->ctrl_update(tr);
325 /* check both trace buffers */ 308 /* check both trace buffers */
326 ret = trace_test_buffer(tr, NULL); 309 ret = trace_test_buffer(tr, NULL);
327 if (!ret) 310 if (!ret)
328 ret = trace_test_buffer(&max_tr, &count); 311 ret = trace_test_buffer(&max_tr, &count);
329 trace->reset(tr); 312 trace->reset(tr);
313 tracing_start();
330 314
331 if (!ret && !count) { 315 if (!ret && !count) {
332 printk(KERN_CONT ".. no entries found .."); 316 printk(KERN_CONT ".. no entries found ..");
@@ -347,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
347 unsigned long count; 331 unsigned long count;
348 int ret; 332 int ret;
349 333
334 /*
335 * Now that the big kernel lock is no longer preemptable,
336 * and this is called with the BKL held, it will always
337 * fail. If preemption is already disabled, simply
338 * pass the test. When the BKL is removed, or becomes
339 * preemptible again, we will once again test this,
340 * so keep it in.
341 */
342 if (preempt_count()) {
343 printk(KERN_CONT "can not test ... force ");
344 return 0;
345 }
346
350 /* start the tracing */ 347 /* start the tracing */
351 tr->ctrl = 1; 348 ret = trace->init(tr);
352 trace->init(tr); 349 if (ret) {
350 warn_failed_init_tracer(trace, ret);
351 goto out;
352 }
353 353
354 /* reset the max latency */ 354 /* reset the max latency */
355 tracing_max_latency = 0; 355 tracing_max_latency = 0;
@@ -363,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
363 local_irq_enable(); 363 local_irq_enable();
364 364
365 /* stop the tracing. */ 365 /* stop the tracing. */
366 tr->ctrl = 0; 366 tracing_stop();
367 trace->ctrl_update(tr);
368 /* check both trace buffers */ 367 /* check both trace buffers */
369 ret = trace_test_buffer(tr, NULL); 368 ret = trace_test_buffer(tr, NULL);
370 if (ret) 369 if (ret) {
370 tracing_start();
371 goto out; 371 goto out;
372 }
372 373
373 ret = trace_test_buffer(&max_tr, &count); 374 ret = trace_test_buffer(&max_tr, &count);
374 if (ret) 375 if (ret) {
376 tracing_start();
375 goto out; 377 goto out;
378 }
376 379
377 if (!ret && !count) { 380 if (!ret && !count) {
378 printk(KERN_CONT ".. no entries found .."); 381 printk(KERN_CONT ".. no entries found ..");
379 ret = -1; 382 ret = -1;
383 tracing_start();
380 goto out; 384 goto out;
381 } 385 }
382 386
383 /* do the test by disabling interrupts first this time */ 387 /* do the test by disabling interrupts first this time */
384 tracing_max_latency = 0; 388 tracing_max_latency = 0;
385 tr->ctrl = 1; 389 tracing_start();
386 trace->ctrl_update(tr);
387 preempt_disable(); 390 preempt_disable();
388 local_irq_disable(); 391 local_irq_disable();
389 udelay(100); 392 udelay(100);
@@ -392,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
392 local_irq_enable(); 395 local_irq_enable();
393 396
394 /* stop the tracing. */ 397 /* stop the tracing. */
395 tr->ctrl = 0; 398 tracing_stop();
396 trace->ctrl_update(tr);
397 /* check both trace buffers */ 399 /* check both trace buffers */
398 ret = trace_test_buffer(tr, NULL); 400 ret = trace_test_buffer(tr, NULL);
399 if (ret) 401 if (ret)
@@ -409,12 +411,22 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
409 411
410 out: 412 out:
411 trace->reset(tr); 413 trace->reset(tr);
414 tracing_start();
412 tracing_max_latency = save_max; 415 tracing_max_latency = save_max;
413 416
414 return ret; 417 return ret;
415} 418}
416#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ 419#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
417 420
421#ifdef CONFIG_NOP_TRACER
422int
423trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
424{
425 /* What could possibly go wrong? */
426 return 0;
427}
428#endif
429
418#ifdef CONFIG_SCHED_TRACER 430#ifdef CONFIG_SCHED_TRACER
419static int trace_wakeup_test_thread(void *data) 431static int trace_wakeup_test_thread(void *data)
420{ 432{
@@ -465,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
465 wait_for_completion(&isrt); 477 wait_for_completion(&isrt);
466 478
467 /* start the tracing */ 479 /* start the tracing */
468 tr->ctrl = 1; 480 ret = trace->init(tr);
469 trace->init(tr); 481 if (ret) {
482 warn_failed_init_tracer(trace, ret);
483 return ret;
484 }
485
470 /* reset the max latency */ 486 /* reset the max latency */
471 tracing_max_latency = 0; 487 tracing_max_latency = 0;
472 488
@@ -486,9 +502,11 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
486 502
487 wake_up_process(p); 503 wake_up_process(p);
488 504
505 /* give a little time to let the thread wake up */
506 msleep(100);
507
489 /* stop the tracing. */ 508 /* stop the tracing. */
490 tr->ctrl = 0; 509 tracing_stop();
491 trace->ctrl_update(tr);
492 /* check both trace buffers */ 510 /* check both trace buffers */
493 ret = trace_test_buffer(tr, NULL); 511 ret = trace_test_buffer(tr, NULL);
494 if (!ret) 512 if (!ret)
@@ -496,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
496 514
497 515
498 trace->reset(tr); 516 trace->reset(tr);
517 tracing_start();
499 518
500 tracing_max_latency = save_max; 519 tracing_max_latency = save_max;
501 520
@@ -519,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
519 int ret; 538 int ret;
520 539
521 /* start the tracing */ 540 /* start the tracing */
522 tr->ctrl = 1; 541 ret = trace->init(tr);
523 trace->init(tr); 542 if (ret) {
543 warn_failed_init_tracer(trace, ret);
544 return ret;
545 }
546
524 /* Sleep for a 1/10 of a second */ 547 /* Sleep for a 1/10 of a second */
525 msleep(100); 548 msleep(100);
526 /* stop the tracing. */ 549 /* stop the tracing. */
527 tr->ctrl = 0; 550 tracing_stop();
528 trace->ctrl_update(tr);
529 /* check the trace buffer */ 551 /* check the trace buffer */
530 ret = trace_test_buffer(tr, &count); 552 ret = trace_test_buffer(tr, &count);
531 trace->reset(tr); 553 trace->reset(tr);
554 tracing_start();
532 555
533 if (!ret && !count) { 556 if (!ret && !count) {
534 printk(KERN_CONT ".. no entries found .."); 557 printk(KERN_CONT ".. no entries found ..");
@@ -547,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
547 int ret; 570 int ret;
548 571
549 /* start the tracing */ 572 /* start the tracing */
550 tr->ctrl = 1; 573 ret = trace->init(tr);
551 trace->init(tr); 574 if (ret) {
575 warn_failed_init_tracer(trace, ret);
576 return 0;
577 }
578
552 /* Sleep for a 1/10 of a second */ 579 /* Sleep for a 1/10 of a second */
553 msleep(100); 580 msleep(100);
554 /* stop the tracing. */ 581 /* stop the tracing. */
555 tr->ctrl = 0; 582 tracing_stop();
556 trace->ctrl_update(tr);
557 /* check the trace buffer */ 583 /* check the trace buffer */
558 ret = trace_test_buffer(tr, &count); 584 ret = trace_test_buffer(tr, &count);
559 trace->reset(tr); 585 trace->reset(tr);
586 tracing_start();
560 587
561 return ret; 588 return ret;
562} 589}
563#endif /* CONFIG_SYSPROF_TRACER */ 590#endif /* CONFIG_SYSPROF_TRACER */
591
592#ifdef CONFIG_BRANCH_TRACER
593int
594trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
595{
596 unsigned long count;
597 int ret;
598
599 /* start the tracing */
600 ret = trace->init(tr);
601 if (ret) {
602 warn_failed_init_tracer(trace, ret);
603 return ret;
604 }
605
606 /* Sleep for a 1/10 of a second */
607 msleep(100);
608 /* stop the tracing. */
609 tracing_stop();
610 /* check the trace buffer */
611 ret = trace_test_buffer(tr, &count);
612 trace->reset(tr);
613 tracing_start();
614
615 return ret;
616}
617#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
new file mode 100644
index 000000000000..d0871bc0aca5
--- /dev/null
+++ b/kernel/trace/trace_stack.c
@@ -0,0 +1,360 @@
1/*
2 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
3 *
4 */
5#include <linux/stacktrace.h>
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/uaccess.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/module.h>
13#include <linux/sysctl.h>
14#include <linux/init.h>
15#include <linux/fs.h>
16#include "trace.h"
17
18#define STACK_TRACE_ENTRIES 500
19
20static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
21 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
22static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
23
24static struct stack_trace max_stack_trace = {
25 .max_entries = STACK_TRACE_ENTRIES,
26 .entries = stack_dump_trace,
27};
28
29static unsigned long max_stack_size;
30static raw_spinlock_t max_stack_lock =
31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
32
33static int stack_trace_disabled __read_mostly;
34static DEFINE_PER_CPU(int, trace_active);
35static DEFINE_MUTEX(stack_sysctl_mutex);
36
37int stack_tracer_enabled;
38static int last_stack_tracer_enabled;
39
40static inline void check_stack(void)
41{
42 unsigned long this_size, flags;
43 unsigned long *p, *top, *start;
44 int i;
45
46 this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
47 this_size = THREAD_SIZE - this_size;
48
49 if (this_size <= max_stack_size)
50 return;
51
52 /* we do not handle interrupt stacks yet */
53 if (!object_is_on_stack(&this_size))
54 return;
55
56 local_irq_save(flags);
57 __raw_spin_lock(&max_stack_lock);
58
59 /* a race could have already updated it */
60 if (this_size <= max_stack_size)
61 goto out;
62
63 max_stack_size = this_size;
64
65 max_stack_trace.nr_entries = 0;
66 max_stack_trace.skip = 3;
67
68 save_stack_trace(&max_stack_trace);
69
70 /*
71 * Now find where in the stack these are.
72 */
73 i = 0;
74 start = &this_size;
75 top = (unsigned long *)
76 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
77
78 /*
79 * Loop through all the entries. One of the entries may
80 * for some reason be missed on the stack, so we may
81 * have to account for them. If they are all there, this
82 * loop will only happen once. This code only takes place
83 * on a new max, so it is far from a fast path.
84 */
85 while (i < max_stack_trace.nr_entries) {
86 int found = 0;
87
88 stack_dump_index[i] = this_size;
89 p = start;
90
91 for (; p < top && i < max_stack_trace.nr_entries; p++) {
92 if (*p == stack_dump_trace[i]) {
93 this_size = stack_dump_index[i++] =
94 (top - p) * sizeof(unsigned long);
95 found = 1;
96 /* Start the search from here */
97 start = p + 1;
98 }
99 }
100
101 if (!found)
102 i++;
103 }
104
105 out:
106 __raw_spin_unlock(&max_stack_lock);
107 local_irq_restore(flags);
108}
109
110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{
113 int cpu, resched;
114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return;
117
118 resched = ftrace_preempt_disable();
119
120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */
122 if (per_cpu(trace_active, cpu)++ != 0)
123 goto out;
124
125 check_stack();
126
127 out:
128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched);
131}
132
133static struct ftrace_ops trace_ops __read_mostly =
134{
135 .func = stack_trace_call,
136};
137
138static ssize_t
139stack_max_size_read(struct file *filp, char __user *ubuf,
140 size_t count, loff_t *ppos)
141{
142 unsigned long *ptr = filp->private_data;
143 char buf[64];
144 int r;
145
146 r = snprintf(buf, sizeof(buf), "%ld\n", *ptr);
147 if (r > sizeof(buf))
148 r = sizeof(buf);
149 return simple_read_from_buffer(ubuf, count, ppos, buf, r);
150}
151
152static ssize_t
153stack_max_size_write(struct file *filp, const char __user *ubuf,
154 size_t count, loff_t *ppos)
155{
156 long *ptr = filp->private_data;
157 unsigned long val, flags;
158 char buf[64];
159 int ret;
160
161 if (count >= sizeof(buf))
162 return -EINVAL;
163
164 if (copy_from_user(&buf, ubuf, count))
165 return -EFAULT;
166
167 buf[count] = 0;
168
169 ret = strict_strtoul(buf, 10, &val);
170 if (ret < 0)
171 return ret;
172
173 local_irq_save(flags);
174 __raw_spin_lock(&max_stack_lock);
175 *ptr = val;
176 __raw_spin_unlock(&max_stack_lock);
177 local_irq_restore(flags);
178
179 return count;
180}
181
182static const struct file_operations stack_max_size_fops = {
183 .open = tracing_open_generic,
184 .read = stack_max_size_read,
185 .write = stack_max_size_write,
186};
187
188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos)
190{
191 long i;
192
193 (*pos)++;
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201
202 if (i >= max_stack_trace.nr_entries ||
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL;
205
206 m->private = (void *)i;
207
208 return &m->private;
209}
210
211static void *t_start(struct seq_file *m, loff_t *pos)
212{
213 void *t = SEQ_START_TOKEN;
214 loff_t l = 0;
215
216 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock);
218
219 if (*pos == 0)
220 return SEQ_START_TOKEN;
221
222 for (; t && l < *pos; t = t_next(m, t, &l))
223 ;
224
225 return t;
226}
227
228static void t_stop(struct seq_file *m, void *p)
229{
230 __raw_spin_unlock(&max_stack_lock);
231 local_irq_enable();
232}
233
234static int trace_lookup_stack(struct seq_file *m, long i)
235{
236 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239
240 sprint_symbol(str, addr);
241
242 return seq_printf(m, "%s\n", str);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246}
247
248static int t_show(struct seq_file *m, void *v)
249{
250 long i;
251 int size;
252
253 if (v == SEQ_START_TOKEN) {
254 seq_printf(m, " Depth Size Location"
255 " (%d entries)\n"
256 " ----- ---- --------\n",
257 max_stack_trace.nr_entries);
258 return 0;
259 }
260
261 i = *(long *)v;
262
263 if (i >= max_stack_trace.nr_entries ||
264 stack_dump_trace[i] == ULONG_MAX)
265 return 0;
266
267 if (i+1 == max_stack_trace.nr_entries ||
268 stack_dump_trace[i+1] == ULONG_MAX)
269 size = stack_dump_index[i];
270 else
271 size = stack_dump_index[i] - stack_dump_index[i+1];
272
273 seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
274
275 trace_lookup_stack(m, i);
276
277 return 0;
278}
279
280static const struct seq_operations stack_trace_seq_ops = {
281 .start = t_start,
282 .next = t_next,
283 .stop = t_stop,
284 .show = t_show,
285};
286
287static int stack_trace_open(struct inode *inode, struct file *file)
288{
289 int ret;
290
291 ret = seq_open(file, &stack_trace_seq_ops);
292
293 return ret;
294}
295
296static const struct file_operations stack_trace_fops = {
297 .open = stack_trace_open,
298 .read = seq_read,
299 .llseek = seq_lseek,
300};
301
302int
303stack_trace_sysctl(struct ctl_table *table, int write,
304 struct file *file, void __user *buffer, size_t *lenp,
305 loff_t *ppos)
306{
307 int ret;
308
309 mutex_lock(&stack_sysctl_mutex);
310
311 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
312
313 if (ret || !write ||
314 (last_stack_tracer_enabled == stack_tracer_enabled))
315 goto out;
316
317 last_stack_tracer_enabled = stack_tracer_enabled;
318
319 if (stack_tracer_enabled)
320 register_ftrace_function(&trace_ops);
321 else
322 unregister_ftrace_function(&trace_ops);
323
324 out:
325 mutex_unlock(&stack_sysctl_mutex);
326 return ret;
327}
328
329static __init int enable_stacktrace(char *str)
330{
331 stack_tracer_enabled = 1;
332 last_stack_tracer_enabled = 1;
333 return 1;
334}
335__setup("stacktrace", enable_stacktrace);
336
337static __init int stack_trace_init(void)
338{
339 struct dentry *d_tracer;
340 struct dentry *entry;
341
342 d_tracer = tracing_init_dentry();
343
344 entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
345 &max_stack_size, &stack_max_size_fops);
346 if (!entry)
347 pr_warning("Could not create debugfs 'stack_max_size' entry\n");
348
349 entry = debugfs_create_file("stack_trace", 0444, d_tracer,
350 NULL, &stack_trace_fops);
351 if (!entry)
352 pr_warning("Could not create debugfs 'stack_trace' entry\n");
353
354 if (stack_tracer_enabled)
355 register_ftrace_function(&trace_ops);
356
357 return 0;
358}
359
360device_initcall(stack_trace_init);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index db58fb66a135..a5779bd975db 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,6 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 205
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 207}
@@ -234,20 +233,10 @@ static void stop_stack_timers(void)
234 stop_stack_timer(cpu); 233 stop_stack_timer(cpu);
235} 234}
236 235
237static void stack_reset(struct trace_array *tr)
238{
239 int cpu;
240
241 tr->time_start = ftrace_now(tr->cpu);
242
243 for_each_online_cpu(cpu)
244 tracing_reset(tr->data[cpu]);
245}
246
247static void start_stack_trace(struct trace_array *tr) 236static void start_stack_trace(struct trace_array *tr)
248{ 237{
249 mutex_lock(&sample_timer_lock); 238 mutex_lock(&sample_timer_lock);
250 stack_reset(tr); 239 tracing_reset_online_cpus(tr);
251 start_stack_timers(); 240 start_stack_timers();
252 tracer_enabled = 1; 241 tracer_enabled = 1;
253 mutex_unlock(&sample_timer_lock); 242 mutex_unlock(&sample_timer_lock);
@@ -261,27 +250,17 @@ static void stop_stack_trace(struct trace_array *tr)
261 mutex_unlock(&sample_timer_lock); 250 mutex_unlock(&sample_timer_lock);
262} 251}
263 252
264static void stack_trace_init(struct trace_array *tr) 253static int stack_trace_init(struct trace_array *tr)
265{ 254{
266 sysprof_trace = tr; 255 sysprof_trace = tr;
267 256
268 if (tr->ctrl) 257 start_stack_trace(tr);
269 start_stack_trace(tr); 258 return 0;
270} 259}
271 260
272static void stack_trace_reset(struct trace_array *tr) 261static void stack_trace_reset(struct trace_array *tr)
273{ 262{
274 if (tr->ctrl) 263 stop_stack_trace(tr);
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285} 264}
286 265
287static struct tracer stack_trace __read_mostly = 266static struct tracer stack_trace __read_mostly =
@@ -289,7 +268,6 @@ static struct tracer stack_trace __read_mostly =
289 .name = "sysprof", 268 .name = "sysprof",
290 .init = stack_trace_init, 269 .init = stack_trace_init,
291 .reset = stack_trace_reset, 270 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST 271#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof, 272 .selftest = trace_selftest_startup_sysprof,
295#endif 273#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
new file mode 100644
index 000000000000..79602740bbb5
--- /dev/null
+++ b/kernel/tracepoint.c
@@ -0,0 +1,576 @@
1/*
2 * Copyright (C) 2008 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/tracepoint.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[];
30
31/* Set to 1 to enable tracepoint debug output */
32static const int tracepoint_debug;
33
34/*
35 * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
36 * builtin and module tracepoints and the hash table.
37 */
38static DEFINE_MUTEX(tracepoints_mutex);
39
40/*
41 * Tracepoint hash table, containing the active tracepoints.
42 * Protected by tracepoints_mutex.
43 */
44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to to delay the free of multiple probes array until a quiescent
51 * state is reached.
52 * Tracepoint entries modifications are protected by the tracepoints_mutex.
53 */
54struct tracepoint_entry {
55 struct hlist_node hlist;
56 void **funcs;
57 int refcount; /* Number of times armed. 0 if disarmed. */
58 char name[0];
59};
60
61struct tp_probes {
62 union {
63 struct rcu_head rcu;
64 struct list_head list;
65 } u;
66 void *probes[0];
67};
68
69static inline void *allocate_probes(int count)
70{
71 struct tp_probes *p = kmalloc(count * sizeof(void *)
72 + sizeof(struct tp_probes), GFP_KERNEL);
73 return p == NULL ? NULL : p->probes;
74}
75
76static void rcu_free_old_probes(struct rcu_head *head)
77{
78 kfree(container_of(head, struct tp_probes, u.rcu));
79}
80
81static inline void release_probes(void *old)
82{
83 if (old) {
84 struct tp_probes *tp_probes = container_of(old,
85 struct tp_probes, probes[0]);
86 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
87 }
88}
89
90static void debug_print_probes(struct tracepoint_entry *entry)
91{
92 int i;
93
94 if (!tracepoint_debug || !entry->funcs)
95 return;
96
97 for (i = 0; entry->funcs[i]; i++)
98 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
99}
100
101static void *
102tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
103{
104 int nr_probes = 0;
105 void **old, **new;
106
107 WARN_ON(!probe);
108
109 debug_print_probes(entry);
110 old = entry->funcs;
111 if (old) {
112 /* (N -> N+1), (N != 0, 1) probes */
113 for (nr_probes = 0; old[nr_probes]; nr_probes++)
114 if (old[nr_probes] == probe)
115 return ERR_PTR(-EEXIST);
116 }
117 /* + 2 : one for new probe, one for NULL func */
118 new = allocate_probes(nr_probes + 2);
119 if (new == NULL)
120 return ERR_PTR(-ENOMEM);
121 if (old)
122 memcpy(new, old, nr_probes * sizeof(void *));
123 new[nr_probes] = probe;
124 new[nr_probes + 1] = NULL;
125 entry->refcount = nr_probes + 1;
126 entry->funcs = new;
127 debug_print_probes(entry);
128 return old;
129}
130
131static void *
132tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
133{
134 int nr_probes = 0, nr_del = 0, i;
135 void **old, **new;
136
137 old = entry->funcs;
138
139 if (!old)
140 return ERR_PTR(-ENOENT);
141
142 debug_print_probes(entry);
143 /* (N -> M), (N > 1, M >= 0) probes */
144 for (nr_probes = 0; old[nr_probes]; nr_probes++) {
145 if ((!probe || old[nr_probes] == probe))
146 nr_del++;
147 }
148
149 if (nr_probes - nr_del == 0) {
150 /* N -> 0, (N > 1) */
151 entry->funcs = NULL;
152 entry->refcount = 0;
153 debug_print_probes(entry);
154 return old;
155 } else {
156 int j = 0;
157 /* N -> M, (N > 1, M > 0) */
158 /* + 1 for NULL */
159 new = allocate_probes(nr_probes - nr_del + 1);
160 if (new == NULL)
161 return ERR_PTR(-ENOMEM);
162 for (i = 0; old[i]; i++)
163 if ((probe && old[i] != probe))
164 new[j++] = old[i];
165 new[nr_probes - nr_del] = NULL;
166 entry->refcount = nr_probes - nr_del;
167 entry->funcs = new;
168 }
169 debug_print_probes(entry);
170 return old;
171}
172
173/*
174 * Get tracepoint if the tracepoint is present in the tracepoint hash table.
175 * Must be called with tracepoints_mutex held.
176 * Returns NULL if not present.
177 */
178static struct tracepoint_entry *get_tracepoint(const char *name)
179{
180 struct hlist_head *head;
181 struct hlist_node *node;
182 struct tracepoint_entry *e;
183 u32 hash = jhash(name, strlen(name), 0);
184
185 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
186 hlist_for_each_entry(e, node, head, hlist) {
187 if (!strcmp(name, e->name))
188 return e;
189 }
190 return NULL;
191}
192
193/*
194 * Add the tracepoint to the tracepoint hash table. Must be called with
195 * tracepoints_mutex held.
196 */
197static struct tracepoint_entry *add_tracepoint(const char *name)
198{
199 struct hlist_head *head;
200 struct hlist_node *node;
201 struct tracepoint_entry *e;
202 size_t name_len = strlen(name) + 1;
203 u32 hash = jhash(name, name_len-1, 0);
204
205 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
206 hlist_for_each_entry(e, node, head, hlist) {
207 if (!strcmp(name, e->name)) {
208 printk(KERN_NOTICE
209 "tracepoint %s busy\n", name);
210 return ERR_PTR(-EEXIST); /* Already there */
211 }
212 }
213 /*
214 * Using kmalloc here to allocate a variable length element. Could
215 * cause some memory fragmentation if overused.
216 */
217 e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
218 if (!e)
219 return ERR_PTR(-ENOMEM);
220 memcpy(&e->name[0], name, name_len);
221 e->funcs = NULL;
222 e->refcount = 0;
223 hlist_add_head(&e->hlist, head);
224 return e;
225}
226
227/*
228 * Remove the tracepoint from the tracepoint hash table. Must be called with
229 * mutex_lock held.
230 */
231static inline void remove_tracepoint(struct tracepoint_entry *e)
232{
233 hlist_del(&e->hlist);
234 kfree(e);
235}
236
237/*
238 * Sets the probe callback corresponding to one tracepoint.
239 */
240static void set_tracepoint(struct tracepoint_entry **entry,
241 struct tracepoint *elem, int active)
242{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244
245 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it.
248 * This array is referenced by __DO_TRACE from
249 * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
250 * is used.
251 */
252 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
253 elem->state = active;
254}
255
256/*
257 * Disable a tracepoint and its probe callback.
258 * Note: only waiting an RCU period after setting elem->call to the empty
259 * function insures that the original callback is not used anymore. This insured
260 * by preempt_disable around the call site.
261 */
262static void disable_tracepoint(struct tracepoint *elem)
263{
264 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL);
266}
267
268/**
269 * tracepoint_update_probe_range - Update a probe range
270 * @begin: beginning of the range
271 * @end: end of the range
272 *
273 * Updates the probe callback corresponding to a range of tracepoints.
274 */
275void tracepoint_update_probe_range(struct tracepoint *begin,
276 struct tracepoint *end)
277{
278 struct tracepoint *iter;
279 struct tracepoint_entry *mark_entry;
280
281 mutex_lock(&tracepoints_mutex);
282 for (iter = begin; iter < end; iter++) {
283 mark_entry = get_tracepoint(iter->name);
284 if (mark_entry) {
285 set_tracepoint(&mark_entry, iter,
286 !!mark_entry->refcount);
287 } else {
288 disable_tracepoint(iter);
289 }
290 }
291 mutex_unlock(&tracepoints_mutex);
292}
293
294/*
295 * Update probes, removing the faulty probes.
296 */
297static void tracepoint_update_probes(void)
298{
299 /* Core kernel tracepoints */
300 tracepoint_update_probe_range(__start___tracepoints,
301 __stop___tracepoints);
302 /* tracepoints in modules. */
303 module_update_tracepoints();
304}
305
306static void *tracepoint_add_probe(const char *name, void *probe)
307{
308 struct tracepoint_entry *entry;
309 void *old;
310
311 entry = get_tracepoint(name);
312 if (!entry) {
313 entry = add_tracepoint(name);
314 if (IS_ERR(entry))
315 return entry;
316 }
317 old = tracepoint_entry_add_probe(entry, probe);
318 if (IS_ERR(old) && !entry->refcount)
319 remove_tracepoint(entry);
320 return old;
321}
322
323/**
324 * tracepoint_probe_register - Connect a probe to a tracepoint
325 * @name: tracepoint name
326 * @probe: probe handler
327 *
328 * Returns 0 if ok, error value on error.
329 * The probe address must at least be aligned on the architecture pointer size.
330 */
331int tracepoint_probe_register(const char *name, void *probe)
332{
333 void *old;
334
335 mutex_lock(&tracepoints_mutex);
336 old = tracepoint_add_probe(name, probe);
337 mutex_unlock(&tracepoints_mutex);
338 if (IS_ERR(old))
339 return PTR_ERR(old);
340
341 tracepoint_update_probes(); /* may update entry */
342 release_probes(old);
343 return 0;
344}
345EXPORT_SYMBOL_GPL(tracepoint_probe_register);
346
347static void *tracepoint_remove_probe(const char *name, void *probe)
348{
349 struct tracepoint_entry *entry;
350 void *old;
351
352 entry = get_tracepoint(name);
353 if (!entry)
354 return ERR_PTR(-ENOENT);
355 old = tracepoint_entry_remove_probe(entry, probe);
356 if (IS_ERR(old))
357 return old;
358 if (!entry->refcount)
359 remove_tracepoint(entry);
360 return old;
361}
362
363/**
364 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
365 * @name: tracepoint name
366 * @probe: probe function pointer
367 *
368 * We do not need to call a synchronize_sched to make sure the probes have
369 * finished running before doing a module unload, because the module unload
370 * itself uses stop_machine(), which insures that every preempt disabled section
371 * have finished.
372 */
373int tracepoint_probe_unregister(const char *name, void *probe)
374{
375 void *old;
376
377 mutex_lock(&tracepoints_mutex);
378 old = tracepoint_remove_probe(name, probe);
379 mutex_unlock(&tracepoints_mutex);
380 if (IS_ERR(old))
381 return PTR_ERR(old);
382
383 tracepoint_update_probes(); /* may update entry */
384 release_probes(old);
385 return 0;
386}
387EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
388
389static LIST_HEAD(old_probes);
390static int need_update;
391
392static void tracepoint_add_old_probes(void *old)
393{
394 need_update = 1;
395 if (old) {
396 struct tp_probes *tp_probes = container_of(old,
397 struct tp_probes, probes[0]);
398 list_add(&tp_probes->u.list, &old_probes);
399 }
400}
401
402/**
403 * tracepoint_probe_register_noupdate - register a probe but not connect
404 * @name: tracepoint name
405 * @probe: probe handler
406 *
407 * caller must call tracepoint_probe_update_all()
408 */
409int tracepoint_probe_register_noupdate(const char *name, void *probe)
410{
411 void *old;
412
413 mutex_lock(&tracepoints_mutex);
414 old = tracepoint_add_probe(name, probe);
415 if (IS_ERR(old)) {
416 mutex_unlock(&tracepoints_mutex);
417 return PTR_ERR(old);
418 }
419 tracepoint_add_old_probes(old);
420 mutex_unlock(&tracepoints_mutex);
421 return 0;
422}
423EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
424
425/**
426 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
427 * @name: tracepoint name
428 * @probe: probe function pointer
429 *
430 * caller must call tracepoint_probe_update_all()
431 */
432int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
433{
434 void *old;
435
436 mutex_lock(&tracepoints_mutex);
437 old = tracepoint_remove_probe(name, probe);
438 if (IS_ERR(old)) {
439 mutex_unlock(&tracepoints_mutex);
440 return PTR_ERR(old);
441 }
442 tracepoint_add_old_probes(old);
443 mutex_unlock(&tracepoints_mutex);
444 return 0;
445}
446EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
447
448/**
449 * tracepoint_probe_update_all - update tracepoints
450 */
451void tracepoint_probe_update_all(void)
452{
453 LIST_HEAD(release_probes);
454 struct tp_probes *pos, *next;
455
456 mutex_lock(&tracepoints_mutex);
457 if (!need_update) {
458 mutex_unlock(&tracepoints_mutex);
459 return;
460 }
461 if (!list_empty(&old_probes))
462 list_replace_init(&old_probes, &release_probes);
463 need_update = 0;
464 mutex_unlock(&tracepoints_mutex);
465
466 tracepoint_update_probes();
467 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
468 list_del(&pos->u.list);
469 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
470 }
471}
472EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
473
474/**
475 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
476 * @tracepoint: current tracepoints (in), next tracepoint (out)
477 * @begin: beginning of the range
478 * @end: end of the range
479 *
480 * Returns whether a next tracepoint has been found (1) or not (0).
481 * Will return the first tracepoint in the range if the input tracepoint is
482 * NULL.
483 */
484int tracepoint_get_iter_range(struct tracepoint **tracepoint,
485 struct tracepoint *begin, struct tracepoint *end)
486{
487 if (!*tracepoint && begin != end) {
488 *tracepoint = begin;
489 return 1;
490 }
491 if (*tracepoint >= begin && *tracepoint < end)
492 return 1;
493 return 0;
494}
495EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
496
497static void tracepoint_get_iter(struct tracepoint_iter *iter)
498{
499 int found = 0;
500
501 /* Core kernel tracepoints */
502 if (!iter->module) {
503 found = tracepoint_get_iter_range(&iter->tracepoint,
504 __start___tracepoints, __stop___tracepoints);
505 if (found)
506 goto end;
507 }
508 /* tracepoints in modules. */
509 found = module_get_iter_tracepoints(iter);
510end:
511 if (!found)
512 tracepoint_iter_reset(iter);
513}
514
515void tracepoint_iter_start(struct tracepoint_iter *iter)
516{
517 tracepoint_get_iter(iter);
518}
519EXPORT_SYMBOL_GPL(tracepoint_iter_start);
520
521void tracepoint_iter_next(struct tracepoint_iter *iter)
522{
523 iter->tracepoint++;
524 /*
525 * iter->tracepoint may be invalid because we blindly incremented it.
526 * Make sure it is valid by marshalling on the tracepoints, getting the
527 * tracepoints from following modules if necessary.
528 */
529 tracepoint_get_iter(iter);
530}
531EXPORT_SYMBOL_GPL(tracepoint_iter_next);
532
533void tracepoint_iter_stop(struct tracepoint_iter *iter)
534{
535}
536EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
537
538void tracepoint_iter_reset(struct tracepoint_iter *iter)
539{
540 iter->module = NULL;
541 iter->tracepoint = NULL;
542}
543EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
544
545#ifdef CONFIG_MODULES
546
547int tracepoint_module_notify(struct notifier_block *self,
548 unsigned long val, void *data)
549{
550 struct module *mod = data;
551
552 switch (val) {
553 case MODULE_STATE_COMING:
554 tracepoint_update_probe_range(mod->tracepoints,
555 mod->tracepoints + mod->num_tracepoints);
556 break;
557 case MODULE_STATE_GOING:
558 tracepoint_update_probe_range(mod->tracepoints,
559 mod->tracepoints + mod->num_tracepoints);
560 break;
561 }
562 return 0;
563}
564
565struct notifier_block tracepoint_module_nb = {
566 .notifier_call = tracepoint_module_notify,
567 .priority = 0,
568};
569
570static int init_tracepoints(void)
571{
572 return register_module_notifier(&tracepoint_module_nb);
573}
574__initcall(init_tracepoints);
575
576#endif /* CONFIG_MODULES */
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 8ebcd8532dfb..2dc06ab35716 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -27,6 +27,7 @@
27 */ 27 */
28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
29{ 29{
30 const struct cred *tcred;
30 struct timespec uptime, ts; 31 struct timespec uptime, ts;
31 u64 ac_etime; 32 u64 ac_etime;
32 33
@@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
53 stats->ac_flag |= AXSIG; 54 stats->ac_flag |= AXSIG;
54 stats->ac_nice = task_nice(tsk); 55 stats->ac_nice = task_nice(tsk);
55 stats->ac_sched = tsk->policy; 56 stats->ac_sched = tsk->policy;
56 stats->ac_uid = tsk->uid;
57 stats->ac_gid = tsk->gid;
58 stats->ac_pid = tsk->pid; 57 stats->ac_pid = tsk->pid;
59 rcu_read_lock(); 58 rcu_read_lock();
59 tcred = __task_cred(tsk);
60 stats->ac_uid = tcred->uid;
61 stats->ac_gid = tcred->gid;
60 stats->ac_ppid = pid_alive(tsk) ? 62 stats->ac_ppid = pid_alive(tsk) ?
61 rcu_dereference(tsk->real_parent)->tgid : 0; 63 rcu_dereference(tsk->real_parent)->tgid : 0;
62 rcu_read_unlock(); 64 rcu_read_unlock();
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 3e41c1673e2f..2460c3199b5a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
84 84
85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
86{ 86{
87 const struct cred *cred = current_cred();
87 int retval; 88 int retval;
88 89
89 if (!(retval = put_user(high2lowuid(current->uid), ruid)) && 90 if (!(retval = put_user(high2lowuid(cred->uid), ruid)) &&
90 !(retval = put_user(high2lowuid(current->euid), euid))) 91 !(retval = put_user(high2lowuid(cred->euid), euid)))
91 retval = put_user(high2lowuid(current->suid), suid); 92 retval = put_user(high2lowuid(cred->suid), suid);
92 93
93 return retval; 94 return retval;
94} 95}
@@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
104 105
105asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
106{ 107{
108 const struct cred *cred = current_cred();
107 int retval; 109 int retval;
108 110
109 if (!(retval = put_user(high2lowgid(current->gid), rgid)) && 111 if (!(retval = put_user(high2lowgid(cred->gid), rgid)) &&
110 !(retval = put_user(high2lowgid(current->egid), egid))) 112 !(retval = put_user(high2lowgid(cred->egid), egid)))
111 retval = put_user(high2lowgid(current->sgid), sgid); 113 retval = put_user(high2lowgid(cred->sgid), sgid);
112 114
113 return retval; 115 return retval;
114} 116}
@@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info,
161 163
162asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) 164asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
163{ 165{
164 int i = 0; 166 const struct cred *cred = current_cred();
167 int i;
165 168
166 if (gidsetsize < 0) 169 if (gidsetsize < 0)
167 return -EINVAL; 170 return -EINVAL;
168 171
169 get_group_info(current->group_info); 172 i = cred->group_info->ngroups;
170 i = current->group_info->ngroups;
171 if (gidsetsize) { 173 if (gidsetsize) {
172 if (i > gidsetsize) { 174 if (i > gidsetsize) {
173 i = -EINVAL; 175 i = -EINVAL;
174 goto out; 176 goto out;
175 } 177 }
176 if (groups16_to_user(grouplist, current->group_info)) { 178 if (groups16_to_user(grouplist, cred->group_info)) {
177 i = -EFAULT; 179 i = -EFAULT;
178 goto out; 180 goto out;
179 } 181 }
180 } 182 }
181out: 183out:
182 put_group_info(current->group_info);
183 return i; 184 return i;
184} 185}
185 186
@@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
210 211
211asmlinkage long sys_getuid16(void) 212asmlinkage long sys_getuid16(void)
212{ 213{
213 return high2lowuid(current->uid); 214 return high2lowuid(current_uid());
214} 215}
215 216
216asmlinkage long sys_geteuid16(void) 217asmlinkage long sys_geteuid16(void)
217{ 218{
218 return high2lowuid(current->euid); 219 return high2lowuid(current_euid());
219} 220}
220 221
221asmlinkage long sys_getgid16(void) 222asmlinkage long sys_getgid16(void)
222{ 223{
223 return high2lowgid(current->gid); 224 return high2lowgid(current_gid());
224} 225}
225 226
226asmlinkage long sys_getegid16(void) 227asmlinkage long sys_getegid16(void)
227{ 228{
228 return high2lowgid(current->egid); 229 return high2lowgid(current_egid());
229} 230}
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..477b6660f447 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,12 +16,13 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
19 20
20struct user_namespace init_user_ns = { 21struct user_namespace init_user_ns = {
21 .kref = { 22 .kref = {
22 .refcount = ATOMIC_INIT(2), 23 .refcount = ATOMIC_INIT(1),
23 }, 24 },
24 .root_user = &root_user, 25 .creator = &root_user,
25}; 26};
26EXPORT_SYMBOL_GPL(init_user_ns); 27EXPORT_SYMBOL_GPL(init_user_ns);
27 28
@@ -47,12 +48,14 @@ static struct kmem_cache *uid_cachep;
47 */ 48 */
48static DEFINE_SPINLOCK(uidhash_lock); 49static DEFINE_SPINLOCK(uidhash_lock);
49 50
51/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
50struct user_struct root_user = { 52struct user_struct root_user = {
51 .__count = ATOMIC_INIT(1), 53 .__count = ATOMIC_INIT(2),
52 .processes = ATOMIC_INIT(1), 54 .processes = ATOMIC_INIT(1),
53 .files = ATOMIC_INIT(0), 55 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns,
56#ifdef CONFIG_USER_SCHED 59#ifdef CONFIG_USER_SCHED
57 .tg = &init_task_group, 60 .tg = &init_task_group,
58#endif 61#endif
@@ -101,19 +104,15 @@ static int sched_create_user(struct user_struct *up)
101 if (IS_ERR(up->tg)) 104 if (IS_ERR(up->tg))
102 rc = -ENOMEM; 105 rc = -ENOMEM;
103 106
104 return rc; 107 set_tg_uid(up);
105}
106 108
107static void sched_switch_user(struct task_struct *p) 109 return rc;
108{
109 sched_move_task(p);
110} 110}
111 111
112#else /* CONFIG_USER_SCHED */ 112#else /* CONFIG_USER_SCHED */
113 113
114static void sched_destroy_user(struct user_struct *up) { } 114static void sched_destroy_user(struct user_struct *up) { }
115static int sched_create_user(struct user_struct *up) { return 0; } 115static int sched_create_user(struct user_struct *up) { return 0; }
116static void sched_switch_user(struct task_struct *p) { }
117 116
118#endif /* CONFIG_USER_SCHED */ 117#endif /* CONFIG_USER_SCHED */
119 118
@@ -242,13 +241,21 @@ static struct kobj_type uids_ktype = {
242 .release = uids_release, 241 .release = uids_release,
243}; 242};
244 243
245/* create /sys/kernel/uids/<uid>/cpu_share file for this user */ 244/*
245 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
246 * We do not create this file for users in a user namespace (until
247 * sysfs tagging is implemented).
248 *
249 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
250 */
246static int uids_user_create(struct user_struct *up) 251static int uids_user_create(struct user_struct *up)
247{ 252{
248 struct kobject *kobj = &up->kobj; 253 struct kobject *kobj = &up->kobj;
249 int error; 254 int error;
250 255
251 memset(kobj, 0, sizeof(struct kobject)); 256 memset(kobj, 0, sizeof(struct kobject));
257 if (up->user_ns != &init_user_ns)
258 return 0;
252 kobj->kset = uids_kset; 259 kobj->kset = uids_kset;
253 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); 260 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
254 if (error) { 261 if (error) {
@@ -284,6 +291,8 @@ static void remove_user_sysfs_dir(struct work_struct *w)
284 unsigned long flags; 291 unsigned long flags;
285 int remove_user = 0; 292 int remove_user = 0;
286 293
294 if (up->user_ns != &init_user_ns)
295 return;
287 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() 296 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
288 * atomic. 297 * atomic.
289 */ 298 */
@@ -319,12 +328,13 @@ done:
319 * IRQ state (as stored in flags) is restored and uidhash_lock released 328 * IRQ state (as stored in flags) is restored and uidhash_lock released
320 * upon function exit. 329 * upon function exit.
321 */ 330 */
322static inline void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
323{ 332{
324 /* restore back the count */ 333 /* restore back the count */
325 atomic_inc(&up->__count); 334 atomic_inc(&up->__count);
326 spin_unlock_irqrestore(&uidhash_lock, flags); 335 spin_unlock_irqrestore(&uidhash_lock, flags);
327 336
337 put_user_ns(up->user_ns);
328 INIT_WORK(&up->work, remove_user_sysfs_dir); 338 INIT_WORK(&up->work, remove_user_sysfs_dir);
329 schedule_work(&up->work); 339 schedule_work(&up->work);
330} 340}
@@ -340,13 +350,14 @@ static inline void uids_mutex_unlock(void) { }
340 * IRQ state (as stored in flags) is restored and uidhash_lock released 350 * IRQ state (as stored in flags) is restored and uidhash_lock released
341 * upon function exit. 351 * upon function exit.
342 */ 352 */
343static inline void free_user(struct user_struct *up, unsigned long flags) 353static void free_user(struct user_struct *up, unsigned long flags)
344{ 354{
345 uid_hash_remove(up); 355 uid_hash_remove(up);
346 spin_unlock_irqrestore(&uidhash_lock, flags); 356 spin_unlock_irqrestore(&uidhash_lock, flags);
347 sched_destroy_user(up); 357 sched_destroy_user(up);
348 key_put(up->uid_keyring); 358 key_put(up->uid_keyring);
349 key_put(up->session_keyring); 359 key_put(up->session_keyring);
360 put_user_ns(up->user_ns);
350 kmem_cache_free(uid_cachep, up); 361 kmem_cache_free(uid_cachep, up);
351} 362}
352 363
@@ -362,7 +373,7 @@ struct user_struct *find_user(uid_t uid)
362{ 373{
363 struct user_struct *ret; 374 struct user_struct *ret;
364 unsigned long flags; 375 unsigned long flags;
365 struct user_namespace *ns = current->nsproxy->user_ns; 376 struct user_namespace *ns = current_user_ns();
366 377
367 spin_lock_irqsave(&uidhash_lock, flags); 378 spin_lock_irqsave(&uidhash_lock, flags);
368 ret = uid_hash_find(uid, uidhashentry(ns, uid)); 379 ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -409,6 +420,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
409 if (sched_create_user(new) < 0) 420 if (sched_create_user(new) < 0)
410 goto out_free_user; 421 goto out_free_user;
411 422
423 new->user_ns = get_user_ns(ns);
424
412 if (uids_user_create(new)) 425 if (uids_user_create(new))
413 goto out_destoy_sched; 426 goto out_destoy_sched;
414 427
@@ -432,7 +445,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
432 up = new; 445 up = new;
433 } 446 }
434 spin_unlock_irq(&uidhash_lock); 447 spin_unlock_irq(&uidhash_lock);
435
436 } 448 }
437 449
438 uids_mutex_unlock(); 450 uids_mutex_unlock();
@@ -441,6 +453,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
441 453
442out_destoy_sched: 454out_destoy_sched:
443 sched_destroy_user(new); 455 sched_destroy_user(new);
456 put_user_ns(new->user_ns);
444out_free_user: 457out_free_user:
445 kmem_cache_free(uid_cachep, new); 458 kmem_cache_free(uid_cachep, new);
446out_unlock: 459out_unlock:
@@ -448,63 +461,6 @@ out_unlock:
448 return NULL; 461 return NULL;
449} 462}
450 463
451void switch_uid(struct user_struct *new_user)
452{
453 struct user_struct *old_user;
454
455 /* What if a process setreuid()'s and this brings the
456 * new uid over his NPROC rlimit? We can check this now
457 * cheaply with the new uid cache, so if it matters
458 * we should be checking for it. -DaveM
459 */
460 old_user = current->user;
461 atomic_inc(&new_user->processes);
462 atomic_dec(&old_user->processes);
463 switch_uid_keyring(new_user);
464 current->user = new_user;
465 sched_switch_user(current);
466
467 /*
468 * We need to synchronize with __sigqueue_alloc()
469 * doing a get_uid(p->user).. If that saw the old
470 * user value, we need to wait until it has exited
471 * its critical region before we can free the old
472 * structure.
473 */
474 smp_mb();
475 spin_unlock_wait(&current->sighand->siglock);
476
477 free_uid(old_user);
478 suid_keys(current);
479}
480
481#ifdef CONFIG_USER_NS
482void release_uids(struct user_namespace *ns)
483{
484 int i;
485 unsigned long flags;
486 struct hlist_head *head;
487 struct hlist_node *nd;
488
489 spin_lock_irqsave(&uidhash_lock, flags);
490 /*
491 * collapse the chains so that the user_struct-s will
492 * be still alive, but not in hashes. subsequent free_uid()
493 * will free them.
494 */
495 for (i = 0; i < UIDHASH_SZ; i++) {
496 head = ns->uidhash_table + i;
497 while (!hlist_empty(head)) {
498 nd = head->first;
499 hlist_del_init(nd);
500 }
501 }
502 spin_unlock_irqrestore(&uidhash_lock, flags);
503
504 free_uid(ns->root_user);
505}
506#endif
507
508static int __init uid_cache_init(void) 464static int __init uid_cache_init(void)
509{ 465{
510 int n; 466 int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 532858fa5b88..79084311ee57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,60 +9,55 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/cred.h>
12 13
13/* 14/*
14 * Clone a new ns copying an original user ns, setting refcount to 1 15 * Create a new user namespace, deriving the creator from the user in the
15 * @old_ns: namespace to clone 16 * passed credentials, and replacing that user with the new root user for the
16 * Return NULL on error (failure to kmalloc), new ns otherwise 17 * new namespace.
18 *
19 * This is called by copy_creds(), which will finish setting the target task's
20 * credentials.
17 */ 21 */
18static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) 22int create_user_ns(struct cred *new)
19{ 23{
20 struct user_namespace *ns; 24 struct user_namespace *ns;
21 struct user_struct *new_user; 25 struct user_struct *root_user;
22 int n; 26 int n;
23 27
24 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 28 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
25 if (!ns) 29 if (!ns)
26 return ERR_PTR(-ENOMEM); 30 return -ENOMEM;
27 31
28 kref_init(&ns->kref); 32 kref_init(&ns->kref);
29 33
30 for (n = 0; n < UIDHASH_SZ; ++n) 34 for (n = 0; n < UIDHASH_SZ; ++n)
31 INIT_HLIST_HEAD(ns->uidhash_table + n); 35 INIT_HLIST_HEAD(ns->uidhash_table + n);
32 36
33 /* Insert new root user. */ 37 /* Alloc new root user. */
34 ns->root_user = alloc_uid(ns, 0); 38 root_user = alloc_uid(ns, 0);
35 if (!ns->root_user) { 39 if (!root_user) {
36 kfree(ns); 40 kfree(ns);
37 return ERR_PTR(-ENOMEM); 41 return -ENOMEM;
38 } 42 }
39 43
40 /* Reset current->user with a new one */ 44 /* set the new root user in the credentials under preparation */
41 new_user = alloc_uid(ns, current->uid); 45 ns->creator = new->user;
42 if (!new_user) { 46 new->user = root_user;
43 free_uid(ns->root_user); 47 new->uid = new->euid = new->suid = new->fsuid = 0;
44 kfree(ns); 48 new->gid = new->egid = new->sgid = new->fsgid = 0;
45 return ERR_PTR(-ENOMEM); 49 put_group_info(new->group_info);
46 } 50 new->group_info = get_group_info(&init_groups);
47 51#ifdef CONFIG_KEYS
48 switch_uid(new_user); 52 key_put(new->request_key_auth);
49 return ns; 53 new->request_key_auth = NULL;
50} 54#endif
51 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
52struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
53{
54 struct user_namespace *new_ns;
55
56 BUG_ON(!old_ns);
57 get_user_ns(old_ns);
58
59 if (!(flags & CLONE_NEWUSER))
60 return old_ns;
61 56
62 new_ns = clone_user_ns(old_ns); 57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */
58 kref_set(&ns->kref, 1);
63 59
64 put_user_ns(old_ns); 60 return 0;
65 return new_ns;
66} 61}
67 62
68void free_user_ns(struct kref *kref) 63void free_user_ns(struct kref *kref)
@@ -70,7 +65,7 @@ void free_user_ns(struct kref *kref)
70 struct user_namespace *ns; 65 struct user_namespace *ns;
71 66
72 ns = container_of(kref, struct user_namespace, kref); 67 ns = container_of(kref, struct user_namespace, kref);
73 release_uids(ns); 68 free_uid(ns->creator);
74 kfree(ns); 69 kfree(ns);
75} 70}
76EXPORT_SYMBOL(free_user_ns); 71EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4ab9659d269e..3b34b3545936 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
60 60
61#ifdef CONFIG_SYSCTL_SYSCALL 61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */ 62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, 63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp, 64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen) 65 void __user *newval, size_t newlen)
66{ 66{
@@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
69 write = newval && newlen; 69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table)); 70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write); 71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, name, nlen, 72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 oldval, oldlenp, newval, newlen);
74 put_uts(table, write, uts_table.data); 73 put_uts(table, write, uts_table.data);
75 return r; 74 return r;
76} 75}
diff --git a/kernel/wait.c b/kernel/wait.c
index c275c56cf2d3..cd87131f2fc2 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
72 spin_lock_irqsave(&q->lock, flags); 72 spin_lock_irqsave(&q->lock, flags);
73 if (list_empty(&wait->task_list)) 73 if (list_empty(&wait->task_list))
74 __add_wait_queue(q, wait); 74 __add_wait_queue(q, wait);
75 /* 75 set_current_state(state);
76 * don't alter the task state if this is just going to
77 * queue an async wait queue callback
78 */
79 if (is_sync_wait(wait))
80 set_current_state(state);
81 spin_unlock_irqrestore(&q->lock, flags); 76 spin_unlock_irqrestore(&q->lock, flags);
82} 77}
83EXPORT_SYMBOL(prepare_to_wait); 78EXPORT_SYMBOL(prepare_to_wait);
@@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91 spin_lock_irqsave(&q->lock, flags); 86 spin_lock_irqsave(&q->lock, flags);
92 if (list_empty(&wait->task_list)) 87 if (list_empty(&wait->task_list))
93 __add_wait_queue_tail(q, wait); 88 __add_wait_queue_tail(q, wait);
94 /* 89 set_current_state(state);
95 * don't alter the task state if this is just going to
96 * queue an async wait queue callback
97 */
98 if (is_sync_wait(wait))
99 set_current_state(state);
100 spin_unlock_irqrestore(&q->lock, flags); 90 spin_unlock_irqrestore(&q->lock, flags);
101} 91}
102EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4048e92aa04f..4952322cba45 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -9,7 +9,7 @@
9 * Derived from the taskqueue/keventd code by: 9 * Derived from the taskqueue/keventd code by:
10 * 10 *
11 * David Woodhouse <dwmw2@infradead.org> 11 * David Woodhouse <dwmw2@infradead.org>
12 * Andrew Morton <andrewm@uow.edu.au> 12 * Andrew Morton
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 * 15 *
@@ -62,6 +62,7 @@ struct workqueue_struct {
62 const char *name; 62 const char *name;
63 int singlethread; 63 int singlethread;
64 int freezeable; /* Freeze threads during suspend */ 64 int freezeable; /* Freeze threads during suspend */
65 int rt;
65#ifdef CONFIG_LOCKDEP 66#ifdef CONFIG_LOCKDEP
66 struct lockdep_map lockdep_map; 67 struct lockdep_map lockdep_map;
67#endif 68#endif
@@ -83,21 +84,21 @@ static cpumask_t cpu_singlethread_map __read_mostly;
83static cpumask_t cpu_populated_map __read_mostly; 84static cpumask_t cpu_populated_map __read_mostly;
84 85
85/* If it's single threaded, it isn't in the list of workqueues. */ 86/* If it's single threaded, it isn't in the list of workqueues. */
86static inline int is_single_threaded(struct workqueue_struct *wq) 87static inline int is_wq_single_threaded(struct workqueue_struct *wq)
87{ 88{
88 return wq->singlethread; 89 return wq->singlethread;
89} 90}
90 91
91static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) 92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
92{ 93{
93 return is_single_threaded(wq) 94 return is_wq_single_threaded(wq)
94 ? &cpu_singlethread_map : &cpu_populated_map; 95 ? &cpu_singlethread_map : &cpu_populated_map;
95} 96}
96 97
97static 98static
98struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) 99struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
99{ 100{
100 if (unlikely(is_single_threaded(wq))) 101 if (unlikely(is_wq_single_threaded(wq)))
101 cpu = singlethread_cpu; 102 cpu = singlethread_cpu;
102 return per_cpu_ptr(wq->cpu_wq, cpu); 103 return per_cpu_ptr(wq->cpu_wq, cpu);
103} 104}
@@ -766,8 +767,9 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
766 767
767static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 768static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
768{ 769{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
769 struct workqueue_struct *wq = cwq->wq; 771 struct workqueue_struct *wq = cwq->wq;
770 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; 772 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
771 struct task_struct *p; 773 struct task_struct *p;
772 774
773 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 775 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
@@ -781,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
781 */ 783 */
782 if (IS_ERR(p)) 784 if (IS_ERR(p))
783 return PTR_ERR(p); 785 return PTR_ERR(p);
784 786 if (cwq->wq->rt)
787 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
785 cwq->thread = p; 788 cwq->thread = p;
786 789
787 return 0; 790 return 0;
@@ -801,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
801struct workqueue_struct *__create_workqueue_key(const char *name, 804struct workqueue_struct *__create_workqueue_key(const char *name,
802 int singlethread, 805 int singlethread,
803 int freezeable, 806 int freezeable,
807 int rt,
804 struct lock_class_key *key, 808 struct lock_class_key *key,
805 const char *lock_name) 809 const char *lock_name)
806{ 810{
@@ -822,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
822 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 826 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
823 wq->singlethread = singlethread; 827 wq->singlethread = singlethread;
824 wq->freezeable = freezeable; 828 wq->freezeable = freezeable;
829 wq->rt = rt;
825 INIT_LIST_HEAD(&wq->list); 830 INIT_LIST_HEAD(&wq->list);
826 831
827 if (singlethread) { 832 if (singlethread) {
@@ -965,6 +970,51 @@ undo:
965 return ret; 970 return ret;
966} 971}
967 972
973#ifdef CONFIG_SMP
974struct work_for_cpu {
975 struct work_struct work;
976 long (*fn)(void *);
977 void *arg;
978 long ret;
979};
980
981static void do_work_for_cpu(struct work_struct *w)
982{
983 struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
984
985 wfc->ret = wfc->fn(wfc->arg);
986}
987
988/**
989 * work_on_cpu - run a function in user context on a particular cpu
990 * @cpu: the cpu to run on
991 * @fn: the function to run
992 * @arg: the function arg
993 *
994 * This will return -EINVAL in the cpu is not online, or the return value
995 * of @fn otherwise.
996 */
997long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
998{
999 struct work_for_cpu wfc;
1000
1001 INIT_WORK(&wfc.work, do_work_for_cpu);
1002 wfc.fn = fn;
1003 wfc.arg = arg;
1004 get_online_cpus();
1005 if (unlikely(!cpu_online(cpu)))
1006 wfc.ret = -EINVAL;
1007 else {
1008 schedule_work_on(cpu, &wfc.work);
1009 flush_work(&wfc.work);
1010 }
1011 put_online_cpus();
1012
1013 return wfc.ret;
1014}
1015EXPORT_SYMBOL_GPL(work_on_cpu);
1016#endif /* CONFIG_SMP */
1017
968void __init init_workqueues(void) 1018void __init init_workqueues(void)
969{ 1019{
970 cpu_populated_map = cpu_online_map; 1020 cpu_populated_map = cpu_online_map;